Solved

Parsing: Another Approach

Posted on 1997-04-28
7
185 Views
Last Modified: 2010-04-15
HELLO!

This is PARSIT.C ------------------------------------

/* parsit.c */
/* COMPLETELY changed by Jim Nowlin: 4/28/97,
   using expert advice from 'LucHoltkamp'.
   Someday, this will enter the correct brain cells
   in Jim's head, and I will get it to work.*/


/* SPECIAL NOTE: This is NOT eliminating the lines with comments. */

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <malloc.h>

#define MAXARRAY 80
#define TRUE 1
#define FALSE 0

void parseLine(char *line);
char *get_token(char *line);
char *get_lineNumber_token(char *line);
char *remain(char *line);

FILE *check_file;
char buffer[MAXARRAY];
static int inComment = 0;

int main(void)
{


   if ((check_file = fopen("parse1.out","r")) == NULL) {
     printf("Error in input file name.\n");
     exit(1);
     }

     while( fgets(buffer,MAXARRAY , check_file)  != NULL ) {
     parseLine(buffer);

     }



return 0;



}
void parseLine(char *line)
{
int i=0, j=0;
char *line_number = NULL, *string_test = NULL;
char *rest_ofLine = NULL;

while (line[i])
   {
      if (inComment == TRUE) /* are we still in a comment ?? */ {


/* while we are at the beginning of a comment..... */

 while (line[i] && line[i] != '/' && line[i+1] != '*') ++i; /* start of comment */
 if (line[i]) { i+=2; inComment = FALSE; }
      } /* end if (inComment) */

/* when we reach, in a line, the ending comment symbols, */
/*   inComment is assigned FALSE */

      else if (line[i] == '*' && line[i+1] == '/') /* end of comment ? */
      {
            inComment = FALSE; i+=2;
      }

      else if (isalpha(line[i]) || line[i] == '_') /* is this the start of word? */
      {



 string_test = get_token(line);
 if ( string_test != '\0')
   {
   while ( isdigit(string_test[j]) ) j++;
   }
   line_number = string_test;
   rest_ofLine = remain(line);

   /* parseLine(rest_ofLine);   DO I PARSE, AGAIN ? */

   /* while ( *rest_ofLine !='\n' )
   rest_ofLine = get_token(rest_ofLine); */


   printf("line number %s, %s\n", line_number, rest_ofLine);





 /* check to see if word is a C reserved word. If not, add word to Newtree */
 /* if (!TreeSearch(root,word)) {
 second_root =  Newtree(second_root, word);
 }  */

 /*else*/ if (line[i] == '"') /* is this a string?  We don't WANT strings.  */
  ++i;


/* while line[i] is NOT a string, trap for the '\\' escape character sequence. */

 while (line[i] && line[i] != '"')
     {
      if (line[i] == '\\') ++i; /* Watch for special escape char HERE !! */
      ++i;
     }
 }

 else if (line[i] == '\'') /* start of tab, newline character, etc. ?? */
 {
 ++i;
      while (line[i] && line[i] != '\'')
        {
            if (line[i] == '\\')
            ++i; /* Watch special escape char !! */

            ++i;
        }  /* end while (line[i] && line[i] != '\'')  */

 } /* end else-if */


 else ++i; /* skip everything else */

   }  /* end while */

} /* end parseLine(*w) */



char *get_token(char *line)
{

char *return_a_word;
      return_a_word = strtok(line," ");

      return (return_a_word);

}


char *remain(char *line)
{

char *return_a_word;
      return_a_word = strtok(NULL,"");

      return (return_a_word);

}


It works on a test file named: PARSE1.OUT, which looks like
this: --------------------------------------------------

1 #include <stdio.h>
2
3 int integer;
4
5 char character;
6
7 double doubled;
8
9 /* Comment is added here. */
10
11 float check_sum_total( float one, int two);
12
13 int main(void)
14 {
15
16 float first = 0.0, second = 0.0;
17
18 check_sum_total( first, second);
19
20 return 0;
21
22 }
23
24 float check_sum_total( float one, int two)
25 {
26
27 float hold;
28
29 hold = one + two;
30
31 return (hold);
32 }


At present, THIS is its output; NOTICE the comment on line
number 9. It is desired that NO COMMENTS be included
in the output( just like a compiler! )
---------------------------------------------------------

1 #include <stdio.h>
2
3 int integer;
4
5 char character;
6
7 double doubled;
8
9 /* Comment is added here. */
10
11 float check_sum_total( float one, int two);
12
13 int main(void)
14 {
15
16 float first = 0.0, second = 0.0;
17
18 check_sum_total( first, second);
19
20 return 0;
21
22 }
23
24 float check_sum_total( float one, int two)
25 {
26
27 float hold;
28
29 hold = one + two;
30
31 return (hold);
32 }


In the end, the output of PARSE1.OUT MUST( SHOULD ) LOOK LIKE THIS:
---------------------------------------------------------

user-defined words      occurs on line(s). . .

integer                       3
character                     5
doubled                       7
check_sum_total               11, 24
one                           11, 24, 29
two                           11, 24, 29
main                          13
void                          13
first                         16, 18
second                        16, 18
hold                          27, 29, 31

QUESTION:

I was able to extract the line number via a call to
get_token(*line). I was able to extract the remainder of
the line via a call to rest_ofLine = remain(*line).

I WANT to get each valid string and run it through a tree
searching function to 'filter out' ALL 32 RESERVED words
in C.

DO I USE strtok() again or DO I SEND the rest_ofLine string
back recursively into parseLine()?

Jim
0
Comment
Question by:jnowlin
  • 3
  • 3
7 Comments
 
LVL 10

Accepted Solution

by:
RONSLOW earned 150 total points
Comment Utility
Yuck - awful code..

Anyway - here is fix for why comments AREN'T being stripped...

while (line[i]) {
    if (inComment == TRUE)      /* if we are still in a comment */
    {
        if (line[i] == '*' && line[i+1] == '/')      /* are we still in a comment ?? */
        {
            inComment = FALSE;
            i += 2;
        }
        else
        {
            i++;
        }
    }
    else if (line[i] == '/' && line[i+1] == '*') /* start of comment ? */
    {
        inComment = TRUE;
        i+=2;
    }

Haven't looked at the rest of it yet, but that is a start.

Seems like all the incomment logic was completely screwed

You want me to write a nicer solution for you??

I'll provide answers to the rest of your questions subsequently.





0
 

Author Comment

by:jnowlin
Comment Utility
Yes, I could use that.
I need it quick.
I'll buy more points.

JN
0
 
LVL 10

Expert Comment

by:RONSLOW
Comment Utility
Can I have your eMail as posting code here is next to useless due to formatting problems.

I have written some code (still to test) that reads in a C program and _should_ print out a list of line numbers and words (but doesn't group them together yet)

I provided comments in my code, and you should be able to follow it.

I'd like to test it first, but need to set up my VC environment to build stand alone console apps (I don't have the required libraries on my system yet) - I'll put these on my PC tomorrow and give it a try.

Then we just need to build an in-memory list of words and line numbers and print it out.

Please send me your eMail address - mine is Roger_Onslow@compsys.com.au - also feel free t post my code if you can get it formatted nicely.

0
Why You Should Analyze Threat Actor TTPs

After years of analyzing threat actor behavior, it’s become clear that at any given time there are specific tactics, techniques, and procedures (TTPs) that are particularly prevalent. By analyzing and understanding these TTPs, you can dramatically enhance your security program.

 

Author Comment

by:jnowlin
Comment Utility
Hello RONSLOW!!

I THOUGHT there was a problem with code and the way it gets
formatted HERE at EE's HTML page.

I purchased 200 more points yesterday and I will buy MORE!
I figure I owe you at least 200 points, 'LucHoltkamp' at least
100 points, among others. I WILL GET PEOPLE WHO HELP ME THE POINTS
WHICH THEY DESERVE!! It's only right, and it's worth it
for me to get others' knowledge.

My email address is as follows:

jnowlin@ma.ultranet.com

Hope to hear from you soon. I've got some ideas on using some
recursion to properly and correctly parse the files required
to be parsed.

I feel confident, even though this is due this Thursday, May 1, 1997

Later!   :)

Jim Nowlin

jnowlin@ma.ultranet.com

0
 
LVL 3

Expert Comment

by:LucHoltkamp
Comment Utility
Dear Jim,

This is the third time you post this question. Why don't you post comments to the expert that helps you until you solved your problem?? It's cheaper! ;)
Again I must say that you're making a bit of a mess of the code I've written.
I send you a working program (that is, one that extracts all the identifiers and keywords) based on your first question.
Some points that I spot from your code:

1) DONT use recursion, it's making thing more difficult than neccesary.
2) Move the the formatting of linenumbers out of the parseLine function. It will seriously mess up things if you encounter numbers somwhere inside a line.
3) Split up things! (Divide and Conquer) Don't try to do everything in one function.
4) Your function strtok is not enough. Words can also be ended by symbols ({}()[],.;:'-+=!~%^&*etc) tab-characters (\t) newline-characters (\n\r) formfeeds (\f), EOL (\0), EOF (CTRL-Z), etc.
DON'T use strtok, it looks nice but it's not the right choice. Again let me mail you a working parseLine function. It's simpel, small and very fast.

If you email me I will help you, but if you want RONSLOW's help it's ok by me.

Luc Holtkamp (email: lholtkam@plex.nl)
0
 

Author Comment

by:jnowlin
Comment Utility
Hi Luc.

I'll take everbody's help!
I need to get points to you, though.

Do you know how to do that? Right now, the form says to grade
RONSLOW's answer. I guess I fell behind in the last couple
of days.

I put in RONSLOW's version of your first version of
getLine and it DID filter out the comments. I suspect
I messed up your code because of the way EE'x HTML/and or
NetScape is formatting it. I COULDN'T tell WHERE each curly
brace delimited block of code began or ended.

I SHALL send you email.

Jim

PS

Don't forget to let me know how to send you points,
and an 'A'.
0
 
LVL 10

Expert Comment

by:RONSLOW
Comment Utility
Regarding recursion - use it if it is the most natural way to express your problem.  Some things (like binary trees) are naturally recursive and lend themselves to recursive programming.  In such cases your code will be shorter and clearer if you use recursion.  However, the parsing of identifiers in not recursive in nature, so that is NOT the best way to go.  IF you were parsing actual expressions, then recursion IS the way to go, becuase expressions are naturally recursive in nature, but this is NOT what you are doing here - your goals are much simpler.

Do you really have line numbers at the start of each line of your input file - if this was your design decision, then I'd reconsider - let the program do the line number counting for you.  If this format has been forced upon you then I guess you have to live with it.

I definitely agree with the divide and conquer approach - it is much easier to understand a short function with a discrete task to do and whose code fits on a single screen than to come to terms with a monolithic monster.

See my code that I am emailing you on parsing - yes, strtok is not the right choice for this sort of parsing.

Good luck !!!
0

Featured Post

Find Ransomware Secrets With All-Source Analysis

Ransomware has become a major concern for organizations; its prevalence has grown due to past successes achieved by threat actors. While each ransomware variant is different, we’ve seen some common tactics and trends used among the authors of the malware.

Join & Write a Comment

Preface I don't like visual development tools that are supposed to write a program for me. Even if it is Xcode and I can use Interface Builder. Yes, it is a perfect tool and has helped me a lot, mainly, in the beginning, when my programs were small…
Windows programmers of the C/C++ variety, how many of you realise that since Window 9x Microsoft has been lying to you about what constitutes Unicode (http://en.wikipedia.org/wiki/Unicode)? They will have you believe that Unicode requires you to use…
The goal of this video is to provide viewers with basic examples to understand and use conditional statements in the C programming language.
The goal of this video is to provide viewers with basic examples to understand and use switch statements in the C programming language.

772 members asked questions and received personalized solutions in the past 7 days.

Join the community of 500,000 technology professionals and ask your questions.

Join & Ask a Question

Need Help in Real-Time?

Connect with top rated Experts

14 Experts available now in Live!

Get 1:1 Help Now