Threaded Mode | Linear Mode

F-73P · (This post was last modified: 12-11-2021 08:59 AM by F-73P.)

As the compiler for the programming language (mC) introduced in the previous post nears completion, I would like to provide a brief overview of how the compiler works and share some source code.

The compiler is comprised of 3 main components - the scanner, parser and virtual machine. In this post I'll describe the scanner. Below is the C code for the scanner (which is based on the scanner in Kenneth C. Louden's excellent "Compiler Construction:Principles and Practice"):

Code:

//scanner.c v1.00

//DFA scanner - read the source code and return the next token to the parser

//includes

#include <stdbool.h>

#include <ctype.h>

#include <stdint.h>

#include <string.h>

#include "definitions.h"

//structs

struct { //lookup table of reserved words

char* str;

TokenType tok;

} reservedWords[RESERVED] = {{"main",MAIN},{"if",IF},{"else",ELSE},{"do",DO},{"while",WHILE},{"for",FOR}, {"return",RETURN},

                             {"read",READ},{"write",WRITE},{"and",AND},{"or",OR},{"not",NOT},{"true",TRUE},

                             {"false",FALSE},{"local",LOCAL},{"array",ARRAY},{"switch",SWITCH},

                             {"case",CASE},{"break",BREAK},{"default",DEFAULT},{"goto",GOTO},{"nop",NOPR}};

//global variables    

char source[SOURCE+1]={"/*factorial program*/\n" //example program 

                     "main {\n"

                          "read x;\n"                            

                          "factorial=1;\n"

                          "while (x>0) {\n"

                            "factorial = factorial*x;\n"

                            "x--;\n"

                        "}\n"

                        "write factorial;\n"

                      "}\n"                              

                     };

uint32_t sourceIndex=0; //current position in source

uint16_t lineno=0; //current line number in source    

char lexemeString[MAXLEXEMELEN+1];

//prototypes

TokenType reservedLookup(char *lexemeString);

//functions

TokenType getToken(void) { //returns the next token in source    

    char c;    //current character in source

    uint16_t lexemeStringIndex = 0; //index for storing lexeme into lexemeString

    _Bool save; //flag to indicate save to lexemeString

    TokenType currentToken; //holds current token to be returned to the parser    

    StateType state = START; //current state - always begins at START

  while (state!=DONE) {

        c=source[sourceIndex++]; //next character in program

        save = true;

        switch (state) { //begin switch (state)

            case START:

                if (isdigit(c))

                    state = INNUM;

         else if (isalpha(c))

           state = INID;

                 else if (c == '<')

                     state = INLESSTHAN;

                 else if (c == '>')

                     state = INGREATERTHAN;

                 else if (c == '=')

                     state = INEQUAL;

                 else if (c == '!')

                     state = INNOTEQUAL;

                 else if (c == '/') {

                     save = false;

                     state = INDIV;

                 }

                 else if (c =='+') {

                     state = INPLUS;

                 }

                 else if (c =='-') {

                     state = INMINUS;

                 }

         else if ((c == ' ') || (c == '\t') || (c == '\n')) {

                     save = false;

                     if (c == '\n') lineno++;

                 }

         else

         { state = DONE;

           switch (c) {

                         case ENDFILE:

               save = false;

               currentToken = ENDFILE;

               break;

             case '*':

               currentToken = TIMES;

               break;

                         case ':':

               currentToken = COLON;

               break;

             case ';':

               currentToken = SEMI;

               break;

             case ',':

                             currentToken = COMMA;

               break;

             case '(':

               currentToken = LPAREN;

               break;

             case ')':

               currentToken = RPAREN;

               break;

                         case '[':

               currentToken = LBRACKET;

               break;

             case ']':

               currentToken = RBRACKET;

               break;

                         case '{':

               currentToken = LBRACE;

               break;

             case '}':

               currentToken = RBRACE;

               break;

                         case '&':

               currentToken = ADDR;

               break;

             default:

                 //insert error code

                 state = DONE;

           }

         }

         break;

            case INNUM:

         if (!isdigit(c)) { //backup in the input

                     if(c == 'e') {

                         state=INEXP;

                     }

           else if(c != '.') {

                         sourceIndex--;

                         save = false;

                         state = DONE;

                         currentToken = NUM;

                     }

         }

         break;

            case INEXP:

                state=INNUM;

                break;

      case INID: //ID is letters/digits (cannot start with a digit)

         if (!isalpha(c)) { //backup in the input

                     if (!isdigit(c)) {

                         sourceIndex--;

                         save = false;

                         state = DONE;

                         currentToken = ID;

                     }

         }

         break;

            case INLESSTHAN:

              state = DONE;

              if (c == '=') currentToken = LTE;

              else {

          save = false;

                    sourceIndex--;

                    currentToken = LT;

        }

              break;

            case INGREATERTHAN:

              state = DONE;

              if (c == '=') currentToken = GTE;

              else {

          save = false;

                    sourceIndex--;

                    currentToken = GT;

        }

              break;

            case INEQUAL:

                state = DONE;

              if (c == '=') currentToken = EQ;

              else {

                    sourceIndex--;

                    save=false;

                    currentToken = ASSIGN;

                }

                break;

            case INNOTEQUAL:

                state = DONE;

              if (c == '=') currentToken = NEQ;

              else {

                    //insert error code

                    state = DONE;

                }

              break;

            case INDIV:

        save = false;

              if (c == '*') state = INCOMMENT;

              else {

                    state = DONE;

                    sourceIndex--;

                    currentToken = DIV;                        

                    if (lexemeStringIndex < MAXLEXEMELEN) {

                        lexemeString[lexemeStringIndex++] = '/';

                        lexemeString[lexemeStringIndex] = '\0';

                    }

                    else {

                        //insert error code

                    }    

        }

              break;

            case INPLUS:

        state = DONE;

              if (c == '+') currentToken = INC;

              else {

          save = false;

                    sourceIndex--;

                    currentToken = PLUS;

        }

              break;

            case INMINUS:

        state = DONE;

              if (c == '-') currentToken = DEC;

              else {

          save = false;

                    sourceIndex--;

                    currentToken = MINUS;

        }

              break;

      case INCOMMENT:

              save = false;

              if (c == ENDFILE) { 

                    state = DONE;

                  currentToken = ENDFILE;

              }

              else if (c == '*')

                  state = OUTCOMMENT;

                else if (c == '\n') lineno++;

              break;

            case OUTCOMMENT:

                save = false;

                if (c == '/') state = START;

              else { //backup in the input

           sourceIndex--;

           state = INCOMMENT;

        }

                break;

      case DONE:

      default: //should never happen

               //insert error code

                 state = DONE;

         break;

    } //end switch (state)

        if (save) {

            if (lexemeStringIndex < MAXLEXEMELEN) lexemeString[lexemeStringIndex++] = c;

      else {

                //insert error code

            }                

        }

    if (state == DONE) {

            lexemeString[lexemeStringIndex] = '\0';

       if (currentToken == ID) currentToken = reservedLookup(lexemeString);

        }

  }

    return currentToken;

}

TokenType reservedLookup (char *lexemeString) { //lookup an identifier to see if it is a reserved word (linear search)

    uint16_t i;

    for (i=0;i<RESERVED;i++) {

        if (!strcmp(lexemeString,reservedWords[i].str)) return reservedWords[i].tok;

    }

    return ID;

}

The code for the "definitions.h" header file is:

Code:

//definitions.h v1.00

//tokens

typedef enum { ENDFILE, //book-keeping tokens

    MAIN,IF,ELSE,DO,WHILE,FOR,RETURN,READ,WRITE,AND,OR,NOT,TRUE,FALSE,LOCAL,ARR​AY,SWITCH, //reserved words

      CASE,BREAK,DEFAULT,GOTO,NOPR, //reserved words

    ID,NUM, //multicharacter tokens

    //special symbols

    PLUS,MINUS,TIMES,DIV,LT,LTE,GT,GTE,EQ,NEQ,ASSIGN,COLON,SEMI,COMMA,LPAREN,RP​AREN,LBRACKET,RBRACKET,

    LBRACE,RBRACE,INC,DEC,ADDR

} TokenType;

//states in scanner DFA

typedef enum {START,INNUM,INEXP,INID,INLESSTHAN,INGREATERTHAN,INEQUAL,INNOTEQUAL,INDIV,I​NPLUS,INMINUS,

              INCOMMENT,OUTCOMMENT,DONE} StateType;

//definitions

#define RESERVED 22 //the number of reserved words

#define SOURCE 1000 //maximum character length of source code

#define MAXLEXEMELEN 40 //maximum length of lexeme

The program source code is an array of characters terminated by the null character (i.e. a string). To improve readability, the string is padded with white space (blanks,tabs and newlines). Comments (text between /* and */) are ignored. The source code for the example program in "scanner.c v1.00" is:

Code:

/*factorial program 1*/

main{

   read x;                          

   factorial=1;

   while (x>0) {

     factorial = factorial*x;

     x--;                          

   }

   write factorial;

}

The numbers,letters,words and special symbols that make up the source code are called "lexemes". Each lexeme belongs to a special category called a "token". The scanner extracts each lexeme from the source code and determine its token type.

To test the scanner, add the following two files to the project:

Code:

//compiler.c v1.00

//prototypes

void parse(void);

//functions

int main (void) {

    parse();    

}

Code:

//parser.c v1.00

#include "definitions.h"

//global variables

extern char lexemeString[MAXLEXEMELEN+1];

//prototypes

TokenType getToken(void);

//functions

void parse(void) {

    TokenType token;

    while(token!=ENDFILE) {

        token=getToken();

    }        

}

Build the project, start a Debug Session (I use the Keil uVision5 simulator) and repeatedly step over the line "token=getToken();" in "parser.c" to extract the lexemes and determine the corresponding token (the lexemes are stored in the variable "lexemeString"):

Code:

lexeme                                                token

main                                                  MAIN

{                                                     LBRACE

read                                                  READ

x                                                     ID

;                                                     SEMI

factorial                                             ID

=                                                     ASSIGN

1                                                     NUM

;                                                     SEMI

while                                                 WHILE

(                                                     LPAREN

x                                                     ID

>                                                     GT

0                                                     NUM

)                                                     RPAREN

{                                                     LBRACE

factorial                                             ID

=                                                     ASSIGN

factorial                                             ID

*                                                     TIMES

x                                                     ID

;                                                     SEMI

x                                                     ID

--                                                    DEC

;                                                     SEMI

}                                                     RBRACE

write                                                 WRITE

factorial                                             ID

;                                                     SEMI

}                                                     RBRACE

0                                                     ENDFILE

This information is passed to the parser, which will be the subject of the next post.