Post Reply 
monic part 5: writing C programs on the calculator
12-05-2021, 09:44 AM (This post was last modified: 12-11-2021 08:59 AM by F-73P.)
Post: #20
RE: monic part 5: writing C programs on the calculator
As the compiler for the programming language (mC) introduced in the previous post nears completion, I would like to provide a brief overview of how the compiler works and share some source code.

The compiler is comprised of 3 main components - the scanner, parser and virtual machine. In this post I'll describe the scanner. Below is the C code for the scanner (which is based on the scanner in Kenneth C. Louden's excellent "Compiler Construction:Principles and Practice"):

Code:

//scanner.c v1.00

//DFA scanner - read the source code and return the next token to the parser

//includes

#include <stdbool.h>
#include <ctype.h>
#include <stdint.h>
#include <string.h>

#include "definitions.h"

//structs

struct { //lookup table of reserved words
char* str;
TokenType tok;
} reservedWords[RESERVED] = {{"main",MAIN},{"if",IF},{"else",ELSE},{"do",DO},{"while",WHILE},{"for",FOR}, {"return",RETURN},
                             {"read",READ},{"write",WRITE},{"and",AND},{"or",OR},{"not",NOT},{"true",TRUE},
                             {"false",FALSE},{"local",LOCAL},{"array",ARRAY},{"switch",SWITCH},
                             {"case",CASE},{"break",BREAK},{"default",DEFAULT},{"goto",GOTO},{"nop",NOPR}};
//global variables    
                                                         
char source[SOURCE+1]={"/*factorial program*/\n" //example program 

                     "main {\n"
                          "read x;\n"                            
                          "factorial=1;\n"
                          "while (x>0) {\n"
                            "factorial = factorial*x;\n"
                            "x--;\n"
                        "}\n"
                        "write factorial;\n"
                      "}\n"                              
                     };

uint32_t sourceIndex=0; //current position in source
uint16_t lineno=0; //current line number in source    
char lexemeString[MAXLEXEMELEN+1];

//prototypes
                                         
TokenType reservedLookup(char *lexemeString);

//functions

TokenType getToken(void) { //returns the next token in source    
    char c;    //current character in source
    
    uint16_t lexemeStringIndex = 0; //index for storing lexeme into lexemeString
    _Bool save; //flag to indicate save to lexemeString
    
    TokenType currentToken; //holds current token to be returned to the parser    
  
    StateType state = START; //current state - always begins at START

  while (state!=DONE) {
        c=source[sourceIndex++]; //next character in program
        save = true;
        switch (state) { //begin switch (state)
            case START:
                if (isdigit(c))
                    state = INNUM;
         else if (isalpha(c))
           state = INID;
                 else if (c == '<')
                     state = INLESSTHAN;
                 else if (c == '>')
                     state = INGREATERTHAN;
                 else if (c == '=')
                     state = INEQUAL;
                 else if (c == '!')
                     state = INNOTEQUAL;
                 else if (c == '/') {
                     save = false;
                     state = INDIV;
                 }
                 else if (c =='+') {
                     state = INPLUS;
                 }
                 else if (c =='-') {
                     state = INMINUS;
                 }
         else if ((c == ' ') || (c == '\t') || (c == '\n')) {
                     save = false;
                     if (c == '\n') lineno++;
                 }
         else
         { state = DONE;
           switch (c) {
                         case ENDFILE:
               save = false;
               currentToken = ENDFILE;
               break;
             case '*':
               currentToken = TIMES;
               break;
                         case ':':
               currentToken = COLON;
               break;
             case ';':
               currentToken = SEMI;
               break;
             case ',':
                             currentToken = COMMA;
               break;
             case '(':
               currentToken = LPAREN;
               break;
             case ')':
               currentToken = RPAREN;
               break;
                         case '[':
               currentToken = LBRACKET;
               break;
             case ']':
               currentToken = RBRACKET;
               break;
                         case '{':
               currentToken = LBRACE;
               break;
             case '}':
               currentToken = RBRACE;
               break;
                         case '&':
               currentToken = ADDR;
               break;
             default:
                 //insert error code
                 state = DONE;
           }
         }
         break;
            case INNUM:
         if (!isdigit(c)) { //backup in the input
                     if(c == 'e') {
                         state=INEXP;
                     }
           else if(c != '.') {
                         sourceIndex--;
                         save = false;
                         state = DONE;
                         currentToken = NUM;
                     }
         }
         break;
            case INEXP:
                state=INNUM;
                break;
      case INID: //ID is letters/digits (cannot start with a digit)
         if (!isalpha(c)) { //backup in the input
                     if (!isdigit(c)) {
                         sourceIndex--;
                         save = false;
                         state = DONE;
                         currentToken = ID;
                     }
         }
         break;
            case INLESSTHAN:
              state = DONE;
              if (c == '=') currentToken = LTE;
              else {
          save = false;
                    sourceIndex--;
                    currentToken = LT;
        }
              break;
            case INGREATERTHAN:
              state = DONE;
              if (c == '=') currentToken = GTE;
              else {
          save = false;
                    sourceIndex--;
                    currentToken = GT;
        }
              break;
            case INEQUAL:
                state = DONE;
              if (c == '=') currentToken = EQ;
              else {
                    sourceIndex--;
                    save=false;
                    currentToken = ASSIGN;
                }
                break;
            case INNOTEQUAL:
                state = DONE;
              if (c == '=') currentToken = NEQ;
              else {
                    //insert error code
                    state = DONE;
                }
              break;
            case INDIV:
        save = false;
              if (c == '*') state = INCOMMENT;
              else {
                    state = DONE;
                    sourceIndex--;
                    currentToken = DIV;                        
                    if (lexemeStringIndex < MAXLEXEMELEN) {
                        lexemeString[lexemeStringIndex++] = '/';
                        lexemeString[lexemeStringIndex] = '\0';
                    }
                    else {
                        //insert error code
                    }    
        }
              break;
            case INPLUS:
        state = DONE;
              if (c == '+') currentToken = INC;
              else {
          save = false;
                    sourceIndex--;
                    currentToken = PLUS;
        }
              break;
            case INMINUS:
        state = DONE;
              if (c == '-') currentToken = DEC;
              else {
          save = false;
                    sourceIndex--;
                    currentToken = MINUS;
        }
              break;
      case INCOMMENT:
              save = false;
              if (c == ENDFILE) { 
                    state = DONE;
                  currentToken = ENDFILE;
              }
              else if (c == '*')
                  state = OUTCOMMENT;
                else if (c == '\n') lineno++;
              break;
            case OUTCOMMENT:
                save = false;
                if (c == '/') state = START;
              else { //backup in the input
           sourceIndex--;
           state = INCOMMENT;
        }
                break;
      case DONE:
      default: //should never happen
               //insert error code
                 state = DONE;
         break;
    } //end switch (state)

        if (save) {
            if (lexemeStringIndex < MAXLEXEMELEN) lexemeString[lexemeStringIndex++] = c;
      else {
                //insert error code
            }                
        }

    if (state == DONE) {
            lexemeString[lexemeStringIndex] = '\0';
       if (currentToken == ID) currentToken = reservedLookup(lexemeString);
        }
  }
    return currentToken;
}

TokenType reservedLookup (char *lexemeString) { //lookup an identifier to see if it is a reserved word (linear search)
    uint16_t i;
    for (i=0;i<RESERVED;i++) {
        if (!strcmp(lexemeString,reservedWords[i].str)) return reservedWords[i].tok;
    }
    return ID;
}

The code for the "definitions.h" header file is:

Code:

//definitions.h v1.00

//tokens
typedef enum { ENDFILE, //book-keeping tokens
    MAIN,IF,ELSE,DO,WHILE,FOR,RETURN,READ,WRITE,AND,OR,NOT,TRUE,FALSE,LOCAL,ARR​AY,SWITCH, //reserved words
      CASE,BREAK,DEFAULT,GOTO,NOPR, //reserved words
    ID,NUM, //multicharacter tokens
    //special symbols
    PLUS,MINUS,TIMES,DIV,LT,LTE,GT,GTE,EQ,NEQ,ASSIGN,COLON,SEMI,COMMA,LPAREN,RP​AREN,LBRACKET,RBRACKET,
    LBRACE,RBRACE,INC,DEC,ADDR
} TokenType;

//states in scanner DFA
typedef enum {START,INNUM,INEXP,INID,INLESSTHAN,INGREATERTHAN,INEQUAL,INNOTEQUAL,INDIV,I​NPLUS,INMINUS,
              INCOMMENT,OUTCOMMENT,DONE} StateType;

//definitions
#define RESERVED 22 //the number of reserved words
#define SOURCE 1000 //maximum character length of source code
#define MAXLEXEMELEN 40 //maximum length of lexeme

The program source code is an array of characters terminated by the null character (i.e. a string). To improve readability, the string is padded with white space (blanks,tabs and newlines). Comments (text between /* and */) are ignored. The source code for the example program in "scanner.c v1.00" is:

Code:

/*factorial program 1*/

main{
   read x;                          
   factorial=1;
   while (x>0) {
     factorial = factorial*x;
     x--;                          
   }
   write factorial;
}

The numbers,letters,words and special symbols that make up the source code are called "lexemes". Each lexeme belongs to a special category called a "token". The scanner extracts each lexeme from the source code and determine its token type.

To test the scanner, add the following two files to the project:

Code:

//compiler.c v1.00

//prototypes

void parse(void);

//functions

int main (void) {
    parse();    
}

Code:

//parser.c v1.00

#include "definitions.h"

//global variables

extern char lexemeString[MAXLEXEMELEN+1];

//prototypes

TokenType getToken(void);

//functions

void parse(void) {
    TokenType token;
    
    while(token!=ENDFILE) {
        token=getToken();
    }        
}

Build the project, start a Debug Session (I use the Keil uVision5 simulator) and repeatedly step over the line "token=getToken();" in "parser.c" to extract the lexemes and determine the corresponding token (the lexemes are stored in the variable "lexemeString"):

Code:

lexeme                                                token

main                                                  MAIN
{                                                     LBRACE
read                                                  READ
x                                                     ID
;                                                     SEMI
factorial                                             ID
=                                                     ASSIGN
1                                                     NUM
;                                                     SEMI
while                                                 WHILE
(                                                     LPAREN
x                                                     ID
>                                                     GT
0                                                     NUM
)                                                     RPAREN
{                                                     LBRACE
factorial                                             ID
=                                                     ASSIGN
factorial                                             ID
*                                                     TIMES
x                                                     ID
;                                                     SEMI
x                                                     ID
--                                                    DEC
;                                                     SEMI
}                                                     RBRACE
write                                                 WRITE
factorial                                             ID
;                                                     SEMI
}                                                     RBRACE
0                                                     ENDFILE

This information is passed to the parser, which will be the subject of the next post.

The C language combines all the power of assembly language with all the ease-of-use of assembly language
Find all posts by this user
Quote this message in a reply
Post Reply 


Messages In This Thread
RE: monic part 5: writing C programs on the calculator - F-73P - 12-05-2021 09:44 AM



User(s) browsing this thread: 8 Guest(s)