As the compiler for the programming language (
mC) introduced in the previous post nears completion, I would like to provide a brief overview of how the compiler works and share some source code.
The compiler is comprised of 3 main components - the scanner, parser and virtual machine. In this post I'll describe the scanner. Below is the C code for the scanner (which is based on the scanner in Kenneth C. Louden's excellent "Compiler Construction:Principles and Practice"):
Code:
//scanner.c v1.00
//DFA scanner - read the source code and return the next token to the parser
//includes
#include <stdbool.h>
#include <ctype.h>
#include <stdint.h>
#include <string.h>
#include "definitions.h"
//structs
struct { //lookup table of reserved words
char* str;
TokenType tok;
} reservedWords[RESERVED] = {{"main",MAIN},{"if",IF},{"else",ELSE},{"do",DO},{"while",WHILE},{"for",FOR}, {"return",RETURN},
{"read",READ},{"write",WRITE},{"and",AND},{"or",OR},{"not",NOT},{"true",TRUE},
{"false",FALSE},{"local",LOCAL},{"array",ARRAY},{"switch",SWITCH},
{"case",CASE},{"break",BREAK},{"default",DEFAULT},{"goto",GOTO},{"nop",NOPR}};
//global variables
char source[SOURCE+1]={"/*factorial program*/\n" //example program
"main {\n"
"read x;\n"
"factorial=1;\n"
"while (x>0) {\n"
"factorial = factorial*x;\n"
"x--;\n"
"}\n"
"write factorial;\n"
"}\n"
};
uint32_t sourceIndex=0; //current position in source
uint16_t lineno=0; //current line number in source
char lexemeString[MAXLEXEMELEN+1];
//prototypes
TokenType reservedLookup(char *lexemeString);
//functions
TokenType getToken(void) { //returns the next token in source
char c; //current character in source
uint16_t lexemeStringIndex = 0; //index for storing lexeme into lexemeString
_Bool save; //flag to indicate save to lexemeString
TokenType currentToken; //holds current token to be returned to the parser
StateType state = START; //current state - always begins at START
while (state!=DONE) {
c=source[sourceIndex++]; //next character in program
save = true;
switch (state) { //begin switch (state)
case START:
if (isdigit(c))
state = INNUM;
else if (isalpha(c))
state = INID;
else if (c == '<')
state = INLESSTHAN;
else if (c == '>')
state = INGREATERTHAN;
else if (c == '=')
state = INEQUAL;
else if (c == '!')
state = INNOTEQUAL;
else if (c == '/') {
save = false;
state = INDIV;
}
else if (c =='+') {
state = INPLUS;
}
else if (c =='-') {
state = INMINUS;
}
else if ((c == ' ') || (c == '\t') || (c == '\n')) {
save = false;
if (c == '\n') lineno++;
}
else
{ state = DONE;
switch (c) {
case ENDFILE:
save = false;
currentToken = ENDFILE;
break;
case '*':
currentToken = TIMES;
break;
case ':':
currentToken = COLON;
break;
case ';':
currentToken = SEMI;
break;
case ',':
currentToken = COMMA;
break;
case '(':
currentToken = LPAREN;
break;
case ')':
currentToken = RPAREN;
break;
case '[':
currentToken = LBRACKET;
break;
case ']':
currentToken = RBRACKET;
break;
case '{':
currentToken = LBRACE;
break;
case '}':
currentToken = RBRACE;
break;
case '&':
currentToken = ADDR;
break;
default:
//insert error code
state = DONE;
}
}
break;
case INNUM:
if (!isdigit(c)) { //backup in the input
if(c == 'e') {
state=INEXP;
}
else if(c != '.') {
sourceIndex--;
save = false;
state = DONE;
currentToken = NUM;
}
}
break;
case INEXP:
state=INNUM;
break;
case INID: //ID is letters/digits (cannot start with a digit)
if (!isalpha(c)) { //backup in the input
if (!isdigit(c)) {
sourceIndex--;
save = false;
state = DONE;
currentToken = ID;
}
}
break;
case INLESSTHAN:
state = DONE;
if (c == '=') currentToken = LTE;
else {
save = false;
sourceIndex--;
currentToken = LT;
}
break;
case INGREATERTHAN:
state = DONE;
if (c == '=') currentToken = GTE;
else {
save = false;
sourceIndex--;
currentToken = GT;
}
break;
case INEQUAL:
state = DONE;
if (c == '=') currentToken = EQ;
else {
sourceIndex--;
save=false;
currentToken = ASSIGN;
}
break;
case INNOTEQUAL:
state = DONE;
if (c == '=') currentToken = NEQ;
else {
//insert error code
state = DONE;
}
break;
case INDIV:
save = false;
if (c == '*') state = INCOMMENT;
else {
state = DONE;
sourceIndex--;
currentToken = DIV;
if (lexemeStringIndex < MAXLEXEMELEN) {
lexemeString[lexemeStringIndex++] = '/';
lexemeString[lexemeStringIndex] = '\0';
}
else {
//insert error code
}
}
break;
case INPLUS:
state = DONE;
if (c == '+') currentToken = INC;
else {
save = false;
sourceIndex--;
currentToken = PLUS;
}
break;
case INMINUS:
state = DONE;
if (c == '-') currentToken = DEC;
else {
save = false;
sourceIndex--;
currentToken = MINUS;
}
break;
case INCOMMENT:
save = false;
if (c == ENDFILE) {
state = DONE;
currentToken = ENDFILE;
}
else if (c == '*')
state = OUTCOMMENT;
else if (c == '\n') lineno++;
break;
case OUTCOMMENT:
save = false;
if (c == '/') state = START;
else { //backup in the input
sourceIndex--;
state = INCOMMENT;
}
break;
case DONE:
default: //should never happen
//insert error code
state = DONE;
break;
} //end switch (state)
if (save) {
if (lexemeStringIndex < MAXLEXEMELEN) lexemeString[lexemeStringIndex++] = c;
else {
//insert error code
}
}
if (state == DONE) {
lexemeString[lexemeStringIndex] = '\0';
if (currentToken == ID) currentToken = reservedLookup(lexemeString);
}
}
return currentToken;
}
TokenType reservedLookup (char *lexemeString) { //lookup an identifier to see if it is a reserved word (linear search)
uint16_t i;
for (i=0;i<RESERVED;i++) {
if (!strcmp(lexemeString,reservedWords[i].str)) return reservedWords[i].tok;
}
return ID;
}
The code for the "definitions.h" header file is:
Code:
//definitions.h v1.00
//tokens
typedef enum { ENDFILE, //book-keeping tokens
MAIN,IF,ELSE,DO,WHILE,FOR,RETURN,READ,WRITE,AND,OR,NOT,TRUE,FALSE,LOCAL,ARRAY,SWITCH, //reserved words
CASE,BREAK,DEFAULT,GOTO,NOPR, //reserved words
ID,NUM, //multicharacter tokens
//special symbols
PLUS,MINUS,TIMES,DIV,LT,LTE,GT,GTE,EQ,NEQ,ASSIGN,COLON,SEMI,COMMA,LPAREN,RPAREN,LBRACKET,RBRACKET,
LBRACE,RBRACE,INC,DEC,ADDR
} TokenType;
//states in scanner DFA
typedef enum {START,INNUM,INEXP,INID,INLESSTHAN,INGREATERTHAN,INEQUAL,INNOTEQUAL,INDIV,INPLUS,INMINUS,
INCOMMENT,OUTCOMMENT,DONE} StateType;
//definitions
#define RESERVED 22 //the number of reserved words
#define SOURCE 1000 //maximum character length of source code
#define MAXLEXEMELEN 40 //maximum length of lexeme
The program source code is an array of characters terminated by the null character (i.e. a string). To improve readability, the string is padded with white space (blanks,tabs and newlines). Comments (text between /* and */) are ignored. The source code for the example program in "scanner.c v1.00" is:
Code:
/*factorial program 1*/
main{
read x;
factorial=1;
while (x>0) {
factorial = factorial*x;
x--;
}
write factorial;
}
The numbers,letters,words and special symbols that make up the source code are called "lexemes". Each lexeme belongs to a special category called a "token". The scanner extracts each lexeme from the source code and determine its token type.
To test the scanner, add the following two files to the project:
Code:
//compiler.c v1.00
//prototypes
void parse(void);
//functions
int main (void) {
parse();
}
Code:
//parser.c v1.00
#include "definitions.h"
//global variables
extern char lexemeString[MAXLEXEMELEN+1];
//prototypes
TokenType getToken(void);
//functions
void parse(void) {
TokenType token;
while(token!=ENDFILE) {
token=getToken();
}
}
Build the project, start a Debug Session (I use the Keil uVision5 simulator) and repeatedly step over the line "token=getToken();" in "parser.c" to extract the lexemes and determine the corresponding token (the lexemes are stored in the variable "lexemeString"):
Code:
lexeme token
main MAIN
{ LBRACE
read READ
x ID
; SEMI
factorial ID
= ASSIGN
1 NUM
; SEMI
while WHILE
( LPAREN
x ID
> GT
0 NUM
) RPAREN
{ LBRACE
factorial ID
= ASSIGN
factorial ID
* TIMES
x ID
; SEMI
x ID
-- DEC
; SEMI
} RBRACE
write WRITE
factorial ID
; SEMI
} RBRACE
0 ENDFILE
This information is passed to the parser, which will be the subject of the next post.