AST Parser for C-like Language

project4-ast/.DS_Store
project4-ast/ast.c
#include "symbolTable.h"
#include "ast.h"
extern SymbolTableStackEntryPtr symbolStackTop;
extern int scopeDepth;
//creates a new expression node
AstNodePtr new_ExprNode(ExpKind kind)
{
}
//creates a new statement node
AstNodePtr new_StmtNode(StmtKind kind)
{
}
//creates a new type node for entry into symbol table
Type* new_type(TypeKind kind)
{
}

project4-ast/ast.h
#include "globals.h"
typedef enum {STMT, EXPRESSION, FORMALVAR,
METHOD} NodeKind;
typedef enum {IF_THEN_ELSE_STMT, WHILE_STMT,
RETURN_STMT, COMPOUND_STMT,
EXPRESSION_STMT } StmtKind;
typedef enum {VAR_EXP, ARRAY_EXP, ASSI_EXP,
ADD_EXP, SUB_EXP, MULT_EXP, DIV_EXP,
GT_EXP, LT_EXP, GE_EXP, LE_EXP, EQ_EXP,
NE_EXP, CALL_EXP, CONST_EXP
}ExpKind;
typedef struct node {
NodeKind nKind;
StmtKind sKind;
ExpKind eKind;
struct node *children[MAXCHILDREN]; // max needed for
if_then_else
struct node *sibling; //List of statements, formal argument
Type *nType; //node's type for typechecking expressions for
HW5
ElementPtr nSymbolPtr; // for variable and array expressions
int nValue; // for integer constants
SymbolTablePtr nSymbolTabPtr; //For a block, its symbol
table
char *fname; // for a call expr, the name of the called
function
int nLinenumber;
}AstNode;

typedef AstNode *AstNodePtr;
AstNodePtr new_ExprNode(ExpKind kind) ;
AstNodePtr new_StmtNode(StmtKind kind);
Type* new_type(TypeKind kind);
project4-ast/CCOM4087-Proyect4-AST.pdf
mailto:[email protected]
http://piazza.com
http://piazza.com

http://piazza.com
project4-ast/cmlexer.l
/**********************/
/* C header files */
/**********************/
%{
#include "ast.h"
#include "cmparser.tab.h"
char tokenString[TOKENMAX];
int printoutScan = 0;
%}
%option yylineno

/**********************/
/* start your regular definitions here */
/**********************/
NUMBER [0-9]+
IDENTIFIER [a-zA-Z_$][a-zA-Z_$0-9]*
WHITESPACE [ trn]+
/* start your token specifications here */
/* Token names must come from cmparser.tab.h */
%%
auto { return TOK_ERROR; }
continue { return TOK_ERROR; }
goto { return TOK_ERROR; }
break { return TOK_ERROR; }
default { return TOK_ERROR; }
long { return TOK_ERROR; }
case { return TOK_ERROR; }
do { return TOK_ERROR; }
short { return TOK_ERROR; }
char { return TOK_ERROR; }
double { return TOK_ERROR; }
register { return TOK_ERROR; }
const { return TOK_ERROR; }
for { return TOK_ERROR; }
switch { return TOK_ERROR; }
byte { return TOK_ERROR; }
float { return TOK_ERROR; }
"~" { return TOK_ERROR; }
"&" { return TOK_ERROR; }
"+=" { return TOK_ERROR; }
"^=" { return TOK_ERROR; }

"?" { return TOK_ERROR; }
"|" { return TOK_ERROR; }
"-=" { return TOK_ERROR; }
"%=" { return TOK_ERROR; }
":" { return TOK_ERROR; }
"^" { return TOK_ERROR; }
"*=" { return TOK_ERROR; }
"<<=" { return TOK_ERROR; }
"++" { return TOK_ERROR; }
"<<" { return TOK_ERROR; }
"/=" { return TOK_ERROR; }
">>=" { return TOK_ERROR; }
"--" { return TOK_ERROR; }
">>" { return TOK_ERROR; }
"&=" { return TOK_ERROR; }
">>>=" { return TOK_ERROR; }
"!" { return TOK_ERROR; }
">>>" { return TOK_ERROR; }
"|=" { return TOK_ERROR; }
else { return TOK_ELSE; }
if { return TOK_IF; }
return { return TOK_RETURN; }
void { return TOK_VOID; }
int { return TOK_INT; }
while { return TOK_WHILE; }
"+" { return TOK_PLUS; }
"-" { return TOK_MINUS; }
"*" { return TOK_MULT; }
"/" { return TOK_DIV; }
"<" { return TOK_LT; }
"<=" { return TOK_LE; }
">" { return TOK_GT; }
">=" { return TOK_GE; }
"==" { return TOK_EQ; }
"!=" { return TOK_NE; }
"=" { return TOK_ASSIGN; }

";" { return TOK_SEMI; }
"," { return TOK_COMMA; }
"(" { return TOK_LPAREN; }
")" { return TOK_RPAREN; }
"[" { return TOK_LSQ; }
"]" { return TOK_RSQ; }
"{" { return TOK_LBRACE; }
"}" { return TOK_RBRACE; }
{NUMBER} { yylval.iVal = atoi(yytext); return TOK_NUM; }
{IDENTIFIER} {yylval.cVal = strdup(yytext); return TOK_ID;
}
{WHITESPACE} { }
"//".* { }
"/*"([^*]|*+[^*/])*("*"+)("/") {}
. { return TOK_ERROR; }
%%
/**********************/
/* C support functions */
/**********************/
void printToken(int token, char *str)
{
/* Print the line number, token name and matched lexeme
-- one per line without any additional characters exactly as
below */
/* Example 13:TOK_INT: 37*/
switch(token)
{
case TOK_ELSE:
fprintf(yyout,"%d : TOK_ELSE :
%sn",yylineno,str);
break;
case TOK_IF:

fprintf(yyout,"%d : TOK_IF : %sn",yylineno,str);
break;
case TOK_RETURN:
fprintf(yyout,"%d : TOK_RETURN :
%sn",yylineno,str);
break;
case TOK_VOID:
fprintf(yyout,"%d : TOK_VOID :
%sn",yylineno,str);
break;
case TOK_INT:
fprintf(yyout,"%d : TOK_INT : %sn",yylineno,str);
break;
case TOK_WHILE:
fprintf(yyout,"%d : TOK_WHILE :
%sn",yylineno,str);
break;
case TOK_ID:
fprintf(yyout,"%d : TOK_ID : %sn",yylineno,str);
break;
case TOK_NUM:
fprintf(yyout,"%d : TOK_NUM : %sn",yylineno,str);
break;
case TOK_PLUS:
fprintf(yyout,"%d : TOK_PLUS : %sn",yylineno,str);
break;
case TOK_MINUS:
fprintf(yyout,"%d : TOK_MINUS :
%sn",yylineno,str);
break;
case TOK_MULT:
fprintf(yyout,"%d : TOK_MULT :
%sn",yylineno,str);
break;
case TOK_DIV:
fprintf(yyout,"%d : TOK_DIV : %sn",yylineno,str);

break;
case TOK_LT:
fprintf(yyout,"%d : TOK_LT : %sn",yylineno,str);
break;
case TOK_LE:
fprintf(yyout,"%d : TOK_LE : %sn",yylineno,str);
break;
case TOK_GT:
fprintf(yyout,"%d : TOK_GT : %sn",yylineno,str);
break;
case TOK_GE:
fprintf(yyout,"%d : TOK_GE : %sn",yylineno,str);
break;
case TOK_EQ:
fprintf(yyout,"%d : TOK_EQ : %sn",yylineno,str);
break;
case TOK_NE:
fprintf(yyout,"%d : TOK_NE : %sn",yylineno,str);
break;
case TOK_ASSIGN:
fprintf(yyout,"%d : TOK_ASSIGN :
%sn",yylineno,str);
break;
case TOK_SEMI:
fprintf(yyout,"%d : TOK_SEMI : %sn",yylineno,str);
break;
case TOK_COMMA:
fprintf(yyout,"%d : TOK_COMMA :
%sn",yylineno,str);
break;
case TOK_LPAREN:
fprintf(yyout,"%d : TOK_LPAREN :
%sn",yylineno,str);
break;
case TOK_RPAREN:
fprintf(yyout,"%d : TOK_RPAREN :

%sn",yylineno,str);
break;
case TOK_LSQ:
fprintf(yyout,"%d : TOK_LSQ : %sn",yylineno,str);
break;
case TOK_RSQ:
fprintf(yyout,"%d : TOK_RSQ : %sn",yylineno,str);
break;
case TOK_LBRACE:
fprintf(yyout,"%d : TOK_LBRACE :
%sn",yylineno,str);
break;
case TOK_RBRACE:
fprintf(yyout,"%d : TOK_RBRACE :
%sn",yylineno,str);
break;
case TOK_ERROR:
fprintf(yyout,"%d : TOK_ERROR :
%sn",yylineno,str);
break;
}
}
int gettok(void){
int currentToken;
currentToken=yylex();
if (currentToken == 0) { // means EOF}
return 0;
}
strncpy(tokenString, yytext, TOKENMAX);
if (printoutScan) {
printToken(currentToken,tokenString);
}
return currentToken;

}
int initLex(int argc, char **argv){
if ( argc > 1 )
yyin = fopen( argv[1], "r" );
else
yyin = stdin;
// while (gettok() !=0) ; //gettok returns 0 on EOF
// return;
}
project4-ast/cmparser.y
%{
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "ast.h"
#include "util.h"
/* other external function prototypes */
extern int yylex();
extern int initLex(int , char **);
/* external global variables */
extern int yydebug;
extern int yylineno;

/* function prototypes */
void yyerror(const char *);
/* global variables */
AstNodePtr program;
%}
/* YYSTYPE */
%union
{
AstNodePtr nodePtr;
int iVal;
char *cVal;
Type *type;
}
/* terminals */
%token TOK_ELSE TOK_IF TOK_RETURN TOK_VOID
TOK_INT TOK_WHILE
%token TOK_PLUS TOK_MINUS TOK_MULT TOK_DIV
TOK_LT TOK_LE TOK_GT TOK_GE TOK_EQ TOK_NE
TOK_ASSIGN TOK_SEMI TOK_COMMA
%token TOK_LPAREN TOK_RPAREN TOK_LSQ TOK_RSQ
TOK_LBRACE TOK_RBRACE TOK_ERROR
%token <cVal> TOK_ID
%token <iVal> TOK_NUM

%type <nodePtr> Declarations Functions
%type <type> Type_Specifier
%type <nodePtr> Compound_Stmt Statements Statement
%type <nodePtr> Expr_Statement If_Else_Statement
Selection_Stmt Iteration_Stmt Return_Stmt
%type <nodePtr> Expression Simple_Expression
Additive_Expression Factor Var Call
%type <nodePtr> Args Args_List
/* associativity and precedence */
%nonassoc TOK_IF
%nonassoc TOK_ELSE
%right TOK_ASSIGN
%left TOK_EQ TOK_NE
%nonassoc TOK_LT TOK_GT TOK_LE TOK_GE
%left TOK_PLUS TOK_SUB
%left TOK_MULT TOK_DIV
%nonassoc error
%%
Start : Declarations {
}
;
Declarations : Functions {
program = $1;
}
| Var_Declaration Declarations {
}
;
Functions : Fun_Declaration {

}
| Fun_Declaration Functions {
}
;
Var_Declaration : Type_Specifier TOK_ID TOK_SEMI {
}
| Type_Specifier TOK_ID TOK_LSQ TOK_NUM
TOK_RSQ TOK_SEMI {
}
;
Fun_Declaration : Type_Specifier TOK_ID TOK_LPAREN
Params TOK_RPAREN Compound_Stmt {
}
;
Params : Param_List {
}
| TOK_VOID {
}
;
Param_List : Param_List TOK_COMMA Param {
}
| Param {
}
;

Param : Type_Specifier TOK_ID {
}
| Type_Specifier TOK_ID TOK_LSQ TOK_RSQ {
}
;
Type_Specifier : TOK_INT {
}
| TOK_VOID {
}
;
Compound_Stmt : TOK_LBRACE Statements TOK_RBRACE {
}
| TOK_LBRACE Local_Declarations Statements
TOK_RBRACE {
}
;
Local_Declarations : Var_Declaration Local_Declarations {
}
| Var_Declaration {
}
;
Statements : Statement Statements {
}

| {
}
;
Statement : Expr_Statement {
}
| Compound_Stmt {
}
| Selection_Stmt {
}
| Iteration_Stmt {
}
| Return_Stmt {
}
;
Expr_Statement : Expression TOK_SEMI {
}
| TOK_SEMI {
}
;
Selection_Stmt : If_Else_Statement %prec TOK_IF {
}
| If_Else_Statement TOK_ELSE Statement {
}

;
If_Else_Statement : TOK_IF TOK_LPAREN Expression
TOK_RPAREN Statement {
}
;
Iteration_Stmt : TOK_WHILE TOK_LPAREN Expression
}
;
Return_Stmt : TOK_RETURN Expression TOK_SEMI {
}
| TOK_RETURN TOK_SEMI {
}
;
Expression : Var TOK_ASSIGN Expression {
}
| Simple_Expression {
}
;
Var : TOK_ID {
}
| TOK_ID TOK_LSQ Expression TOK_RSQ {
}

;
Simple_Expression : Additive_Expression TOK_GT
Additive_Expression {
}
| Additive_Expression TOK_LT
}
| Additive_Expression TOK_GE
}
| Additive_Expression TOK_LE
}
| Additive_Expression TOK_EQ
}
| Additive_Expression TOK_NE
}
| Additive_Expression {
}
;
Additive_Expression : Additive_Expression TOK_PLUS Term {
}
| Additive_Expression TOK_MINUS Term {

}
| Term {
}
;
Term : Term TOK_MULT Factor {
}
| Term TOK_DIV Factor {
}
| Factor {
}
;
Factor : TOK_LPAREN Expression TOK_RPAREN {
}
| Var {
}
| Call {
}
| TOK_NUM {
}
;
Call : TOK_ID TOK_LPAREN Args TOK_RPAREN {
}
;

Args : Args_List {
}
| {
}
;
Args_List : Args_List TOK_COMMA Expression {
}
| Expression {
}
;
%%
void yyerror (char const *s) {
fprintf (stderr, "Line %d: %sn", yylineno, s);
}
int main(int argc, char **argv){
initLex(argc,argv);
#ifdef YYLLEXER
while (gettok() !=0) ; //gettok returns 0 on EOF
return;
#else
yyparse();
// print_Ast();
#endif
}

project4-ast/globals.h
#define TRUE 1
#define FALSE 0
#define TOKENMAX 512
#define MAXHASHSIZE 32
#define MAXCHILDREN 3
project4-ast/Makefile
# Makefile for cminus
CC=gcc
LEX=flex
YACC=bison
MV=/bin/mv
RM=/bin/rm
CP=/bin/cp
LIBRARIES= -ll
CFLAGS=-g
YFLAGS=-d
PROGRAM=cmparser
OBJECTS=cmparser.o cmlexer.o symbolTable.o ast.o util.o
SOURCES=cmlexer.l cmparser.y symbolTable.c ast.c util.c
CTEMPS=cmlexer.c cmparser.c cmparser.tab.h cmparser.tab.c
cmparser.output

$(PROGRAM): $(OBJECTS)
$(CC) $(CFLAGS) -o $(PROGRAM) $(OBJECTS)
$(LIBRARIES)
util.o: util.c
$(CC) $(CFLAGS) -c util.c
ast.o: ast.c
$(CC) $(CFLAGS) -c ast.c
symbolTable.o: symbolTable.c
$(CC) $(CFLAGS) -c symbolTable.c
cmparser.o: cmparser.c
$(CC) $(CFLAGS) -c cmparser.c
cmparser.c: cmparser.y
$(YACC) cmparser.y -o cmparser.c
cmlexer.o: cmlexer.c cmparser.tab.h globals.h
$(CC) $(CFLAGS) -c cmlexer.c
cmparser.tab.h: cmparser.y
$(YACC) -d cmparser.y
cmlexer.c: cmlexer.l
$(LEX) -ocmlexer.c cmlexer.l
clean:
$(RM) $(PROGRAM) $(OBJECTS) $(CTEMPS)
submit:
mkdir submit
cp $(SOURCES) submit
turnin -ccs473 -past submit

project4-ast/symbolTable.c
// Top should point to the top of the scope stack,
// which is the most recent scope pushed
SymbolTableStackEntryPtr symbolStackTop;
int scopeDepth;
/* global function prototypes */
//allocate the global scope entry and symbol table --and set
scopeDepth to 0
// The global scope remains till the end of the program
int initSymbolTable()
{
//initialize a stack
int i=0;
symbolStackTop = (SymbolTableStackEntryPtr) malloc
(sizeof(struct symbolTableStackEntry));
if (symbolStackTop == NULL) return 0;
symbolStackTop -> symbolTablePtr = (SymbolTable*)
malloc (sizeof(SymbolTable));
for(i=0; i<MAXHASHSIZE; i++)
symbolStackTop -> symbolTablePtr->hashTable[i] =
NULL;
symbolStackTop->prevScope = NULL;
scopeDepth = 0;
//printf("n Initialized Stack with Global Hash Table");
return 1;
}

// Look up a given entry
ElementPtr symLookup(char *name)
{
char *a = name;
SymbolTableStackEntryPtr iter = symbolStackTop;
int ascii = 0,depth = scopeDepth ;
while(*a!='0')
{
ascii+= (int) *a;
a++;
}
while(iter)
{
ElementPtr symelement = iter->symbolTablePtr-
>hashTable[ascii%16];
while(symelement)
{
if(strcmp(symelement->id, name) == 0)
{
//printf("n Found symbol %s at scope
depth %d",name, depth);
return symelement;
}
else
symelement = symelement->next;
}
iter = iter->prevScope;
//printf("n Did not find symbol %s at scope depth
%d",name, depth);
depth--;
}
return NULL;
}

// Insert an element with a specified type in a particular line
number
// initialize the scope depth of the entry from the global var
scopeDepth
//symInsert("counter", typ, 10);
ElementPtr symInsert(char *name, Type *type, int line)
{
ElementPtr new_element = (ElementPtr) malloc
(sizeof(Element));
char *a = name;
int ascii = 0 ;
while(*a!='0')
{
ascii+= (int) *a;
a++;
}
new_element->key = ascii%16;
new_element->id = strdup(name);
new_element->linenumber = line;
new_element->scope = scopeDepth;
new_element->stype = type;
new_element->snode = NULL;
if(!symbolStackTop->symbolTablePtr-
>hashTable[new_element->key]) // if the first element of hash
table for this ascii is null
{
new_element->next = NULL;
symbolStackTop->symbolTablePtr-
>hashTable[new_element->key] = new_element;
}
else
{
new_element->next = symbolStackTop-

>symbolTablePtr->hashTable[new_element->key];
symbolStackTop->symbolTablePtr-
>hashTable[new_element->key] = new_element;
}
//printf("n Inserted %s at scope depth %d with ascii %d
",new_element->id,scopeDepth,new_element->key);
return symbolStackTop->symbolTablePtr-
>hashTable[new_element->key];
}
//push a new entry to the symbol stack
// This should modify the variable top and change the scope
depth
int enterScope()
{
int i;
SymbolTableStackEntryPtr newSymbolStackEntry =
(SymbolTableStackEntryPtr) malloc (sizeof(struct
symbolTableStackEntry));
if (newSymbolStackEntry == NULL) return 0;
newSymbolStackEntry -> symbolTablePtr =
(SymbolTable*) malloc (sizeof(SymbolTable));
for(i=0; i<MAXHASHSIZE; i++)
newSymbolStackEntry -> symbolTablePtr-
>hashTable[i] = NULL;
newSymbolStackEntry->prevScope = symbolStackTop;
symbolStackTop = newSymbolStackEntry;
scopeDepth++;
//printf("n New Scope");
return 1;
}

//pop an entry off the symbol stack
// This should modify top and change scope depth
void leaveScope()
{
if (!symbolStackTop)
printf("n Error .. cannot leave global scope");
else if(!symbolStackTop->prevScope)
symbolStackTop = NULL;
else
symbolStackTop = symbolStackTop->prevScope;
scopeDepth--;
}
// Do not modify this function
void printElement(ElementPtr symelement) {
if (symelement != NULL) {
printf("nLine %d: %s", symelement-
>linenumber,symelement->id);
}
else printf("Wrong call! symbol table entry NULL");
}
//should traverse through the entire symbol table and print it
// must use the printElement function given above
void printSymbolTable()
{
int i;
SymbolTableStackEntryPtr SymbolStackEntry =
symbolStackTop;
while(SymbolStackEntry)
{
for(i=0; i<MAXHASHSIZE;i++)
{
ElementPtr symelement = SymbolStackEntry-

>symbolTablePtr->hashTable[i];
while(symelement)
{
printElement(symelement);
}
}
SymbolStackEntry = SymbolStackEntry->prevScope;
}
}
project4-ast/symbolTable.h
#ifndef _SYMBOL_TABLE
#define _SYMBOL_TABLE
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* data structure of one element in the symbol table */
typedef enum {VOID, INT, ARRAY, FUNCTION} TypeKind;
typedef struct type{
TypeKind kind; /* one from the enum above */
int dimension; /* for arrays */
struct type *function; /*function argument and return types
*/
/* Ignore this field until HW5 */
}Type;
typedef Type *TypePtr;
typedef struct element {
int key;
char * id;
int linenumber;
int scope; /* scope depth at declaration */
Type *stype; /* pointer to the type infomation
*/
struct element *next; /* pointer to the next
symbol with the
same hash table index */
struct node *snode; /* (New) AST Node for method
declartion */
} Element;
typedef Element *ElementPtr;
typedef ElementPtr HashTableEntry;
/* data structure of the symbol table */
typedef struct symbolTable {

HashTableEntry hashTable[MAXHASHSIZE]; /*
hash table */
} SymbolTable;
typedef SymbolTable *SymbolTablePtr;
typedef struct symbolTableStackEntry {
SymbolTablePtr symbolTablePtr;
struct symbolTableStackEntry *prevScope;
} SymbolTableStackEntry;
typedef SymbolTableStackEntry
*SymbolTableStackEntryPtr;
int initSymbolTable() ;
ElementPtr symLookup(char *);
ElementPtr symInsert(char *, Type *, int );
int enterScope();
void leaveScope();
#endif
project4-ast/util.c
/* Original Code: Silvano Bonacia, Fall 2006 */
/* Modifications and changes, Prof. Venkat, Spring 08 */
#include "ast.h"
#include "util.h"
extern AstNode *program;
extern void printSymbolTable();

#define PAD_STR " "
int indLevel = 0;
void pad (int level) {
int i;
for(i=0;i<level;i++)
printf("%s", PAD_STR);
}
/*
* Print the parameters of the method (parameters being in the
* current scope)
*/
void printMethodParams (SymbolTableStackEntryPtr scope) {
int i;
SymbolTableStackEntryPtr SymbolStackEntry = scope;
for(i=0; i<MAXHASHSIZE;i++) {
ElementPtr symelement = SymbolStackEntry-
>symbolTablePtr->hashTable[i];
while(symelement) {
switch(symelement->stype->kind) {
case INT :
printf("int %s", symelement->id);
break;
case ARRAY :
printf("int %s[]", symelement->id);
break;
case VOID :
printf("void");
break;
case FUNCTION :
break;
}

if(symelement)
printf(", ");
}
}
}
/*
* Print the variable declarations of the scope.
* Return the number of variables
*/
int printVarDeclarations (SymbolTablePtr scope)
{
int noVar = 0;
int i;
for(i=0; i<MAXHASHSIZE;i++) {
ElementPtr symelement = scope->hashTable[i];
while(symelement) {
pad(indLevel);
switch(symelement->stype->kind) {
case INT :
printf("int %s;n", symelement->id);
noVar++;
break;
case ARRAY :
printf("int %s[%d];n", symelement->id,
symelement->stype->dimension);
noVar++;
break;
case VOID:
break;
case FUNCTION :
break;
}
}

}
return noVar;
}
//traverses through the entire program and symbol table and
prints it
//the output must match the program with the exception of white
spaces
//do a diff -w to ignre white spaces
void printType(TypePtr type, int dummy){
switch(type->kind) {
case INT :
printf("int");
break;
case ARRAY :
printf("int");
break;
case VOID:
printf("void");
break;
case FUNCTION :
printType(type->function, 0);
break;
}
}
void print_Expression(AstNodePtr expr, int endWithSemi) {
if(expr == NULL) {
return;
}
switch(expr->eKind) {
case VAR_EXP :
printf("%s", expr->nSymbolPtr->id);
break;
case ARRAY_EXP :

printf("%s[", expr->nSymbolPtr->id);
print_Expression(expr->children[0], 0);
printf("]");
break;
case ASSI_EXP :
printf(" = ");
break;
case ADD_EXP :
printf(" + ");
break;
case SUB_EXP :
printf(" - ");
break;
case MULT_EXP :
printf(" * ");
break;
case DIV_EXP :
printf(" / ");
break;
case GT_EXP :
printf(" > ");
break;
case LT_EXP :

printf(" < ");
break;
case GE_EXP :
printf(" >= ");
break;
case LE_EXP :
printf(" <= ");
break;
case EQ_EXP :
printf(" == ");
break;
case NE_EXP :
printf(" != ");
break;
case CALL_EXP :
printf("%s(", expr->fname);
if(expr->children[0] != NULL) {
AstNodePtr ptr = expr->children[0]->sibling;
while(ptr != NULL) {
printf(", ");
print_Expression(ptr, 0);
ptr = ptr->sibling;
}
}
printf(")");
break;

case CONST_EXP :
printf("%d", expr->nValue);
break;
}
if(endWithSemi == 1)
printf(";n");
}
void print_Statement(AstNodePtr stmt) {
if(stmt == NULL)
return;
pad(indLevel);
switch(stmt->sKind) {
case IF_THEN_ELSE_STMT :
printf("if(");
print_Expression(stmt->children[0], 0);
printf(")n");
if(stmt->children[1] != NULL) {
if(stmt->children[1]->sKind != COMPOUND_STMT) {
indLevel++;
print_Statement(stmt->children[1]);
indLevel--;
} else print_Statement(stmt->children[1]);
} else {
pad(indLevel+1);
printf(";n");
}
pad(indLevel);
printf("elsen");
indLevel++;
indLevel--;

}
break;
case WHILE_STMT :
printf("while(");
printf(")n");
indLevel++;
indLevel--;
} else {
pad(indLevel+1);
printf(";n");
}
break;
case RETURN_STMT :
printf("return ");
if(stmt->children[0] == NULL)
printf(";n");
else
break;
case COMPOUND_STMT :
printf("{n");
indLevel++;
if(stmt->nSymbolTabPtr != NULL) {
if(printVarDeclarations(stmt->nSymbolTabPtr) > 0)
printf("n");
}
if(stmt->children[0] != NULL)
print_Statement(stmt->children[0]); // Print the first
statement
else {
pad(indLevel);

printf(";n");
}
indLevel--;
pad(indLevel);
printf("}n");
break;
case EXPRESSION_STMT :
break;
}
//printf("sibling : %dn", stmt->sibling);
print_Statement(stmt->sibling); // Print the next statement
}
void print_Ast_Recursion(AstNodePtr root) {
//printf("print_Ast_Recursion: root = %dn", root);
/*
* End the recursion
*/
if(root == NULL)
return;
switch(root->nKind) {
case METHOD :
//printf("root->nKind = METHODn");
printType(root->nType, 0);
printf(" %s(", root->nSymbolPtr->id);
if(root->children[0] == NULL)
printf("void");
else
print_Ast_Recursion(root->children[0]); // print the
parameters of the method
printf(")n");
print_Ast_Recursion(root->children[1]); // print the body of
the method
printf("n");

print_Ast_Recursion(root->sibling); // print the next method
break;
case FORMALVAR :
//printf("root->nKind = FORMALVARn");
printType(root->nSymbolPtr->stype, 0); // print the type
printf(" %s", root->nSymbolPtr->id); // print the name of
the variable
if(root->nSymbolPtr->stype->kind == ARRAY)
printf("[]");
/*
* Print the next parameter if there's one
*/
if(root->sibling != NULL) {
printf(", ");
print_Ast_Recursion(root->sibling);
}
break;
case STMT :
//printf("root->nKind = STMTn");
print_Statement(root);
break;
case EXPRESSION :
//printf("root->nKind = EXPRESSIONn");
print_Expression(root, 1); // I don't think it ever gets here
break;
}
}
void print_Ast() {
/*
* First print the entire symbol table
*/
//printSymbolTable();
/*
* Then print the program

*/
indLevel = 0;
if(printVarDeclarations(symbolStackTop->symbolTablePtr) >
0)
printf("n");
print_Ast_Recursion(program);
}
project4-ast/util.h
extern void printNode(AstNodePtr, int);
extern void printType(TypePtr, int);
lex_yacc.pdf
A GUIDE TO LEX & YACC
BY THOMAS NIEMANN
2
PREFACE
This document explains how to construct a compiler using lex
and yacc. Lex and yacc are
tools used to generate lexical analyzers and parsers. I assume

you can program in C, and
understand data structures such as linked-lists and trees.
The introduction describes the basic building blocks of a
compiler and explains the
interaction between lex and yacc. The next two sections
describe lex and yacc in more
detail. With this background, we construct a sophisticated
calculator. Conventional
arithmetic operations and control statements, such as if-else and
while, are implemented.
With minor changes, we convert the calculator into a compiler
for a stack-based machine.
The remaining sections discuss issues that commonly arise in
compiler writing. Source
code for examples may be downloaded from the web site listed
below.
Permission to reproduce portions of this document is given
provided the web site
listed below is referenced, and no additional restrictions apply.
Source code, when part of
a software project, may be used freely without reference to the
author.
THOMAS NIEMANN
Portland, Oregon
[email protected]
http://members.xoom.com/thomasn
http://members.xoom.com/thomasn
3

CONTENTS
1. INTRODUCTION
4
2. LEX
6
2.1 Theory 6
2.2 Practice 7
3. YACC
12
3.1 Theory 12
3.2 Practice, Part I 14
3.3 Practice, Part II 17
4. CALCULATOR
20
4.1 Description 20
4.2 Include File 23
4.3 Lex Input 24
4.4 Yacc Input 25
4.5 Interpreter 29
4.6 Compiler 30
5. MORE LEX
32
5.1 Strings 32
5.2 Reserved Words 33
5.3 Debugging Lex 33
6. MORE YACC
35
6.1 Recursion 35
6.2 If-Else Ambiguity 35

6.3 Error Messages 36
6.4 Inherited Attributes 37
6.5 Embedded Actions 37
6.6 Debugging Yacc 38
7. BIBLIOGRAPHY
39
4
1. Introduction
Until 1975, writing a compiler was a very time-consuming
process. Then Lesk [1975]
and Johnson [1975] published papers on lex and yacc. These
utilities greatly simplify
compiler writing. Implementation details for lex and yacc may
be found in Aho [1986].
Lex and yacc are available from
• Mortice Kern Systems (MKS), at http://www.mks.com,
• GNU flex and bison, at http://www.gnu.org,
• Ming, at http://agnes.dida.physik.uni-
essen.de/~janjaap/mingw32,
• Cygnus, at http://www.cygnus.com/misc/gnu-win32, and
• me, at
http://members.xoom.com/thomasn/y_gnu.zip (executables), and
http://members.xoom.com/thomasn/y_gnus.zip (source for
y_gnu.zip).
The version from MKS is a high-quality commercial product
that retails for about
$300US. GNU software is free. Output from flex may be used in
a commercial product,

and, as of version 1.24, the same is true for bison. Ming and
Cygnus are 32-bit Windows-
95/NT ports of the GNU software. My version is based on
Ming’s, but is compiled with
Visual C++ and includes a minor bug fix in the file handling
routine. If you download my
version, be sure to retain directory structure when you unzip.
Lexical Analyzer
Syntax Analyzer
a = b + c * d
id1 = id2 + id3 * id4
=
+
*
id1
source code
tokens
syntax tree
id2
id3 id4
load id3
mul id4

add id2
store id1
Code Generator
generated code
Figure 1-1: Compilation Sequence
http://members.xoom.com/thomasn/y_lex.pdf
http://members.xoom.com/thomasn/y_yacc.pdf
http://www.amazon.com/exec/obidos/ISBN=0201100886/none01
A/#Bibliography
http://www.mks.com/
http://www.gnu.org/
http://agnes.dida.physik.uni-essen.de/~janjaap/mingw32
http://www.cygnus.com/misc/gnu-win32
http://members.xoom.com/thomasn/y_gnu.zip
http://members.xoom.com/thomasn/y_gnus.zip
5
Lex generates C code for a lexical analyzer, or scanner. It uses
patterns that match
strings in the input and converts the strings to tokens. Tokens
are numerical
representations of strings, and simplify processing. This is
illustrated in Figure 1-1.
As lex finds identifiers in the input stream, it enters them in a
symbol table. The
symbol table may also contain other information such as data
type (integer or real) and
location of the variable in memory. All subsequent references to
identifiers refer to the

appropriate symbol table index.
Yacc generates C code for a syntax analyzer, or parser. Yacc
uses grammar rules that
allow it to analyze tokens from lex and create a syntax tree. A
syntax tree imposes a
hierarchical structure on tokens. For example, operator
precedence and associativity are
apparent in the syntax tree. The next step, code generation, does
a depth-first walk of the
syntax tree to generate code. Some compilers produce machine
code, while others, as
shown above, output assembly.
lex
yacc
cc
bas.y
bas.l lex.yy.c
y.tab.c
bas.exe
source
compiled output
(yylex)
(yyparse)
y.tab.h

Figure 1-2: Building a Compiler with Lex/Yacc
Figure 1-2 illustrates the file naming conventions used by lex
and yacc. We'll assume
our goal is to write a BASIC compiler. First, we need to specify
all pattern matching rules
for lex (bas.l) and grammar rules for yacc (bas.y). Commands to
create our compiler,
bas.exe, are listed below:
yacc –d bas.y # create y.tab.h, y.tab.c
lex bas.l # create lex.yy.c
cc lex.yy.c y.tab.c –obas.exe # compile/link
Yacc reads the grammar descriptions in bas.y and generates a
parser, function
yyparse, in file y.tab.c. Included in file bas.y are token
declarations. These are
converted to constant definitions by yacc and placed in file
y.tab.h. Lex reads the
pattern descriptions in bas.l, includes file y.tab.h, and generates
a lexical analyzer,
function yylex, in file lex.yy.c.
Finally, the lexer and parser are compiled and linked together to
form the executable,
bas.exe. From main, we call yyparse to run the compiler.
Function yyparse
automatically calls yylex to obtain each token.
6
2. Lex

2.1 Theory
The first phase in a compiler reads the input source and
converts strings in the source to
tokens. Using regular expressions, we can specify patterns to
lex that allow it to scan and
match strings in the input. Each pattern in lex has an associated
action. Typically an
action returns a token, representing the matched string, for
subsequent use by the parser.
To begin with, however, we will simply print the matched string
rather than return a
token value. We may scan for identifiers using the regular
expression
letter(letter|digit)*
This pattern matches a string of characters that begins with a
single letter, and is followed
by zero or more letters or digits. This example nicely illustrates
operations allowed in
regular expressions:
• repetition, expressed by the “*” operator
• alternation, expressed by the “|” operator
• concatenation
Any regular expression expressions may be expressed as a finite
state automaton
(FSA). We can represent an FSA using states, and transitions
between states. There is one
start state, and one or more final or accepting states.
0 1 2
letter

letter or digit
otherstart
Figure 2-1: Finite State Automaton
In Figure 2-1, state 0 is the start state, and state 2 is the
accepting state. As characters are
read, we make a transition from one state to another. When the
first letter is read, we
transition to state 1. We remain in state 1 as more letters or
digits are read. When we read
a character other than a letter or digit, we transition to state 2,
the accepting state. Any
FSA may be expressed as a computer program. For example, our
3-state machine is
easily programmed:
7
start: goto state0
state0: read c
if c = letter goto state1
goto state0
state1: read c
if c = letter goto state1
if c = digit goto state1
goto state2
state2: accept string
This is the technique used by lex. Regular expressions are

translated by lex to a computer
program that mimics an FSA. Using the next input character,
and current state, the next
state is easily determined by indexing into a computer-
generated state table.
Now we can easily understand some of lex’s limitations. For
example, lex cannot be
used to recognize nested structures such as parentheses. Nested
structures are handled by
incorporating a stack. Whenever we encounter a “(”, we push it
on the stack. When a “)”
is encountered, we match it with the top of the stack, and pop
the stack. Lex, however,
only has states and transitions between states. Since it has no
stack, it is not well suited
for parsing nested structures. Yacc augments an FSA with a
stack, and can process
constructs such as parentheses with ease. The important thing is
to use the right tool for
the job. Lex is good at pattern matching. Yacc is appropriate for
more challenging tasks.
2.2 Practice
Metacharacter Matches
. any character except newline
n newline
* zero or more copies of preceding expression
+ one or more copies of preceding expression
? zero or one copy of preceeding expression
^ beginning of line
$ end of line
a|b a or b
(ab)+ one or more copies of ab (grouping)
"a+b" literal “a+b” (C escapes still work)

[] character class
Table 2-1: Pattern Matching Primitives
8
Expression Matches
abc abc
abc* ab, abc, abcc, abccc, …
abc+ abc, abcc, abccc, …
a(bc)+ abc, abcbc, abcbcbc, …
a(bc)? a, abc
[abc] a, b, c
[a-z] any letter, a through z
[a-z] a, -, z
[-az] -, a, z
[A-Za-z0-9]+ one or more alphanumeric characters
[ tn]+ whitespace
[^ab] anything except: a, b
[a^b] a, ^, b
[a|b] a, |, b
a|b a or b
Table 2-2: Pattern Matching Examples

Regular expressions in lex are composed of metacharacters
(Table 2-1). Pattern matching
examples are shown in Table 2-2. Within a character class,
normal operators lose their
meaning. Two operators allowed in a character class are the
hyphen (“-”) and circumflex
(“^”). When used between two characters, the hyphen represents
a range of characters.
The circumflex, when used as the first character, negates the
expression. If two patterns
match the same string, the longest match wins. In case both
matches are the same length,
then the first pattern listed is used.
Input to Lex is divided into three sections, with %% dividing
the sections. This is best
illustrated by example. The first example is the shortest
possible lex file:
%%
Input is copied to output, one character at a time. The first %%
is always required, as there
must always be a rules section. However, if we don’t specify
any rules, then the default
action is to match everything and copy it to output. Defaults for
input and output are
stdin and stdout, respectively. Here is the same example, with
defaults explicitly
coded:
... definitions ...
%%
... rules ...
%%

... subroutines ...
9
%%
/* match everything except newline */
. ECHO;
/* match newline */
n ECHO;
%%
int yywrap(void) {
return 1;
}
int main(void) {
yylex();
return 0;
}
Two patterns have been specified in the rules section. Each
pattern must begin in column
one. This is followed by whitespace (space, tab or newline), and
an optional action
associated with the pattern. The action may be a single C
statement, or multiple C
statements enclosed in braces. Anything not starting in column
one is copied verbatim to
the generated C file. We may take advantage of this behavior to

specify comments in our
lex file. In this example there are two patterns, “.” and “n”,
with an ECHO action
associated for each pattern. Several macros and variables are
predefined by lex. ECHO is a
macro that writes code matched by the pattern. This is the
default action for any
unmatched strings. Typically, ECHO is defined as:
#define ECHO fwrite(yytext, yyleng, 1, yyout)
Variable yytext is a pointer to the matched string (NULL-
terminated), and yyleng is
the length of the matched string. Variable yyout is the output
file, and defaults to
stdout. Function yywrap is called by lex when input is
exhausted. Return 1 if you are
done, or 0 if more processing is required. Every C program
requires a main function. In
this case, we simply call yylex, the main entry-point for lex.
Some implementations of
lex include copies of main and yywrap in a library, eliminating
the need to code them
explicitly. This is why our first example, the shortest lex
program, functioned properly.
10
name function
int yylex(void) call to invoke lexer, returns token
char *yytext pointer to matched string
yyleng length of matched string
yylval value associated with token
int yywrap(void) wrapup, return 1 if done, 0 if not done

FILE *yyout output file
FILE *yyin input file
INITIAL initial start condition
BEGIN condition switch start condition
ECHO write matched string
Table 2-3: Lex Predefined Variables
Here’s a program that does nothing at all. All input is matched,
but no action is
associated with any pattern, so there will be no output.
%%
.
n
The following example prepends line numbers to each line in a
file. Some
implementations of lex predefine and calculate yylineno. The
input file for lex is yyin,
and defaults to stdin.
%{
int yylineno;
%}
%%
^(.*)n printf("%4dt%s", ++yylineno, yytext);
%%
int main(int argc, char *argv[]) {
yyin = fopen(argv[1], "r");
yylex();
fclose(yyin);
}

The definitions section is composed of substitutions, code, and
start states. Code in
the definitions section is simply copied as-is to the top of the
generated C file, and must
be bracketed with “%{“ and “%}” markers. Substitutions
simplify pattern-matching rules.
For example, we may define digits and letters:
11
digit [0-9]
letter [A-Za-z]
%{
int count;
%}
%%
/* match identifier */
{letter}({letter}|{digit})* count++;
%%
int main(void) {
yylex();
printf("number of identifiers = %dn", count);
return 0;
}
Whitespace must separate the defining term and the associated
expression.
References to substitutions in the rules section are surrounded
by braces ({letter}) to

distinguish them from literals. When we have a match in the
rules section, the associated
C code is executed. Here is a scanner that counts the number of
characters, words, and
lines in a file (similar to Unix wc):
%{
int nchar, nword, nline;
%}
%%
n { nline++; nchar++; }
[^ tn]+ { nword++, nchar += yyleng; }
. { nchar++; }
%%
int main(void) {
yylex();
printf("%dt%dt%dn", nchar, nword, nline);
return 0;
}
12
3. Yacc
3.1 Theory
Grammars for yacc are described using a variant of Backus Naur
Form (BNF). This
technique was pioneered by John Backus and Peter Naur, and
used to describe
ALGOL60. A BNF grammar can be used to express context-free
languages. Most

constructs in modern programming languages can be represented
in BNF. For example,
the grammar for an expression that multiplies and adds numbers
is
1 E -> E + E
2 E -> E * E
3 E -> id
Three productions have been specified. Terms that appear on the
left-hand side (lhs) of a
production, such as E (expression) are nonterminals. Terms such
as id (identifier) are
terminals (tokens returned by lex) and only appear on the right-
hand side (rhs) of a
production. This grammar specifies that an expression may be
the sum of two
expressions, the product of two expressions, or an identifier.
We can use this grammar to
generate expressions:
E -> E * E (r2)
-> E * z (r3)
-> E + E * z (r1)
-> E + y * z (r3)
-> x + y * z (r3)
At each step we expanded a term, replacing the lhs of a
production with the
corresponding rhs. The numbers on the right indicate which rule
applied. To parse an
expression, we actually need to do the reverse operation.
Instead of starting with a single
nonterminal (start symbol) and generating an expression from a
grammar, we need to
reduce an expression to a single nonterminal. This is known as

bottom-up or shift-reduce
parsing, and uses a stack for storing terms. Here is the same
derivation, but in reverse
order:
13
1 . x + y * z shift
2 x . + y * z reduce(r3)
3 E . + y * z shift
4 E + . y * z shift
5 E + y . * z reduce(r3)
6 E + E . * z shift
7 E + E * . z shift
8 E + E * z . reduce(r3)
9 E + E * E . reduce(r2) emit multiply
10 E + E . reduce(r1) emit add
11 E . accept
Terms to the left of the dot are on the stack, while remaining
input is to the right of the
dot. We start by shifting tokens onto the stack. When the top of
the stack matches the rhs
of a production, we replace the matched tokens on the stack
with the lhs of the
production. Conceptually, the matched tokens of the rhs are
popped off the stack, and the
lhs of the production is pushed on the stack. The matched
tokens are known as a handle,
and we are reducing the handle to the lhs of the production.
This process continues until
we have shifted all input to the stack, and only the starting
nonterminal remains on the
stack. In step 1 we shift the x to the stack. Step 2 applies rule r3

to the stack, changing x
to E. We continue shifting and reducing, until a single
nonterminal, the start symbol,
remains in the stack. In step 9, when we reduce rule r2, we emit
the multiply instruction.
Similarly, the add instruction is emitted in step 10. Thus,
multiply has a higher
precedence than addition.
Consider, however, the shift at step 6. Instead of shifting, we
could have reduced,
applying rule r1. This would result in addition having a higher
precedence than
multiplication. This is known as a shift-reduce conflict. Our
grammar is ambiguous, as
there is more than one possible derivation that will yield the
expression. In this case,
operator precedence is affected. As another example,
associativity in the rule
E -> E + E
is ambiguous, for we may recurse on the left or the right. To
remedy the situation, we
could rewrite the grammar, or supply yacc with directives that
indicate which operator
has precedence. The latter method is simpler, and will be
demonstrated in the practice
section.
The following grammar has a reduce-reduce conflict. With an id
on the stack, we
may reduce to T, or reduce to E.
E -> T
E -> id

T -> id
Yacc takes a default action when there is a conflict. For shift-
reduce conflicts, yacc will
shift. For reduce-reduce conflicts, it will use the first rule in the
listing. It also issues a
warning message whenever a conflict exists. The warnings may
be suppressed by making
14
the grammar unambiguous. Several methods for removing
ambiguity will be presented in
subsequent sections.
3.2 Practice, Part I
Input to yacc is divided into three sections. The definitions
section consists of token
declarations, and C code bracketed by “%{“ and “%}”. The BNF
grammar is placed in the
rules section, and user subroutines are added in the subroutines
section.
This is best illustrated by constructing a small calculator that
can add and subtract
numbers. We’ll begin by examining the linkage between lex and
yacc. Here is the
definitions section for the yacc input file:
%token INTEGER
This definition declares an INTEGER token. When we run yacc,
it generates a parser in

file y.tab.c, and also creates an include file, y.tab.h:
#ifndef YYSTYPE
#define YYSTYPE int
#endif
#define INTEGER 258
extern YYSTYPE yylval;
Lex includes this file and utilizes the definitions for token
values. To obtain tokens, yacc
calls yylex. Function yylex has a return type of int, and returns
the token value.
Values associated with the token are returned by lex in variable
yylval. For example,
[0-9]+ {
yylval = atoi(yytext);
return INTEGER;
}
would store the value of the integer in yylval, and return token
INTEGER to yacc. The
type of yylval is determined by YYSTYPE. Since the default
type is integer, this works
well in this case. Token values 0-255 are reserved for character
values. For example, if
you had a rule such as
[-+] return *yytext; /* return operator */
... definitions ...
%%
... rules ...
%%
... subroutines ...

15
the character value for minus or plus is returned. Note that we
placed the minus sign first
so that it wouldn’t be mistaken for a range designator.
Generated token values typically
start around 258, as lex reserves several values for end-of-file
and error processing. Here
is the complete lex input specification for our calculator:
%{
#include "y.tab.h"
%}
%%
[0-9]+ {
return INTEGER;
}
[-+n] return *yytext;
[ t] ; /* skip whitespace */
. yyerror("invalid character");
%%
int yywrap(void) {
return 1;

}
Internally, yacc maintains two stacks in memory; a parse stack
and a value stack. The
parse stack contains terminals and nonterminals, and represents
the current parsing state.
The value stack is an array of YYSTYPE elements, and
associates a value with each
element in the parse stack. For example, when lex returns an
INTEGER token, yacc shifts
this token to the parse stack. At the same time, the
corresponding yylval is shifted to
the value stack. The parse and value stacks are always
synchronized, so finding a value
related to a token on the stack is easily accomplished. Here is
the yacc input specification
for our calculator:
16
%token INTEGER
%%
program:
program expr 'n' { printf("%dn", $2); }
|
;
expr:
INTEGER { $$ = $1; }
| expr '+' expr { $$ = $1 + $3; }
| expr '-' expr { $$ = $1 - $3; }
;

%%
int yyerror(char *s) {
fprintf(stderr, "%sn", s);
return 0;
}
int main(void) {
yyparse();
return 0;
}
The rules section resembles the BNF grammar discussed earlier.
The left-hand side of a
production, or nonterminal, is entered left-justified, followed by
a colon. This is followed
by the right-hand side of the production. Actions associated
with a rule are entered in
braces.
By utilizing left-recursion, we have specified that a program
consists of zero or more
expressions. Each expression terminates with a newline. When a
newline is detected, we
print the value of the expression. When we apply the rule
expr: expr '+' expr { $$ = $1 + $3; }
we replace the right-hand side of the production in the parse
stack with the left-hand side
of the same production. In this case, we pop “expr '+' expr” and
push “expr”. We
have reduced the stack by popping three terms off the stack, and

pushing back one term.
We may reference positions in the value stack in our C code by
specifying “$1” for the
first term on the right-hand side of the production, “$2” for the
second, and so on. “$$”
designates the top of the stack after reduction has taken place.
The above action adds the
value associated with two expressions, pops three terms off the
value stack, and pushes
back a single sum. Thus, the parse and value stacks remain
synchronized.
17
Numeric values are initially entered on the stack when we
reduce from INTEGER to
expr. After INTEGER is shifted to the stack, we apply the rule
expr: INTEGER { $$ = $1; }
The INTEGER token is popped off the parse stack, followed by
a push of expr. For the
value stack, we pop the integer value off the stack, and then
push it back on again. In
other words, we do nothing. In fact, this is the default action,
and need not be specified.
Finally, when a newline is encountered, the value associated
with expr is printed.
In the event of syntax errors, yacc calls the user-supplied
function yyerror. If you
need to modify the interface to yyerror, you can alter the canned
file that yacc includes
to fit your needs. The last function in our yacc specification is

main … in case you were
wondering where it was. This example still has an ambiguous
grammar. Yacc will issue
shift-reduce warnings, but will still process the grammar using
shift as the default
operation.
3.3 Practice, Part II
In this section we will extend the calculator from the previous
section to incorporate
some new functionality. New features include arithmetic
operators multiply, and divide.
Parentheses may be used to over-ride operator precedence, and
single-character variables
may be specified in assignment statements. The following
illustrates sample input and
calculator output:
user: 3 * (4 + 5)
calc: 27
user: x = 3 * (5 + 4)
user: y = 5
user: x
calc: 27
user: y
calc: 5
user: x + 2*y
calc: 37
The lexical analyzer returns VARIABLE and INTEGER tokens.
For variables, yylval
specifies an index to sym, our symbol table. For this program,
sym merely holds the
value of the associated variable. When INTEGER tokens are
returned, yylval contains
the number scanned. Here is the input specification for lex:

18
%{
#include "y.tab.h"
%}
%%
/* variables */
[a-z] {
yylval = *yytext - 'a';
return VARIABLE;
}
/* integers */
[0-9]+ {
return INTEGER;
}
/* operators */
[-+()=/*n] { return *yytext; }
/* skip whitespace */
[ t] ;
/* anything else is an error */
. yyerror("invalid character");

%%
int yywrap(void) {
return 1;
}
The input specification for yacc follows. The tokens for
INTEGER and VARIABLE are
utilized by yacc to create #defines in y.tab.h for use in lex. This
is followed by
definitions for the arithmetic operators. We may specify %left,
for left-associative, or
%right, for right associative. The last definition listed has the
highest precedence. Thus,
multiplication and division have higher precedence than
addition and subtraction. All
four operators are left-associative. Using this simple technique,
we are able to
disambiguate our grammar.
%token INTEGER VARIABLE
%left '+' '-'
%left '*' '/'
%{
int sym[26];
%}
19
%%

program:
program statement 'n'
|
;
statement:
expr { printf("%dn", $1); }
| VARIABLE '=' expr { sym[$1] = $3; }
;
expr:
INTEGER
| VARIABLE { $$ = sym[$1]; }
| expr '+' expr { $$ = $1 + $3; }
| expr '-' expr { $$ = $1 - $3; }
| expr '*' expr { $$ = $1 * $3; }
| expr '/' expr { $$ = $1 / $3; }
| '(' expr ')' { $$ = $2; }
;
%%
int yyerror(char *s) {
fprintf(stderr, "%sn", s);
return 0;
}
int main(void) {
yyparse();
return 0;
}

20
4. Calculator
4.1 Description
This version of the calculator is substantially more complex
than previous versions.
Major changes include control constructs such as if-else and
while. In addition, a syntax
tree is constructed during parsing. After parsing, we walk the
syntax tree to produce
output. Two versions of the tree walk routine are supplied:
• an interpreter that executes statements during the tree walk,
and
• a compiler that generates code for a hypothetical stack-based
machine.
To make things more concrete, here is a sample program,
x = 0;
while (x < 3) {
print x;
x = x + 1;
}
with output for the interpretive version,
0
1
2
and output for the compiler version.

push 0
pop x
L000:
push x
push 3
compLT
jz L001
push x
print
push x
push 1
add
pop x
jmp L000
L001:
21
The include file contains declarations for the syntax tree and
symbol table. The
symbol table, sym, allows for single-character variable names.
A node in the syntax tree
may hold a constant (conNodeType), an identifier
(idNodeType), or an internal node
with an operator (oprNodeType). Union nodeType encapsulates
all three variants, and
nodeType.type is used to determine which structure we have.
The lex input file contains patterns for VARIABLE and
INTEGER tokens. In addition,
tokens are defined for 2-character operators such as EQ and NE.

Single-character
operators are simply returned as themselves.
The yacc input file defines YYSTYPE, the type of yylval, as
%union {
int iValue; /* integer value */
char sIndex; /* symbol table index */
nodeType nPtr; /* node pointer */
};
This causes the following to be generated in y.tab.h:
typedef union {
nodeType nPtr; /* node pointer */
} YYSTYPE;
extern YYSTYPE yylval;
Constants, variables, and nodes can be represented by yylval in
the parser’s value stack.
Notice the type definitions
%token <iValue> INTEGER
%type <nPtr> expr
This binds expr to nPtr, and INTEGER to iValue in the
YYSTYPE union. This is
required so that yacc can generate the correct code. For
example, the rule
expr: INTEGER { $$ = con($1); }

should generate the following code. Note that yyvsp[0]
addresses the top of the value
stack, or the value associated with INTEGER.
yylval.nPtr = con(yyvsp[0].iValue);
22
The unary minus operator is given higher priority than binary
operators as follows:
%left GE LE EQ NE '>' '<'
%left '+' '-'
%left '*' '/'
%nonassoc UMINUS
The %nonassoc indicates no associativity is implied. It is
frequently used in conjunction
with %prec to specify precedence of a rule. Thus, we have
expr: '-' expr %prec UMINUS { $$ = node(UMINUS, 1, $2); }
indicating that the precedence of the rule is the same as the
precedence of token UMINUS.
And, as defined above, UMINUS has higher precedence than the
other operators. A similar
technique is used to remove ambiguity associated with the if-
else statement (see If-Else
Ambiguity, p. 35).
The syntax tree is constructed bottom-up, allocating the leaf
nodes when variables
and integers are reduced. When operators are encountered, a
node is allocated and

pointers to previously allocated nodes are entered as operands.
As statements are reduced,
ex is called to do a depth-first walk of the syntax tree. Since the
tree was constructed
bottom-up, a depth-first walk visits nodes in the order that they
were originally allocated.
This results in operators being applied in the order that they
were encountered during
parsing. Two versions of ex, an interpretive version, and a
compiler version, are
included.
23
4.2 Include File
typedef enum { typeCon, typeId, typeOpr } nodeEnum;
/* constants */
typedef struct {
nodeEnum type; /* type of node */
int value; /* value of constant */
} conNodeType;
/* identifiers */
typedef struct {
int i; /* subscript to ident array */
} idNodeType;

/* operators */
typedef struct {
int oper; /* operator */
int nops; /* number of operands */
union nodeTypeTag *op[1]; /* operands (expandable) */
} oprNodeType;
typedef union nodeTypeTag {
conNodeType con; /* constants */
idNodeType id; /* identifiers */
oprNodeType opr; /* operators */
} nodeType;
extern int sym[26];
24
4.3 Lex Input
%{
#include <stdlib.h>
#include "calc3.h"
#include "y.tab.h"
%}
%%
[a-z] {
yylval.sIndex = *yytext - 'a';

return VARIABLE;
}
[0-9]+ {
yylval.iValue = atoi(yytext);
return INTEGER;
}
[-()<>=+*/;{}.] {
return *yytext;
}
">=" return GE;
"<=" return LE;
"==" return EQ;
"!=" return NE;
"while" return WHILE;
"if" return IF;
"else" return ELSE;
"print" return PRINT;
[ tn+] ; /* ignore whitespace */
. yyerror("Unknown character");
%%
int yywrap(void) {
return 1;
}
25

4.4 Yacc Input
%{
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include "calc3.h"
/* prototypes */
nodeType *opr(int oper, int nops, ...);
nodeType *id(int i);
nodeType *con(int value);
void freeNode(nodeType *p);
void yyerror(char *s);
int sym[26]; /* symbol table */
%}
%union {
nodeType *nPtr; /* node pointer */
};
%token <iValue> INTEGER
%token <sIndex> VARIABLE
%token WHILE IF PRINT
%nonassoc IFX
%nonassoc ELSE
%left GE LE EQ NE '>' '<'
%left '+' '-'
%left '*' '/'
%nonassoc UMINUS

%type <nPtr> stmt expr stmt_list
%%
26
program:
function '.' { exit(0); }
;
function:
function stmt { ex($2); freeNode($2); }
| /* NULL */
;
stmt:
';' { $$ = opr(';', 2, NULL, NULL); }
| expr ';' { $$ = $1; }
| PRINT expr ';' { $$ = opr(PRINT, 1, $2); }
| VARIABLE '=' expr ';' { $$ = opr('=', 2, id($1), $3); }
| WHILE '(' expr ')' stmt
{ $$ = opr(WHILE, 2, $3, $5); }
| IF '(' expr ')' stmt %prec IFX
{ $$ = opr(IF, 2, $3, $5); }
| IF '(' expr ')' stmt ELSE stmt
{ $$ = opr(IF, 3, $3, $5, $7); }
| '{' stmt_list '}' { $$ = $2; }
;

stmt_list:
stmt { $$ = $1; }
| stmt_list stmt { $$ = opr(';', 2, $1, $2); }
;
expr:
INTEGER { $$ = con($1); }
| VARIABLE { $$ = id($1); }
| '-' expr %prec UMINUS { $$ = opr(UMINUS, 1, $2); }
| expr '+' expr { $$ = opr('+', 2, $1, $3); }
| expr '-' expr { $$ = opr('-', 2, $1, $3); }
| expr '*' expr { $$ = opr('*', 2, $1, $3); }
| expr '/' expr { $$ = opr('/', 2, $1, $3); }
| expr '<' expr { $$ = opr('<', 2, $1, $3); }
| expr '>' expr { $$ = opr('>', 2, $1, $3); }
| expr GE expr { $$ = opr(GE, 2, $1, $3); }
| expr LE expr { $$ = opr(LE, 2, $1, $3); }
| expr NE expr { $$ = opr(NE, 2, $1, $3); }
| expr EQ expr { $$ = opr(EQ, 2, $1, $3); }
| '(' expr ')' { $$ = $2; }
;
%%
27
nodeType *con(int value) {
nodeType *p;
/* allocate node */
if ((p = malloc(sizeof(conNodeType))) == NULL)

yyerror("out of memory");
/* copy information */
p->type = typeCon;
p->con.value = value;
return p;
}
nodeType *id(int i) {
nodeType *p;
/* allocate node */
if ((p = malloc(sizeof(idNodeType))) == NULL)
p->type = typeId;
p->id.i = i;
return p;
}
nodeType *opr(int oper, int nops, ...) {
va_list ap;
nodeType *p;
size_t size;
int i;
/* allocate node */
size = sizeof(oprNodeType) + (nops - 1) * sizeof(nodeType*);
if ((p = malloc(size)) == NULL)

p->type = typeOpr;
p->opr.oper = oper;
p->opr.nops = nops;
va_start(ap, nops);
for (i = 0; i < nops; i++)
p->opr.op[i] = va_arg(ap, nodeType*);
va_end(ap);
return p;
}
28
void freeNode(nodeType *p) {
int i;
if (!p) return;
if (p->type == typeOpr) {
for (i = 0; i < p->opr.nops; i++)
freeNode(p->opr.op[i]);
}
free (p);
}
void yyerror(char *s) {
fprintf(stdout, "%sn", s);
}

int main(void) {
yyparse();
return 0;
}
29
4.5 Interpreter
#include <stdio.h>
#include "calc3.h"
#include "y.tab.h"
int ex(nodeType *p) {
if (!p) return 0;
switch(p->type) {
case typeCon: return p->con.value;
case typeId: return sym[p->id.i];
case typeOpr:
switch(p->opr.oper) {
case WHILE: while(ex(p->opr.op[0]))
ex(p->opr.op[1]);
return 0;
case IF: if (ex(p->opr.op[0]))
ex(p->opr.op[1]);
else if (p->opr.nops > 2)
ex(p->opr.op[2]);

return 0;
case PRINT: printf("%dn", ex(p->opr.op[0]));
return 0;
case ';': ex(p->opr.op[0]);
return ex(p->opr.op[1]);
case '=': return sym[p->opr.op[0]->id.i] =
ex(p->opr.op[1]);
case UMINUS: return -ex(p->opr.op[0]);
case '+': return ex(p->opr.op[0]) + ex(p->opr.op[1]);
case '-': return ex(p->opr.op[0]) - ex(p->opr.op[1]);
case '*': return ex(p->opr.op[0]) * ex(p->opr.op[1]);
case '/': return ex(p->opr.op[0]) / ex(p->opr.op[1]);
case '<': return ex(p->opr.op[0]) < ex(p->opr.op[1]);
case '>': return ex(p->opr.op[0]) > ex(p->opr.op[1]);
case GE: return ex(p->opr.op[0]) >= ex(p->opr.op[1]);
case LE: return ex(p->opr.op[0]) <= ex(p->opr.op[1]);
case NE: return ex(p->opr.op[0]) != ex(p->opr.op[1]);
case EQ: return ex(p->opr.op[0]) == ex(p->opr.op[1]);
}
}
}
30
4.6 Compiler
#include <stdio.h>
#include "calc3.h"
#include "y.tab.h"

static int lbl;
int ex(nodeType *p) {
int lbl1, lbl2;
if (!p) return 0;
switch(p->type) {
case typeCon:
printf("tpusht%dn", p->con.value);
break;
case typeId:
printf("tpusht%cn", p->id.i + 'a');
break;
case typeOpr:
case WHILE:
printf("L%03d:n", lbl1 = lbl++);
ex(p->opr.op[0]);
printf("tjztL%03dn", lbl2 = lbl++);
ex(p->opr.op[1]);
printf("tjmptL%03dn", lbl1);
printf("L%03d:n", lbl2);
break;
case IF:
ex(p->opr.op[0]);
if (p->opr.nops > 2) {
/* if else */
ex(p->opr.op[1]);
printf("tjmptL%03dn", lbl2 = lbl++);

ex(p->opr.op[2]);
} else {
/* if */
ex(p->opr.op[1]);
}
break;
31
case PRINT:
ex(p->opr.op[0]);
printf("tprintn");
break;
case '=':
ex(p->opr.op[1]);
printf("tpopt%cn", p->opr.op[0]->id.i + 'a');
break;
case UMINUS:
ex(p->opr.op[0]);
printf("tnegn");
break;
default:
ex(p->opr.op[0]);
ex(p->opr.op[1]);

case '+': printf("taddn"); break;
case '-': printf("tsubn"); break;
case '*': printf("tmuln"); break;
case '/': printf("tdivn"); break;
case '<': printf("tcompLTn"); break;
case '>': printf("tcompGTn"); break;
case GE: printf("tcompGEn"); break;
case LE: printf("tcompLEn"); break;
case NE: printf("tcompNEn"); break;
case EQ: printf("tcompEQn"); break;
}
}
}
}
32
5. More Lex
5.1 Strings
Quoted strings frequently appear in programming languages.
Here is one way to match a
string in lex:
%{
char *yylval;
#include <string.h>
%}
%%
"[^"n]*["n] {

yylval = strdup(yytext+1);
if (yylval[yyleng-2] != '"')
warning("improperly terminated string");
else
yylval[yyleng-2] = 0;
printf("found '%s'n", yylval);
}
The above example ensures that strings don’t cross line
boundaries, and removes
enclosing quotes. If we wish to add escape sequences, such as n
or ", start states
simplify matters:
%{
char buf[100];
char *s;
%}
%x STRING
%%
" { BEGIN STRING; s = buf; }
<STRING>n { *s++ = 'n'; }
<STRING>t { *s++ = 't'; }
<STRING>" { *s++ = '"'; }
<STRING>" {
*s = 0;
BEGIN 0;
printf("found '%s'n", buf);
}

<STRING>n { printf("invalid string"); exit(1); }
<STRING>. { *s++ = *yytext; }
Exclusive start state STRING is defined in the definition
section. When the scanner
detects a quote, the BEGIN macro shifts lex into the STRING
state. Lex stays in the
STRING state, recognizing only patterns that begin with
<STRING>, until another BEGIN
33
is executed. Thus, we have a mini-environment for scanning
strings. When the trailing
quote is recognized, we switch back to state 0, the initial state.
5.2 Reserved Words
If your program has a large collection of reserved words, it is
more efficient to let lex
simply match a string, and determine in your own code whether
it is a variable or
reserved word. For example, instead of coding
"if" return IF;
"then" return THEN;
"else" return ELSE;
{letter}({letter}|{digit})* {
yylval.id = symLookup(yytext);
return IDENTIFIER;
}
where symLookup returns an index into the symbol table, it is

better to detect reserved
words and identifiers simultaneously, as follows:
{letter}({letter}|{digit})* {
int i;
if ((i = resWord(yytext)) != 0)
return (i);
yylval.id = symLookup(yytext);
return (IDENTIFIER);
}
This technique significantly reduces the number of states
required, and results in smaller
scanner tables.
5.3 Debugging Lex
Lex has facilities that enable debugging. This feature may vary
with different versions of
lex, so you should consult documentation for details. The code
generated by lex in file
lex.yy.c includes debugging statements that are enabled by
specifying command-line
option “-d”. Debug output may be toggled on and off by setting
yy_flex_debug.
Output includes the rule applied and corresponding matched
text. If you’re running lex
and yacc together, specify the following in your yacc input file:
extern int yy_flex_debug;
int main(void) {
yy_flex_debug = 1;
yyparse();

}
34
Alternatively, you may write your own debug code by defining
functions that display
information for the token value, and each variant of the yylval
union. This is illustrated
in the following example. When DEBUG is defined, the debug
functions take effect, and a
trace of tokens and associated values is displayed.
%union {
int ivalue;
...
};
%{
#ifdef DEBUG
int dbgToken(int tok, char *s) {
printf("token %sn", s);
return tok;
}
int dbgTokenIvalue(int tok, char *s) {
printf("token %s (%d)n", s, yylval.ivalue);
return tok;
}
#define RETURN(x) return dbgToken(x, #x)

#define RETURN_ivalue(x) return dbgTokenIvalue(x, #x)
#else
#define RETURN(x) return(x)
#define RETURN_ivalue(x) return(x)
#endif
%}
%%
[0-9]+ {
yylval.ivalue = atoi(yytext);
RETURN_ivalue(INTEGER);
}
"if" RETURN(IF);
"else" RETURN(ELSE);
35
6. More Yacc
6.1 Recursion
When specifying a list, we may do so using left recursion,
list:
item
| list ',' item
;
or right recursion:

list:
item
| item ',' list
If right recursion is used, all items on the list are pushed on the
stack. After the last item
is pushed, we start reducing. With left recursion, we never have
more than three terms on
the stack, since we reduce as we go along. For this reason, it is
advantageous to use left
recursion.
6.2 If-Else Ambiguity
A shift-reduce conflict that frequently occurs involves the if-
else construct. Assume we
have the following rules:
stmt:
IF expr stmt
| IF expr stmt ELSE stmt
...
and the following state:
IF expr stmt IF expr stmt . ELSE stmt
We need to decide if we should shift the ELSE, or reduce the IF
expr stmt at the top
of the stack. If we shift, then we have
IF expr stmt IF expr stmt ELSE . stmt
IF expr stmt IF expr stmt ELSE stmt .
IF expr stmt stmt .
where the second ELSE is paired with the second IF. If we

reduce, we have
36
IF expr stmt stmt . ELSE stmt
IF expr stmt . ELSE stmt
IF expr stmt ELSE . stmt
IF expr stmt ELSE stmt .
where the second ELSE is paired with the first IF. Modern
programming languages pair
an ELSE with the most recent unpaired IF, so the former
behavior is expected. This
works well with yacc, since default behavior, when a shift-
reduce conflict is encountered,
is to shift.
Although yacc does the right thing, it also issues a shift-reduce
warning message. To
remove the message, give IF-ELSE a higher precedence than the
simple IF statement:
%nonassoc IFX
%nonassoc ELSE
stmt:
IF expr stmt %prec IFX
| IF expr stmt ELSE stmt
6.3 Error Messages
A nice compiler gives the user meaningful error messages. For
example, not much
information is conveyed by the following message:

syntax error
If we track the line number in lex, then we can at least give the
user a line number:
void yyerror(char *s) {
fprintf(stderr, "line %d: %sn", yylineno, s);
}
When yacc discovers a parsing error, default action is to call
yyerror, and then return
from yylex with a return value of one. A more graceful action
flushes the input stream
to a statement delimiter, and continues to scan:
stmt:
';'
| expr ';'
| PRINT expr ';'
| VARIABLE '=' expr ';
| WHILE '(' expr ')' stmt
| IF '(' expr ')' stmt %prec IFX
| IF '(' expr ')' stmt ELSE stmt
| '{' stmt_list '}'
| error ';'
| error '}'
;
37
The error token is a special feature of yacc that will match all
input until the token

following error is found. For this example, when yacc detects an
error in a statement it
will call yyerror, flush input up to the next semicolon or brace,
and resume scanning.
6.4 Inherited Attributes
The examples so far have used synthesized attributes. At any
point in a syntax tree we can
determine the attributes of a node based on the attributes of its
children. Consider the rule
expr: expr '+' expr { $$ = $1 + $3; }
Since we are parsing bottom-up, the values of both operands are
available, and we can
determine the value associated with the left-hand side. An
inherited attribute of a node
depends on the value of a parent or sibling node. The following
grammar defines a C
variable declaration:
decl: type varlist
type: INT | FLOAT
varlist:
VAR { setType($1, $0); }
| varlist ',' VAR { setType($3, $0); }
Here is a sample parse:
. INT VAR
INT . VAR
type . VAR
type VAR .
type varlist .
decl .

When we reduce VAR to varlist, we should annotate the symbol
table with the type of
the variable. However, the type is buried in the stack. This
problem is resolved by
indexing back into the stack. Recall that $1 designates the first
term on the right-hand
side. We can index backwards, using $0, $-1, and so on. In this
case, $0 will do just
fine. If you need to specify a token type, the syntax is
$<tokentype>0, angle brackets
included. In this particular example, care must be taken to
ensure that type always
precedes varlist.
6.5 Embedded Actions
Rules in yacc may contain embedded actions:
list: item1 { do_item1($1); } item2 { do_item2($3); } item3
Note that the actions take a slot in the stack, so do_item2 must
use $3 to reference
item2. Actually, this grammar is transformed by yacc into the
following:
38
list: item1 _rule01 item2 _rule02 item3
_rule01: { do_item1($0); }
_rule02: { do_item2($0); }
6.6 Debugging Yacc
Yacc has facilities that enable debugging. This feature may vary
with different versions

of yacc, so you should consult documentation for details. The
code generated by yacc in
file y.tab.c includes debugging statements that are enabled by
defining YYDEBUG and
setting it to a non-zero value. This may also be done by
specifying command-line option
“-t”. With YYDEBUG properly set, debug output may be
toggled on and off by setting
yydebug. Output includes tokens scanned and shift/reduce
actions.
%{
#define YYDEBUG 1
%}
%%
...
%%
int main(void) {
#if YYDEBUG
yydebug = 1;
#endif
yylex();
}
In addition, you can dump the parse states by specifying
command-line option "-v".
States are dumped to file y.output, and are often useful when
debugging a grammar.
Alternatively, you can write your own debug code by defining a
TRACE macro, as
illustrated below. When DEBUG is defined, a trace of
reductions, by line number, is
displayed.

%{
#ifdef DEBUG
#define TRACE printf("reduce at line %dn", __LINE__);
#else
#define TRACE
#endif
%}
%%
statement_list:
statement
{ TRACE $$ = $1; }
| statement_list statement
{ TRACE $$ = newNode(';', 2, $1, $2); }
;
39
7. Bibliography
Aho, Alfred V., Ravi Sethi and Jeffrey D. Ullman [1986].
Compilers, Prinicples,
Techniques and Tools. Addison-Wesley, Reading,
Massachusetts.
Gardner, Jim, Chris Retterath and Eric Gisin [1988]. MKS Lex
& Yacc. Mortice Kern
Systems Inc., Waterloo, Ontario, Canada.
Johnson, Stephen C. [1975]. Yacc: Yet Another Compiler
Compiler. Computing Science

Technical Report No. 32, Bell Laboratories, Murray hill, New
Jersey.
Lesk, M. E. and E. Schmidt [1975]. Lex – A Lexical Analyzer
Generator. Computing
Science Technical Report No. 39, Bell Laboratories, Murray
Hill, New Jersey.
Levine, John R., Tony Mason and Doug Brown [1992]. Lex &
Yacc. O’Reilly &
Associates, Inc. Sebastopol, California.
A/
A/
http://members.xoom.com/thomasn/y_yacc.pdf
http://members.xoom.com/thomasn/y_lex.pdf
A/PrefaceIntroductionLexTheoryPracticeYaccTheoryPractice,
Part IPractice, Part IICalculatorDescriptionInclude FileLex
InputYacc InputInterpreterCompilerMore LexStringsReserved
WordsDebugging LexMore YaccRecursionIf-Else
AmbiguityError MessagesInherited AttributesEmbedded
ActionsDebugging YaccBibliography
cmparser.y
%{
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "ast.h"
#include "util.h"

/* other external function prototypes */
extern int yylex();
extern int initLex(int , char **);
extern AstNodePtr new_Node(NodeKind kind);
/* external global variables */
extern int yydebug;
extern int yylineno;
/* function prototypes */
void yyerror(const char *);
/* global variables */
AstNodePtr program;
FILE *outfile;
%}
/* YYSTYPE */
%union
{
AstNodePtr nodePtr;
int iVal;
char *cVal;
Type *type;
}
/* terminals */

%token TOK_ELSE TOK_IF TOK_RETURN TOK_VOID
TOK_INT TOK_WHILE
%token TOK_PLUS TOK_MINUS TOK_MULT TOK_DIV
TOK_LT TOK_LE TOK_GT TOK_GE TOK_EQ TOK_NE
TOK_ASSIGN TOK_SEMI TOK_COMMA
%token TOK_LPAREN TOK_RPAREN TOK_LSQ TOK_RSQ
TOK_LBRACE TOK_RBRACE TOK_ERROR
%token <cVal> TOK_ID
%token <iVal> TOK_NUM
%type <nodePtr> Declarations Functions Fun_Declaration
%type <type> Type_Specifier
%type <nodePtr> Compound_Stmt Statements Statement
%type <nodePtr> Expr_Statement If_Else_Statement
Selection_Stmt Iteration_Stmt Return_Stmt
%type <nodePtr> Expression Simple_Expression
Additive_Expression Factor Var Call Term
%type <nodePtr> Args Args_List
%type <nodePtr> Params Param_List Param
/* associativity and precedence */
%nonassoc TOK_IF
%nonassoc TOK_ELSE
%right TOK_ASSIGN
%left TOK_EQ TOK_NE
%nonassoc TOK_LT TOK_GT TOK_LE TOK_GE
%left TOK_PLUS TOK_SUB
%left TOK_MULT TOK_DIV
%nonassoc error
%%
Start : Declarations {}

;
Declarations : Functions { program = $1;}
| Var_Declaration Declarations { }
;
Functions : Fun_Declaration { $$ = $1 }
| Fun_Declaration Functions {
$1->sibling = $2;
$$ = $1
}
;
Var_Declaration : Type_Specifier TOK_ID TOK_SEMI {
if ($1->kind != INT){
yyerror("non-int variable
declaredn");
yyerror($2);
}
ElementPtr symElement =
symLookup($2);
if(!symElement ||
scopeDepth != symElement->scope)
symInsert($2,$1,yylineno);
else
{
yyerror("Redeclaration
of variable");
yyerror($2);
free($1);
}
}
| Type_Specifier TOK_ID TOK_LSQ TOK_NUM
TOK_RSQ TOK_SEMI {
if ($1->kind

!= INT){
yyerror("non-
int variable declaredn");
yyerror($2);
}
ElementPtr
symElement = symLookup($2);
if(!symElement
|| scopeDepth != symElement->scope)
{ /* reuse
Type for storing dimension */
$1->kind =
ARRAY;
$1-
>dimension = $4;
$1->function
= NULL;
}
else
{
yyerror("Redeclaration of variable");
yyerror($2);
free($1);
}
}
;
Fun_Declaration : Type_Specifier TOK_ID TOK_LPAREN {
$<nodePtr>$ =
new_Node(METHOD);
$<nodePtr>$->nLinenumber
= yylineno;
$<nodePtr>$->nType = $1;

symLookup($2);
if(!symElement ||
{
$<nodePtr>$-
>nSymbolPtr = symInsert($2,$1,yylineno);
$<nodePtr>$-
>nSymbolPtr->stype->function = (Type*) malloc(sizeof(Type));
$<nodePtr>$-
>nSymbolPtr->stype->function->kind = $1->kind;
$<nodePtr>$-
>nSymbolPtr->snode = $<nodePtr>$;
$1->kind =
FUNCTION;
enterScope();
}
else
{
of function");
yyerror($2);
free($1);
}
}
Params TOK_RPAREN {
$<nodePtr>4->children[0] =
$5;
}
Compound_Stmt {
$8;
$$ = $<nodePtr>4;
leaveScope();

}
;
Params : Param_List {$$ = $1}
| TOK_VOID {$$=NULL;}
;
Param_List : Param_List TOK_COMMA Param {
$<nodePtr>$ = $1;
while($<nodePtr>$->sibling)
$<nodePtr>$ =
$<nodePtr>$->sibling;
$<nodePtr>$->sibling = $3;
$3->sibling = NULL;
$$ = $1;
}
| Param {
$1->sibling = NULL;
$$ = $1;
}
;
Param : Type_Specifier TOK_ID {
symLookup($2);
if(!symElement || scopeDepth !=
symElement->scope)
{
$$ = new_Node(FORMALVAR);
$$->nLinenumber = yylineno;
yyerror("non-int variable
declaredn");
}
else{

$$->nSymbolPtr =
$$->nType = $1;
}
}
else
{
yyerror("Redeclaration of
variable");
yyerror($2);
free($1);
}
}
| Type_Specifier TOK_ID TOK_LSQ TOK_RSQ {
symLookup($2);
if(!symElement ||
{
Type *typ;
yyerror("non-int
array declaredn");
}
typ =
new_type(ARRAY);
$$ =
new_Node(FORMALVAR);
$$->nLinenumber =
yylineno;
$$->nSymbolPtr =
symInsert($2,typ,yylineno);
$$->nType = typ;
free($1);
}
else

{
of variablen");
yyerror($2);
free($1);
}
}
;
Type_Specifier : TOK_INT {
$$ = (Type*) new_type(INT);
}
| TOK_VOID {
$$ = (Type*) new_type(VOID);
}
;
Compound_Stmt : TOK_LBRACE {
enterScope();
$<nodePtr>$ =
new_StmtNode(COMPOUND_STMT);
$<nodePtr>$->nSymbolTabPtr =
symbolStackTop->symbolTablePtr;
$<nodePtr>$->nLinenumber = yylineno;
}
Statements { $<nodePtr>2->children[0] = $3; } //
shud point to first statement
TOK_RBRACE {
leaveScope();
$$ = $<nodePtr>2;
}
| TOK_LBRACE {
enterScope();
$<nodePtr>$ =

new_StmtNode(COMPOUND_STMT);
$<nodePtr>$->nSymbolTabPtr =
symbolStackTop->symbolTablePtr;
}
Local_Declarations Statements { $<nodePtr>2-
>children[0] = $4;} // shud point to first statement
TOK_RBRACE {
leaveScope();
$$ = $<nodePtr>2;
}
;
Local_Declarations : Var_Declaration Local_Declarations {}
| Var_Declaration {}
;
Statements : Statement Statements {
$1->sibling = $2;
$$ = $1;
}
| {$$=NULL}
;
Statement : Expr_Statement {
$$ = $1;
}
| Compound_Stmt {
$$ = $1;
}
| Selection_Stmt { $$ = $1; }
| Iteration_Stmt {
$$ = $1;
}
| Return_Stmt { $$ = $1; }

;
Expr_Statement : Expression TOK_SEMI {
$$ =
new_StmtNode(EXPRESSION_STMT);
$$->children[0] = $1;
}
| TOK_SEMI {
$$ =
new_StmtNode(EXPRESSION_STMT) ;
$$->children[0] = NULL;
}
;
Selection_Stmt : If_Else_Statement %prec TOK_IF {
$$ = $1;
}
| If_Else_Statement TOK_ELSE Statement {
$$ = $1;
$$->children[2]= $3;
}
;
If_Else_Statement : TOK_IF TOK_LPAREN Expression {
$<nodePtr>$ =
new_StmtNode(IF_THEN_ELSE_STMT);
= yylineno;
$<nodePtr>$->children[0] =
$3;
}
$6;

$$ = $<nodePtr>4;
}
;
Iteration_Stmt : TOK_WHILE TOK_LPAREN Expression {
$<nodePtr>$ =
new_StmtNode(WHILE_STMT);
= yylineno;
$3;
}
$6;
$$ = $<nodePtr>4;
= yylineno;
}
;
Return_Stmt : TOK_RETURN Expression TOK_SEMI {
$$ =
new_StmtNode(RETURN_STMT);
$$->children[0] = $2;
}
| TOK_RETURN TOK_SEMI {
$$ =
new_StmtNode(RETURN_STMT);
$$->children[0] = NULL;
}
;

Expression : Var TOK_ASSIGN {
$<nodePtr>$ =
new_ExprNode(ASSI_EXP);
$<nodePtr>$->children[0] = $1;
}
Expression {
$<nodePtr>3->children[1] = $4;
$$ = $<nodePtr>3;
}
| Simple_Expression {
$$ = $1;
}
;
Var : TOK_ID {
$$ = new_ExprNode(VAR_EXP);
$$->nSymbolPtr = symLookup($1);
if(!$$->nSymbolPtr)
{
yyerror("Variable must be declared before use");
yyerror($1);
free($$);
}
}
| TOK_ID TOK_LSQ Expression TOK_RSQ {
$$ = new_ExprNode(ARRAY_EXP);
$$->nSymbolPtr = symLookup($1);
if(!$$->nSymbolPtr)
{

yyerror("Variable must be
declared before use");
yyerror($1);
free($$);
}
else
$$->children[0] = $3;
}
;
Simple_Expression : Additive_Expression TOK_GT {
$<nodePtr>$ =
new_ExprNode(GT_EXP);
= yylineno;
$1;
}
$$ = $<nodePtr>3;
}
| Additive_Expression TOK_LT {
$<nodePtr>$ =
new_ExprNode(LT_EXP);
= yylineno;
$1;
}
$$ = $<nodePtr>3;
}
| Additive_Expression TOK_GE {

$<nodePtr>$ =
new_ExprNode(GE_EXP);
= yylineno;
$1;
}
$$ = $<nodePtr>3;
}
| Additive_Expression TOK_LE {
$<nodePtr>$ =
new_ExprNode(LE_EXP);
= yylineno;
$1;
}
$$ = $<nodePtr>3;
}
| Additive_Expression TOK_EQ {
$<nodePtr>$ =
new_ExprNode(EQ_EXP);
= yylineno;
$1;
}
$$ = $<nodePtr>3;

}
| Additive_Expression TOK_NE {
$<nodePtr>$ =
new_ExprNode(NE_EXP);
= yylineno;
$1;
}
$$ = $<nodePtr>3;
}
| Additive_Expression { $$ = $1 }
;
Additive_Expression : Additive_Expression TOK_PLUS {
$<nodePtr>$ =
new_ExprNode(ADD_EXP);
= yylineno;
$1;
}
Term {
$$ = $<nodePtr>3;
}
| Additive_Expression TOK_MINUS {
$<nodePtr>$ =
new_ExprNode(SUB_EXP);
= yylineno;

$1;
}
Term {
$$ = $<nodePtr>3;
}
| Term {
$$ = $1;
}
;
Term : Term TOK_MULT {
$<nodePtr>$ = new_ExprNode(MULT_EXP);
}
Factor {
$$ = $<nodePtr>3;
}
| Term TOK_DIV {
$<nodePtr>$ = new_ExprNode(DIV_EXP);
}
Factor {
$$ = $<nodePtr>3;
}
| Factor {
$$ = $1;
}
;

Factor : TOK_LPAREN Expression TOK_RPAREN { $$ = $2; }
| Var { $$ = $1 }
| Call { $$ = $1 }
| TOK_NUM {
$$ = new_ExprNode(CONST_EXP);
$$->nValue = $1;
}
;
Call : TOK_ID TOK_LPAREN Args TOK_RPAREN {
$$=
new_ExprNode(CALL_EXP);
$$->nSymbolPtr =
symLookup($1);
if(!$$->nSymbolPtr)
{
printf("function %s not found,
forward referencen", $1);
$$->fname = strdup($1);
}
$$->children[0] = $3;
}
;
Args : Args_List {$$=$1;}
| { $$ = NULL; }
;
Args_List : Args_List TOK_COMMA Expression
{
$<nodePtr>$ = $1;
while($<nodePtr>$->sibling)

$<nodePtr>$ =
$<nodePtr>$->sibling;
$<nodePtr>$->sibling = $3;
$3->sibling = NULL;
$$ = $1;
}
| Expression {
$1->sibling = NULL;
$$ = $1;
}
;
%%
void yyerror (char const *s) {
fprintf (stderr, "Error at line %d: %sn", yylineno, s);
exit(1);
}
int main(int argc, char **argv){
initLex(argc,argv);
#ifdef YYLLEXER
while (gettok() !=0) ; //gettok returns 0 on EOF
return;
#else
initSymbolTable();
yyparse();
#endif
}

ast.c
#include "ast.h"
//creates a new expression node
AstNodePtr new_ExprNode(ExpKind kind)
{
AstNodePtr node = (AstNodePtr) malloc(sizeof(AstNode));
node->nKind = EXPRESSION;
node->eKind = kind;
return node;
}
//creates a new statement node
AstNodePtr new_StmtNode(StmtKind kind)
{
node->nKind = STMT;
node->sKind = kind;
return node;
}
AstNodePtr new_Node(NodeKind kind)
{
node->nKind = kind;
return node;
}
//creates a new type node for entry into symbol table
Type* new_type(TypeKind kind)
{

Type *typ1 = (Type *)malloc(sizeof(Type));
typ1->kind = kind;
return typ1;
}

AST Parser for C-like Language

Recommended

Recommended

More Related Content

Similar to AST Parser for C-like Language

Similar to AST Parser for C-like Language (20)

More from wkyra78

More from wkyra78 (20)

Recently uploaded

Recently uploaded (20)

AST Parser for C-like Language