Scanner source code

This commit is contained in:
Victor Fernandes 2017-03-06 10:05:09 -05:00
parent f3c285ec50
commit db10e77bc7
3 changed files with 558 additions and 0 deletions

372
scanner.c Executable file
View File

@ -0,0 +1,372 @@
/* Filename: scanner.c
/* PURPOSE:
* SCANNER.C: Functions implementing a Lexical Analyzer (Scanner)
* as required for CST8152, Assignment #2
* scanner_init() must be called before using the scanner.
* The file is incomplete;
* Provided by: Svillen Ranev
* Version: 1.16.02
* Date: 29 September 2016
*******************************************************************
* REPLACE THIS HEADER WITH YOUR HEADER
*******************************************************************
*/
/* The #define _CRT_SECURE_NO_WARNINGS should be used in MS Visual Studio projects
* to suppress the warnings about using "unsafe" functions like fopen()
* and standard sting library functions defined in string.h.
* The define does not have any effect in Borland compiler projects.
*/
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h> /* standard input / output */
#include <ctype.h> /* conversion functions */
#include <stdlib.h> /* standard library functions and constants */
#include <string.h> /* string functions */
#include <limits.h> /* integer types constants */
#include <float.h> /* floating-point types constants */
/*#define NDEBUG to suppress assert() call */
#include <assert.h> /* assert() prototype */
/* project header files */
#include "buffer.h"
#include "token.h"
#include "table.h"
#define DEBUG /* for conditional processing */
#undef DEBUG
/* Global objects - variables */
/* This buffer is used as a repository for string literals.
It is defined in platy_st.c */
extern Buffer * str_LTBL; /*String literal table */
int line; /* current line number of the source code */
extern int scerrnum; /* defined in platy_st.c - run-time error number */
/* Local(file) global objects - variables */
static Buffer *lex_buf;/*pointer to temporary lexeme buffer*/
/* No other global variable declarations/definitiond are allowed */
/* scanner.c static(local) function prototypes */
static int char_class(char c); /* character class function */
static int get_next_state(int, char, int *); /* state machine function */
static int iskeyword(char * kw_lexeme); /*keywords lookup functuion */
static long atool(char * lexeme); /* converts octal string to decimal value */
int scanner_init(Buffer * sc_buf) {
if(b_isempty(sc_buf)) return EXIT_FAILURE;/*1*/
/* in case the buffer has been read previously */
b_setmark(sc_buf, 0);
b_retract_to_mark(sc_buf);
b_reset(str_LTBL);
line = 1;
return EXIT_SUCCESS;/*0*/
/* scerrnum = 0; *//*no need - global ANSI C */
}
Token mlwpar_next_token(Buffer * sc_buf)
{
Token t; /* token to return after recognition */
unsigned char c; /* input symbol */
int state = 0; /* initial state of the FSM */
short lexstart; /*start offset of a lexeme in the input buffer */
short lexend; /*end offset of a lexeme in the input buffer */
int accept = NOAS; /* type of state - initially not accepting */
/*
lexstart is the offset from the beginning of the char buffer of the
input buffer (sc_buf) to the first character of the current lexeme,
which is being processed by the scanner.
lexend is the offset from the beginning of the char buffer of the
input buffer (sc_buf) to the last character of the current lexeme,
which is being processed by the scanner.
*/
DECLARE YOUR VARIABLES HERE IF NEEDED
while (1){ /* endless loop broken by token returns it will generate a warning */
GET THE NEXT SYMBOL FROM THE INPUT BUFFER
c = b_getc(sc_buf);
/* special cases or token driven processing */
WRITE YOUR CODE FOR PROCESSING THE SPECIAL CASES HERE.
COMMENTS AND STRING LITERALS ARE ALSO PROCESSED HERE.
WHAT FOLLOWS IS A PSEUDO CODE. YOU CAN USE switch STATEMENT
INSTEAD OF if-else TO PROCESS THE SPECIAL CASES
DO NOT FORGET TO COUNT THE PROGRAM LINES
IF (c == SOME CHARACTER)
...
SKIP CHARACTER (FOR EXAMPLE SPACE)
continue;
OR SET TOKEN (SET TOKEN CODE AND TOKEN ATTRIBUTE(IF AVAILABLE))
return t;
EXAMPLE:
if (c == ' ') continue;
if (c == '{'){ t.code = RBR_T; /*no attribute */ return t;
if (c == '+'){ t.code = ART_OP_T; t.attribute.arr_op = PLUS */ return t;
...
IF (c == '.') TRY TO PROCESS .AND. or .OR.
IF SOMETHING ELSE FOLLOWS . OR THE LAST . IS MISSING
RETURN AN ERROR TOKEN
IF (c == '!') TRY TO PROCESS COMMENT
IF THE FOLLOWING IS NOT CHAR IS NOT < REPORT AN ERROR
ELSE IN A LOOP SKIP CHARACTERS UNTIL line terminator is found THEN continue;
...
IF STRING (FOR EXAMPLE, "text") IS FOUND
SET MARK TO MARK THE BEGINNING OF THE STRING
IF THE STRING IS LEGAL
USING b_addc(..)COPY THE text FROM INPUT BUFFER INTO str_LTBL
ADD '\0' at the end make the string C-type string
SET STRING TOKEN
(the attribute of the string token is the offset from
the beginning of the str_LTBL char buffer to the beginning
of the string (TEXT in the example))
return t;
ELSE
THE STRING LITERAL IS ILLEGAL
SET ERROR TOKEN FOR ILLEGAL STRING (see assignment)
DO NOT STORE THE ILLEGAL STRINg IN THE str_LTBL
return t;
IF (c == ANOTHER CHARACTER)
SET TOKEN
return t;
/* Process state transition table */
IF (c is a digit OR c is a letter){
SET THE MARK AT THE BEGINING OF THE LEXEME
b_setmark(sc_buf,forward);
....
CODE YOUR FINATE STATE MACHINE HERE (FSM or DFA)
IT IMPLEMENTS THE FOLLOWING ALGORITHM:
FSM0. Begin with state = 0 and the input character c
FSM1. Get the next state from the transition table calling
state = get_next_state(state, c, &accept);
FSM2. Get the next character
FSM3. If the state is not accepting (accept == NOAS), go to step FSM1
If the step is accepting, token is found, leave the machine and
call an accepting function as described below.
RETRACT getc_offset IF THE FINAL STATE IS A RETRACTING FINAL STATE
GET THE BEGINNING AND THE END OF THE LEXEME
lexstart = b_getmark(sc_buf);
SET lexend TO getc_offset USING AN APPROPRIATE BUFFER FUNCTION
CREATE A TEMPORRARY LEXEME BUFFER HERE;
lex_buf = b_create(...);
. RETRACT getc_offset to the MARK SET PREVIOUSLY AT THE BEGINNING OF THE LEXEME AND
. USING b_getc() COPY THE LEXEME BETWEEN lexstart AND lexend FROM THE INPUT BUFFER INTO lex_buf USING b_addc(...),
. WHEN VID (KEYWORDS INCLUDED), FPL OR IL IS RECOGNIZED
. YOU MUST CALL THE ACCEPTING FUNCTION USING THE ARRAY aa_table ,WHICH
. CONTAINS POINTERS TO FUNCTIONS. THE ARRAY INDEX OF THE FUNCTION TO BE
. CALLED IS STORED IN THE VARIABLE state.
. YOU ARE NOT ALLOWED TO CALL ANY OF THE ACCEPTING FUNCTIONS BY NAME.
. THE ARGUMENT TO THE FUNCTION IS THE STRING STORED IN lex_buf.
....
b_free(lex_buf);
return t;
CHECK OTHER CHARS HERE if NEEDED, SET A TOKEN AND RETURN IT.
FOR ILLEGAL CHARACTERS SET ERROR TOKEN.
THE ILLEGAL CHAR IS THE ATTRIBUTE OF THE ERROR TOKEN
IN A CASE OF RUNTIME ERROR, THE FUNCTION MUST STORE
A NON-NEGATIVE NUMBER INTO THE GLOBAL VARIABLE scerrnum
AND RETURN AN ERROR TOKEN. THE ERROR TOKEN ATTRIBUTE MUST
BE THE STRING "RUN TIME ERROR: "
}//end while(1)
}
DO NOT MODIFY THE CODE OF THIS FUNCTION
YOU CAN REMOVE THE COMMENTS
int get_next_state(int state, char c, int *accept)
{
int col;
int next;
col = char_class(c);
next = st_table[state][col];
#ifdef DEBUG
printf("Input symbol: %c Row: %d Column: %d Next: %d \n",c,state,col,next);
#endif
/*
The assert(int test) macro can be used to add run-time diagnostic to programs
and to "defend" from producing unexpected results.
assert() is a macro that expands to an if statement;
if test evaluates to false (zero) , assert aborts the program
(by calling abort()) and sends the following message on stderr:
Assertion failed: test, file filename, line linenum
The filename and linenum listed in the message are the source file name
and line number where the assert macro appears.
If you place the #define NDEBUG directive ("no debugging")
in the source code before the #include <assert.h> directive,
the effect is to comment out the assert statement.
*/
assert(next != IS);
/*
The other way to include diagnostics in a program is to use
conditional preprocessing as shown bellow. It allows the programmer
to send more details describing the run-time problem.
Once the program is tested thoroughly #define DEBUG is commented out
or #undef DEBUF is used - see the top of the file.
*/
#ifdef DEBUG
if(next == IS){
printf("Scanner Error: Illegal state:\n");
printf("Input symbol: %c Row: %d Column: %d\n",c,state,col);
exit(1);
}
#endif
*accept = as_table[next];
return next;
}
int char_class (char c)
{
int val;
THIS FUNCTION RETURNS THE COLUMN NUMBER IN THE TRANSITION
TABLE st_table FOR THE INPUT CHARACTER c.
SOME COLUMNS MAY REPRESENT A CHARACTER CLASS .
FOR EXAMPLE IF COLUMN 1 REPRESENTS [A-Z]
THE FUNCTION RETURNS 1 EVERY TIME c IS ONE
OF THE LETTERS A,B,...,Z.
return val;
}
HERE YOU WRITE THE DEFINITIONS FOR YOUR ACCEPTING FUNCTIONS.
************************************************************
ACCEPTING FUNCTION FOR THE arithmentic variable identifier AND keywords (VID - AVID/KW)
REPLACE XX WITH THE CORRESPONDING ACCEPTING STATE NUMBER
Token aa_funcXX(char lexeme[]){
WHEN CALLED THE FUNCTION MUST
1. CHECK IF THE LEXEME IS A KEYWORD.
IF YES, IT MUST RETURN A TOKEN WITH THE CORRESPONDING ATTRIBUTE
FOR THE KEYWORD. THE ATTRIBUTE CODE FOR THE KEYWORD
IS ITS INDEX IN THE KEYWORD LOOKUP TABLE (kw_table in table.h).
IF THE LEXEME IS NOT A KEYWORD, GO TO STEP 2.
2. SET a AVID TOKEN.
IF THE lexeme IS LONGER than VID_LEN (see token.h) CHARACTERS,
ONLY FIRST VID_LEN CHARACTERS ARE STORED
INTO THE VARIABLE ATTRIBUTE ARRAY vid_lex[](see token.h) .
ADD \0 AT THE END TO MAKE A C-type STRING.
return t;
}
ACCEPTING FUNCTION FOR THE string variable identifier (VID - SVID)
REPLACE XX WITH THE CORRESPONDING ACCEPTING STATE NUMBER
Token aa_funcXX(char lexeme[]){
WHEN CALLED THE FUNCTION MUST
1. SET a SVID TOKEN.
IF THE lexeme IS LONGER than VID_LEN characters,
ONLY FIRST VID_LEN-1 CHARACTERS ARE STORED
INTO THE VARIABLE ATTRIBUTE ARRAY vid_lex[],
AND THEN THE % CHARACTER IS APPENDED TO THE NAME.
ADD \0 AT THE END TO MAKE A C-type STRING.
return t;
}
ACCEPTING FUNCTION FOR THE floating-point literal (FPL)
Token aa_funcXX(char lexeme[]){
THE FUNCTION MUST CONVERT THE LEXEME TO A FLOATING POINT VALUE,
WHICH IS THE ATTRIBUTE FOR THE TOKEN.
THE VALUE MUST BE IN THE SAME RANGE AS the value of 4-byte float in C.
IN CASE OF ERROR (OUT OF RANGE) THE FUNCTION MUST RETURN ERROR TOKEN
THE ERROR TOKEN ATTRIBUTE IS lexeme. IF THE ERROR lexeme IS LONGER
than ERR_LEN caharacters, only the first ERR_LEN character are
stored in err_lex.
return t;
}
ACCEPTING FUNCTION FOR THE integer literal(IL) - decimal constant (DIL) AND ZERO (0)
Token aa_funcXX(char lexeme[]){
THE FUNCTION MUST CONVERT THE LEXEME REPRESENTING A DECIMAL CONSTANT AND 0
TO A DECIMAL INTEGER VALUE, WHICH IS THE ATTRIBUTE FOR THE TOKEN.
THE VALUE MUST BE IN THE SAME RANGE AS the value of 2-byte integer in C.
IN CASE OF ERROR (OUT OF RANGE) THE FUNCTION MUST RETURN ERROR TOKEN
THE ERROR TOKEN ATTRIBUTE IS lexeme. IF THE ERROR lexeme IS LONGER
than ERR_LEN caharacters, only the first ERR_LEN character are
stored in err_lex.
return t;
}
ACCEPTING FUNCTION FOR THE integer literal(IL) - octal constant (OIL)
Token aa_funcXX(char lexeme[]){
THE FUNCTION MUST CONVERT THE LEXEME REPRESENTING AN OCTAL CONSTANT
TO A DECIMAL INTEGER VALUE WHICH IS THE ATTRIBUTE FOR THE TOKEN.
THE VALUE MUST BE IN THE SAME RANGE AS the value of 2-byte integer in C.
THIS FUNCTION IS SIMILAR TO THE FUNCTION ABOVE AND THEY CAN BE
COMBINED INTO ONE FUNCTION
THE MAIN DIFFERENCE IE THAT THIS FUNCTION CALLS
THE FUNCTION atool(char * lexeme) WHICH CONVERTS AN ASCII STRING
REPRESENTING AN OCTAL NUMBER TO INTEGER VALUE
IN CASE OF ERROR (OUT OF RANGE) THE FUNCTION MUST RETURN ERROR TOKEN
THE ERROR TOKEN ATTRIBUTE IS lexeme. IF THE ERROR lexeme IS LONGER
than ERR_LEN caharacters, only the first ERR_LEN character are
stored in err_lex.
return t;
}
ACCEPTING FUNCTION FOR THE ERROR TOKEN
Token aa_funcXX(char lexeme[]){
THE FUNCTION SETS THE ERROR TOKEN. lexeme[] CONTAINS THE ERROR
THE ATTRIBUTE OF THE ERROR TOKEN IS THE lexeme ITSELF
AND IT MUST BE STORED in err_lex. IF THE ERROR lexeme IS LONGER
than ERR_LEN caharacters, only the first ERR_LEN character are
stored in err_lex.
return t;
}
CONVERSION FUNCTION
long atool(char * lexeme){
THE FUNCTION CONVERTS AN ASCII STRING
REPRESENTING AN OCTAL INTEGER CONSTANT TO INTEGER VALUE
}
HERE YOU WRITE YOUR ADDITIONAL FUNCTIONS (IF ANY).
FOR EXAMPLE
int iskeyword(char * kw_lexeme){}

112
table.h Executable file
View File

@ -0,0 +1,112 @@
/* Filename: table.h
* Transition Table and function declarations necessary for the scanner implementation
* as required for CST8152 - Assignment #2.
* Version: 1.16.02
* Date: 29 September 2016
* Provided by: Svillen Ranev
* The file is incomplete. You are to complete it.
***************************************************
* REPLACE THIS HEADER WITH YOUR HEADER
***************************************************
*/
#ifndef TABLE_H_
#define TABLE_H_
#ifndef BUFFER_H_
#include "buffer.h"
#endif
#ifndef NULL
#include <_null.h> /* NULL pointer constant is defined there */
#endif
/* Source end-of-file (SEOF) sentinel symbol
* '\0' or only one of the folowing constants: 255, 0xFF , EOF
*/
/* Single-lexeme tokens processed separately one by one
* in the token-driven part of the scanner
* '=' , ' ' , '(' , ')' , '{' , '}' , == , <> , '>' , '<' ,
* space
* !<comment , ',' , '"' , ';' , '-' , '+' , '*' , '/', # ,
* .AND., .OR. , SEOF, 'wrong symbol',
*/
REPLACE *ESN* WITH YOUR ERROR STATE NUMBER
#define ES *ESN* /* Error state */
#define IS -1 /* Inavalid state */
/* State transition table definition */
REPLACE *CN* WITH YOUR COLUMN NUMBER
#define TABLE_COLUMNS *CN*
/*transition table - type of states defined in separate table */
int st_table[ ][TABLE_COLUMNS] = {
/* State 0 */ {YOUR INITIALIZATION},
/* State 1 */ {YOUR INITIALIZATION},
.
. YOUR TABLE INITIALIZATION HERE
.
/* State N */ {YOUR INITIALIZATION},
/* Accepting state table definition */
REPLACE *N1*, *N2*, and *N3* WITH YOUR NUMBERS
#define ASWR *N1* /* accepting state with retract */
#define ASNR *N2* /* accepting state with no retract */
#define NOAS *N3* /* not accepting state */
int as_table[ ] = {YOUR INITIALIZATION HERE - USE ASWR, ASNR, NOAS };
/* Accepting action function declarations */
FOR EACH OF YOUR ACCEPTING STATES YOU MUST PROVIDE
ONE FUNCTION PROTOTYPE. THEY ALL RETURN Token AND TAKE
ONE ARGUMENT: A string REPRESENTING A TOKEN LEXEME.
Token aa_funcXX(char *lexeme);
Replace XX with the number of the accepting state: 02, 03 and so on.
/* defining a new type: pointer to function (of one char * argument)
returning Token
*/
typedef Token (*PTR_AAF)(char *lexeme);
/* Accepting function (action) callback table (array) definition */
/* If you do not want to use the typedef, the equvalent declaration is:
* Token (*aa_table[])(char lexeme[]) = {
*/
PTR_AAF aa_table[ ] ={
HERE YOU MUST PROVIDE AN INITIALIZATION FOR AN ARRAY OF POINTERS
TO ACCEPTING FUNCTIONS. THE ARRAY HAS THE SAME SIZE AS as_table[ ].
YOU MUST INITIALIZE THE ARRAY ELEMENTS WITH THE CORRESPONDING
ACCEPTING FUNCTIONS (FOR THE STATES MARKED AS ACCEPTING IN as_table[]).
THE REST OF THE ELEMENTS MUST BE SET TO NULL.
};
/* Keyword lookup table (.AND. and .OR. are not keywords) */
#define KWT_SIZE 8
char * kw_table []= {
"ELSE",
"IF",
"INPUT",
"OUTPUT",
"PLATYPUS",
"REPEAT",
"THEN",
"USING"
};
#endif

74
token.h Executable file
View File

@ -0,0 +1,74 @@
/* Filename: token.h
* Token declarations necessary for the scanner implementation
* CST8152, Assignment #2
* Version: 1.16.02
* Date: 29 September 2016
* Provided by: Svillen Ranev
* The file is complete and MUST NOT be modified.
*/
#ifndef TOKEN_H_
#define TOKEN_H_
/*#pragma warning(1:4001) *//*to enforce C89 type comments - to make //comments an warning */
/*#pragma warning(error:4001)*//* to enforce C89 comments - to make // comments an error */
/* Constants */
#define VID_LEN 8 /* variable identifier length */
#define ERR_LEN 20 /* error message length */
#define INL_LEN 5 /* maximum number of digits for IL */
/* Token codes */
#define ERR_T 0 /* Error token */
#define SEOF_T 1 /* Source end-of-file token */
#define AVID_T 2 /* Arithmetic Variable identifier token */
#define SVID_T 3 /* String Variable identifier token */
#define FPL_T 4 /* Floating point literal token */
#define INL_T 5 /* Integer literal token */
#define STR_T 6 /* String literal token */
#define SCC_OP_T 7 /* String concatenation operator token */
#define ASS_OP_T 8 /* Assignment operator token */
#define ART_OP_T 9 /* Arithmetic operator token */
#define REL_OP_T 10 /* Relational operator token */
#define LOG_OP_T 11 /* Logical operator token */
#define LPR_T 12 /* Left parenthesis token */
#define RPR_T 13 /* Right parenthesis token */
#define LBR_T 14 /* Left brace token */
#define RBR_T 15 /* Right brace token */
#define KW_T 16 /* Keyword token */
#define COM_T 17 /* Comma token */
#define EOS_T 18 /* End of statement *(semi - colon) */
/* Operators token attributes */
typedef enum ArithmeticOperators {PLUS, MINUS, MULT, DIV} Arr_Op;
typedef enum RelationalOperators {EQ, NE, GT, LT} Rel_Op;
typedef enum LogicalOperators {AND,OR} Log_Op;
/* Structure declaring the token and its attributes */
typedef union TokenAttribute{
int get_int; /* integer attributes accessor */
Arr_Op arr_op; /* arithmetic operator attribute code */
Rel_Op rel_op; /* relational operator attribute code */
Log_Op log_op; /* logical operator attribute code */
int int_value; /* integer literal attribute (value) */
int kwt_idx; /* keyword index in the keyword table */
short str_offset; /* sring literal offset from the beginning of */
/* the string literal buffer (str_LTBL->cb_head) */
float flt_value; /* floating-point literal attribute (value) */
char vid_lex[VID_LEN+1]; /* variable identifier token attribute */
char err_lex[ERR_LEN+1]; /* error token attribite */
} TA;
typedef struct Token
{
int code; /* token code */
TA attribute; /* token attribute */
} Token;
#endif