/* Filename: scanner.c PURPOSE: * SCANNER.C: Functions implementing a Lexical Analyzer (Scanner) * as required for CST8152, Assignment #2 * scanner_init() must be called before using the scanner. * The file is incomplete; * Author: Victor Fernandes, 040772243 * Provided by: Svillen Ranev * Version: 1.17.1 * Date: 30 January 2017 * Function list: scanner_init, malar_next_token, get_next_state, char_class, aa_func02, aa_func03, aa_func05, aa_func08, aa_func10, aa_func12 aa_func13, atool, iskeyword */ /* The #define _CRT_SECURE_NO_WARNINGS should be used in MS Visual Studio projects * to suppress the warnings about using "unsafe" functions like fopen() * and standard sting library functions defined in string.h. * The define does not have any effect in Borland compiler projects. */ #define _CRT_SECURE_NO_WARNINGS #include <stdio.h> /* standard input / output */ #include <ctype.h> /* conversion functions */ #include <stdlib.h> /* standard library functions and constants */ #include <string.h> /* string functions */ #include <limits.h> /* integer types constants */ #include <float.h> /* floating-point types constants */ /*#define NDEBUG to suppress assert() call */ #include <assert.h> /* assert() prototype */ /* project header files */ #include "buffer.h" #include "token.h" #include "stable.h" #include "table.h" #define DEBUG /* for conditional processing */ #undef DEBUG /* Global objects - variables */ /* This buffer is used as a repository for string literals. It is defined in platy_st.c */ extern Buffer * str_LTBL; /*String literal table */ int line; /* current line number of the source code */ extern int scerrnum; /* defined in platy_st.c - run-time error number */ extern STD sym_table; /* symbol table */ /* Local(file) global objects - variables */ static Buffer *lex_buf; /*pointer to temporary lexeme buffer*/ /* No other global variable declarations/definitiond are allowed */ /* scanner.c static(local) function prototypes */ static int char_class(char c); /* character class function */ static int get_next_state(int, char, int *); /* state machine function */ static int iskeyword(char * kw_lexeme); /*keywords lookup functuion */ static long atool(char * lexeme); /* converts octal string to decimal value */ /* Prepares the Scanner to read the source code buffer * Author: Svillen Ranev * Called functions: b_isempty, b_setmark, b_retract_to_mark, b_reset * Parameters: * - pBuffer sc_buf * Return values: * - 1 (failure), 0 (success) */ int scanner_init(Buffer * sc_buf) { if (b_isempty(sc_buf)) return EXIT_FAILURE;/*1*/ /* in case the buffer has been read previously */ b_setmark(sc_buf, 0); b_retract_to_mark(sc_buf); b_reset(str_LTBL); line = 1; return EXIT_SUCCESS;/*0*/ /* scerrnum = 0; *//*no need - global ANSI C */ } /* Reads the source code buffer and generates a token * Author: Victor Fernandes * Version: 0.0.1 * Called functions: aa_table[], b_getc, b_setmark, b_getcoffset, b_retract_to_mark, b_retract, b_mark, b_eob, b_create, b_addc, b_free, isalpha, isalnum, get_next_state * Parameters: - pBuffer sc_buf * Return values: Token * Algorithm: Read a character from the source buffer, one by one, and match string patterns to tokens. If an illegal sequence is found while starting a pattern off of the first matching character, it returns a token with an error code with the infringing character. If the scanner matches a valid pattern it returns a Token with the appropriate code. */ Token malar_next_token(Buffer * sc_buf) { Token t; /* token to return after recognition */ unsigned char c; /* input symbol */ int state = 0; /* initial state of the FSM */ short lexstart; /* start offset of a lexeme in the input buffer */ short lexend; /* end offset of a lexeme in the input buffer */ int accept = NOAS; /* type of state - initially not accepting */ /* Counter for loops in string error case */ int i; /*String offset for the str_LTBL*/ static short str_offset = 0; /* temporary buffer used to store an erroneous string literal*/ pBuffer err_lex_buf; if (sc_buf == NULL) { scerrnum = 1; return aa_table[ES]("RUN TIME ERROR: "); /* WHOOPS */ } while (1) { /* endless loop broken by token returns; it will generate a warning */ /* Get symbol from buffer */ c = b_getc(sc_buf); switch (c) { case 255: t.code = SEOF_T; return t; /* EOF */ case '\0': t.code = SEOF_T; return t; /* Source EOF */ case '\n': line++; continue; /* Ignore new line, increment line count */ case '\r': line++; continue; /* CR, increment line count*/ case ' ': continue; /* Ignore white space */ case '\t': continue; /* Ignore tabs */ case ';': t.code = EOS_T; return t; /* End of statement */ case ',': t.code = COM_T; return t; /* Comma */ case '{': t.code = LBR_T; return t; /* Left brace */ case '}': t.code = RBR_T; return t; /* Right brace */ case '(': t.code = LPR_T; return t; /* Left parenthesis */ case ')': t.code = RPR_T; return t; /* Right parenthesis */ case '+': t.code = ART_OP_T; t.attribute.arr_op = PLUS; return t; /* Addition operator */ case '-': t.code = ART_OP_T; t.attribute.arr_op = MINUS; return t; /* Substraction operator */ case '*': t.code = ART_OP_T; t.attribute.arr_op = MULT; return t; /* Multiplication operator */ case '/': t.code = ART_OP_T; t.attribute.arr_op = DIV; return t; /* Devision operator */ case '>': t.code = REL_OP_T; t.attribute.rel_op = GT; return t; /* Greater-than relational operator */ case '<': c = b_getc(sc_buf); if (c == '>') { t.code = REL_OP_T; t.attribute.rel_op = NE; /* Negation operator */ return t; } else if (c == '<') { t.code = SCC_OP_T; /* String concatenation operator */ return t; } else { t.code = REL_OP_T; t.attribute.rel_op = LT; /* Less-than operator */ b_retract(sc_buf); return t; } case '.': b_setmark(sc_buf, b_getcoffset(sc_buf)); /* Set mark before continuing (AND|OR case) */ c = b_getc(sc_buf); if (c == 'A' && b_getc(sc_buf) == 'N' && b_getc(sc_buf) == 'D' && b_getc(sc_buf) == '.') { t.code = LOG_OP_T; t.attribute.log_op = AND; return t; } else if (c == 'O' && b_getc(sc_buf) == 'R' && b_getc(sc_buf) == '.') { t.code = LOG_OP_T; t.attribute.log_op = OR; return t; } t.code = ERR_T; /* "That character's not supposed to be here" case */ t.attribute.err_lex[0] = '.'; t.attribute.err_lex[1] = '\0'; b_retract_to_mark(sc_buf); return t; case '!': c = b_getc(sc_buf); if (c == '<') { /* It's a comment line */ /* Consume chars until line ends */ for (; c != '\0' && c != '\r' && c != '\n' && c != 255; c = b_getc(sc_buf)); ++line; continue; } else { /* Bad character, pump out an error token */ t.code = ERR_T; b_retract(sc_buf); b_retract(sc_buf); /* Retract twice to re-read '!' */ t.attribute.err_lex[0] = c = b_getc(sc_buf); t.attribute.err_lex[1] = c = b_getc(sc_buf); t.attribute.err_lex[2] = '\0'; /* Consume the rest of the caracters to ignore the line*/ for (; c != '\0' && c != '\r' && c != '\n' && c != 255; c = b_getc(sc_buf)); ++line; return t; } case '=': c = b_getc(sc_buf); if (c == '=') { /* Relational equals-to operator */ t.code = REL_OP_T; t.attribute.rel_op = EQ; return t; } b_retract(sc_buf); t.code = ASS_OP_T; /* Assignment operator */ return t; case '\"': /* Don't quote me on this */ /* Track the beginning of string */ b_setmark(sc_buf, b_getcoffset(sc_buf)); lexstart = b_mark(sc_buf); lexend = lexstart; c = b_getc(sc_buf); /* Step through the string literal and track progress */ for (; c != '\"'; c = b_getc(sc_buf), ++lexend) { if (c == '\n' || c == '\r') ++line; if (c == '\0' || c == 255) { /* Illegal string, make it an error token */ b_retract_to_mark(sc_buf); b_retract(sc_buf); /* Retract one more time to re-read '"' into err_lex */ t.code = ERR_T; err_lex_buf = b_create(100, 10, 'a'); /* Start up temporary buffer */ c = b_getc(sc_buf); for (i = 0; i < (lexend - lexstart); c = b_getc(sc_buf), ++i) { /* Continue until the end of the lexeme where error was found */ if (i < (ERR_LEN) || c != 255 || c != '\0') b_addc(err_lex_buf, c); } /* Pass the complete erroneous string to error state accepting function*/ t = aa_table[ES](b_setmark(err_lex_buf, 0)); b_free(err_lex_buf); /* Clean up the temporary buffer */ return t; } } /* end for loop, string finished and considered valid */ b_retract_to_mark(sc_buf); /* Copy the matched string literal to str_LTBL */ t.attribute.str_offset = str_offset; c = b_getc(sc_buf); for (; lexstart < lexend; c = b_getc(sc_buf), ++lexstart, ++str_offset) { b_addc(str_LTBL, c); } b_addc(str_LTBL, '\0'); ++str_offset; t.code = STR_T; return t; default: if (isalpha(c) || isalnum(c)) { /*Set mark to beginning of lexeme*/ b_retract(sc_buf); b_setmark(sc_buf, b_getcoffset(sc_buf)); lexstart = b_mark(sc_buf); lexend = lexstart; state = 0; while (accept == NOAS) { state = get_next_state(state, b_getc(sc_buf), &accept); if (accept != NOAS) { break; } } /* * Entering Accepting State */ if (as_table[state] == ASWR) { b_retract(sc_buf); } /* Get end of lexeme */ lexend = b_getcoffset(sc_buf); b_retract_to_mark(sc_buf); lex_buf = b_create(20, 8, 'a'); /* Copy the scanned lexeme into lexical buffer */ for (; lexstart < lexend; ++lexstart) { b_addc(lex_buf, b_getc(sc_buf)); } b_addc(lex_buf, '\0'); if (aa_table[state] != NULL) { t = aa_table[state](b_setmark(lex_buf, 0)); } else { scerrnum = 1; t = aa_table[ES]("RUN TIME ERROR: "); return t; } b_free(lex_buf); } /* Invalid character */ else { t.code = ERR_T; t.attribute.err_lex[0] = c; t.attribute.err_lex[1] = '\0'; } return t; } } /*end while(1)*/ } /* Looks up the transition table for the next state given the input character Author: Victor Fernandes Version: 0.0.1 Called functions: char_class, assert, printf, as_table Parameters: - int state: the starting point for the transition table lookup - char c: the input character for table lookup - int *accept: pointer to the accepting state of the scanner Return values: int (the next state value of the scanner) */ int get_next_state(int state, char c, int *accept) { int col; int next; col = char_class(c); next = st_table[state][col]; #ifdef DEBUG printf("Input symbol: %c Row: %d Column: %d Next: %d \n", c, state, col, next); #endif assert(next != IS); #ifdef DEBUG if (next == IS) { printf("Scanner Error: Illegal state:\n"); printf("Input symbol: %c Row: %d Column: %d\n", c, state, col); exit(1); } #endif *accept = as_table[next]; return next; } /* Matches the column value in the transition table to the given input character Author: Victor Fernandes Version: 0.0.1 Called functions: N/A Parameters: - char c: the input character to be matched in the transition table Return values: int (the value representing the column in the transition table) */ int char_class(char c) { int val; if (isalpha(c)) val = 0; else if (c == '0') val = 1; else if (c > '0' && c < '8') val = 2; else if (c == '8' || c == '9') val = 3; else if (c == '.') val = 4; else if (c == '#') val = 5; else val = 6; return val; } /* Generates a token for an arithmetic variable identifer or keyword Author: Victor Fernandes Version: 0.0.1 Called functions: iskeyword, calloc, aa_table[], strlen, strncpy, free Parameters: - char* lexeme: the string pattern matched by the FA Return values: Token */ Token aa_func02(char lexeme[]) { unsigned kw_idx, offset; /* Variable to contain keyword table index */ Token t; char v_type; /*char* temp_str;*/ #ifdef DEBUG printf("Lexeme: '%s'\n", lexeme); #endif kw_idx = iskeyword(lexeme); if (kw_idx != -1) { /* Keyword check */ t.code = KW_T; t.attribute.kwt_idx = kw_idx; return t; } /* Not a keyword? Must be AVID*/ t.code = AVID_T; /*if ((temp_str = (char*)calloc(VID_LEN + 1, sizeof(char))) == NULL) { return aa_table[ES]("RUN TIME ERROR: "); } strncpy(temp_str, lexeme, VID_LEN);*/ switch (lexeme[0]) { /* Read first character of lexeme for implicit type (not used yet?)*/ case 'i': case 'o': case 'd': case 'n': /* Integer */ v_type = 'I'; break; default: /* Floating point*/ v_type = 'F'; break; } if ((offset = st_install(sym_table, lexeme, v_type, line)) == -1){ printf("Error: Install failed - Symbol Table is full.\n"); st_store(sym_table); free(lexeme); exit(1); } t.attribute.vid_offset = offset; /*free(temp_str);*/ return t; } /* Generates a token for an string variable identifer Author: Victor Fernandes Version: 0.0.1 Called functions: calloc, aa_table[], strlen, strncpy, free Parameters: - char* lexeme: the string pattern matched by the FA Return values: Token */ Token aa_func03(char lexeme[]) { Token t; unsigned offset;/* char* temp_str; if ((temp_str = (char*)calloc(VID_LEN + 2, sizeof(char))) == NULL) { return aa_table[ES]("RUN TIME ERROR: "); }*/ /*strncpy(temp_str, lexeme, VID_LEN);*/ lexeme[strlen(lexeme) - 1] = '#'; /* Add # to end of the SVID */ if ((offset = st_install(sym_table, lexeme, 'S', line)) == -1){ printf("Error: Install failed - Symbol Table is full.\n"); st_store(sym_table); free(lexeme); exit(1); } t.code = SVID_T; t.attribute.vid_offset = offset; /*free(temp_str);*/ return t; } /* Generates a token for a decimal integer literal constant (DIL) Author: Victor Fernandes Version: 0.0.1 Called functions: atol, aa_table[] Parameters: - char* lexeme: the string pattern matched by the FA Return values: Token */ Token aa_func05(char lexeme[]) { Token t; long temp_num; temp_num = atol(lexeme); /* MOVE TO DEFINE */ if (temp_num > 32767 || temp_num < 0) { /* Overflow error */ t = aa_table[ES](lexeme); return t; } t.code = INL_T; t.attribute.int_value = (int)temp_num; return t; } /* Generates a token for a floating-point literal Author: Victor Fernandes Version: 0.0.1 Called functions: strtof, aa_table[] Parameters: - char* lexeme: the string pattern matched by the FA Return values: Token */ Token aa_func08(char lexeme[]) { Token t; double temp_dbl = 0.0; temp_dbl = atof(lexeme); if ((temp_dbl > FLT_MAX) || ((temp_dbl != 0.0) && (temp_dbl < FLT_MIN))) { /* Overflow error */ return aa_table[ES](lexeme); } t.code = FPL_T; t.attribute.flt_value = (float)temp_dbl; return t; } /* Generates a token for an octal integer literal Author: Victor Fernandes Version: 0.0.1 Called functions: strlen, aa_table[], atool Parameters: - char* lexeme: the string pattern matched by the FA Return values: Token */ Token aa_func10(char lexeme[]) { Token t; long new_olval; if (strlen(lexeme) > INL_LEN + 1) { t = aa_table[ES](lexeme); } t.code = INL_T; new_olval = atool(lexeme); if (new_olval < 0 || new_olval > PLT_SHRT_MAX) { t = aa_table[ES](lexeme); return t; } t.code = INL_T; t.attribute.int_value = (int)new_olval; return t; } /* Generates a token for a general error token Author: Victor Fernandes Version: 0.0.1 Called functions: aa_table[] Parameters: - char* lexeme: the string pattern matched by the FA Return values: Token */ Token aa_func12(char lexeme[]) { /* This function does the same as aa_func13, except that it is marked as non-retracting in the accepting function state, but the token is generated exactly the same way */ return aa_table[ESWR](lexeme); } /* Generates a token for a general error token Author: Victor Fernandes Version: 0.0.1 Called functions: strlen, aa_table[] Parameters: - char* lexeme: the string pattern matched by the FA Return values: Token */ Token aa_func13(char lexeme[]) { Token t; unsigned int i = strlen(lexeme); t.code = ERR_T; /* for (i = 0; i < (ERR_LEN) && i < strlen(lexeme); i++) t.attribute.err_lex[i] = lexeme[i]; */ if (strlen(lexeme) > ERR_LEN) { t.attribute.err_lex[i - 1] = '.'; t.attribute.err_lex[i - 2] = '.'; t.attribute.err_lex[i - 3] = '.'; } t.attribute.err_lex[i] = '\0'; return t; } /* Returns an octal representation of a string Author: Victor Fernandes Version: 0.0.1 Called functions: N/A Parameters: - char* lexeme: the string pattern to convert Return values: long (integer representation of the octal string) */ long atool(char * lexeme) { int i, x = 1; long result = 0; for (i = strlen(lexeme); i > 0; i--, x *= 8) result += x*(lexeme[i - 1] - '0'); return result; } /* Looks up the string pattern on the keyword table Author: Victor Fernandes Version: 0.0.1 Called functions: N/A Parameters: - char* lexeme: the string pattern to look up in kw_table Return values: int -1 (could not find a match), int [1 - KW_SIZE] index location of the matching keyword */ int iskeyword(char * kw_lexeme) { int i; if (kw_lexeme == NULL) return -1; for (i = 0; i < KWT_SIZE; i++) { if (strcmp(kw_table[i], kw_lexeme) == 0) { return i; } } return -1; }