From 9ba1bb3d2007b37e3392208ef027f88d78e51c3e Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Fri, 13 Aug 2021 11:36:31 +0200 Subject: cc: some work on the scanner, expression parser --- miniany/README | 6 +- miniany/REQUIREMENTS | 12 ++ miniany/cc.c | 280 +++++++++++++++++++++++++++++++++++++++++--- miniany/libc-freestanding.c | 51 +++++++- miniany/libc-hosted.c | 2 + old/c.ebnf | 229 ++++++++++++++++++++++++++++++++++++ old/minic.ebnf | 9 ++ 7 files changed, 567 insertions(+), 22 deletions(-) create mode 100644 old/c.ebnf diff --git a/miniany/README b/miniany/README index fcd9e06..851db39 100644 --- a/miniany/README +++ b/miniany/README @@ -34,11 +34,13 @@ tcc -O0 -g -o c4 c4.c inspiration for what makes up a minimal C language * tiny.c - * http://www.iro.umontreal.ca/~felipe/IFT2030-Automne2002/Complements/tinyc.c + * http://www.iro.umontreal.ca/~felipe/IFT2030-Automne2002/Complements/tinyc.c, + Marc Feeley, really easy and much more readable, meant as educational compiler * c.c in swieros: https://github.com/rswier/swieros.git * documentation * "Compiler Construction", Niklaus Wirth * https://github.com/DoctorWkt/acwj - + * https://www.engr.mun.ca/~theo/Misc/exp_parsing.htm#climbing, + https://en.wikipedia.org/wiki/Operator-precedence_parser#Precedence_climbing_method diff --git a/miniany/REQUIREMENTS b/miniany/REQUIREMENTS index 474e975..cd6ffcf 100644 --- a/miniany/REQUIREMENTS +++ b/miniany/REQUIREMENTS @@ -29,3 +29,15 @@ not implementing: - linker - have compilation units needs a linker do build an executable +- symname[t] printing the symbol and not the number, + requires static initializers for array of char* +- ASTs are basically only useful when you start to optimize, + till then you can use an intermediate format (as C4) does + and a stack machine. They also make the code easier readable. + For use they fore the introduction of pointers, references and structs. + In expression parsing we see, that const folding already needs + an AST, because we should not emit code when still reading + a constant expression. It also seperates syntactical stuff like '[' + from logical stuff like 'declaration of array size' and 'derefencing + a pointer'. + diff --git a/miniany/cc.c b/miniany/cc.c index 1637627..ebdebdb 100644 --- a/miniany/cc.c +++ b/miniany/cc.c @@ -1,45 +1,297 @@ int col; int row; +int pushback; + +int token; + +int DEBUG_SCANNER; + +enum { + MAX_IDENT_LEN = 20, + TEST = -1 +}; + +void pushBack( int c ) +{ + pushback = c; +} + int getChar( ) { int c; + if( pushback ) { + c = pushback; + pushback = 0; + return c; + } + c = getchar( ); if( c == EOF ) { return c; } col++; if( c == '\n' ) { - col = 1; + col = 0; row++; } return c; } -int main( int argc, char **argv ) +int skipWhite( ) { int c; - col = 1; - row = 1; + c = getChar( ); + while( isspace( c ) ) { + c = getChar( ); + } + + return c; +} - putstring( "Hello CC" ); - putnl( ); +enum { + S_PLUS = 1, + S_MINUS, + S_STAR, + S_SLASH, + S_SEMICOLON, + S_EQUALS, + S_INT = 10, + S_IDENT, + S_NUM = 20, + S_ERR = 30, + S_EOI = 31 +}; - c = getChar( ); +void printErrorHeader( ) +{ + putstring( "Error line " ); putint( row ); + putstring( ", pos " ); + putint( col ); putstring( ": " ); - while( c != EOF ) { - if( c == '\n' ) { - putchar( '$' ); - putchar( c ); - putint( row ); - putstring( ": " ); +} + +int num; + +void scanNumber( int c ) +{ + num = c - '0'; + c = getChar( ); + while( isdigit( c ) ) { + num = 10 * num + ( c - '0' ); + c = getChar( ); + } + pushBack( c ); +} + +/* c4: no data segment allocation in char array decleration */ +char *ident; +/*char ident[20]; + char ident[MAX_IDENT_LEN]; +*/ + +void scanIdent( int c ) +{ + int n; + + n = 0; + while( isalnum( c ) || ( c == '_' ) ) { + ident[n] = c; + n++; + if( n >= MAX_IDENT_LEN - 1 ) { + printErrorHeader( ); + putstring( "too long identifier" ); + putnl( ); + exit( EXIT_FAILURE ); + } + c = getChar( ); + } + ident[n] = 0; /* c4 doesn't handle '\0' */ + pushBack( c ); +} + +int keyword( char *ident ) +{ + if( *ident == 'i' ) { + if( strcmp( ident, "int" ) == 0 ) { + return S_INT; } else { - putchar( c ); + return 0; } + } + + return 0; +} + +int getToken( ) +{ + int t; + int c; + + c = skipWhite( ); + + if( c == EOF ) { + t = S_EOI; + } else if( c == '+' ) { + t = S_PLUS; + } else if( c == '-' ) { + t = S_MINUS; + } else if( c == '*' ) { + t = S_STAR; + } else if ( c == '/' ) { c = getChar( ); + if( c == '/' ) { + while( c != '\n' ) { + c = getChar( ); + } + t = getToken( ); + } else if( c == '*' ) { + do { + while( c != '*' ) { + c = getChar( ); + } + c = getChar( ); + } while( c != '/' ); + c = getChar( ); + t = getToken( ); + } else { + pushBack( c ); + t = S_SLASH; + } + } else if( c == ';' ) { + t = S_SEMICOLON; + } else if( c == '=' ) { + t = S_EQUALS; + } else if( isdigit( c ) ) { + scanNumber( c ); + t = S_NUM; + } else if( c >= 'a' && c <= 'z' ) { + scanIdent( c ); + if( ( t = keyword( ident ) ) ) { + } else { + t = S_IDENT; + } + } else { + t = S_ERR; + printErrorHeader( ); + putstring( "unknown token '" ); + putchar( c ); + putstring( "'" ); + putnl( ); + exit( EXIT_FAILURE ); + } + + if( DEBUG_SCANNER ) { + putint( row ); + putchar( '/' ); + putint( col ); + putstring( ": " ); + putint( t ); + if( t == S_NUM ) { + putchar( '(' ); + putint( num ); + putchar( ')' ); + } + putnl( ); + } + + return t; +} + +void expect( int must, char *what ) +{ + if( token == must ) { + token = getToken( ); + } else { + printErrorHeader( ); + putstring( what ); + putstring( " expected" ); + putnl( ); + exit( EXIT_FAILURE ); + } +} + +void parseExpression( ) +{ + if( token == S_EOI ) { + printErrorHeader( ); + putstring( "unexpected eof in expression" ); + putnl( ); + exit( EXIT_FAILURE ); + } + + if( token == S_NUM ) { + putstring( "immediate int " ); + putint( num ); + putnl( ); + } + + token = getToken( ); + if( token == S_PLUS ) { + token = getToken( ); + parseExpression( ); + } else if( token == S_MINUS ) { + token = getToken( ); + parseExpression( ); + } else if( token == S_STAR ) { + token = getToken( ); + parseExpression( ); + } else if( token == S_SLASH ) { + token = getToken( ); + parseExpression( ); + } else if( token == S_EOI || token == S_SEMICOLON ) { + return; + } else { + printErrorHeader( ); + putstring( "unexpected token '" ); + putint( token ); + putstring( "' in expression" ); + putnl( ); + exit( EXIT_FAILURE ); + } +} + +void parseDeclaration( ) +{ + expect( S_INT, "int" ); + expect( S_IDENT, "identifier" ); + putstring( "Adding glob: " ); putstring( ident ); putnl( ); + expect( S_SEMICOLON, ";" ); +} + +void parseAssignment( ) +{ + token = getToken( ); + expect( S_EQUALS, "=" ); + parseExpression( ); + expect( S_SEMICOLON, ";" ); +} + +void parseStatement( ) +{ + if( token == S_INT ) { + parseDeclaration( ); + } else if( token == S_IDENT ) { + parseAssignment( ); + } else if( token == S_EOI ) { + return; + } +} + +int main( int argc, char **argv ) +{ + col = 0; + row = 1; + pushback = 0; + DEBUG_SCANNER = 1; + ident = "12345678901234567890"; + + token = getToken( ); + while( token != S_EOI ) { + parseStatement( ); } exit( EXIT_SUCCESS ); diff --git a/miniany/libc-freestanding.c b/miniany/libc-freestanding.c index 8b1a378..ffed093 100644 --- a/miniany/libc-freestanding.c +++ b/miniany/libc-freestanding.c @@ -20,6 +20,40 @@ int strlen( char *s ) return p - s; } +int strcmp( char *s1, char *s2 ) +{ + while( ( *s1 != '\0' ) && ( *s2 != '\0' ) && *s1 == *s2 ) { + s1++; + s2++; + } + + return *s1 - *s2; +} + +int isspace( int c ) +{ + if( c == ' ' || c == '\r' || c == '\n' || c == '\t' ) return 1; + return 0; +} + +int isdigit( int c ) +{ + if( c >= '0' && c <= '9' ) return 1; + return 0; +} + +int isalpha( int c ) +{ + if( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) ) return 1; + return 0; +} + +int isalnum( int c ) +{ + if( isalpha( c ) || isdigit( c ) ) return 1; + return 0; +} + enum { EXIT_SUCCESS = 0, EXIT_FAILURE = 1 @@ -31,9 +65,18 @@ enum { SYSCALL_WRITE = 4 }; +/* EOF = -1 is a const expression for c4 */ +/* EOF = 0xFFFFFFFF clang doesn't like this */ +/* EOF = 0xFFFFFFFF gcc overflow in conversion from 'enum ' to 'int' changes value from '4294967295' to '-1' */ +/* EOF = 256, this is not standard, but doesn't really matter in freestanding mode, + * code should really not relly on the fact what the value of EOF is (-1 usually) */ +enum { + EOF = 256 +}; + int errno; -static __attribute__((noinline)) int syscall1( int id, int arg0 ) +int syscall1( int id, int arg0 ) { int retval; @@ -54,7 +97,7 @@ static __attribute__((noinline)) int syscall1( int id, int arg0 ) return retval; } -static __attribute__((noinline)) int syscall3( int id, int arg0, int arg1, int arg2 ) +int syscall3( int id, int arg0, int arg1, int arg2 ) { int retval; @@ -109,10 +152,6 @@ static int read_string( int fd, char *buf, int size ) return syscall3( SYSCALL_READ, fd, (int)buf, size ); } -enum { - EOF = -1 -}; - int puts( char *s ) { print_string( s ); diff --git a/miniany/libc-hosted.c b/miniany/libc-hosted.c index 4c663bf..4154aa1 100644 --- a/miniany/libc-hosted.c +++ b/miniany/libc-hosted.c @@ -5,6 +5,8 @@ #include #include +#include +#include int putstring( char *s ) { diff --git a/old/c.ebnf b/old/c.ebnf new file mode 100644 index 0000000..9a3c0ae --- /dev/null +++ b/old/c.ebnf @@ -0,0 +1,229 @@ + ::= {}* + + ::= + | + + ::= {}* {}* + + ::= + | + | + + ::= auto + | register + | static + | extern + | typedef + + ::= void + | char + | short + | int + | long + | float + | double + | signed + | unsigned + | + | + | + + ::= { {}+ } + | { {}+ } + | + + ::= struct + | union + + ::= {}* + + ::= + | + + ::= + | , + + ::= + | : + | : + + ::= {}? + + ::= * {}* {}? + + ::= const + | volatile + + ::= + | ( ) + | [ {}? ] + | ( ) + | ( {}* ) + + ::= + + ::= + | ? : + + ::= + | || + + ::= + | && + + ::= + | | + + ::= + | ^ + + ::= + | & + + ::= + | == + | != + + ::= + | < + | > + | <= + | >= + + ::= + | << + | >> + + ::= + | + + | - + + ::= + | * + | / + | % + + ::= + | ( ) + + ::= + | ++ + | -- + | + | sizeof + | sizeof + + ::= + | [ ] + | ( {}* ) + | . + | -> + | ++ + | -- + + ::= + | + | + | ( ) + + ::= + | + | + | + + ::= + | , + + ::= + | + + ::= = + | *= + | /= + | %= + | += + | -= + | <<= + | >>= + | &= + | ^= + | |= + + ::= & + | * + | + + | - + | ~ + | ! + + ::= {}+ {}? + + ::= + | , ... + + ::= + | , + + ::= {}+ + | {}+ + | {}+ + + ::= + | + | + + ::= ( ) + | {}? [ {}? ] + | {}? ( {}? ) + + ::= enum { } + | enum { } + | enum + + ::= + | , + + ::= + | = + + ::= + + ::= {}+ {}* ; + + ::= + | = + + ::= + | { } + | { , } + + ::= + | , + + ::= { {}* {}* } + + ::= + | + | + | + | + | + + ::= : + | case : + | default : + + ::= {}? ; + + ::= if ( ) + | if ( ) else + | switch ( ) + + ::= while ( ) + | do while ( ) ; + | for ( {}? ; {}? ; {}? ) + + ::= goto ; + | continue ; + | break ; + | return {}? ; diff --git a/old/minic.ebnf b/old/minic.ebnf index 9379e35..4c354e6 100644 --- a/old/minic.ebnf +++ b/old/minic.ebnf @@ -5,3 +5,12 @@ Integer = Digit { Digit }. Letter = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" | "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z". Digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9". + +Expression = Integer + | Expression "*" Expression + | Expression "/" Expression + | Expression "+" Expression + | Expression "-" Expression + . + + -- cgit v1.2.3-54-g00ecf