// c4.c - C in four functions // char, int, and pointer types // if, while, return, and expression statements // just enough features to allow self-compilation and a bit more // Written by Robert Swierczek #include #include #include #include #include #include //#define int long long char *p, *lp, // current position in source code *data; // data/bss pointer int *e, *le, // current position in emitted code *id, // currently parsed identifier *sym, // symbol table (simple list of identifiers) tk, // current token ival, // current token value ty, // current expression type loc, // local variable offset line, // current line number src, // print source and assembly flag debug; // print executed instructions // tokens and classes (operators last and in precedence order) enum { Num = 128, Fun, Sys, Glo, Loc, Id, Char, Else, Enum, If, Int, Return, Sizeof, Do, While, Assign, Cond, Lor, Lan, Or, Xor, And, Eq, Ne, Lt, Gt, Le, Ge, Shl, Shr, Add, Sub, Mul, Div, Mod, Inc, Dec, Brak }; // opcodes enum { LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH , OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD , OPEN,READ,CLOS,PRTF,MALC,FREE,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU,IALP,SCMP,EXIT }; // types enum { CHAR, INT, PTR }; // identifier offsets (since we can't create an ident struct) enum { Tk, Hash, Name, Class, Type, Val, HClass, HType, HVal, Idsz }; void next() { char *pp; while (tk = *p) { ++p; if (tk == '\n') { if (src) { printf("%d: %.*s", line, p - lp, lp); lp = p; while (le < e) { printf("%8.4s", &"LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH ," "OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ," "OPEN,READ,CLOS,PRTF,MALC,FREE,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU.IALP,SCMP,EXIT,"[*++le * 5]); if (*le <= ADJ) printf(" %d\n", *++le); else printf("\n"); } } ++line; } else if (tk == '#') { while (*p != 0 && *p != '\n') ++p; } else if ((tk >= 'a' && tk <= 'z') || (tk >= 'A' && tk <= 'Z') || tk == '_') { pp = p - 1; while ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_') tk = tk * 147 + *p++; tk = (tk << 6) + (p - pp); id = sym; while (id[Tk]) { if (tk == id[Hash] && !memcmp((char *)id[Name], pp, p - pp)) { tk = id[Tk]; return; } id = id + Idsz; } id[Name] = (int)pp; id[Hash] = tk; tk = id[Tk] = Id; return; } else if (tk >= '0' && tk <= '9') { if (ival = tk - '0') { while (*p >= '0' && *p <= '9') ival = ival * 10 + *p++ - '0'; } else if (*p == 'x' || *p == 'X') { while ((tk = *++p) && ((tk >= '0' && tk <= '9') || (tk >= 'a' && tk <= 'f') || (tk >= 'A' && tk <= 'F'))) ival = ival * 16 + (tk & 15) + (tk >= 'A' ? 9 : 0); } else { while (*p >= '0' && *p <= '7') ival = ival * 8 + *p++ - '0'; } tk = Num; return; } else if (tk == '/') { if (*p == '/') { ++p; while (*p != 0 && *p != '\n') ++p; } else if (*p != 0 && *p == '*') { ++p; while (*p != 0 && *p != '/') { if (*p == '\n') line++; if (*p == '*'); ++p; } ++p; } else { tk = Div; return; } } else if (tk == '\'' || tk == '"') { pp = data; while (*p != 0 && *p != tk) { if ((ival = *p++) == '\\') { if ((ival = *p++) == 'n') ival = '\n'; } if (tk == '"') *data++ = ival; } ++p; if (tk == '"') ival = (int)pp; else tk = Num; return; } else if (tk == '=') { if (*p == '=') { ++p; tk = Eq; } else tk = Assign; return; } else if (tk == '+') { if (*p == '+') { ++p; tk = Inc; } else tk = Add; return; } else if (tk == '-') { if (*p == '-') { ++p; tk = Dec; } else tk = Sub; return; } else if (tk == '!') { if (*p == '=') { ++p; tk = Ne; } return; } else if (tk == '<') { if (*p == '=') { ++p; tk = Le; } else if (*p == '<') { ++p; tk = Shl; } else tk = Lt; return; } else if (tk == '>') { if (*p == '=') { ++p; tk = Ge; } else if (*p == '>') { ++p; tk = Shr; } else tk = Gt; return; } else if (tk == '|') { if (*p == '|') { ++p; tk = Lor; } else tk = Or; return; } else if (tk == '&') { if (*p == '&') { ++p; tk = Lan; } else tk = And; return; } else if (tk == '^') { tk = Xor; return; } else if (tk == '%') { tk = Mod; return; } else if (tk == '*') { tk = Mul; return; } else if (tk == '[') { tk = Brak; return; } else if (tk == '?') { tk = Cond; return; } else if (tk == '~' || tk == ';' || tk == '{' || tk == '}' || tk == '(' || tk == ')' || tk == ']' || tk == ',' || tk == ':') return; } } void expr(int lev) { int t, *d; if (!tk) { printf("%d: unexpected eof in expression\n", line); exit(-1); } else if (tk == Num) { *++e = IMM; *++e = ival; next(); ty = INT; } else if (tk == '"') { *++e = IMM; *++e = ival; next(); while (tk == '"') next(); data = (char *)((int)data + sizeof(int) & -sizeof(int)); ty = PTR; } else if (tk == Sizeof) { next(); if (tk == '(') next(); else { printf("%d: open paren expected in sizeof\n", line); exit(-1); } ty = INT; if (tk == Int) next(); else if (tk == Char) { next(); ty = CHAR; } while (tk == Mul) { next(); ty = ty + PTR; } if (tk == ')') next(); else { printf("%d: close paren expected in sizeof\n", line); exit(-1); } *++e = IMM; *++e = (ty == CHAR) ? sizeof(char) : sizeof(int); ty = INT; } else if (tk == Id) { d = id; next(); if (tk == '(') { next(); t = 0; while (tk != ')') { expr(Assign); *++e = PSH; ++t; if (tk == ',') next(); } next(); if (d[Class] == Sys) *++e = d[Val]; else if (d[Class] == Fun) { *++e = JSR; *++e = d[Val]; } else { printf("%d: bad function call\n", line); exit(-1); } if (t) { *++e = ADJ; *++e = t; } ty = d[Type]; } else if (d[Class] == Num) { *++e = IMM; *++e = d[Val]; ty = INT; } else { if (d[Class] == Loc) { *++e = LEA; *++e = loc - d[Val]; } else if (d[Class] == Glo) { *++e = IMM; *++e = d[Val]; } else { printf("%d: undefined variable\n", line); exit(-1); } *++e = ((ty = d[Type]) == CHAR) ? LC : LI; } } else if (tk == '(') { next(); if (tk == Int || tk == Char) { t = (tk == Int) ? INT : CHAR; next(); while (tk == Mul) { next(); t = t + PTR; } if (tk == ')') next(); else { printf("%d: bad cast\n", line); exit(-1); } expr(Inc); ty = t; } else { expr(Assign); if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); } } } else if (tk == Mul) { next(); expr(Inc); if (ty > INT) ty = ty - PTR; else { printf("%d: bad dereference\n", line); exit(-1); } *++e = (ty == CHAR) ? LC : LI; } else if (tk == And) { next(); expr(Inc); if (*e == LC || *e == LI) --e; else { printf("%d: bad address-of\n", line); exit(-1); } ty = ty + PTR; } else if (tk == '!') { next(); expr(Inc); *++e = PSH; *++e = IMM; *++e = 0; *++e = EQ; ty = INT; } else if (tk == '~') { next(); expr(Inc); *++e = PSH; *++e = IMM; *++e = -1; *++e = XOR; ty = INT; } else if (tk == Add) { next(); expr(Inc); ty = INT; } else if (tk == Sub) { next(); *++e = IMM; if (tk == Num) { *++e = -ival; next(); } else { *++e = -1; *++e = PSH; expr(Inc); *++e = MUL; } ty = INT; } else if (tk == Inc || tk == Dec) { t = tk; next(); expr(Inc); if (*e == LC) { *e = PSH; *++e = LC; } else if (*e == LI) { *e = PSH; *++e = LI; } else { printf("%d: bad lvalue in pre-increment\n", line); exit(-1); } *++e = PSH; *++e = IMM; *++e = (ty > PTR) ? sizeof(int) : sizeof(char); *++e = (t == Inc) ? ADD : SUB; *++e = (ty == CHAR) ? SC : SI; } else { printf("%d: bad expression\n", line); exit(-1); } while (tk >= lev) { // "precedence climbing" or "Top Down Operator Precedence" method t = ty; if (tk == Assign) { next(); if (*e == LC || *e == LI) *e = PSH; else { printf("%d: bad lvalue in assignment\n", line); exit(-1); } expr(Assign); *++e = ((ty = t) == CHAR) ? SC : SI; } else if (tk == Cond) { next(); *++e = BZ; d = ++e; expr(Assign); if (tk == ':') next(); else { printf("%d: conditional missing colon\n", line); exit(-1); } *d = (int)(e + 3); *++e = JMP; d = ++e; expr(Cond); *d = (int)(e + 1); } else if (tk == Lor) { next(); *++e = BNZ; d = ++e; expr(Lan); *d = (int)(e + 1); ty = INT; } else if (tk == Lan) { next(); *++e = BZ; d = ++e; expr(Or); *d = (int)(e + 1); ty = INT; } else if (tk == Or) { next(); *++e = PSH; expr(Xor); *++e = OR; ty = INT; } else if (tk == Xor) { next(); *++e = PSH; expr(And); *++e = XOR; ty = INT; } else if (tk == And) { next(); *++e = PSH; expr(Eq); *++e = AND; ty = INT; } else if (tk == Eq) { next(); *++e = PSH; expr(Lt); *++e = EQ; ty = INT; } else if (tk == Ne) { next(); *++e = PSH; expr(Lt); *++e = NE; ty = INT; } else if (tk == Lt) { next(); *++e = PSH; expr(Shl); *++e = LT; ty = INT; } else if (tk == Gt) { next(); *++e = PSH; expr(Shl); *++e = GT; ty = INT; } else if (tk == Le) { next(); *++e = PSH; expr(Shl); *++e = LE; ty = INT; } else if (tk == Ge) { next(); *++e = PSH; expr(Shl); *++e = GE; ty = INT; } else if (tk == Shl) { next(); *++e = PSH; expr(Add); *++e = SHL; ty = INT; } else if (tk == Shr) { next(); *++e = PSH; expr(Add); *++e = SHR; ty = INT; } else if (tk == Add) { next(); *++e = PSH; expr(Mul); if ((ty = t) > PTR) { *++e = PSH; *++e = IMM; *++e = sizeof(int); *++e = MUL; } *++e = ADD; } else if (tk == Sub) { next(); *++e = PSH; expr(Mul); if (t > PTR && t == ty) { *++e = SUB; *++e = PSH; *++e = IMM; *++e = sizeof(int); *++e = DIV; ty = INT; } else if ((ty = t) > PTR) { *++e = PSH; *++e = IMM; *++e = sizeof(int); *++e = MUL; *++e = SUB; } else *++e = SUB; } else if (tk == Mul) { next(); *++e = PSH; expr(Inc); *++e = MUL; ty = INT; } else if (tk == Div) { next(); *++e = PSH; expr(Inc); *++e = DIV; ty = INT; } else if (tk == Mod) { next(); *++e = PSH; expr(Inc); *++e = MOD; ty = INT; } else if (tk == Inc || tk == Dec) { if (*e == LC) { *e = PSH; *++e = LC; } else if (*e == LI) { *e = PSH; *++e = LI; } else { printf("%d: bad lvalue in post-increment\n", line); exit(-1); } *++e = PSH; *++e = IMM; *++e = (ty > PTR) ? sizeof(int) : sizeof(char); *++e = (tk == Inc) ? ADD : SUB; *++e = (ty == CHAR) ? SC : SI; *++e = PSH; *++e = IMM; *++e = (ty > PTR) ? sizeof(int) : sizeof(char); *++e = (tk == Inc) ? SUB : ADD; next(); } else if (tk == Brak) { next(); *++e = PSH; expr(Assign); if (tk == ']') next(); else { printf("%d: close bracket expected\n", line); exit(-1); } if (t > PTR) { *++e = PSH; *++e = IMM; *++e = sizeof(int); *++e = MUL; } else if (t < PTR) { printf("%d: pointer type expected\n", line); exit(-1); } *++e = ADD; *++e = ((ty = t - PTR) == CHAR) ? LC : LI; } else { printf("%d: compiler error tk=%d\n", line, tk); exit(-1); } } } void stmt() { int *a, *b; if (tk == If) { next(); if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); } expr(Assign); if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); } *++e = BZ; b = ++e; stmt(); if (tk == Else) { *b = (int)(e + 3); *++e = JMP; b = ++e; next(); stmt(); } *b = (int)(e + 1); } else if (tk == Do) { next(); a = e + 1; stmt(); if (tk != While) { printf("%d: while expected to terminate do loop\n", line); exit(-1); } next(); if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); } expr(Assign); if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); } *++e = BZ; b = ++e; *++e = JMP; *++e = (int)a; *b = (int)(e + 1); } else if (tk == While) { next(); a = e + 1; if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); } expr(Assign); if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); } *++e = BZ; b = ++e; stmt(); *++e = JMP; *++e = (int)a; *b = (int)(e + 1); } else if (tk == Return) { next(); if (tk != ';') expr(Assign); *++e = LEV; if (tk == ';') next(); else { printf("%d: semicolon expected\n", line); exit(-1); } } else if (tk == '{') { next(); while (tk != '}') stmt(); next(); } else if (tk == ';') { next(); } else { expr(Assign); if (tk == ';') next(); else { printf("%d: semicolon expected\n", line); exit(-1); } } } int main(int argc, char **argv) { int fd, bt, ty, poolsz, *idmain; int *pc, *sp, *bp, a, cycle; // vm registers int i, *t; // temps --argc; ++argv; if (argc > 0 && **argv == '-' && (*argv)[1] == 's') { src = 1; --argc; ++argv; } if (argc > 0 && **argv == '-' && (*argv)[1] == 'd') { debug = 1; --argc; ++argv; } if (argc < 1) { printf("usage: c4 [-s] [-d] file ...\n"); return -1; } if ((fd = open(*argv, 0)) < 0) { printf("could not open(%s)\n", *argv); return -1; } poolsz = 256*1024; // arbitrary size if (!(sym = malloc(poolsz))) { printf("could not malloc(%d) symbol area\n", poolsz); return -1; } if (!(le = e = malloc(poolsz))) { printf("could not malloc(%d) text area\n", poolsz); return -1; } if (!(data = malloc(poolsz))) { printf("could not malloc(%d) data area\n", poolsz); return -1; } if (!(sp = malloc(poolsz))) { printf("could not malloc(%d) stack area\n", poolsz); return -1; } memset(sym, 0, poolsz); memset(e, 0, poolsz); memset(data, 0, poolsz); p = "char else enum if int return sizeof do while " "EOF EXIT_SUCCESS EXIT_FAILURE " "open read close printf malloc free memset memcmp getchar putstring putnl putchar putint isspace isdigit isalnum isalpha strcmp exit void main"; i = Char; while (i <= While) { next(); id[Tk] = i++; } // add keywords to symbol table // add library constants next(); id[Class] = Num; id[Type] = INT; id[Val] = -1; next(); id[Class] = Num; id[Type] = INT; id[Val] = 0; next(); id[Class] = Num; id[Type] = INT; id[Val] = 1; i = OPEN; while (i <= EXIT) { next(); id[Class] = Sys; id[Type] = INT; id[Val] = i++; } // add library to symbol table next(); id[Tk] = Char; // handle void type next(); idmain = id; // keep track of main if (!(lp = p = malloc(poolsz))) { printf("could not malloc(%d) source area\n", poolsz); return -1; } if ((i = read(fd, p, poolsz-1)) <= 0) { printf("read() returned %d\n", i); return -1; } p[i] = 0; close(fd); // parse declarations line = 1; next(); while (tk) { bt = INT; // basetype if (tk == Int) next(); else if (tk == Char) { next(); bt = CHAR; } else if (tk == Enum) { next(); if (tk != '{') next(); if (tk == '{') { next(); i = 0; while (tk != '}') { if (tk != Id) { printf("%d: bad enum identifier %d\n", line, tk); return -1; } next(); if (tk == Assign) { next(); if (tk != Num) { printf("%d: bad enum initializer\n", line); return -1; } i = ival; next(); } id[Class] = Num; id[Type] = INT; id[Val] = i++; if (tk == ',') next(); } next(); } } while (tk != ';' && tk != '}') { ty = bt; while (tk == Mul) { next(); ty = ty + PTR; } if (tk == Brak) { next(); if (tk == Num) next(); else { printf("%d: integer as dimensions of array expected\n", line); return -1; } if (tk == ']') next(); else { printf("%d: ] expected\n", line); return -1; } id[Type] = ty + PTR; id[Class] = Glo; id[Val] = (int)data; data = data + ival * sizeof(char); } else { if (tk != Id) { printf("%d: bad global declaration\n", line); return -1; } if (id[Class]) { printf("%d: duplicate global definition\n", line); return -1; } next(); id[Type] = ty; if (tk == '(') { // function id[Class] = Fun; id[Val] = (int)(e + 1); next(); i = 0; while (tk != ')') { ty = INT; if (tk == Int) next(); else if (tk == Char) { next(); ty = CHAR; } while (tk == Mul) { next(); ty = ty + PTR; } if (tk != Id) { printf("%d: bad parameter declaration\n", line); return -1; } if (id[Class] == Loc) { printf("%d: duplicate parameter definition\n", line); return -1; } id[HClass] = id[Class]; id[Class] = Loc; id[HType] = id[Type]; id[Type] = ty; id[HVal] = id[Val]; id[Val] = i++; next(); if (tk == ',') next(); } next(); if (tk != '{') { printf("%d: bad function definition\n", line); return -1; } loc = ++i; next(); while (tk == Int || tk == Char) { bt = (tk == Int) ? INT : CHAR; next(); while (tk != ';') { ty = bt; while (tk == Mul) { next(); ty = ty + PTR; } if (tk != Id) { printf("%d: bad local declaration\n", line); return -1; } if (id[Class] == Loc) { printf("%d: duplicate local definition\n", line); return -1; } id[HClass] = id[Class]; id[Class] = Loc; id[HType] = id[Type]; id[Type] = ty; id[HVal] = id[Val]; id[Val] = ++i; next(); if (tk == ',') next(); } next(); } *++e = ENT; *++e = i - loc; while (tk != '}') stmt(); *++e = LEV; id = sym; // unwind symbol table locals while (id[Tk]) { if (id[Class] == Loc) { id[Class] = id[HClass]; id[Type] = id[HType]; id[Val] = id[HVal]; } id = id + Idsz; } } else { id[Class] = Glo; id[Val] = (int)data; data = data + sizeof(int); } } if (tk == ',') next(); } next(); } if (!(pc = (int *)idmain[Val])) { printf("main() not defined\n"); return -1; } if (src) return 0; // setup stack bp = sp = (int *)((int)sp + poolsz); *--sp = EXIT; // call exit if main returns *--sp = PSH; t = sp; *--sp = argc; *--sp = (int)argv; *--sp = (int)t; // run... cycle = 0; while (1) { i = *pc++; ++cycle; if (debug) { printf("%d> %.4s", cycle, &"LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH ," "OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ," "OPEN,READ,CLOS,PRTF,MALC,FREE,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU.IALP,SCMP,EXIT,"[i * 5]); if (i <= ADJ) printf(" %d\n", *pc); else printf("\n"); } if (i == LEA) a = (int)(bp + *pc++); // load local address else if (i == IMM) a = *pc++; // load global address or immediate else if (i == JMP) pc = (int *)*pc; // jump else if (i == JSR) { *--sp = (int)(pc + 1); pc = (int *)*pc; } // jump to subroutine else if (i == BZ) pc = a ? pc + 1 : (int *)*pc; // branch if zero else if (i == BNZ) pc = a ? (int *)*pc : pc + 1; // branch if not zero else if (i == ENT) { *--sp = (int)bp; bp = sp; sp = sp - *pc++; } // enter subroutine else if (i == ADJ) sp = sp + *pc++; // stack adjust else if (i == LEV) { sp = bp; bp = (int *)*sp++; pc = (int *)*sp++; } // leave subroutine else if (i == LI) a = *(int *)a; // load int else if (i == LC) a = *(char *)a; // load char else if (i == SI) *(int *)*sp++ = a; // store int else if (i == SC) a = *(char *)*sp++ = a; // store char else if (i == PSH) *--sp = a; // push else if (i == OR) a = *sp++ | a; else if (i == XOR) a = *sp++ ^ a; else if (i == AND) a = *sp++ & a; else if (i == EQ) a = *sp++ == a; else if (i == NE) a = *sp++ != a; else if (i == LT) a = *sp++ < a; else if (i == GT) a = *sp++ > a; else if (i == LE) a = *sp++ <= a; else if (i == GE) a = *sp++ >= a; else if (i == SHL) a = *sp++ << a; else if (i == SHR) a = *sp++ >> a; else if (i == ADD) a = *sp++ + a; else if (i == SUB) a = *sp++ - a; else if (i == MUL) a = *sp++ * a; else if (i == DIV) a = *sp++ / a; else if (i == MOD) a = *sp++ % a; else if (i == OPEN) a = open((char *)sp[1], *sp); else if (i == READ) a = read(sp[2], (char *)sp[1], *sp); else if (i == CLOS) a = close(*sp); else if (i == PRTF) { t = sp + pc[1]; a = printf((char *)t[-1], t[-2], t[-3], t[-4], t[-5], t[-6]); } else if (i == MALC) a = (int)malloc(*sp); else if (i == FREE) free((void *)*sp); else if (i == MSET) a = (int)memset((char *)sp[2], sp[1], *sp); else if (i == MCMP) a = memcmp((char *)sp[2], (char *)sp[1], *sp); else if (i == GETC) a = getchar(); else if (i == PUTS) { t = sp + pc[1]; a = printf("%s", (char *)t[-1]); } else if (i == PUTN) a = printf("\n"); else if (i == PUTC) a = putchar(*sp); else if (i == PUTI) { t = sp + pc[1]; a = printf("%d", (int)t[-1]); } else if (i == ISPC) { t = sp + pc[1]; a = isspace( (int)t[-1]); } else if (i == IDGT) { t = sp + pc[1]; a = isdigit( (int)t[-1]); } else if (i == IANU) { t = sp + pc[1]; a = isalnum( (int)t[-1]); } else if (i == IALP) { t = sp + pc[1]; a = isalpha( (int)t[-1]); } else if (i == SCMP) { t = sp + pc[1]; a = strcmp((char *)t[-1], (char *)t[-2]); } else if (i == EXIT) { printf("exit(%d) cycle = %d\n", *sp, cycle); return *sp; } else { printf("unknown instruction = %d! cycle = %d\n", i, cycle); return -1; } } }