summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2021-09-06 16:00:12 +0000
committerAndreas Baumann <mail@andreasbaumann.cc>2021-09-06 16:00:12 +0000
commita8eeffaef662df8e9b174b5c484df95ecdba6a9e (patch)
treeeb3cf3e1c62314dde49d66ef6fceba3343c328d8
parent660a0c3c89090548ff9ea5802eb3325dc1432727 (diff)
downloadcompilertests-a8eeffaef662df8e9b174b5c484df95ecdba6a9e.tar.gz
compilertests-a8eeffaef662df8e9b174b5c484df95ecdba6a9e.tar.bz2
c4 is freestanding now
-rw-r--r--miniany/README21
-rw-r--r--miniany/REQUIREMENTS25
-rwxr-xr-xminiany/build.sh20
-rw-r--r--miniany/c4.c380
-rw-r--r--miniany/doc/journal.stuffwithstuff.com_2011_03_19_pratt-parsers-expression-parsing-made-easy.txt564
-rw-r--r--miniany/hello.c3
-rw-r--r--miniany/libc-freestanding.c39
-rw-r--r--miniany/test1.hexmust8
-rwxr-xr-xminiany/torture.sh24
9 files changed, 908 insertions, 176 deletions
diff --git a/miniany/README b/miniany/README
index c011906..53266d6 100644
--- a/miniany/README
+++ b/miniany/README
@@ -1,17 +1,20 @@
-# Building
+# Running on host:
```
./build.sh cc tcc hosted d
./build.sh cc tcc freestanding d
+./cc < test1.c > test1.asm
+fasm test1.asm test1.bin
+ndisasm -b32 -o1000000h -a test1.bin
```
# Running on c4:
```
-tcc -O0 -g -o c4 c4.c
-./c4 cc.c < hello.c
-./c4 c4.c cc.c < hello.c
-./c4 c4.c c4.c cc.c < hello.c
+echo -n -e "\034" > EOF
+cat cc.c EOF hello.c | ./c4
+cat c4.c EOF cc.c EOF hello.c | ./c4
+cat c4.c4 EOF c4.c EOF cc.c EOF hello.c | ./c4
```
# Local version of c4
@@ -33,6 +36,14 @@ complex things into our own compiler.
- some simplified functions for printing like putstring, putint, putnl
- strict C89 conformance, mainly use standard comment blocks, also
removed some warnings
+ - some casts around malloc and memset to fit to non-void freestanding-libc
+ - converted printf to putstring/putint/putnl and some helper functions
+ for error reporting like error()
+ - removed memory leaks
+ - de-POSIX-ified, no open/read/close, use getchar from stdin only
+ (don't assume the existence of a file system), this also means
+ we have to create sort of an old style tape-file with FS markers
+ to separate the files piped to c4.
# Acknoledgments and references
diff --git a/miniany/REQUIREMENTS b/miniany/REQUIREMENTS
index 0a6076c..645bdfa 100644
--- a/miniany/REQUIREMENTS
+++ b/miniany/REQUIREMENTS
@@ -59,4 +59,27 @@ not implementing:
you have a grammatical construct to help recognizing it.
- register number for register alloation
https://en.wikipedia.org/wiki/Strahler_number
-
+- volatile: we are not doing any optimizations for now, so volatile (as const)
+ can just be a ignored keyword.
+- c4 freestanding
+ - uses some casts, the malloc ones are actually good for clarification,
+ the ones in memset are not so usefull (this is all because we don't
+ have 'void *')
+ - open/read/close is POSIX, we would prefer either C style file handling
+ (we have it in libc-freestanding.c or some stdin, stdout thingy)
+ - again printf and varargs, either use libc-freestanding.c or revert to
+ putint, putstring, putnl..
+ - if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); }
+ =>
+ error("open paren expected"); }
+ - printf("%d: compiler error tk=%d\n", line, tk); exit(-1);
+ - printf("could not malloc(%d) symbol area\n") => remove size, also map to error
+ - printf("read() returned %d\n", i); => dito
+ - we also print a non-sensical line, but we don't really care about this
+ - printf("%d: bad enum identifier %d\n", line, tk); he number 'tk' looks like
+ debug output here, so we drop it.
+ error1int is the other option (also choosen in other places)
+ - other cases translate by hand:
+ - case EXIT: /* putstring("exit("); putint(*sp); putstring(") cycle = "); putint(cycle); putnl(); */ return *sp;
+ - default: putstring("unknown instruction = "); putint(i); putstring("! cycle = "); putint(cycle); putnl(); return -1;
+
diff --git a/miniany/build.sh b/miniany/build.sh
index 72a702c..68124b0 100755
--- a/miniany/build.sh
+++ b/miniany/build.sh
@@ -30,22 +30,29 @@ case "${COMPILER}" in
;;
esac
-case "${LEVEL}" in
- 0|1|2|3|s)
+case "${COMPILER}:${LEVEL}" in
+ *:0|*:1|*:2|*:3|*:s)
CFLAGS+=" -O${LEVEL}"
;;
- d)
+ pcc:d)
+ CFLAGS+=" -g -O0"
+ DEBUG=1
+ ;;
+ *:d)
CFLAGS+=" -g -Og"
DEBUG=1
;;
*)
- echo "ERROR: Unknown compilation level '${LEVEL}' (use one of 0123 for -O<n>, or d for -O0 and debugging)" 1>&2
+ echo "ERROR: Unknown compilation level '${LEVEL}' (use one of 0123s for -O<n>, or d for '-Og -g' and debugging)" 1>&2
exit 1
;;
esac
case "${MODE}" in
- freestanding|hosted)
+ freestanding)
+ CFLAGS+=" -D__FREESTANDING__"
+ ;;
+ hosted)
;;
*)
echo "ERROR: Unknown environment '${MODE}' (use 'freestanding' or 'hosted')" 1>&2
@@ -67,6 +74,9 @@ case "${COMPILER}:${MODE}" in
CFLAGS+=" -fno-bultin -nostdlib"
MODULES+=("_start-stub.c")
;;
+ pcc:hosted)
+ CFLAGS+=" -Wl,-emain"
+ ;;
*:hosted)
#~ CFLAGS+=" -lbsd"
;;
diff --git a/miniany/c4.c b/miniany/c4.c
index 8317947..ca6d3d3 100644
--- a/miniany/c4.c
+++ b/miniany/c4.c
@@ -6,18 +6,23 @@
/* Written by Robert Swierczek */
+#if !defined(__FREESTANDING__)
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <unistd.h>
#include <fcntl.h>
#include <ctype.h>
+#endif
/* #define int long long */
char *p, *lp, /* current position in source code */
- *data; /* data/bss pointer */
+ *startp, /* start of the source code (for free) */
+ *data, /* data/bss pointer */
+ *sdata; /* start of data/bss pointer (for free) */
int *e, *le, /* current position in emitted code */
+ *se, /* start of emited code (for free) */
*cas, /* case statement patch-up pointer */
*brak, /* break statement patch-up pointer */
*def, /* default statement patch-up pointer */
@@ -63,28 +68,63 @@ enum {
/* opcodes */
enum { LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH ,
OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,
- OPEN,READ,CLOS,PRTF,MALC,FREE,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU,IALP,SCMP,SDUP,EXIT };
+ MALC,FREE,MCPY,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU,IALP,SCMP,SDUP,EXIT };
/* types */
enum { CHAR, INT, PTR = 256, PTR2 = 512 };
+void error(char *msg)
+{
+ putint(line);
+ putstring(": ");
+ putstring(msg);
+ putnl();
+ exit(-1);
+}
+
+void error1int(char *msg, int i)
+{
+ putint(line);
+ putstring(": ");
+ putstring(msg);
+ putint(i);
+ putnl();
+ exit(-1);
+}
+
void next()
{
char *pp;
+ char *buf;
while ((tk = *p) != 0) {
++p;
switch (tk) {
case '\n':
if (src) {
- printf("%d: %.*s", line, p - lp, lp);
+ putint(line);
+ putstring(": ");
+ buf = (char *)malloc(512);
+ memcpy(buf, lp, p - lp);
+ buf[p-lp] = 0;
+ putstring(buf);
lp = p;
while (le < e) {
- printf("%8.4s", &"LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH ,"
+ le++;
+ memcpy(buf, &"LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH ,"
"OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,"
- "OPEN,READ,CLOS,PRTF,MALC,FREE,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU,IALP,SCMP,SDUP,EXIT,"[*++le * 5]);
- if (*le <= ADJ) printf(" %d\n", *++le); else printf("\n");
+ "MALC,FREE,MCPY,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU.IALP,SCMP,SDUP,EXIT,"[*le * 5], 4);
+ buf[4] = 0;
+ putstring(" ");
+ putstring(buf);
+ if (*le <= ADJ) {
+ le++;
+ putchar(' ');
+ putint(*le);
+ }
+ putnl();
}
+ free(buf);
}
++line;
break;
@@ -191,7 +231,7 @@ void expr(int lev)
struct member_s *m;
switch (tk) {
- case 0: printf("%d: unexpected eof in expression\n", line); exit(-1);
+ case 0: error("unexpected eof in expression");
case Num: *++e = IMM; *++e = ival; next(); ty = INT; break;
case '"':
*++e = IMM; *++e = ival; next();
@@ -199,11 +239,11 @@ void expr(int lev)
data = (char *)(((int)data + sizeof(int)) & -sizeof(int)); ty = PTR;
break;
case Sizeof:
- next(); if (tk == '(') next(); else { printf("%d: open paren expected in sizeof\n", line); exit(-1); }
+ next(); if (tk == '(') next(); else { error("open paren expected in sizeof"); }
ty = INT; if (tk == Int) next(); else if (tk == Char) { next(); ty = CHAR; }
- else if (tk == Struct) { next(); if (tk != Id) { printf("%d: bad struct type\n", line); exit(-1); } ty = id->stype; next(); }
+ else if (tk == Struct) { next(); if (tk != Id) { error("bad struct type"); } ty = id->stype; next(); }
while (tk == Mul) { next(); ty = ty + PTR; }
- if (tk == ')') next(); else { printf("%d: close paren expected in sizeof\n", line); exit(-1); }
+ if (tk == ')') next(); else { error("close paren expected in sizeof"); }
*++e = IMM; *++e = ty >= PTR ? sizeof(int) : tsize[ty];
ty = INT;
break;
@@ -216,7 +256,7 @@ void expr(int lev)
next();
if (d->class == Sys) *++e = d->val;
else if (d->class == Fun) { *++e = JSR; *++e = d->val; }
- else { printf("%d: bad function call\n", line); exit(-1); }
+ else { error("bad function call"); }
if (t) { *++e = ADJ; *++e = t; }
ty = d->type;
}
@@ -224,7 +264,7 @@ void expr(int lev)
else {
if (d->class == Loc) { *++e = LEA; *++e = loc - d->val; }
else if (d->class == Glo) { *++e = IMM; *++e = d->val; }
- else { printf("%d: undefined variable\n", line); exit(-1); }
+ else { error("undefined variable"); }
if ((ty = d->type) <= INT || ty >= PTR) *++e = (ty == CHAR) ? LC : LI;
}
break;
@@ -232,25 +272,25 @@ void expr(int lev)
next();
if (tk == Int || tk == Char || tk == Struct) {
if (tk == Int) { next(); t = INT; } else if (tk == Char) { next(); t = CHAR; }
- else { next(); if (tk != Id) { printf("%d: bad struct type\n", line); exit(-1); } t = id->stype; next(); }
+ else { next(); if (tk != Id) { error("bad struct type"); } t = id->stype; next(); }
while (tk == Mul) { next(); t = t + PTR; }
- if (tk == ')') next(); else { printf("%d: bad cast\n", line); exit(-1); }
+ if (tk == ')') next(); else { error("bad cast"); }
expr(Inc);
ty = t;
}
else {
expr(Assign);
- if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); }
+ if (tk == ')') next(); else { error("close paren expected"); }
}
break;
case Mul:
next(); expr(Inc);
- if (ty > INT) ty = ty - PTR; else { printf("%d: bad dereference\n", line); exit(-1); }
+ if (ty > INT) ty = ty - PTR; else { error("bad dereference"); }
if (ty <= INT || ty >= PTR) *++e = (ty == CHAR) ? LC : LI;
break;
case And:
next(); expr(Inc);
- if (*e == LC || *e == LI) --e; else { printf("%d: bad address-of\n", line); exit(-1); }
+ if (*e == LC || *e == LI) --e; else { error("bad address-of"); }
ty = ty + PTR;
break;
case '!': next(); expr(Inc); *++e = PSH; *++e = IMM; *++e = 0; *++e = EQ; ty = INT; break;
@@ -266,13 +306,13 @@ void expr(int lev)
t = tk; next(); expr(Inc);
if (*e == LC) { *e = PSH; *++e = LC; }
else if (*e == LI) { *e = PSH; *++e = LI; }
- else { printf("%d: bad lvalue in pre-increment\n", line); exit(-1); }
+ else { error("bad lvalue in pre-increment"); }
*++e = PSH;
*++e = IMM; *++e = ty >= PTR2 ? sizeof(int) : (ty >= PTR) ? tsize[ty - PTR] : 1;
*++e = (t == Inc) ? ADD : SUB;
*++e = (ty == CHAR) ? SC : SI;
break;
- default: printf("%d: bad expression\n", line); exit(-1);
+ default: error("bad expression");
}
while (tk >= lev) { /* "precedence climbing" or "Top Down Operator Precedence" method */
@@ -280,14 +320,14 @@ void expr(int lev)
switch (tk) {
case Assign:
next();
- if (*e == LC || *e == LI) *e = PSH; else { printf("%d: bad lvalue in assignment\n", line); exit(-1); }
+ if (*e == LC || *e == LI) *e = PSH; else { error("bad lvalue in assignment"); }
expr(Assign); *++e = ((ty = t) == CHAR) ? SC : SI;
break;
case Cond:
next();
*++e = BZ; b = ++e;
expr(Assign);
- if (tk == ':') next(); else { printf("%d: conditional missing colon\n", line); exit(-1); }
+ if (tk == ':') next(); else { error("conditional missing colon"); }
*b = (int)(e + 3); *++e = JMP; b = ++e;
expr(Cond);
*b = (int)(e + 1);
@@ -326,7 +366,7 @@ void expr(int lev)
case Dec:
if (*e == LC) { *e = PSH; *++e = LC; }
else if (*e == LI) { *e = PSH; *++e = LI; }
- else { printf("%d: bad lvalue in post-increment\n", line); exit(-1); }
+ else { error("bad lvalue in post-increment"); }
sz = ty >= PTR2 ? sizeof(int) : ty >= PTR ? tsize[ty - PTR] : 1;
*++e = PSH; *++e = IMM; *++e = sz;
*++e = (tk == Inc) ? ADD : SUB;
@@ -338,11 +378,11 @@ void expr(int lev)
case Dot:
ty = ty + PTR;
case Arrow:
- if (ty <= PTR+INT || ty >= PTR2) { printf("%d: structure expected\n", line); exit(-1); }
+ if (ty <= PTR+INT || ty >= PTR2) { error("structure expected"); }
next();
- if (tk != Id) { printf("%d: structure member expected\n", line); exit(-1); }
+ if (tk != Id) { error("structure member expected"); }
m = members[ty - PTR]; while (m && m->id != id) m = m->next;
- if (!m) { printf("%d: structure member not found\n", line); exit(-1); }
+ if (!m) { error("structure member not found"); }
if (m->offset) { *++e = PSH; *++e = IMM; *++e = m->offset; *++e = ADD; }
ty = m->type;
if (ty <= INT || ty >= PTR) *++e = (ty == CHAR) ? LC : LI;
@@ -350,14 +390,14 @@ void expr(int lev)
break;
case Brak:
next(); *++e = PSH; expr(Assign);
- if (tk == ']') next(); else { printf("%d: close bracket expected\n", line); exit(-1); }
- if (t < PTR) { printf("%d: pointer type expected\n", line); exit(-1); }
+ if (tk == ']') next(); else { error("close bracket expected"); }
+ if (t < PTR) { error("pointer type expected"); }
sz = (t = t - PTR) >= PTR ? sizeof(int) : tsize[t];
if (sz > 1) { *++e = PSH; *++e = IMM; *++e = sz; *++e = MUL; }
*++e = ADD;
if ((ty = t) <= INT || ty >= PTR) *++e = (ty == CHAR) ? LC : LI;
break;
- default: printf("%d: compiler error tk=%d\n", line, tk); exit(-1);
+ default: error1int("compiler error tk=", tk);
}
}
}
@@ -369,9 +409,9 @@ void stmt()
switch (tk) {
case If:
next();
- if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); }
+ if (tk == '(') next(); else { error("open paren expected"); }
expr(Assign);
- if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); }
+ if (tk == ')') next(); else { error("close paren expected\n"); }
*++e = BZ; b = ++e;
stmt();
if (tk == Else) {
@@ -385,11 +425,11 @@ void stmt()
next();
a = e + 1;
stmt();
- if (tk != While) { printf("%d: while expected to terminate do loop\n", line); exit(-1); }
+ if (tk != While) { error("while expected to terminate do loop"); }
next();
- if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); }
+ if (tk == '(') next(); else { error("open paren expected"); }
expr(Assign);
- if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); }
+ if (tk == ')') next(); else { error("close paren expected"); }
*++e = BZ; b = ++e;
*++e = JMP; *++e = (int)a;
*b = (int)(e + 1);
@@ -397,9 +437,9 @@ void stmt()
case While:
next();
a = e + 1;
- if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); }
+ if (tk == '(') next(); else { error("open paren expected"); }
expr(Assign);
- if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); }
+ if (tk == ')') next(); else { error("close paren expected"); }
*++e = BZ; b = ++e;
stmt();
*++e = JMP; *++e = (int)a;
@@ -407,9 +447,9 @@ void stmt()
return;
case Switch:
next();
- if (tk == '(') next(); else { printf("%d: open paren expected\n", line); exit(-1); }
+ if (tk == '(') next(); else { error("open paren expected"); }
expr(Assign);
- if (tk == ')') next(); else { printf("%d: close paren expected\n", line); exit(-1); }
+ if (tk == ')') next(); else { error("close paren expected"); }
a = cas; *++e = JMP; cas = ++e;
b = brak; d = def; brak = def = 0;
stmt();
@@ -421,19 +461,19 @@ void stmt()
*++e = JMP; ++e; *e = (int)(e + 7); *++e = PSH; i = *cas; *cas = (int)e;
next();
expr(Or);
- if (e[-1] != IMM) { printf("%d: bad case immediate\n", line); exit(-1); }
+ if (e[-1] != IMM) { error("bad case immediate"); }
*e = *e - i; *++e = SUB; *++e = BNZ; cas = ++e; *e = i + e[-3];
- if (tk == ':') next(); else { printf("%d: colon expected\n", line); exit(-1); }
+ if (tk == ':') next(); else { error("colon expected"); }
stmt();
return;
case Break:
next();
- if (tk == ';') next(); else { printf("%d: semicolon expected\n", line); exit(-1); }
+ if (tk == ';') next(); else { error("semicolon expected"); }
*++e = JMP; *++e = (int)brak; brak = e;
return;
case Default:
next();
- if (tk == ':') next(); else { printf("%d: colon expected\n", line); exit(-1); }
+ if (tk == ':') next(); else { error("colon expected"); }
def = e + 1;
stmt();
return;
@@ -441,7 +481,7 @@ void stmt()
next();
if (tk != ';') expr(Assign);
*++e = LEV;
- if (tk == ';') next(); else { printf("%d: semicolon expected\n", line); exit(-1); }
+ if (tk == ';') next(); else { error("semicolon expected"); }
return;
case '{':
next();
@@ -453,56 +493,60 @@ void stmt()
return;
default:
expr(Assign);
- if (tk == ';') next(); else { printf("%d: semicolon expected\n", line); exit(-1); }
+ if (tk == ';') next(); else { error("semicolon expected"); }
}
}
int main(int argc, char **argv)
{
- int fd, bt, mbt, ty, poolsz;
+ int bt, mbt, ty, poolsz;
struct ident_s *idmain;
- struct member_s *m;
+ struct member_s *m, *mtmp;
int *pc, *sp, *bp, a, cycle; /* vm registers */
int i, *t, neg; /* temps */
-
+ int rt, run, n;
+ int *ssp;
+ char *buf;
+
--argc; ++argv;
if (argc > 0 && **argv == '-' && (*argv)[1] == 's') { src = 1; --argc; ++argv; }
if (argc > 0 && **argv == '-' && (*argv)[1] == 'd') { debug = 1; --argc; ++argv; }
- if (argc < 1) { printf("usage: c4 [-s] [-d] file ...\n"); return -1; }
-
- if ((fd = open(*argv, 0)) < 0) { printf("could not open(%s)\n", *argv); return -1; }
poolsz = 256*1024; /* arbitrary size */
- if (!(sym = malloc(poolsz))) { printf("could not malloc(%d) symbol area\n", poolsz); return -1; }
- if (!(le = e = malloc(poolsz))) { printf("could not malloc(%d) text area\n", poolsz); return -1; }
- if (!(data = malloc(poolsz))) { printf("could not malloc(%d) data area\n", poolsz); return -1; }
- if (!(sp = malloc(poolsz))) { printf("could not malloc(%d) stack area\n", poolsz); return -1; }
- if (!(tsize = malloc(PTR * sizeof(int)))) { printf("could not malloc() tsize area\n"); return -1; }
- if (!(members = malloc(PTR * sizeof(struct member_s *)))) { printf("could not malloc() members area\n"); return -1; }
+ if (!(sym = (struct ident_s *)malloc(poolsz))) { error("could not malloc symbol area"); }
+ if (!(se = le = e = (int *)malloc(poolsz))) { error("could not malloc text area"); }
+ if (!(sdata = data = malloc(poolsz))) { error("could not malloc data area"); }
+ if (!(ssp = sp = (int *)malloc(poolsz))) { error("could not malloc stack area"); }
+ if (!(tsize = (int *)malloc(PTR * sizeof(int)))) { error("could not malloc tsize area"); }
+ if (!(members = (struct member_s **)malloc(PTR * sizeof(struct member_s *)))) { error("could not malloc members area"); }
- memset(sym, 0, poolsz);
- memset(e, 0, poolsz);
- memset(data, 0, poolsz);
- memset(tsize, 0, PTR * sizeof(int));
- memset(members, 0, PTR * sizeof(struct member_s *));
+ memset((char *)sym, 0, poolsz);
+ memset((char *)e, 0, poolsz);
+ memset((char *)data, 0, poolsz);
+ memset((char *)tsize, 0, PTR * sizeof(int));
+ memset((char *)members, 0, PTR * sizeof(struct member_s *));
p = "break case char default else enum if int return sizeof do struct switch while "
"EOF EXIT_SUCCESS EXIT_FAILURE NULL "
- "open read close printf malloc free memset memcmp getchar putstring putnl putchar putint isspace isdigit isalnum isalpha strcmp strdup exit void main";
+ "malloc free memcpy memset memcmp getchar putstring putnl putchar putint isspace isdigit isalnum isalpha strcmp strdup exit void main";
i = Break; while (i <= While) { next(); id->tk = i++; } /* add keywords to symbol table */
/* add library constants */
next(); id->class = Num; id->type = INT; id->val = -1;
next(); id->class = Num; id->type = INT; id->val = 0;
next(); id->class = Num; id->type = INT; id->val = 1;
next(); id->class = Num; id->type = INT; id->val = (int)NULL;
- i = OPEN; while (i <= EXIT) { next(); id->class = Sys; id->type = INT; id->val = i++; } /* add library to symbol table */
+ i = MALC; while (i <= EXIT) { next(); id->class = Sys; id->type = INT; id->val = i++; } /* add library to symbol table */
next(); id->tk = Char; /* handle void type */
next(); idmain = id; /* keep track of main */
- if (!(lp = p = malloc(poolsz))) { printf("could not malloc(%d) source area\n", poolsz); return -1; }
- if ((i = read(fd, p, poolsz-1)) <= 0) { printf("read() returned %d\n", i); return -1; }
+ if (!(startp = lp = p = malloc(poolsz))) { error("could not malloc source area"); }
+
+ i = 0; p[i] = getchar();
+ while (p[i]!=28 /* FS file separator */ && p[i]!=EOF) {
+ i++; p[i] = getchar();
+ if (i>=poolsz-1) { error("too small read buffer"); }
+ }
p[i] = 0;
- close(fd);
/* add primitive types */
tsize[tnew++] = sizeof(char);
@@ -522,13 +566,13 @@ int main(int argc, char **argv)
next();
i = 0;
while (tk != '}') {
- if (tk != Id) { printf("%d: bad enum identifier %d\n", line, tk); return -1; }
+ if (tk != Id) { error("bad enum identifier"); }
next();
if (tk == Assign) {
next();
neg = 0;
if (tk == Sub) { next(); neg =1; }
- if (tk != Num) { printf("%d: bad enum initializer\n", line); return -1; }
+ if (tk != Num) { error("bad enum initializer"); }
i = ival;
if (neg) {
i = -ival;
@@ -552,7 +596,7 @@ int main(int argc, char **argv)
}
if (tk == '{') {
next();
- if (members[bt]) { printf("%d: duplicate structure definition\n", line); return -1; }
+ if (members[bt]) { error("duplicate structure definition"); }
i = 0;
while (tk != '}') {
mbt = INT;
@@ -560,15 +604,15 @@ int main(int argc, char **argv)
else if (tk == Char) { next(); mbt = CHAR; }
else if (tk == Struct) {
next();
- if (tk != Id) { printf("%d: bad struct declaration\n", line); return -1; }
+ if (tk != Id) { error("bad struct declaration\n"); }
mbt = id->stype;
next();
}
while (tk != ';') {
ty = mbt;
while (tk == Mul) { next(); ty = ty + PTR; }
- if (tk != Id) { printf("%d: bad struct member definition\n", line); return -1; }
- m = malloc(sizeof(struct member_s));
+ if (tk != Id) { error("bad struct member definition"); }
+ m = (struct member_s *)malloc(sizeof(struct member_s));
m->id = id;
m->offset = i;
m->type = ty;
@@ -588,8 +632,8 @@ int main(int argc, char **argv)
while (tk != ';' && tk != '}') {
ty = bt;
while (tk == Mul) { next(); ty = ty + PTR; }
- if (tk != Id) { printf("%d: bad global declaration\n", line); return -1; }
- if (id->class) { printf("%d: duplicate global definition\n", line); return -1; }
+ if (tk != Id) { error("bad global declaration"); }
+ if (id->class) { error("duplicate global definition"); }
next();
id->type = ty;
if (tk == '(') { /* function */
@@ -602,13 +646,13 @@ int main(int argc, char **argv)
else if (tk == Char) { next(); ty = CHAR; }
else if (tk == Struct) {
next();
- if (tk != Id) { printf("%d: bad struct declaration\n", line); return -1; }
+ if (tk != Id) { error("bad struct declaration"); }
ty = id->stype;
next();
}
while (tk == Mul) { next(); ty = ty + PTR; }
- if (tk != Id) { printf("%d: bad parameter declaration\n", line); return -1; }
- if (id->class == Loc) { printf("%d: duplicate parameter definition\n", line); return -1; }
+ if (tk != Id) { error("bad parameter declaration"); }
+ if (id->class == Loc) { error("duplicate parameter definition"); }
id->hclass = id->class; id->class = Loc;
id->htype = id->type; id->type = ty;
id->hval = id->val; id->val = i++;
@@ -616,21 +660,21 @@ int main(int argc, char **argv)
if (tk == ',') next();
}
next();
- if (tk != '{') { printf("%d: bad function definition\n", line); return -1; }
+ if (tk != '{') { error("bad function definition"); }
loc = ++i;
next();
while (tk == Int || tk == Char || tk == Struct) {
if (tk == Int) bt = INT; else if (tk == Char) bt = CHAR; else {
next();
- if (tk != Id) { printf("%d: bad struct declaration\n", line); return -1; }
+ if (tk != Id) { error("bad struct declaration"); }
bt = id->stype;
}
next();
while (tk != ';') {
ty = bt;
while (tk == Mul) { next(); ty = ty + PTR; }
- if (tk != Id) { printf("%d: bad local declaration\n", line); return -1; }
- if (id->class == Loc) { printf("%d: duplicate local definition\n", line); return -1; }
+ if (tk != Id) { error("bad local declaration"); }
+ if (id->class == Loc) { error("duplicate local definition"); }
id->hclass = id->class; id->class = Loc;
id->htype = id->type; id->type = ty;
id->hval = id->val; id->val = ++i;
@@ -662,83 +706,113 @@ int main(int argc, char **argv)
next();
}
- if (!(pc = (int *)idmain->val)) { printf("main() not defined\n"); return -1; }
- if (src) return 0;
+ if (!(pc = (int *)idmain->val)) { error("main() not defined\n"); }
+ rt = 0;
+ if (!src) {
- /* setup stack */
- bp = sp = (int *)((int)sp + poolsz);
- *--sp = EXIT; /* call exit if main returns */
- *--sp = PSH; t = sp;
- *--sp = argc;
- *--sp = (int)argv;
- *--sp = (int)t;
+ /* setup stack */
+ bp = sp = (int *)((int)sp + poolsz);
+ *--sp = EXIT; /* call exit if main returns */
+ *--sp = PSH; t = sp;
+ *--sp = argc;
+ *--sp = (int)argv;
+ *--sp = (int)t;
- /* run... */
- cycle = 0;
- a = 0;
- while (1) {
- i = *pc++; ++cycle;
- if (debug) {
- printf("%d> %.4s", cycle,
- &"LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH ,"
- "OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,"
- "OPEN,READ,CLOS,PRTF,MALC,FREE,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU.IALP,SCMP,SDUP,EXIT,"[i * 5]);
- if (i <= ADJ) printf(" %d\n", *pc); else printf("\n");
- }
- switch (i) {
- case LEA: a = (int)(bp + *pc++); break; /* load local address */
- case IMM: a = *pc++; break; /* load global address or immediate */
- case JMP: pc = (int *)*pc; break; /* jump */
- case JSR: *--sp = (int)(pc + 1); pc = (int *)*pc; break; /* jump to subroutine */
- case BZ: pc = a ? pc + 1 : (int *)*pc; break; /* branch if zero */
- case BNZ: pc = a ? (int *)*pc : pc + 1; break; /* branch if not zero */
- case ENT: *--sp = (int)bp; bp = sp; sp = sp - *pc++; break; /* enter subroutine */
- case ADJ: sp = sp + *pc++; break; /* stack adjust */
- case LEV: sp = bp; bp = (int *)*sp++; pc = (int *)*sp++; break; /* leave subroutine */
- case LI: a = *(int *)a; break; /* load int */
- case LC: a = *(char *)a; break; /* load char */
- case SI: *(int *)*sp++ = a; break; /* store int */
- case SC: a = *(char *)*sp++ = a; break; /* store char */
- case PSH: *--sp = a; break; /* push */
+ /* run... */
+ cycle = 0;
+ a = 0;
+ run = 1;
+ while (run) {
+ i = *pc++; ++cycle;
+ if (debug) {
+ putint(cycle);
+ putstring("> ");
+ buf = (char *)malloc( 5 );
+ memcpy(buf, &"LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH ,"
+ "OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD ,"
+ "MALC,FREE,MCPY,MSET,MCMP,GETC,PUTS,PUTN,PUTC,PUTI,ISPC,IDGT,IANU.IALP,SCMP,SDUP,EXIT,"[i * 5],4);
+ buf[4] = 0;
+ putstring(buf);
+ free(buf);
+ if (i <= ADJ) {
+ putchar(' ');
+ putint(*pc);
+ }
+ putnl();
+ }
+ switch (i) {
+ case LEA: a = (int)(bp + *pc++); break; /* load local address */
+ case IMM: a = *pc++; break; /* load global address or immediate */
+ case JMP: pc = (int *)*pc; break; /* jump */
+ case JSR: *--sp = (int)(pc + 1); pc = (int *)*pc; break; /* jump to subroutine */
+ case BZ: pc = a ? pc + 1 : (int *)*pc; break; /* branch if zero */
+ case BNZ: pc = a ? (int *)*pc : pc + 1; break; /* branch if not zero */
+ case ENT: *--sp = (int)bp; bp = sp; sp = sp - *pc++; break; /* enter subroutine */
+ case ADJ: sp = sp + *pc++; break; /* stack adjust */
+ case LEV: sp = bp; bp = (int *)*sp++; pc = (int *)*sp++; break; /* leave subroutine */
+ case LI: a = *(int *)a; break; /* load int */
+ case LC: a = *(char *)a; break; /* load char */
+ case SI: *(int *)*sp++ = a; break; /* store int */
+ case SC: a = *(char *)*sp++ = a; break; /* store char */
+ case PSH: *--sp = a; break; /* push */
- case OR: a = *sp++ | a; break;
- case XOR: a = *sp++ ^ a; break;
- case AND: a = *sp++ & a; break;
- case EQ: a = *sp++ == a; break;
- case NE: a = *sp++ != a; break;
- case LT: a = *sp++ < a; break;
- case GT: a = *sp++ > a; break;
- case LE: a = *sp++ <= a; break;
- case GE: a = *sp++ >= a; break;
- case SHL: a = *sp++ << a; break;
- case SHR: a = *sp++ >> a; break;
- case ADD: a = *sp++ + a; break;
- case SUB: a = *sp++ - a; break;
- case MUL: a = *sp++ * a; break;
- case DIV: a = *sp++ / a; break;
- case MOD: a = *sp++ % a; break;
+ case OR: a = *sp++ | a; break;
+ case XOR: a = *sp++ ^ a; break;
+ case AND: a = *sp++ & a; break;
+ case EQ: a = *sp++ == a; break;
+ case NE: a = *sp++ != a; break;
+ case LT: a = *sp++ < a; break;
+ case GT: a = *sp++ > a; break;
+ case LE: a = *sp++ <= a; break;
+ case GE: a = *sp++ >= a; break;
+ case SHL: a = *sp++ << a; break;
+ case SHR: a = *sp++ >> a; break;
+ case ADD: a = *sp++ + a; break;
+ case SUB: a = *sp++ - a; break;
+ case MUL: a = *sp++ * a; break;
+ case DIV: a = *sp++ / a; break;
+ case MOD: a = *sp++ % a; break;
- case OPEN: a = open((char *)sp[1], *sp); break;
- case READ: a = read(sp[2], (char *)sp[1], *sp); break;
- case CLOS: a = close(*sp); break;
- case PRTF: t = sp + pc[1]; a = printf((char *)t[-1], t[-2], t[-3], t[-4], t[-5], t[-6]); break;
- case MALC: a = (int)malloc(*sp); break;
- case FREE: free((void *)*sp); break;
- case MSET: a = (int)memset((char *)sp[2], sp[1], *sp); break;
- case MCMP: a = memcmp((char *)sp[2], (char *)sp[1], *sp); break;
- case GETC: a = getchar(); break;
- case PUTS: t = sp + pc[1]; a = printf("%s", (char *)t[-1]); break;
- case PUTN: a = printf("\n"); break;
- case PUTC: a = putchar(*sp); break;
- case PUTI: t = sp + pc[1]; a = printf("%d", (int)t[-1]); break;
- case ISPC: t = sp + pc[1]; a = isspace( (int)t[-1]); break;
- case IDGT: t = sp + pc[1]; a = isdigit( (int)t[-1]); break;
- case IANU: t = sp + pc[1]; a = isalnum( (int)t[-1]); break;
- case IALP: t = sp + pc[1]; a = isalpha( (int)t[-1]); break;
- case SCMP: t = sp + pc[1]; a = strcmp((char *)t[-1], (char *)t[-2]); break;
- case SDUP: t = sp + pc[1]; a = (int)strdup((char *)t[-1]); break;
- case EXIT: /* printf("exit(%d) cycle = %d\n", *sp, cycle); */ return *sp;
- default: printf("unknown instruction = %d! cycle = %d\n", i, cycle); return -1;
+ case MALC: a = (int)malloc(*sp); break;
+ case FREE: free((void *)*sp); break;
+ case MCPY: a = (int)memcpy((char *)sp[2], (char *)sp[1], *sp); break;
+ case MSET: a = (int)memset((char *)sp[2], sp[1], *sp); break;
+ case MCMP: a = memcmp((char *)sp[2], (char *)sp[1], *sp); break;
+ case GETC: a = getchar(); break;
+ case PUTS: t = sp + pc[1]; a = putstring((char *)t[-1]); break;
+ case PUTN: a = putnl(); break;
+ case PUTC: a = putchar(*sp); break;
+ case PUTI: t = sp + pc[1]; a = putint((int)t[-1]); break;
+ case ISPC: t = sp + pc[1]; a = isspace( (int)t[-1]); break;
+ case IDGT: t = sp + pc[1]; a = isdigit( (int)t[-1]); break;
+ case IANU: t = sp + pc[1]; a = isalnum( (int)t[-1]); break;
+ case IALP: t = sp + pc[1]; a = isalpha( (int)t[-1]); break;
+ case SCMP: t = sp + pc[1]; a = strcmp((char *)t[-1], (char *)t[-2]); break;
+ case SDUP: t = sp + pc[1]; a = (int)strdup((char *)t[-1]); break;
+ case EXIT: /* putstring("exit("); putint(*sp); putstring(") cycle = "); putint(cycle); putnl(); */ rt = *sp; run = 0; break;
+ default: putstring("unknown instruction = "); putint(i); putstring("! cycle = "); putint(cycle); putnl(); rt = -1; run = 0; break;
+ }
+ }
+ }
+ free((char *)startp);
+ n = 0;
+ while (n<PTR) {
+ if (members[n]) {
+ m = members[n];
+ while (m) {
+ mtmp = m;
+ m = m->next;
+ free((char *)mtmp);
+ }
}
+ n++;
}
+ free((char *)members);
+ free((char *)tsize);
+ free((char *)ssp);
+ free((char *)sdata);
+ free((char *)se);
+ free((char *)sym);
+ exit(rt);
+ return rt;
}
diff --git a/miniany/doc/journal.stuffwithstuff.com_2011_03_19_pratt-parsers-expression-parsing-made-easy.txt b/miniany/doc/journal.stuffwithstuff.com_2011_03_19_pratt-parsers-expression-parsing-made-easy.txt
new file mode 100644
index 0000000..b1b1b7d
--- /dev/null
+++ b/miniany/doc/journal.stuffwithstuff.com_2011_03_19_pratt-parsers-expression-parsing-made-easy.txt
@@ -0,0 +1,564 @@
+ #[1]RSS 2.0 [2]Atom 1.0
+
+[3]Pratt Parsers: Expression Parsing Made Easy
+
+ [4]&#8617; [5]&#8618;
+
+[6]March 19, 2011 [7]code [8]java [9]js [10]language [11]magpie [12]parsing
+
+ Every now and then, I stumble onto some algorithm or idea that's so
+ clever and such a perfect solution to a problem that I feel like I got
+ smarter or gained [13]a new superpower just by learning it. [14]Heaps
+ (just about the only thing I got out of my truncated CS education) were
+ one thing like this. I recently stumbled onto another: [15]Pratt
+ parsers.
+
+ When you're writing a parser, [16]recursive descent is as easy as
+ spreading peanut butter. It excels when you can figure out what to do
+ next based on the next chunk of code you're parsing. That's usually
+ true at the top level of a language where things like classes are and
+ also for statements since most start with something that uniquely
+ identifies them (if, for, while, etc.).
+
+ But it gets tricky when you get to expressions. When it comes to infix
+ operators like +, postfix ones like ++, and even mixfix expressions
+ like ?:, it can be hard to tell what kind of expression you're parsing
+ until you're halfway through it. You can do this with recursive
+ descent, but it's a chore. You have to write separate functions for
+ each level of precedence (JavaScript has 17 of them, for example),
+ manually handle associativity, and smear your grammar across a bunch of
+ parsing code until it's hard to see.
+
+PB & J, The Secret Weapon
+
+ Pratt parsing fixes just that. If recursive descent is peanut butter,
+ Pratt parsing is jelly. When you mix the two together, you get a
+ simple, terse, readable parser that can handle any grammar you throw at
+ it.
+
+ Pratt's technique for handling operator precedence and infix
+ expressions is so simple and effective it's a mystery why almost no one
+ knows about it. After the seventies, top down operator precedence
+ parsers seem to have fallen off the Earth. Douglas Crockford's
+ [17]JSLint uses one to [18]parse JavaScript, but his treatment is one
+ of the [19]very few remotely modern articles about it.
+
+ Part of the problem, I think, is that Pratt's terminology is opaque,
+ and Crockford's article is itself rather murky. Pratt uses terms like
+ "null denominator" and Crockford mixes in extra stuff like tracking
+ lexical scope that obscures the core idea.
+
+ This is where I come in. I won't do anything revolutionary. I'll just
+ try to get the core concepts behind top down operator precedence
+ parsers and present them as clearly as I can. I'll switch out some
+ terms to (I hope) clarify things. Hopefully I won't offend anyone's
+ purist sensibilities. I'll be coding in Java, the vulgar Latin of
+ programming languages. I figure if you can write it in Java, you can
+ write it in anything.
+
+What We'll Be Making
+
+ I'm a learn-by-doing person, which means I'm also a teach-by-doing one.
+ So to show how Pratt parsers work, we'll build a parser for a [20]tiny
+ little toy language called Bantam. It just has expressions since that's
+ where Pratt parsing is really helpful, but that should be enough to
+ convince of its usefulness.
+
+ Even though it's simple, it has a full gamut of operators: prefix (+,
+ -, ~, !), postfix (!), infix (+, -, *, /, ^), and even a mixfix
+ conditional operator (?:). It has multiple precedence levels and both
+ right and left associative operators. It also has assignment, function
+ calls and parentheses for grouping. If we can parse this, we can parse
+ anything.
+
+What We'll Start With
+
+ All we care about is parsing, so we'll ignore the tokenizing phase. I
+ slapped together [21]a crude lexer that works and we'll just pretend
+ that tokens are raining down from heaven or something.
+
+ A token is just a chunk of meaningful code with a type and a string
+ associated with it. Given a + b(c), the tokens would be:
+NAME "a"
+PLUS "+"
+NAME "b"
+LEFT_PAREN "("
+NAME "c"
+RIGHT_PAREN ")"
+
+ Likewise, we won't be interpreting or compiling this code. We just want
+ to parse it to a nice data structure. For our purposes, that means our
+ parser should chew up a bunch of [22]Token objects and spit out an
+ instance of some class that implements [23]Expression. To give you an
+ idea, here's a simplified version of the class for a [24]conditional
+ expression:
+class ConditionalExpression implements Expression {
+ public ConditionalExpression(
+ Expression condition,
+ Expression thenArm,
+ Expression elseArm) {
+ this.condition = condition;
+ this.thenArm = thenArm;
+ this.elseArm = elseArm;
+ }
+
+ public final Expression condition;
+ public final Expression thenArm;
+ public final Expression elseArm;
+}
+
+ (You gotta love Java's "please sign it in quadruplicate" level of
+ bureaucracy here. Like I said, if you can do this in Java, you can do
+ it in any language.)
+
+ We'll be building this starting from a simple [25]Parser class. This
+ owns the token stream, handles lookahead and provides the basic methods
+ you'll need to write a top-down recursive descent parser with a single
+ token of lookahead (i.e. it's LL(1)). This is enough to get us going.
+ If we need more later, it's easy to extend it.
+
+ OK, let's build ourselves a parser!
+
+First Things First
+
+ Even though a "full" Pratt parser is pretty tiny, I found it to be a
+ bit hard to decipher. Sort of like [26]quicksort, the implementation is
+ a deceptively-simple handful of deeply intertwined code. To untangle
+ it, we'll build it up one tiny step at a time.
+
+ The simplest expressions to parse are prefix operators and single-token
+ ones. For those, the current token tells us all that we need to do.
+ Bantam has one single-token expression, named variables, and four
+ prefix operators: +, -, ~, and !. The simplest possible code to parse
+ that would be:
+Expression parseExpression() {
+ if (match(TokenType.NAME)) // return NameExpression...
+ else if (match(TokenType.PLUS)) // return prefix + operator...
+ else if (match(TokenType.MINUS)) // return prefix - operator...
+ else if (match(TokenType.TILDE)) // return prefix ~ operator...
+ else if (match(TokenType.BANG)) // return prefix ! operator...
+ else throw new ParseException();
+}
+
+ But that's a bit monolithic. As you can see, we're switching off of a
+ TokenType to branch to different parsing behavior. Let's encode that
+ directly by making a Map from TokenTypes to chunks of parsing code.
+ We'll call these chunks "parselets", and they will implement this:
+interface PrefixParselet {
+ Expression parse(Parser parser, Token token);
+}
+
+ An implementation of this to parse variable names is just:
+class NameParselet implements PrefixParselet {
+ public Expression parse(Parser parser, Token token) {
+ return new NameExpression(token.getText());
+ }
+}
+
+ We can use a single class for all of the prefix operators since they
+ only differ in the actual operator token itself:
+class PrefixOperatorParselet implements PrefixParselet {
+ public Expression parse(Parser parser, Token token) {
+ Expression operand = parser.parseExpression();
+ return new PrefixExpression(token.getType(), operand);
+ }
+}
+
+ You'll note that it calls back into parseExpression() to parse the
+ operand that appears after the operator (i.e. to parse the a in -a).
+ This recursion takes care of nested operators like -+~!a.
+
+ Back in Parser, the chained if statements are replaced with a cleaner
+ map:
+class Parser {
+ public void register(TokenType token, PrefixParselet parselet) {
+ mPrefixParselets.put(token, parselet);
+ }
+
+ public Expression parseExpression() {
+ Token token = consume();
+ PrefixParselet prefix = mPrefixParselets.get(token.getType());
+
+ if (prefix == null) throw new ParseException(
+ "Could not parse \"" + token.getText() + "\".");
+
+ return prefix.parse(this, token);
+ }
+
+ // Other stuff...
+
+ private final Map<TokenType, PrefixParselet> mPrefixParselets =
+ new HashMap<TokenType, PrefixParselet>();
+}
+
+ To define the grammar we have so far (variables and the four prefix
+ operators), we'll make this helper method:
+void prefix(TokenType token) {
+ register(token, new PrefixOperatorParselet());
+}
+
+ And now we can define the grammar like:
+register(TokenType.NAME, new NameParselet());
+prefix(TokenType.PLUS);
+prefix(TokenType.MINUS);
+prefix(TokenType.TILDE);
+prefix(TokenType.BANG);
+
+ This is already an improvement over a recursive descent parser because
+ our grammar is now more declarative instead of being spread out over a
+ few imperative functions, and we can see the actual grammar all in one
+ place. Even better, we can extend the grammar just by registering new
+ parselets. We don't have to change the Parser class itself.
+
+ If we only had prefix expressions, we'd be done now. Alas, we don't.
+
+Stuck In the Middle
+
+ What we have so far only works if the first token tells us what kind of
+ expression we're parsing, but that isn't always the case. With an
+ expression like a + b, we don't know we have an add expression until
+ after we parse the a and get to +. We'll have to extend the parser to
+ support that.
+
+ Fortunately, we're in a good place to do so. Our current
+ parseExpression() method will parse a complete prefix expression
+ including any nested prefix expressions and then stop. So, if we throw
+ this at it:
+-a + b
+
+ It will parse -a and leave us sitting on +. That's exactly the token we
+ need to tell what infix expression we're parsing. The only difference
+ between an infix expression and a prefix one here is that there's
+ another expression before the infix operator that it needs to have as
+ an argument. Let's define a parselet that supports that:
+interface InfixParselet {
+ Expression parse(Parser parser, Expression left, Token token);
+}
+
+ The only difference is that left argument, which is just the expression
+ we parsed before we got to the infix token. We'll wire this up to our
+ parser by having another table of infix parselets.
+
+ Having separate tables for prefix and infix expressions is important
+ because we'll often have both a prefix and infix parselet for a single
+ TokenType. For example, the prefix parselet for ( handles grouping in
+ an expression like a * (b + c). Meanwhile, the infix parselet handles
+ function calls like a(b).
+
+ Now, after we parse the prefix expression, we hand it off to any infix
+ one that subsumes it:
+class Parser {
+ public void register(TokenType token, InfixParselet parselet) {
+ mInfixParselets.put(token, parselet);
+ }
+
+ public Expression parseExpression() {
+ Token token = consume();
+ PrefixParselet prefix = mPrefixParselets.get(token.getType());
+
+ if (prefix == null) throw new ParseException(
+ "Could not parse \"" + token.getText() + "\".");
+
+ Expression left = prefix.parse(this, token);
+
+ token = lookAhead(0);
+ InfixParselet infix = mInfixParselets.get(token.getType());
+
+ // No infix expression at this point, so we're done.
+ if (infix == null) return left;
+
+ consume();
+ return infix.parse(this, left, token);
+ }
+
+ // Other stuff...
+
+ private final Map<TokenType, InfixParselet> mInfixParselets =
+ new HashMap<TokenType, InfixParselet>();
+}
+
+ Pretty straightforward. We can implement an infix parselet for binary
+ arithmetic operators like + using something like:
+class BinaryOperatorParselet implements InfixParselet {
+ public Expression parse(Parser parser,
+ Expression left, Token token) {
+ Expression right = parser.parseExpression();
+ return new OperatorExpression(left, token.getType(), right);
+ }
+}
+
+ This also works for postfix operators. I'm calling them "infix"
+ parselets, but they're really "anything but prefix". If there's some
+ expression that comes before the token, it will be handled by an infix
+ parselet, and that includes postfix expressions and mixfix ones like
+ ?:.
+
+ Postfix is as easy as a single-token prefix parselet: it just takes the
+ left expression and wraps it in another expression:
+class PostfixOperatorParselet implements InfixParselet {
+ public Expression parse(Parser parser, Expression left,
+ Token token) {
+ return new PostfixExpression(left, token.getType());
+ }
+}
+
+ Mixfix is easy too. It's pretty much a familiar recursive descent
+ parser:
+class ConditionalParselet implements InfixParselet {
+ public Expression parse(Parser parser, Expression left,
+ Token token) {
+ Expression thenArm = parser.parseExpression();
+ parser.consume(TokenType.COLON);
+ Expression elseArm = parser.parseExpression();
+
+ return new ConditionalExpression(left, thenArm, elseArm);
+ }
+}
+
+ Now we can parse prefix expressions, postfix, infix, and even mixfix.
+ With a pretty small amount of code, we can parse expressions like a +
+ (b ? c! : -d). We're done, right? Well... almost.
+
+Excuse You, Aunt Sally
+
+ Our parser can parse all of this stuff, but it doesn't parse it with
+ the right precedence or associativity. If you throw a - b - c at it, it
+ will parse it like a - (b - c), which isn't right. (Well, actually it
+ is right--associative that is. We need it to be left.)
+
+ And this last step is where Pratt parsers go from pretty nice to
+ totally radical. We'll make two simple changes. We'll extend
+ parseExpression() to take a precedence--a number that tells which
+ expressions can be parsed by that call. If it encounters an expression
+ whose precedence is lower than we allow, it just stops parsing and
+ returns what it has so far.
+
+ To make that check we need to know the precedence of any given infix
+ expression. We'll do that by letting the parselet specify it:
+public interface InfixParselet {
+ Expression parse(Parser parser, Expression left, Token token);
+ int getPrecedence();
+}
+
+ Using that, our core expression parser becomes:
+public Expression parseExpression(int precedence) {
+ Token token = consume();
+ PrefixParselet prefix = mPrefixParselets.get(token.getType());
+
+ if (prefix == null) throw new ParseException(
+ "Could not parse \"" + token.getText() + "\".");
+
+ Expression left = prefix.parse(this, token);
+
+ while (precedence < getPrecedence()) {
+ token = consume();
+
+ InfixParselet infix = mInfixParselets.get(token.getType());
+ left = infix.parse(this, left, token);
+ }
+
+ return left;
+}
+
+ That relies on a tiny helper function to get the precedence of the
+ current token or default if there's no infix parselet for it:
+private int getPrecedence() {
+ InfixParselet parser = mInfixParselets.get(
+ lookAhead(0).getType());
+ if (parser != null) return parser.getPrecedence();
+
+ return 0;
+}
+
+ And that's it. To use this, we'll set up a little precedence table:
+public class Precedence {
+ public static final int ASSIGNMENT = 1;
+ public static final int CONDITIONAL = 2;
+ public static final int SUM = 3;
+ public static final int PRODUCT = 4;
+ public static final int EXPONENT = 5;
+ public static final int PREFIX = 6;
+ public static final int POSTFIX = 7;
+ public static final int CALL = 8;
+}
+
+ To make our operators correctly handle precedence, they'll pass an
+ appropriate value back into parseExpression() when they call it
+ recursively. For example, the [27]BinaryOperatorParselet instance that
+ handles the + operator will pass in Precedence.SUM when it parses its
+ right-hand operand.
+
+ Associativity is easy too. If an infix parselet calls parseExpression()
+ with the same precedence that it returns for its own getPrecedence()
+ call, you'll get left associativity. To be right-associative, it just
+ needs to pass in one less than that instead.
+
+Go Forth and Multiply
+
+ I've rewritten the [28]parser for Magpie using this and it worked like
+ a charm. I'm also working on a JavaScript parser based on this and
+ again it's been a great fit.
+
+ I find parsing like this to be simple, terse, extensible (Magpie, for
+ example, uses this to [29]let you extend its own syntax at runtime),
+ and easy to read. I'm at the point where I can't imagine writing a
+ parser any other way. I never thought I'd say this, but parsers are
+ easy now.
+
+ To see for yourself, just take a look at [30]the complete program.
+ Please enable JavaScript to view the [31]comments powered by Disqus.
+
+ [32][dogshot_square.jpg]
+
+ Hi! I'm Bob Nystrom, the one on the left.
+
+ I wrote a book called [33]Game Programming Patterns. I'm working on
+ another book called [34]Crafting Interpreters.
+
+ You can email me at robert at this site or follow me on twitter at
+ [35]@munificentbob.
+
+Elsewhere
+
+ * Code at [36]github
+ * Tweets at [37]twitter
+ * Photos at [38]500px
+ * Photos at [39]flickr
+
+Categories
+
+ * [40]code 67
+ * [41]language 43
+ * [42]magpie 24
+ * [43]c-sharp 13
+ * [44]dart 13
+ * [45]game-dev 12
+ * [46]java 10
+ * [47]cpp 8
+ * [48]design 7
+ * [49]game-patterns 6
+ * [50]parsing 6
+ * [51]roguelike 6
+ * [52]book 5
+ * [53]go 5
+ * [54]js 4
+ * [55]personal 4
+ * [56]c 3
+ * [57]finch 3
+ * [58]python 3
+ * [59]ruby 3
+ * [60]blog 2
+ * [61]f-sharp 2
+ * [62]lua 2
+ * [63]music 2
+ * [64]ai 1
+ * [65]beta 1
+ * [66]blogofile 1
+ * [67]game 1
+ * [68]jasic 1
+ * [69]javascript 1
+ * [70]oop 1
+ * [71]optimization 1
+ * [72]oscon 1
+ * [73]politics 1
+ * [74]scheme 1
+ * [75]typescript 1
+ * [76]visualization 1
+
+ All [77]76 articles...
+
+ This blog is built using [78]jekyll. The source repo for it is
+ [79]here.
+
+ © 2008-2021 Robert Nystrom
+
+References
+
+ Visible links:
+ 1. https://journal.stuffwithstuff.com/rss.xml
+ 2. https://journal.stuffwithstuff.com/atom.xml
+ 3. https://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/
+ 4. https://journal.stuffwithstuff.com/2011/02/13/extending-syntax-from-within-a-language/
+ 5. https://journal.stuffwithstuff.com/2011/04/21/multimethods-multiple-inheritance-multiawesome/
+ 6. https://journal.stuffwithstuff.com/archive
+ 7. https://journal.stuffwithstuff.com/category/code
+ 8. https://journal.stuffwithstuff.com/category/java
+ 9. https://journal.stuffwithstuff.com/category/js
+ 10. https://journal.stuffwithstuff.com/category/language
+ 11. https://journal.stuffwithstuff.com/category/magpie
+ 12. https://journal.stuffwithstuff.com/category/parsing
+ 13. http://xkcd.com/208/
+ 14. http://en.wikipedia.org/wiki/Heap_%28data_structure%29
+ 15. http://en.wikipedia.org/wiki/Vaughan_Pratt
+ 16. http://en.wikipedia.org/wiki/Recursive_descent
+ 17. http://www.jslint.com/
+ 18. http://javascript.crockford.com/tdop/tdop.html
+ 19. http://effbot.org/zone/simple-top-down-parsing.htm
+ 20. https://github.com/munificent/bantam
+ 21. https://github.com/munificent/bantam/blob/master/src/com/stuffwithstuff/bantam/Lexer.java
+ 22. https://github.com/munificent/bantam/blob/master/src/com/stuffwithstuff/bantam/Token.java
+ 23. https://github.com/munificent/bantam/blob/master/src/com/stuffwithstuff/bantam/expressions/Expression.java
+ 24. https://github.com/munificent/bantam/blob/master/src/com/stuffwithstuff/bantam/expressions/ConditionalExpression.java
+ 25. https://github.com/munificent/bantam/blob/master/src/com/stuffwithstuff/bantam/Parser.java
+ 26. http://en.wikipedia.org/wiki/Quicksort
+ 27. https://github.com/munificent/bantam/blob/master/src/com/stuffwithstuff/bantam/parselets/BinaryOperatorParselet.java
+ 28. https://github.com/munificent/magpie/blob/master/src/com/stuffwithstuff/magpie/parser/MagpieParser.java
+ 29. http://journal.stuffwithstuff.com/2011/02/13/extending-syntax-from-within-a-language/
+ 30. http://github.com/munificent/bantam
+ 31. http://disqus.com/?ref_noscript
+ 32. https://journal.stuffwithstuff.com/
+ 33. http://gameprogrammingpatterns.com/
+ 34. http://craftinginterpreters.com/
+ 35. https://twitter.com/intent/user?screen_name=munificentbob
+ 36. http://github.com/munificent
+ 37. http://twitter.com/munificentbob
+ 38. https://500px.com/munificent
+ 39. http://www.flickr.com/photos/bobisbob/
+ 40. https://journal.stuffwithstuff.com/category/code
+ 41. https://journal.stuffwithstuff.com/category/language
+ 42. https://journal.stuffwithstuff.com/category/magpie
+ 43. https://journal.stuffwithstuff.com/category/c-sharp
+ 44. https://journal.stuffwithstuff.com/category/dart
+ 45. https://journal.stuffwithstuff.com/category/game-dev
+ 46. https://journal.stuffwithstuff.com/category/java
+ 47. https://journal.stuffwithstuff.com/category/cpp
+ 48. https://journal.stuffwithstuff.com/category/design
+ 49. https://journal.stuffwithstuff.com/category/game-patterns
+ 50. https://journal.stuffwithstuff.com/category/parsing
+ 51. https://journal.stuffwithstuff.com/category/roguelike
+ 52. https://journal.stuffwithstuff.com/category/book
+ 53. https://journal.stuffwithstuff.com/category/go
+ 54. https://journal.stuffwithstuff.com/category/js
+ 55. https://journal.stuffwithstuff.com/category/personal
+ 56. https://journal.stuffwithstuff.com/category/c
+ 57. https://journal.stuffwithstuff.com/category/finch
+ 58. https://journal.stuffwithstuff.com/category/python
+ 59. https://journal.stuffwithstuff.com/category/ruby
+ 60. https://journal.stuffwithstuff.com/category/blog
+ 61. https://journal.stuffwithstuff.com/category/f-sharp
+ 62. https://journal.stuffwithstuff.com/category/lua
+ 63. https://journal.stuffwithstuff.com/category/music
+ 64. https://journal.stuffwithstuff.com/category/ai
+ 65. https://journal.stuffwithstuff.com/category/beta
+ 66. https://journal.stuffwithstuff.com/category/blogofile
+ 67. https://journal.stuffwithstuff.com/category/game
+ 68. https://journal.stuffwithstuff.com/category/jasic
+ 69. https://journal.stuffwithstuff.com/category/javascript
+ 70. https://journal.stuffwithstuff.com/category/oop
+ 71. https://journal.stuffwithstuff.com/category/optimization
+ 72. https://journal.stuffwithstuff.com/category/oscon
+ 73. https://journal.stuffwithstuff.com/category/politics
+ 74. https://journal.stuffwithstuff.com/category/scheme
+ 75. https://journal.stuffwithstuff.com/category/typescript
+ 76. https://journal.stuffwithstuff.com/category/visualization
+ 77. https://journal.stuffwithstuff.com/archive
+ 78. http://jekyllrb.com/
+ 79. https://github.com/munificent/journal
+
+ Hidden links:
+ 81. https://www.reddit.com/submit?url=http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy//
+ 82. https://news.ycombinator.com/submitlink?u=http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy//&t=Pratt%20Parsers:%20Expression%20Parsing%20Made%20Easy
+ 83. http://twitter.com/share?url=http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/&text=%22Pratt%20Parsers:%20Expression%20Parsing%20Made%20Easy%22%20%40munificentbob
+ 84. http://www.facebook.com/share.php?u=http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/
+ 85. https://plus.google.com/share?url=http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/
+ 86. https://journal.stuffwithstuff.com/rss.xml
diff --git a/miniany/hello.c b/miniany/hello.c
index ab06506..062ab5f 100644
--- a/miniany/hello.c
+++ b/miniany/hello.c
@@ -2,6 +2,7 @@
int main()
{
- printf("hello, world\n");
+ putstring("hello, world");
+ putnl();
return 0;
}
diff --git a/miniany/libc-freestanding.c b/miniany/libc-freestanding.c
index 31808e7..a68a0c5 100644
--- a/miniany/libc-freestanding.c
+++ b/miniany/libc-freestanding.c
@@ -30,6 +30,33 @@ int strcmp( char *s1, char *s2 )
return *s1 - *s2;
}
+int memcmp( char *s1, char *s2, int n )
+{
+ while( n > 0 ) {
+ if( *s1 != *s2 ) {
+ return *s1 - *s2;
+ }
+ n--;
+ s1++;
+ s2++;
+ }
+
+ return 0;
+}
+
+char *memset( char *d, int c, int n )
+{
+ char *s = d;
+
+ while( n > 0 ) {
+ *s = (char)c;
+ s++;
+ n--;
+ }
+
+ return d;
+}
+
int isspace( int c )
{
if( c == ' ' || c == '\r' || c == '\n' || c == '\t' ) return 1;
@@ -132,9 +159,10 @@ enum {
static void print_char( int fd, int c )
{
- /* gcc 10 and 11 think on -O1 and -O2 that s can be held in registers, thus
- * clobbering the output? Let's put a volatile here seems to help. Neither
- * clang nor tcc show this behaviour..
+ /* gcc 10 and 11 think on -Os, -O1 and -O2 that 's' can be held
+ * in registers, thus clobbering the output? Let's put a
+ * volatile here seems to help. Neither clang, pcc or tcc show
+ * this behaviour..
*/
volatile char s[1];
s[0] = c;
@@ -396,17 +424,18 @@ void free( char *ptr )
}
}
-char *memcpy( char *d, char *s, int len )
+char *memcpy( char *d, char *s, int n )
{
char *p;
char *q;
p = s;
q = d;
- while( len-- ) {
+ while( n > 0 ) {
*q = *p;
q++;
p++;
+ n--;
}
return d;
diff --git a/miniany/test1.hexmust b/miniany/test1.hexmust
new file mode 100644
index 0000000..1f67e53
--- /dev/null
+++ b/miniany/test1.hexmust
@@ -0,0 +1,8 @@
+00000000 b8 0c 00 00 00 bb 19 00 00 00 b9 05 00 00 00 ba |................|
+00000010 02 00 00 00 be 03 00 00 00 50 89 d0 f7 e6 89 c2 |.........P......|
+00000020 58 29 d1 50 52 89 d8 ba 00 00 00 00 f7 f1 89 c3 |X).PR...........|
+00000030 5a 58 01 d8 a3 65 00 00 01 a1 65 00 00 01 bb 03 |ZX...e....e.....|
+00000040 00 00 00 b9 03 00 00 00 ba 04 00 00 00 50 52 89 |.............PR.|
+00000050 c8 f7 e2 89 c1 5a 58 01 cb f7 f3 a3 61 00 00 01 |.....ZX.....a...|
+00000060 f4 00 00 00 00 00 00 00 00 |.........|
+00000069
diff --git a/miniany/torture.sh b/miniany/torture.sh
index 530cd7c..c3d2db7 100755
--- a/miniany/torture.sh
+++ b/miniany/torture.sh
@@ -2,14 +2,26 @@
for COMPILER in gcc clang pcc tcc; do
for MODE in freestanding hosted; do
- for LEVEL in d 0 1 2 3; do
-# freestanding c4 needs much more work
+ for LEVEL in d s 0 1 2 3; do
+ echo "## $COMPILER $MODE $LEVEL ##"
+# pcc and hosted c4 fail, use another compiler for this case for now
# ./build.sh c4 $COMPILER $MODE $LEVEL
- $COMPILER -m32 -o c4 c4.c
+ if test "$COMPILER" = "pcc"; then
+ ./build.sh c4 tcc $MODE $LEVEL
+ else
+ ./build.sh c4 $COMPILER $MODE $LEVEL
+ fi
./build.sh cc $COMPILER $MODE $LEVEL
- ./cc < test1.c
- ./c4 cc.c < test1.c
- ./c4 c4.c cc.c < test1.c
+ ./cc < test1.c > test1.asm
+ cat cc.c EOF test1.c | ./c4 > test1-c4.asm
+ cat c4.c EOF cc.c EOF test1.c | ./c4 > test1-c4-c4.asm
+ diff test1.asm test1-c4.asm
+ diff test1.asm test1-c4-c4.asm
+ fasm test1.asm test1.bin
+ hexdump -C test1.bin > test1.hex
+ diff test1.hex test1.hexmust
+ ndisasm -b32 -o1000000h -a test1.bin > test1.dis
+ diff test1.dis test1.disasmust
done
done
done