summaryrefslogtreecommitdiff
path: root/miniany/doc/Writing a C Compiler, Part 1.html
blob: bb8d2e0f87dda482de0193520540b5cf7c7df4a3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
<!DOCTYPE html>
<!-- saved from url=(0056)https://norasandler.com/2017/11/29/Write-a-Compiler.html -->
<html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1">

  <title>Writing a C Compiler, Part 1</title>
  <meta name="description" content="This is the first post in a series on writing your own C compiler. Here are some reasons to write a compiler:">

  <link rel="stylesheet" href="./Writing a C Compiler, Part 1_files/main.css">
  <link rel="canonical" href="https://norasandler.com/2017/11/29/Write-a-Compiler.html">
  <link rel="alternate" type="application/rss+xml" title="Nora Sandler" href="https://norasandler.com/feed.xml">
  
</head>


  <body>

    <header class="site-header" role="banner">

  <div class="wrapper">
    
    
    <a class="site-title" href="https://norasandler.com/">Nora Sandler</a>
  
    
      <nav class="site-nav">
        <input type="checkbox" id="nav-trigger" class="nav-trigger">
        <label for="nav-trigger">
          <span class="menu-icon">
            <svg viewBox="0 0 18 15" width="18px" height="15px">
              <path fill="#424242" d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.031C17.335,0,18,0.665,18,1.484L18,1.484z"></path>
              <path fill="#424242" d="M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0c0-0.82,0.665-1.484,1.484-1.484 h15.031C17.335,6.031,18,6.696,18,7.516L18,7.516z"></path>
              <path fill="#424242" d="M18,13.516C18,14.335,17.335,15,16.516,15H1.484C0.665,15,0,14.335,0,13.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.031C17.335,12.031,18,12.696,18,13.516L18,13.516z"></path>
            </svg>
          </span>
        </label>

        <div class="trigger">
          
            
            
              
            
          
            
            
              
              <a class="page-link" href="https://norasandler.com/about/">About</a>
              
            
          
            
            
              
              <a class="page-link" href="https://norasandler.com/archive/">Archive</a>
              
            
          
            
            
          
            
            
              
            
          
            
            
              
            
          
            
            
              
            
          
          <a class="page-link" href="https://github.com/nlsandler">Github</a>
          <a href="https://norasandler.com/feed.xml"><img id="rss" height="20" width="20" src="./Writing a C Compiler, Part 1_files/rss.png"></a>

        </div>
      </nav>
    
  </div>
</header>


    <main class="page-content" aria-label="Content">
      <div class="wrapper">
        <article class="post h-entry" itemscope="" itemtype="http://schema.org/BlogPosting">

  <header class="post-header">
    <h1 class="post-title p-name" itemprop="name headline">Writing a C Compiler, Part 1</h1>
    <p class="post-meta">
      <time class="dt-published" datetime="2017-11-29T22:00:00+00:00" itemprop="datePublished">Nov 29, 2017
      </time></p>
  </header>

  <div class="post-content e-content" itemprop="articleBody">
    <p>This is the first post in a series on writing your own C compiler. Here are some reasons to write a compiler:</p>

<ol>
  <li>You’ll learn about abstract syntax trees (ASTs) and how programs can represent and manipulate other programs. Handy for working with linters, static analyzers, and metaprogramming of all sorts.</li>
  <li>You’ll learn about assembly, calling conventions, and all the gritty, low-level details of how computers, like, do stuff.</li>
  <li>It seems like an impossibly hard project (but isn’t!), so writing one will make you feel like a badass.</li>
</ol>

<p>I’ve been working on my own C compiler, <a href="https://github.com/nlsandler/nqcc">nqcc</a> for the past several weeks, using Abdulaziz Ghuloum’s <a href="http://scheme2006.cs.uchicago.edu/11-ghuloum.pdf">An Incremental Approach to Compiler Construction</a> as a roadmap. I really like Ghuloum’s approach: you start by compiling a tiny, trivial subset of your source language all the way down to <a href="https://drawings.jvns.ca/assembly/">x86 assembly</a>. Then you add new language features, one step at a time. In step one, you just return constants; in a later step you handle addition and subtraction; and so on. Every step is small enough to feel manageable, and at the end of the every step you have a working compiler.</p>

<p>This series is adapted from Ghuloum’s paper - the original paper is about compiling Scheme, so I had to make some adjustments to compile C instead. I’ll cover arithmetic operations, conditionals, local variables, function calls, and perhaps more. I’ve also written some <a href="https://github.com/nlsandler/write_a_c_compiler">test programs</a> that you can use to validate that each stage of your compiler works correctly.</p>

<h1 id="preliminaries">Preliminaries</h1>

<p>Before you start, you need to decide on two things: what language to write your compiler in, and how to handle parsing and lexing. You can implement the compiler in whatever language you like, but I’d recommend using a language with sum types and pattern matching<sup id="anchor1"><a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#fn1">1</a></sup>, like OCaml, Haskell, or Rust. It will be SO MUCH EASIER to build and traverse an AST if you do. I started writing nqcc in Python, which I know very well, then got fed up and switched to OCaml, which I didn’t know well at all, and it was definitely worth it.</p>

<p>You also need to decide whether to write your own parser and lexer or use automatic parser and scanner generators (e.g. <a href="https://github.com/westes/flex">flex</a> and <a href="https://www.gnu.org/software/bison/">bison</a>). In this series of posts, I’ll show you how to write a lexer (or scanner) and recursive descent parser by hand. Using a parser generator is probably easier, but I haven’t tried it so I could be wrong. You could probably also use a scanner generator for lexing, but hand-write your own parser. Basically, do whatever you like, but I’m only going to talk about hand-writing a lexer and parser for the rest of this series, so if you want to use bison and flex you’re on your own.</p>

<h2 id="update-21819">Update 2/18/19</h2>

<p>There’s one more thing you need to decide on: whether to target <a href="https://en.wikipedia.org/wiki/IA-32">32-bit</a> or <a href="https://en.wikipedia.org/wiki/X86-64">64-bit</a> assembly. This series uses 32-bit architecture because that’s what Ghuloum’s paper used. However, I’ve realized since starting the series that this was a bad call. Because 32-bit architecture is increasingly obsolete, compiling and running 32-bit binaries can be a headache. I’ve decided to go back and add 64-bit examples to this series when I get the chance. Until I do that, you have one of two options:</p>

<ol>
  <li>Figure out on your own how to adapt these posts to a 64-bit instruction set. If you’re at all familiar with assembly, this isn’t too hard and it’s what I’d recommend.</li>
  <li>
    <p>Stick with the 32-bit instruction set I’ve used in these posts. This will require a little extra work up front, depending on your OS:</p>

    <ul>
      <li>
        <p>On Linux, you’ll need to install some extra libraries in order to turn your 32-bit assembly into an executable. <a href="https://github.com/namin/inc/blob/master/Dockerfile">This Dockerfile</a> lists the libraries you’ll need (plus some Scheme-related stuff you can ignore). Many thanks to Jaseem Abid, who had previously worked through Ghuloum’s paper, for creating this Dockerfile and telling me about it.</p>
      </li>
      <li>
        <p>32-bit support is being phased out on macOS, and the next version probably won’t let you run 32-bit binaries at all. At the moment, the <code class="language-plaintext highlighter-rouge">gcc</code> binary that ships with XCode won’t compile 32-bit applications by default. You can just install GCC from Homebrew and use that instead, or you can futz around with XCode and figure out how to make it build 32-bit binaries. I went with the former.</p>
      </li>
    </ul>
  </li>
</ol>

<h1 id="week-1-integers">Week 1: Integers</h1>

<p>This week, we’ll compile a program that returns a single integer. We’ll also set up the three basic passes of our compiler. This will be a lot of work for not that much payoff, but the architecture we define now will make it easy to add more language features later on.</p>

<p>Here’s a program we’d like to compile - we’ll call it return_2.c:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">int</span> <span class="nf">main</span><span class="p">()</span> <span class="p">{</span>
    <span class="k">return</span> <span class="mi">2</span><span class="p">;</span>
<span class="p">}</span>
</code></pre></div></div>

<p>We’ll only handle programs with a single function, <code class="language-plaintext highlighter-rouge">main</code>, consisting of a single return statement. The only thing that varies is the value of the integer being returned. We won’t handle hex or octal integer literals, just decimal. To verify that your compiler works correctly, you’ll need to compile a program, run it, and check its return code:</p>

<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nv">$ </span>./YOUR_COMPILER return_2.c <span class="c"># compile the source file shown above</span>
<span class="nv">$ </span>./gcc <span class="nt">-m32</span> return_2.s <span class="nt">-o</span> return_2 <span class="c"># assemble it into an executable</span>
<span class="nv">$ </span>./return_2 <span class="c"># run the executable you just compiled</span>
<span class="nv">$ </span><span class="nb">echo</span> <span class="nv">$?</span> <span class="c"># check the return code; it should be 2</span>
2 
</code></pre></div></div>

<p>Your compiler will produce x86 assembly. We won’t transform the assembly into an executable ourselves - that’s the job of the assembler and linker, which are separate programs<sup id="anchor2"><a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#fn2">2</a></sup>. To see how this program looks in assembly, let’s compile it with gcc<sup id="anchor3"><a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#fn3">3</a></sup>:</p>

<div class="language-bash highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nv">$ </span>gcc <span class="nt">-S</span> <span class="nt">-O3</span> <span class="nt">-fno-asynchronous-unwind-tables</span> return_2.c
<span class="nv">$ </span><span class="nb">cat </span>return_2.s
    .section __TEXT,__text_startup,regular,pure_instructions
    .align 4
    .globl _main
_main:
    movl    <span class="nv">$2</span>, %eax
    ret
    .subsections_via_symbols
</code></pre></div></div>

<p>Now, let’s look at the assembly itself. We can ignore the <code class="language-plaintext highlighter-rouge">.section</code>, <code class="language-plaintext highlighter-rouge">.align</code> and <code class="language-plaintext highlighter-rouge">.subsections_via_symbols</code> directives - if you delete them, you can still assemble and run return_2.s<sup id="anchor4"><a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#fn4">4</a></sup>. <code class="language-plaintext highlighter-rouge">.globl _main</code> indicates that the <code class="language-plaintext highlighter-rouge">_main</code> symbol should be visible to the linker; otherwise it can’t find the entry point to the program. (If you’re on a Unix-like system other than OS X, this symbol will just be <code class="language-plaintext highlighter-rouge">main</code>, no underscore.)</p>

<p>Finally, we have our actual assembly instructions:</p>

<div class="language-nasm highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="nl">_main:</span>                  <span class="c1">; label for start of "main" function</span>
    <span class="nf">movl</span>    <span class="kc">$</span><span class="mi">2</span><span class="p">,</span> <span class="o">%</span><span class="nb">eax</span>    <span class="c1">; move constant "2" into the EAX register</span>
    <span class="nf">ret</span>                 <span class="c1">; return from function</span>
</code></pre></div></div>

<p>The most important point here is that when a function returns, the EAX register<sup id="anchor5"><a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#fn5">5</a></sup> will contain its return value. The <code class="language-plaintext highlighter-rouge">main</code> function’s return value will be the program’s exit code.</p>

<p>An important side note: throughout this tutorial, I’ll use AT&amp;T assembly syntax, because that’s what GCC uses by default. Some online resources might use Intel syntax, which has operands in the reverse order from AT&amp;T syntax. Whenever you’re reading assembly, make sure you know what syntax it’s using!</p>

<p>The only thing that can change in the snippet of assembly above is the return value. So one very simple approach would be to use a regular expression to extract the return value from the source code, then plug it into the assembly. Here’s a 20-line Python script to do that:</p>

<div class="language-python highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kn">import</span> <span class="nn">sys</span><span class="p">,</span> <span class="n">os</span><span class="p">,</span> <span class="n">re</span>

<span class="c1">#expected form of a C program, without line breaks
</span><span class="n">source_re</span> <span class="o">=</span> <span class="sa">r</span><span class="s">"int main\s*\(\s*\)\s*{\s*return\s+(?P&lt;ret&gt;[0-9]+)\s*;\s*}"</span> 

<span class="c1"># Use 'main' instead of '_main' if not on OS X
</span><span class="n">assembly_format</span> <span class="o">=</span> <span class="s">"""    
    .globl _main
_main:
    movl    ${}, %eax
    ret
"""</span>

<span class="n">source_file</span> <span class="o">=</span> <span class="n">sys</span><span class="p">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="n">assembly_file</span> <span class="o">=</span> <span class="n">os</span><span class="p">.</span><span class="n">path</span><span class="p">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">source_file</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="s">".s"</span>

<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">source_file</span><span class="p">,</span> <span class="s">'r'</span><span class="p">)</span> <span class="k">as</span> <span class="n">infile</span><span class="p">,</span> <span class="nb">open</span><span class="p">(</span><span class="n">assembly_file</span><span class="p">,</span> <span class="s">'w'</span><span class="p">)</span> <span class="k">as</span> <span class="n">outfile</span><span class="p">:</span>
    <span class="n">source</span> <span class="o">=</span> <span class="n">infile</span><span class="p">.</span><span class="n">read</span><span class="p">().</span><span class="n">strip</span><span class="p">()</span>
    <span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="p">.</span><span class="n">match</span><span class="p">(</span><span class="n">source_re</span><span class="p">,</span> <span class="n">source</span><span class="p">)</span>

    <span class="c1"># extract the named "ret" group, containing the return value
</span>    <span class="n">retval</span> <span class="o">=</span> <span class="n">match</span><span class="p">.</span><span class="n">group</span><span class="p">(</span><span class="s">'ret'</span><span class="p">)</span> 
    <span class="n">outfile</span><span class="p">.</span><span class="n">write</span><span class="p">(</span><span class="n">assembly_format</span><span class="p">.</span><span class="nb">format</span><span class="p">(</span><span class="n">retval</span><span class="p">))</span>
</code></pre></div></div>

<p>But parsing the whole program with one big regular expression isn’t a viable long-term strategy. Instead, we’ll split up the compiler into three stages: lexing, parsing, and code generation. As far as I know, this is a pretty standard compiler architecture, except you’d normally want a bunch of optimization passes between parsing and code generation.</p>

<h2 id="lexing">Lexing</h2>

<p>The lexer (also called the scanner or tokenizer) is the phase of the compiler that breaks up a string (the source code) into a list of tokens. A token is the smallest unit the parser can understand - if a program is like a paragraph, tokens are like individual words. (Many tokens <em>are</em> individual words, separated by whitespace.) Variable names, keywords, and constants, and punctuation like braces are all examples of tokens. Here’s a list of all the tokens in return_2.c:</p>

<ul>
  <li><code class="language-plaintext highlighter-rouge">int</code> keyword</li>
  <li>Identifier “main”</li>
  <li>Open parentheses</li>
  <li>Close parentheses</li>
  <li>Open brace</li>
  <li><code class="language-plaintext highlighter-rouge">return</code> keyword</li>
  <li>Constant “2”</li>
  <li>Semicolon</li>
  <li>Close brace</li>
</ul>

<p>Note that some tokens have a value (e.g. the constant token has value “2”) and some don’t (like parentheses and braces). Also note that there are no whitespace tokens. (In some languages, like Python, whitespace is significant and you do need tokens to represent it.)</p>

<p>Here are all the tokens your lexer needs to recognize, and the regular expression defining each of them:</p>
<ul>
  <li>Open brace <code class="language-plaintext highlighter-rouge">{</code></li>
  <li>Close brace <code class="language-plaintext highlighter-rouge">}</code></li>
  <li>Open parenthesis <code class="language-plaintext highlighter-rouge">\(</code></li>
  <li>Close parenthesis <code class="language-plaintext highlighter-rouge">\)</code></li>
  <li>Semicolon <code class="language-plaintext highlighter-rouge">;</code></li>
  <li>Int keyword <code class="language-plaintext highlighter-rouge">int</code></li>
  <li>Return keyword <code class="language-plaintext highlighter-rouge">return</code></li>
  <li>Identifier <code class="language-plaintext highlighter-rouge">[a-zA-Z]\w*</code></li>
  <li>Integer literal <code class="language-plaintext highlighter-rouge">[0-9]+</code></li>
</ul>

<p>If you want, you could just have a “keyword” token type, instead of a different token type for each keyword.</p>

<h4 id="-task">☑ Task:</h4>
<p>Write a <em>lex</em> function that accepts a file and returns a list of tokens. It should work for all stage 1 examples in the test suite, including the invalid ones. (The invalid examples should raise errors in the parser, not the lexer.) To keep things simple, we only lex decimal integers. If you like, you can extend your lexer to handle octal and hex integers too.</p>

<p>You might notice that we can’t lex negative integers. That’s not an accident - C doesn’t have negative integer constants. It just has a negation operator, which can be applied to positive integers. We’ll add negation in the next post.</p>

<h2 id="parsing">Parsing</h2>

<p>The next step is transforming our list of tokens into an abstract syntax tree. An AST is one way to represent the structure of a program. In most programming languages, language constructs like conditionals and function declarations are made up of simpler constructs, like variables and constants. ASTs capture this relationship; the root of the AST will be the entire program, and each node will have children representing its constituent parts. Let’s look at a small example:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>if (a &lt; b) {
    c = 2;
    return c;
} else {
    c = 3;
}
</code></pre></div></div>

<p>This code snippet is an if statement, so we’ll label the root of our AST “if statement”. It will have three children:</p>
<ul>
  <li>The condition (<code class="language-plaintext highlighter-rouge">a &lt; b</code>)</li>
  <li>The if body (<code class="language-plaintext highlighter-rouge">c = 2; return c;</code>)</li>
  <li>The else body (<code class="language-plaintext highlighter-rouge">c = 3;</code>)</li>
</ul>

<p>Each of these components can be broken down further. For example, the condition is a binary <code class="language-plaintext highlighter-rouge">&lt;</code> operation with two children:</p>

<ul>
  <li>The first operand (variable <code class="language-plaintext highlighter-rouge">a</code>)</li>
  <li>The second operand (variable <code class="language-plaintext highlighter-rouge">b</code>)</li>
</ul>

<p>An assignment statement (like <code class="language-plaintext highlighter-rouge">c=2;</code>) also has two children: the variable being updated (<code class="language-plaintext highlighter-rouge">c</code>), and the expression assigned to it (<code class="language-plaintext highlighter-rouge">2</code>).</p>

<p>The if body, on the other hand, can have an arbitrary number of children - each statement is a child node. In this case it has two children because there are two statements. The children are ordered - <code class="language-plaintext highlighter-rouge">c=2;</code> precedes <code class="language-plaintext highlighter-rouge">return c;</code> because it comes first in the source code.</p>

<p>Here’s the full AST for this code snippet:</p>

<p><img src="./Writing a C Compiler, Part 1_files/AST.svg" alt="Image of diagram; text outline follows"></p>

<div class="screen-reader-only">
  <ul>
    <li>if statement
      <ul>
        <li>condition: binary operation (&lt;)
          <ul>
            <li>operand 1: variable a</li>
            <li>operand 2: variable b</li>
          </ul>
        </li>
        <li>if body: statement list
          <ul>
            <li>statement 1: assignment
              <ul>
                <li>variable: c</li>
                <li>right-hand side: constant 2</li>
              </ul>
            </li>
            <li>statement 2: return
              <ul>
                <li>return value: variable c</li>
              </ul>
            </li>
          </ul>
        </li>
        <li>else body: statement list
          <ul>
            <li>statement 1: assignment
              <ul>
                <li>variable: c</li>
                <li>right-hand side: constant 3</li>
              </ul>
            </li>
          </ul>
        </li>
      </ul>
    </li>
  </ul>
</div>

<p>And here’s pseudocode for constructing this AST:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>//create if condition
cond = BinaryOp(op='&gt;', operand_1=Var(a), operand_2=Var(b))

//create if body
assign = Assignment(var=Var(c), rhs=Const(2))
return = Return(val=Var(c))
if_body = [assign, return]

//create else body
assign_else = Assignment(var=Var(c), rhs=Const(3))
else_body = [assign_else]

//construct if statement
if = If(condition=cond, body=if_body, else=else_body)
</code></pre></div></div>

<p>For now, though, we don’t need to worry about conditionals, variable assignments, or binary operators. Right now, the only AST nodes we need to support are programs, function declarations, statements, and expressions. Here’s how we’ll define each of them:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>program = Program(function_declaration)
function_declaration = Function(string, statement) //string is the function name
statement = Return(exp)
exp = Constant(int) 
</code></pre></div></div>

<p>Right now, a program consists of a single function, <code class="language-plaintext highlighter-rouge">main</code>. Later on we’ll define a program as a list of functions. A function has a name and a body. Later, a function will also have a list of arguments. In a real C compiler, we’d also need to store the function’s return type, but right now we only have integer types. A function body is a single statement; later it will be a list of statements. There’s only one type of statement: a return statement. Later we’ll add other types of statements, like conditionals and variable declarations. A return statement has one child, an expression - this is the value being returned. For now an expression can only be an integer constant. Later we’ll let expressions include arithmetic operations, which will allow us to parse statements like <code class="language-plaintext highlighter-rouge">return 2+2;</code>.</p>

<p>As we add new language constructs, we’ll update the definitions of our AST nodes. For example, we’ll eventually add a new type of statement: variable assignment. When we do, we’ll add a new form to our <code class="language-plaintext highlighter-rouge">statement</code> definition:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>statement = Return(exp) | Assign(variable, exp)
</code></pre></div></div>

<p>Here’s a diagram of the AST for return_2.c:</p>

<p><img src="./Writing a C Compiler, Part 1_files/return_2_ast.svg" alt="Image of diagram; text outline follows"></p>

<div class="screen-reader-only">
  <ul>
    <li>Program
      <ul>
        <li>Function (name: main)
          <ul>
            <li>body
              <ul>
                <li>return statement
                  <ul>
                    <li>constant (value: 2)</li>
                  </ul>
                </li>
              </ul>
            </li>
          </ul>
        </li>
      </ul>
    </li>
  </ul>
</div>

<p>Finally, we need a formal grammar, which defines how series of tokens can be combined to form language constructs. We’ll define it here in <a href="https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form">Backus-Naur Form</a>:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;program&gt; ::= &lt;function&gt;
&lt;function&gt; ::= "int" &lt;id&gt; "(" ")" "{" &lt;statement&gt; "}"
&lt;statement&gt; ::= "return" &lt;exp&gt; ";"
&lt;exp&gt; ::= &lt;int&gt;
</code></pre></div></div>

<p>Each of the lines above is a <em>production rule</em>, defining how a language construct can be built from other language constructs and tokens. Every symbol that appears on the left-hand side of a production rule (i.e. <code class="language-plaintext highlighter-rouge">&lt;program&gt;</code>, <code class="language-plaintext highlighter-rouge">&lt;function&gt;</code>, <code class="language-plaintext highlighter-rouge">&lt;statement&gt;</code>) is a non-terminal symbol. Individual tokens (keywords, ids, punctuation, etc.) are terminal symbols. Note that, while this grammar tells us what sequence of tokens constitutes a valid C program, it <em>doesn’t</em> tell us exactly how to transform that program into an AST - for example, there’s no production rule corresponding to the Constant node in the AST. We could rewrite our grammar to have a production rule for constants, but we don’t have to in order to parse the program.</p>

<p>Right now the grammar is extremely simple; there’s only one production rule for each non-terminal symbol. Later, some non-terminal symbols will have multiple production rules. For example, if we added support for variable declarations, we could have the following rule for deriving statements:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;statement&gt; ::= "return" &lt;int&gt; ";" | "int" &lt;id&gt; "=" &lt;int&gt; ";"
</code></pre></div></div>

<p>To transform a list of tokens into an AST, we’ll use a technique called recursive descent parsing. We’ll define a function to parse each non-terminal symbol in the grammar and return a corresponding AST node. The function to parse symbol <em>S</em> should remove tokens from the start of the list until it reaches a valid derivation of <em>S</em>. If, before it’s done parsing, it hits a token that isn’t in the production rule for <em>S</em>, it should fail. If the rule for <em>S</em> contains other non-terminals, it should call other functions to parse them.</p>

<p>Here’s the pseudocode for parsing a statement:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>def parse_statement(tokens):
    tok = tokens.next()
    if tok.type != "RETURN_KEYWORD":
        fail()
    tok = tokens.next()
    if tok.type != "INT"
        fail()
    exp = parse_exp(tokens) //parse_exp will pop off more tokens
    statement = Return(exp)

    tok = tokens.next()
    if tok.type != "SEMICOLON":
        fail()

    return statement
</code></pre></div></div>

<p>Later, the production rules will be recursive (e.g. an arithmetic expression can contain other expressions), which means the parsing functions will be recursive too - hence the name recursive descent parser.</p>

<h4 id="-task-1">☑ Task:</h4>
<p>Write a <em>parse</em> function that accepts a list of tokens and returns an AST, rooted at a Program node. The function should build the correct AST for all valid stage 1 examples, and raise an error on all invalid stage 1 examples. If you want, you can also have your parser fail gracefully if it encounters integers above your system’s INT_MAX.</p>

<p>There are a lot of ways to represent an AST in code - each type of node could be its own class or its own datatype, depending on what language you’re writing your compiler in. For example, here’s how you might define AST nodes as OCaml datatypes:</p>

<div class="language-ocaml highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">type</span> <span class="n">exp</span> <span class="o">=</span> <span class="nc">Const</span><span class="p">(</span><span class="kt">int</span><span class="p">)</span>
<span class="k">type</span> <span class="n">statement</span> <span class="o">=</span> <span class="nc">Return</span><span class="p">(</span><span class="n">exp</span><span class="p">)</span>
<span class="k">type</span> <span class="n">fun_decl</span> <span class="o">=</span> <span class="nc">Fun</span><span class="p">(</span><span class="kt">string</span><span class="o">,</span> <span class="n">statement</span><span class="p">)</span>
<span class="k">type</span> <span class="n">prog</span> <span class="o">=</span> <span class="nc">Prog</span><span class="p">(</span><span class="n">fun_decl</span><span class="p">)</span>
</code></pre></div></div>

<h2 id="code-generation">Code Generation</h2>

<p>Now that we’ve built an AST, we’re ready to generate some assembly! Like we saw before, we only need to emit four lines of assembly. To emit it, we’ll traverse the AST in roughly the order that the program executes. That means we’ll visit, in order:</p>

<ul>
  <li>The function name (not really a <em>node</em>, but the first thing in the function definition)</li>
  <li>The return value</li>
  <li>The return statement</li>
</ul>

<p>Note that we often (though not always) traverse the tree in <a href="https://en.wikipedia.org/wiki/Tree_traversal#Post-order">post-order</a>, visiting a child before its parent. For example, we need to generate the return value before it’s referenced in a return statement. In later posts, we’ll need to generate the operands of arithmetic expressions before generating the code that operates on them.</p>

<p>Here’s the assembly we need:</p>

<ol>
  <li>To generate a function (e.g. function “foo”):
    <div class="language-nasm highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="nf">.globl</span> <span class="nv">_foo</span>
<span class="nl">_foo:</span>
 <span class="err">&lt;</span><span class="nf">FUNCTION</span> <span class="nv">BODY</span> <span class="nv">GOES</span> <span class="nv">HERE</span><span class="o">&gt;</span>
</code></pre></div>    </div>
  </li>
  <li>To generate a return statement (e.g. <code class="language-plaintext highlighter-rouge">return 3;</code>):
    <div class="language-nasm highlighter-rouge"><div class="highlight"><pre class="highlight"><code> <span class="nf">movl</span>    <span class="kc">$</span><span class="mi">3</span><span class="p">,</span> <span class="o">%</span><span class="nb">eax</span>
 <span class="nf">ret</span>
</code></pre></div>    </div>
  </li>
</ol>

<h4 id="-task-2">☑ Task:</h4>
<p>Write a <em>generate</em> function that accepts an AST and generates assembly. It can return the assembly as a string or write it directly to a file. It should generate correct assembly for all valid stage 1 examples.</p>

<h2 id="optional-pretty-printing">(Optional) Pretty printing</h2>

<p>You’ll probably want a utility function to print out your AST, to help with debugging. You can write it now, or wait until you need it. Here’s what nqcc’s pretty printer outputs for return_2.c:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>FUN INT main:
    params: ()
    body:
        RETURN Int&lt;2&gt;
</code></pre></div></div>

<p>This example includes some information your AST doesn’t need, like the return type and list of function parameters.</p>

<h4 id="-task-3">☑ Task:</h4>
<p>Write a <em>pretty-print</em> funcion that takes an AST and prints it out in a readable way.</p>

<h2 id="putting-it-all-together">Putting it all together</h2>

<h4 id="-task-4">☑ Task:</h4>
<p>Write a program that accepts a C source file and outputs an executable. The program should:</p>

<ol>
  <li>Read in the file</li>
  <li>Lex it</li>
  <li>Parse it</li>
  <li>Generate assembly</li>
  <li>Write the assembly to a file</li>
  <li>Invoke GCC command to convert the assembly to an executable:
    <div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>gcc -m32 assembly.s -o out
</code></pre></div>    </div>

    <p>In this command, “assembly.s” is the name of the assembly file and “out” is the name of the executable you want to generate. The <code class="language-plaintext highlighter-rouge">-m32</code> option tells GCC to build a 32-bit binary. You can omit that option and build 64-bit binaries if you want, but you’ll need to make some changes to the code generation steps later on (e.g. using 64-bit registers).</p>
  </li>
  <li>(Optional) Delete the assembly file.</li>
</ol>

<h2 id="testing">Testing</h2>

<p>You can test that your compiler is working properly with the test script <a href="https://github.com/nlsandler/write_a_c_compiler">here</a>. It will compile a set of test programs using your compiler, execute them, and make sure they return the right value.</p>

<p>To invoke it:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>./test_compiler.sh /path/to/your/compiler
</code></pre></div></div>

<p>In order to test it with the script, your compiler needs to follow this spec:</p>

<ol>
  <li>It can be invoked from the command line, taking only a C source file as an argument, e.g.: 
<code class="language-plaintext highlighter-rouge">./YOUR_COMPILER /path/to/program.c</code></li>
  <li>When passed <code class="language-plaintext highlighter-rouge">program.c</code>, it generates executable <code class="language-plaintext highlighter-rouge">program</code> in the same directory.</li>
  <li>It doesn’t generate assembly or an executable if parsing fails (this is what the test script checks for invalid test programs).</li>
</ol>

<p>The script doesn’t check whether your compiler outputs sensible error messages, but you can use the invalid test programs to test that manually.</p>

<h2 id="up-next">Up Next</h2>

<p>In the <a href="https://norasandler.com/2017/12/05/Write-a-Compiler-2.html">next post</a>, we’ll add three unary operators: <code class="language-plaintext highlighter-rouge">-</code>, <code class="language-plaintext highlighter-rouge">~</code>, and <code class="language-plaintext highlighter-rouge">!</code>. Stay tuned!</p>

<p><em>If you have any questions, corrections, or other feedback, you can <a href="mailto:nora@norasandler.com">email me</a> or <a href="https://github.com/nlsandler/write_a_c_compiler/issues">open an issue</a>.</em></p>

<h2 id="further-reading">Further Reading</h2>

<ul>
  <li><a href="http://www.wilfred.me.uk/blog/2014/08/27/baby-steps-to-a-c-compiler/">Baby Steps to a C Compiler</a> - a post about another C compiler inspired by Ghuloum’s paper.</li>
  <li>The <a href="http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf">C11 Standard</a>, the current C language specification. Annex A is a summary of C’s grammar, so it’s a good reference for parsing. You probably don’t need to read this all the way through.</li>
</ul>

<div class="footnote">
  <p><sup id="fn1">1</sup>
If you’re not familiar with sum types or pattern matching, there’s a good introduction <a href="https://chadaustin.me/2015/07/sum-types/">here</a>.<a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#anchor1">↩</a></p>
</div>

<div class="footnote">
  <p><sup id="fn2">2</sup>
An assembler converts a bunch of human-readable assembly instructions (like <code class="language-plaintext highlighter-rouge">inc</code>) into binary opcodes (like <code class="language-plaintext highlighter-rouge">1000000</code>). A linker combines multiple object files (the files produced by the assembler) into a single executable. Even though return_2.c doesn’t reference any external libraries, we still need the linker, for two reasons:</p>
  <ul>
    <li>Object files produced by the assembler aren’t in the right file format.</li>
    <li>The linker includes some initialization code, called <a href="https://en.wikipedia.org/wiki/Crt0">crt0</a>, even though it’s not explicitly referenced. <a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#anchor2">↩</a></li>
  </ul>
</div>

<div class="footnote">
  <p><sup id="fn3">3</sup>
In case you’re curious about these GCC options: <code class="language-plaintext highlighter-rouge">-S</code> tells GCC to generate an assembly file (return_2.s) instead of an executable. 
<code class="language-plaintext highlighter-rouge">-O3</code> turns on a bunch of compiler optimizations - this removes a lot of boilerplate and makes the code easier to read - at least for extremely simple programs like this one. <code class="language-plaintext highlighter-rouge">-fno-asynchronous-unwind-tables</code> tells it not to generate an unwind table, which contains information needed to generate stack traces. Hiding the unwind table also makes the code smaller and more readable.<a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#anchor3">↩</a></p>
</div>

<div class="footnote">
  <p><sup id="fn4">4</sup>
I think these directives will vary between platforms; these were generated using Homebrew gcc 7.2.0 on OS X. Here’s what they mean:</p>
  <ul>
    <li><code class="language-plaintext highlighter-rouge">.section __TEXT,__text_startup,regular,pure_instructions</code> tells the assembler that this is the text section, which contains assembly instructions (other sections might contain string literals, initialized data, debug information, etc.).</li>
    <li><code class="language-plaintext highlighter-rouge">.align 4</code> tells the assembler to align the first instruction in <code class="language-plaintext highlighter-rouge">main</code> to a multiple of 16 bytes - the 4 here is a power of 2, 2^4 = 16. On some architectures <code class="language-plaintext highlighter-rouge">.align 4</code> would mean align to a multiple of 4 bytes. I think the purpose of aligning the start of the function is to optimize instruction prefetching; section 11.5 of <a href="https://www.agner.org/optimize/optimizing_assembly.pdf">Agner Fog’s optimization manual</a> talks about this a little bit. <strong>Update 10/11/23</strong>: fixed the explanation of this directive, the previous one was incorrect.</li>
    <li><code class="language-plaintext highlighter-rouge">.subsections_via_symbols</code> is used to eliminate dead code. It indicates that each chunk of assembly beginning with a symbol can be treated as an individual block, and removed if it’s not used by any other block.
More info in the documentation <a href="https://developer.apple.com/library/content/documentation/DeveloperTools/Reference/Assembler/040-Assembler_Directives/asm_directives.html#//apple_ref/doc/uid/TP30000823-SW1">here</a>.<a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#anchor4">↩</a></li>
  </ul>
</div>

<div class="footnote">
  <p><sup id="fn5">5</sup>
A <a href="https://en.wikipedia.org/wiki/Processor_register">register</a> is a very tiny, very fast memory cell that sits right on the CPU and has a name you can refer to in assembly.
<a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html#anchor5">↩</a></p>
</div>

  </div><a class="u-url" href="https://norasandler.com/2017/11/29/Write-a-Compiler.html" hidden=""></a>
</article>

      </div>
    </main>

    <footer class="site-footer">

  <div class="wrapper">
      <div class="footer-col-wrapper">
        <div class="footer-col footer-col-1">
            <div class="rc-scout" data-scout-rendered="true"><p class="rc-scout__text"><i class="rc-scout__logo"></i> Want to become a better programmer? <a class="rc-scout__link" href="https://www.recurse.com/scout/click?t=8f520efbc4be09fb83a71920f53a07b7">Join the Recurse Center!</a></p></div><script async="" defer="" src="./Writing a C Compiler, Part 1_files/loader.js"></script>
        </div>
      </div>
    <div class="footer-col-wrapper">
      <div class="footer-col footer-col-1">
        © 2023 Nora Sandler.
      </div>
    </div>
  </div>

</footer>


  


<script async="" src="./Writing a C Compiler, Part 1_files/scout-176be16681a03cbddd686f3cc96694d3aab338ea5cb65452f83d989309810528.js"></script><style class="rc-scout__style" type="text/css">.rc-scout {
  display: block;
  padding: 0;
  border: 0;
  margin: 0;
}
.rc-scout__text {
  display: block;
  padding: 0;
  border: 0;
  margin: 0;
  height: 100%;
  font-size: 100%;
}
.rc-scout__logo {
  display: inline-block;
  padding: 0;
  border: 0;
  margin: 0;
  width: 0.85em;
  height: 0.85em;
  background: no-repeat center url('data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20viewBox%3D%220%200%2012%2015%22%3E%3Crect%20x%3D%220%22%20y%3D%220%22%20width%3D%2212%22%20height%3D%2210%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%221%22%20y%3D%221%22%20width%3D%2210%22%20height%3D%228%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%222%22%20y%3D%222%22%20width%3D%228%22%20height%3D%226%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%222%22%20y%3D%223%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%224%22%20y%3D%223%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%226%22%20y%3D%223%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%223%22%20y%3D%225%22%20width%3D%222%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%226%22%20y%3D%225%22%20width%3D%222%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%224%22%20y%3D%229%22%20width%3D%224%22%20height%3D%223%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%221%22%20y%3D%2211%22%20width%3D%2210%22%20height%3D%224%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%220%22%20y%3D%2212%22%20width%3D%2212%22%20height%3D%223%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%222%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%223%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%224%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%225%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%226%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%227%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%228%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%229%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3C%2Fsvg%3E');
}
.rc-scout__link:link, .rc-scout__link:visited {
  color: #3dc06c;
  text-decoration: underline;
}
.rc-scout__link:hover, .rc-scout__link:active {
  color: #4e8b1d;
}
</style></body></html>