summaryrefslogtreecommitdiff
path: root/miniany/doc/Writing a C Compiler, Part 5.html
blob: fa0ed619a63d79b5dd189cdb301aa0477c3ba549 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
<!DOCTYPE html>
<!-- saved from url=(0058)https://norasandler.com/2018/01/08/Write-a-Compiler-5.html -->
<html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1">

  <title>Writing a C Compiler, Part 5</title>
  <meta name="description" content="This is the fifth post in a series. Read part 1 here.">

  <link rel="stylesheet" href="./Writing a C Compiler, Part 5_files/main.css">
  <link rel="canonical" href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html">
  <link rel="alternate" type="application/rss+xml" title="Nora Sandler" href="https://norasandler.com/feed.xml">
  
</head>


  <body>

    <header class="site-header" role="banner">

  <div class="wrapper">
    
    
    <a class="site-title" href="https://norasandler.com/">Nora Sandler</a>
  
    
      <nav class="site-nav">
        <input type="checkbox" id="nav-trigger" class="nav-trigger">
        <label for="nav-trigger">
          <span class="menu-icon">
            <svg viewBox="0 0 18 15" width="18px" height="15px">
              <path fill="#424242" d="M18,1.484c0,0.82-0.665,1.484-1.484,1.484H1.484C0.665,2.969,0,2.304,0,1.484l0,0C0,0.665,0.665,0,1.484,0 h15.031C17.335,0,18,0.665,18,1.484L18,1.484z"></path>
              <path fill="#424242" d="M18,7.516C18,8.335,17.335,9,16.516,9H1.484C0.665,9,0,8.335,0,7.516l0,0c0-0.82,0.665-1.484,1.484-1.484 h15.031C17.335,6.031,18,6.696,18,7.516L18,7.516z"></path>
              <path fill="#424242" d="M18,13.516C18,14.335,17.335,15,16.516,15H1.484C0.665,15,0,14.335,0,13.516l0,0 c0-0.82,0.665-1.484,1.484-1.484h15.031C17.335,12.031,18,12.696,18,13.516L18,13.516z"></path>
            </svg>
          </span>
        </label>

        <div class="trigger">
          
            
            
              
            
          
            
            
              
              <a class="page-link" href="https://norasandler.com/about/">About</a>
              
            
          
            
            
              
              <a class="page-link" href="https://norasandler.com/archive/">Archive</a>
              
            
          
            
            
          
            
            
              
            
          
            
            
              
            
          
            
            
              
            
          
          <a class="page-link" href="https://github.com/nlsandler">Github</a>
          <a href="https://norasandler.com/feed.xml"><img id="rss" height="20" width="20" src="./Writing a C Compiler, Part 5_files/rss.png"></a>

        </div>
      </nav>
    
  </div>
</header>


    <main class="page-content" aria-label="Content">
      <div class="wrapper">
        <article class="post h-entry" itemscope="" itemtype="http://schema.org/BlogPosting">

  <header class="post-header">
    <h1 class="post-title p-name" itemprop="name headline">Writing a C Compiler, Part 5</h1>
    <p class="post-meta">
      <time class="dt-published" datetime="2018-01-08T20:00:00+00:00" itemprop="datePublished">Jan 8, 2018
      </time></p>
  </header>

  <div class="post-content e-content" itemprop="articleBody">
    <p><em>This is the fifth post in a series. Read part 1 <a href="https://norasandler.com/2017/11/29/Write-a-Compiler.html">here</a>.</em></p>

<p>We’ve spent the last two weeks adding binary primitives, and I don’t know about you, but I’m starting to get kind of bored with it.
This week, we’ll do something completely different and add support for local variables.
We’ll finally be able to compile functions longer than one line! Hooray!</p>

<p>As always, accompanying tests are <a href="https://github.com/nlsandler/write_a_c_compiler">here</a>.</p>

<h1 id="week-5-local-variables">Week 5: Local Variables</h1>

<p>We’re adding variables this week! Programming without variables is hard, so this is very exciting. 
To keep things simple, we’re going to support variables in a very restricted way for now:</p>

<ul>
  <li>We only support local variables, which are declared in <code class="language-plaintext highlighter-rouge">main</code>. No global variables.</li>
  <li>We only support variables of type <code class="language-plaintext highlighter-rouge">int</code>.</li>
  <li>We don’t support type modifiers like <code class="language-plaintext highlighter-rouge">short</code>, <code class="language-plaintext highlighter-rouge">long</code> or <code class="language-plaintext highlighter-rouge">unsigned</code>, storage-class specifiers like <code class="language-plaintext highlighter-rouge">static</code>,
or type qualifiers like <code class="language-plaintext highlighter-rouge">const</code>. Just plain old <code class="language-plaintext highlighter-rouge">int</code>.</li>
  <li>You can only declare one variable per statement. We won’t support statements like <code class="language-plaintext highlighter-rouge">int a, b;</code></li>
</ul>

<p>There are three things you can do with a variable:</p>

<ul>
  <li>Declare it (<code class="language-plaintext highlighter-rouge">int a;</code>)
    <ul>
      <li>When you declare it, you can also optionally initialize it (<code class="language-plaintext highlighter-rouge">int a = 2;</code>)</li>
    </ul>
  </li>
  <li>Assign to it (<code class="language-plaintext highlighter-rouge">a = 3;</code>)</li>
  <li>Reference it in an expression (<code class="language-plaintext highlighter-rouge">a + 2</code>)</li>
</ul>

<p>We’ll need to add support for these three things. We’ll also add support for functions containing more than one statement.</p>

<h2 id="lexing">Lexing</h2>

<p>The only new token this week is the assignment operator, <code class="language-plaintext highlighter-rouge">=</code>. Here’s our list of tokens, with the newest addition in bold at the bottom:</p>

<ul>
  <li>Open brace <code class="language-plaintext highlighter-rouge">{</code></li>
  <li>Close brace <code class="language-plaintext highlighter-rouge">}</code></li>
  <li>Open parenthesis <code class="language-plaintext highlighter-rouge">(</code></li>
  <li>Close parenthesis <code class="language-plaintext highlighter-rouge">)</code></li>
  <li>Semicolon <code class="language-plaintext highlighter-rouge">;</code></li>
  <li>Int keyword <code class="language-plaintext highlighter-rouge">int</code></li>
  <li>Return keyword <code class="language-plaintext highlighter-rouge">return</code></li>
  <li>Identifier <code class="language-plaintext highlighter-rouge">[a-zA-Z]\w*</code></li>
  <li>Integer literal <code class="language-plaintext highlighter-rouge">[0-9]+</code></li>
  <li>Minus <code class="language-plaintext highlighter-rouge">-</code></li>
  <li>Bitwise complement <code class="language-plaintext highlighter-rouge">~</code></li>
  <li>Logical negation <code class="language-plaintext highlighter-rouge">!</code></li>
  <li>Addition <code class="language-plaintext highlighter-rouge">+</code></li>
  <li>Multiplication <code class="language-plaintext highlighter-rouge">*</code></li>
  <li>Division <code class="language-plaintext highlighter-rouge">/</code></li>
  <li>AND <code class="language-plaintext highlighter-rouge">&amp;&amp;</code></li>
  <li>OR <code class="language-plaintext highlighter-rouge">||</code></li>
  <li>Equal <code class="language-plaintext highlighter-rouge">==</code></li>
  <li>Not Equal <code class="language-plaintext highlighter-rouge">!=</code></li>
  <li>Less than <code class="language-plaintext highlighter-rouge">&lt;</code></li>
  <li>Less than or equal <code class="language-plaintext highlighter-rouge">&lt;=</code></li>
  <li>Greater than <code class="language-plaintext highlighter-rouge">&gt;</code></li>
  <li>Greater than or equal <code class="language-plaintext highlighter-rouge">&gt;=</code></li>
  <li><strong>Assignment <code class="language-plaintext highlighter-rouge">=</code></strong></li>
</ul>

<h4 id="-task">☑ Task:</h4>
<p>Update the <em>lex</em> function to handle the <code class="language-plaintext highlighter-rouge">=</code> token. It should work for all stage 1-5 examples in the test suite, including the invalid ones.</p>

<h2 id="parsing">Parsing</h2>

<p>We need to make a lot of changes to our AST this week. Let’s look at a sample program we’d like to handle:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">int</span> <span class="nf">main</span><span class="p">()</span> <span class="p">{</span>
    <span class="kt">int</span> <span class="n">a</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
    <span class="n">a</span> <span class="o">=</span> <span class="n">a</span> <span class="o">+</span> <span class="mi">1</span><span class="p">;</span>
    <span class="k">return</span> <span class="n">a</span><span class="p">;</span>
<span class="p">}</span>
</code></pre></div></div>

<p>In this program, <code class="language-plaintext highlighter-rouge">main</code> contains three statements:</p>

<ol>
  <li>A variable declaration (<code class="language-plaintext highlighter-rouge">int a = 1;</code>)</li>
  <li>A variable assignment (<code class="language-plaintext highlighter-rouge">a = a + 1;</code>)</li>
  <li>A return statement (<code class="language-plaintext highlighter-rouge">return a;</code>)</li>
</ol>

<p>We need to update the defintion of <code class="language-plaintext highlighter-rouge">function_declaration</code> in the AST so a function can contain a list of statements, not just a single statement:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>function_declaration = Function(string, statement list) //string is function name
</code></pre></div></div>

<p>Right now, the only statements we’ve defined are <code class="language-plaintext highlighter-rouge">return</code> statements. That’s not right either. Let’s add some more:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>statement = Return(exp) 
          | Declare(string, exp option) //string is variable name
                                        //exp is optional initializer
          | Exp(exp)
</code></pre></div></div>

<p>We’ve added <code class="language-plaintext highlighter-rouge">Decl</code> for variable declarations. We can use an option type (<code class="language-plaintext highlighter-rouge">Maybe</code> in Haskell) to represent that we may or may not have an initializer.</p>

<p>The AST for <code class="language-plaintext highlighter-rouge">int a;</code> might look like this:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>decl = Declare("a", None) //None because we don't initialize it
</code></pre></div></div>

<p>And the AST for <code class="language-plaintext highlighter-rouge">int a = 3</code> might look like this:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>init_exp = Const(3)
decl = Declare("a", Some(init_exp))
</code></pre></div></div>

<p>Note that we don’t store the variable’s type anywhere in our AST; we don’t need to, because it can only have type <code class="language-plaintext highlighter-rouge">int</code>. We’ll need to start tracking type information once we have multiple types</p>

<p>We’ve also added a standalone <code class="language-plaintext highlighter-rouge">Exp</code> statement, which means we can now write programs like this:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">int</span> <span class="nf">main</span><span class="p">()</span> <span class="p">{</span>
    <span class="mi">2</span> <span class="o">+</span> <span class="mi">2</span><span class="p">;</span>
    <span class="k">return</span> <span class="mi">0</span><span class="p">;</span>
<span class="p">}</span>
</code></pre></div></div>

<p>This is valid C; if you compile it with gcc, it will issue a warning but it won’t fail.</p>

<p>However, <code class="language-plaintext highlighter-rouge">2+2;</code> isn’t a very useful statement. The real reason to add an <code class="language-plaintext highlighter-rouge">Exp</code> statement is so we can write statements like this:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">a</span> <span class="o">=</span> <span class="mi">2</span><span class="p">;</span>
</code></pre></div></div>

<p>Variable assignment is just an expression! That’s why you this statement is valid:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">a</span> <span class="o">=</span> <span class="mi">2</span> <span class="o">*</span> <span class="p">(</span><span class="n">b</span> <span class="o">=</span> <span class="mi">2</span><span class="p">);</span>
</code></pre></div></div>

<p>In the code snippet above, the expression <code class="language-plaintext highlighter-rouge">b = 2</code> has the value <code class="language-plaintext highlighter-rouge">2</code>, and the side effect of updating <code class="language-plaintext highlighter-rouge">b</code> to have that value.
This would be evaluated as:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="n">a</span> <span class="o">=</span> <span class="mi">2</span> <span class="o">*</span> <span class="p">(</span><span class="n">b</span> <span class="o">=</span> <span class="mi">2</span><span class="p">)</span>
<span class="n">a</span> <span class="o">=</span> <span class="mi">2</span> <span class="o">*</span> <span class="mi">2</span> <span class="c1">//also b is 2 now</span>
<span class="n">a</span> <span class="o">=</span> <span class="mi">4</span>
</code></pre></div></div>

<p>Now we need to update <code class="language-plaintext highlighter-rouge">exp</code> in our AST definition to handle assignment operators. My first thought was to just add <code class="language-plaintext highlighter-rouge">=</code> as another binary operator – after all, <code class="language-plaintext highlighter-rouge">a = b</code> <em>looks</em> kind of like <code class="language-plaintext highlighter-rouge">a + b</code>. But that’s totally wrong: the two operands of a binary operator can be arbitrary expressions, but the left side of an assignment operator can’t. A statement like <code class="language-plaintext highlighter-rouge">2 = 2</code> doesn’t make any sense, because you can’t assign a new value to <code class="language-plaintext highlighter-rouge">2</code>.</p>

<p>Instead, we’ll just define assignment as a new type of expression:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>exp = Assign(string, exp) //string is variable, exp is value to assign
    | BinOp(binary_operator, exp, exp)
    | UnOp(unary_operator, exp)
    | Constant(int)
</code></pre></div></div>

<p>Now we can write the AST for the statement <code class="language-plaintext highlighter-rouge">a = 2;</code> like this:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>assign_exp = Assign("a", Const(2))
assign_statement = Exp(assign_exp)
</code></pre></div></div>

<p>Now we can define variables and update their values, but that’s not super helpful unless we can actually reference them.
Let’s add variable reference as another type of expression:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>exp = Assign(string, exp)
    | Var(string) //string is variable name
    | BinOp(binary_operator, exp, exp)
    | UnOp(unary_operator, exp)
    | Constant(int)
</code></pre></div></div>

<p>Now we can write the AST <code class="language-plaintext highlighter-rouge">return a;</code> like this:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>return_exp = Var("a")
return_statement = Return(return_exp)
</code></pre></div></div>

<p>If we put it all together, here’s our new AST, with changes bolded:</p>

<pre>program = Program(function_declaration)
<b>function_declaration = Function(string, statement list) //string is the function name</b>

statement = Return(exp) 
<b>          | Declare(string, exp option) //string is variable name
                                        //exp is optional initializer
          | Exp(exp) </b>

exp = Assign(string, exp)
<b>    | Var(string) //string is variable name </b>
    | BinOp(binary_operator, exp, exp)
    | UnOp(unary_operator, exp)
    | Constant(int)
</pre>

<p>We also need to update our grammar. First, we need to update <code class="language-plaintext highlighter-rouge">&lt;function&gt;</code> to allow multiple statements.</p>

<p>Old definition:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;function&gt; ::= "int" &lt;id&gt; "(" ")" "{" &lt;statement&gt; "}"
</code></pre></div></div>

<p>New definition:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;function&gt; ::= "int" &lt;id&gt; "(" ")" "{" { &lt;statement&gt; } "}"
</code></pre></div></div>

<p>Thanks to the interspersed <code class="language-plaintext highlighter-rouge">{</code>/<code class="language-plaintext highlighter-rouge">}</code>, indicating repetitition, and <code class="language-plaintext highlighter-rouge">"{"</code>/<code class="language-plaintext highlighter-rouge">"}"</code>, indicating literal curly braces, this is almost completely unreadable. But it just means a function can have more than one statement now.</p>

<p>We need to handle multiple types of statement. We already have return statements:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>"return" &lt;exp&gt; ";"
</code></pre></div></div>

<p>And standalone expressions are super easy:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;exp&gt; ";"
</code></pre></div></div>

<p>A variable declaration needs a type specifier (<code class="language-plaintext highlighter-rouge">int</code>) followed by a name, optionally followed by an initializer. We use <code class="language-plaintext highlighter-rouge">[]</code> here to indicate something is optional:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>"int" &lt;id&gt; [ = &lt;exp&gt; ] ";"
</code></pre></div></div>

<p>Let’s put it all together to get a our new definition of <code class="language-plaintext highlighter-rouge">&lt;statement&gt;</code>:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;statement&gt; ::= "return" &lt;exp&gt; ";"
              | &lt;exp&gt; ";"
              | "int" &lt;id&gt; [ = &lt;exp&gt; ] ";"
</code></pre></div></div>

<p>Finally, we need to update <code class="language-plaintext highlighter-rouge">&lt;exp&gt;</code>. Assignment is our lowest-precedence operator, so it becomes our top level <code class="language-plaintext highlighter-rouge">&lt;exp&gt;</code> expression. Also note that, unlike most of our other operators, it’s right-associative, which makes it a bit simpler to express.</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;exp&gt; ::= &lt;id&gt; "=" &lt;exp&gt; | &lt;logical-or-exp&gt;
&lt;logical-or-exp&gt; ::= &lt;logical-and-exp&gt; { "||" &lt;logical-and-exp&gt; }
</code></pre></div></div>

<p>The grammar for all our binary operations (<code class="language-plaintext highlighter-rouge">&lt;logical-and-exp&gt;</code> on down to <code class="language-plaintext highlighter-rouge">&lt;term&gt;</code>) is unchanged. 
We just need to change <code class="language-plaintext highlighter-rouge">&lt;factor&gt;</code> so we can refer to variables as well as constants:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>&lt;factor&gt; ::= "(" &lt;exp&gt; ")" | &lt;unary_op&gt; &lt;factor&gt; | &lt;int&gt; | &lt;id&gt;
</code></pre></div></div>

<p>When you put it all together, here’s our new grammar, with changes bolded:</p>

<pre>&lt;program&gt; ::= &lt;function&gt;
<b>&lt;function&gt; ::= "int" &lt;id&gt; "(" ")" "{" { &lt;statement&gt; } "}"</b>
&lt;statement&gt; ::= "return" &lt;exp&gt; ";"
<b>              | &lt;exp&gt; ";"
              | "int" &lt;id&gt; [ = &lt;exp&gt;] ";" </b>
<b>&lt;exp&gt; ::= &lt;id&gt; "=" &lt;exp&gt; | &lt;logical-or-exp&gt;
&lt;logical-or-exp&gt; ::= &lt;logical-and-exp&gt; { "||" &lt;logical-and-exp&gt; } </b>
&lt;logical-and-exp&gt; ::= &lt;equality-exp&gt; { "&amp;&amp;" &lt;equality-exp&gt; }
&lt;equality-exp&gt; ::= &lt;relational-exp&gt; { ("!=" | "==") &lt;relational-exp&gt; }
&lt;relational-exp&gt; ::= &lt;additive-exp&gt; { ("&lt;" | "&gt;" | "&lt;=" | "&gt;=") &lt;additive-exp&gt; }
&lt;additive-exp&gt; ::= &lt;term&gt; { ("+" | "-") &lt;term&gt; }
&lt;term&gt; ::= &lt;factor&gt; { ("*" | "/") &lt;factor&gt; }
<b>&lt;factor&gt; ::= "(" &lt;exp&gt; ")" | &lt;unary_op&gt; &lt;factor&gt; | &lt;int&gt; | &lt;id&gt;</b>
&lt;unary_op&gt; ::= "!" | "~" | "-"
</pre>

<h4 id="-task-1">☑ Task:</h4>
<p>Update your expression-parsing code to handle variable declaration, assignment, and references. It should successfully parse all valid stage 1-5 examples in the test suite. The invalid examples are a little different this week. Some of them should fail during parsing; others can be parsed successfully but should cause errors during code generation (e.g. because they reference variables that haven’t been declared.) I decided to deal with this in the laziest way possible; the names of the invalid examples that should fail during parsing all start with <code class="language-plaintext highlighter-rouge">syntax_err</code>.</p>

<h2 id="code-generation">Code Generation</h2>

<p>We need to save local variables somewhere, so we’ll save them on the stack.
We also need to remember exactly where on the stack each variable was saved, so we can refer to it later.
To track this information, we’ll create a map from variable names to locations.</p>

<p>But how are we supposed to know a variable’s location at compile time? Absolute memory addresses aren’t determined until runtime. We could store the variable’s offset from ESP, except that the value of ESP changes whenever we push something onto the stack.
The solution is to store the variable’s offset from a different register, EBP.
To understand why this will work, we need to know a little bit about stack frames.</p>

<h3 id="stack-frames">Stack Frames</h3>

<p>Whenever we call a function, we allocate a chunk of memory for it on top of the stack – this memory is called the <em>stack frame</em>. The stack frame holds function arguments, the address to jump to after the function returns, and of course local variables. We already know that ESP points to the top of stack, which is also the top of the current stack frame<sup id="anchor1"><a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#fn1">1</a></sup>. The EBP (or base pointer) register points to the bottom of the current stack frame.
Without EBP, we wouldn’t know where once stack frame ends and the other begins, and we wouldn’t be able to find important values like a function’s return address.</p>

<p><img src="./Writing a C Compiler, Part 5_files/call_stack.svg" alt="Obligatory call stack diagram"></p>

<div class="screen-reader-only">
  <p>Call stack diagram, from higher address on the bottom of the stack to lower address on top:
  I. Caller’s stack frame
    * Caller’s local variable y
    * Caller’s local variable x
    * return address
  II. Callee’s stack frame
    * Saved EBP (current EBP points here)
    * local variable a
    * local variable b (top of stack; current ESP points here)</p>
</div>

<p>When a function (let’s call it <code class="language-plaintext highlighter-rouge">f</code>) returns, its caller needs to be able to pick up where it left off. That means its stack frame, and the values in ESP and EBP, all need to be exactly the same as they were before <code class="language-plaintext highlighter-rouge">f</code> was called. The first thing <code class="language-plaintext highlighter-rouge">f</code> needs to do is set up a new stack frame for itself, using the following instructions:</p>

<div class="language-nasm highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    <span class="nf">push</span> <span class="o">%</span><span class="nb">ebp</span>       <span class="c1">; save old value of EBP</span>
    <span class="nf">movl</span> <span class="o">%</span><span class="nb">esp</span><span class="p">,</span> <span class="o">%</span><span class="nb">ebp</span> <span class="c1">; current top of stack is bottom of new stack frame</span>
</code></pre></div></div>

<p>These instructions are called the function prologue. 
Immediately before <code class="language-plaintext highlighter-rouge">f</code> returns, it executes the function epilogue to remove this stack frame, 
leaving everything just as it was before the function prologue:</p>

<div class="language-nasm highlighter-rouge"><div class="highlight"><pre class="highlight"><code>    <span class="nf">movl</span> <span class="o">%</span><span class="nb">ebp</span><span class="p">,</span> <span class="o">%</span><span class="nb">esp</span> <span class="c1">; restore ESP; now it points to old EBP</span>
    <span class="nf">pop</span> <span class="o">%</span><span class="nb">ebp</span>        <span class="c1">; restore old EBP; now ESP is where it was before prologue</span>
    <span class="nf">ret</span>
</code></pre></div></div>

<p>Up to this point, we could get away with not having a function prologue or epilogue, but now we need to add them.
Adding them helps us in two ways:</p>

<ul>
  <li><strong>We can store variable locations as offsets from EBP</strong>. We know there’s nothing above EBP (because we set up an empty stack frame in the function prologue),
and we know that EBP won’t change until the function epilogue.</li>
  <li>We can safely push local variables onto the stack without changing the caller’s stack frame<sup id="anchor2"><a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#fn2">2</a></sup>.</li>
</ul>

<p>You should generate the function prologue at the start of the function definition, right after the function’s label.
You should generate the function epilogue as part of the return statement, right before <code class="language-plaintext highlighter-rouge">ret</code>.</p>

<p>Besides our variable map, we need to keep track of a <em>stack index</em>, which tells us the offset of the next available spot on the stack, relative to EBP. The next available spot is always the four-byte stack slot right after ESP, at <code class="language-plaintext highlighter-rouge">ESP - 4</code>. Right after the function prologue, EBP and ESP are the same. That means the stack index will also be -4. Whenever we push a variable onto the stack, we’ll decrement the stack index by 4<sup id="anchor3"><a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#fn3">3</a></sup>.</p>

<p>Now let’s look at how we can handle declaring, assigning, and referring to variables.</p>

<h3 id="variable-declaration">Variable Declaration</h3>

<p>When you encounter a variable declaration, just save the variable onto the stack and add it to the variable map<sup id="anchor4"><a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#fn4">4</a></sup>. Note that it’s illegal to declare a variable twice in the same local scope<sup id="anchor5"><a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#fn5">5</a></sup>, as in the following code snippet:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">int</span> <span class="n">a</span><span class="p">;</span>
<span class="kt">int</span> <span class="n">a</span><span class="p">;</span>
</code></pre></div></div>

<p>So your program should fail if the variable is already in the variable map.
Here’s how you might generate assembly for the statement <code class="language-plaintext highlighter-rouge">int a = expression</code>:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>  if var_map.contains("a"):
    fail() //shouldn't declare a var twice
  generate_exp(expression)      // generate assembly to calculate e1 and move it to eax
  emit "    pushl %eax" // save initial value of "a" onto the stack
  var_map = var_map.put("a", stack_index) // record location of a in the variable map
  stack_index = stack_index - 4 // stack location of next address will be 4 bytes lower
</code></pre></div></div>

<p>A few points here:</p>

<ul>
  <li>If a variable isn’t initialized, you can just initialize it to 0. Or whatever you want, really.</li>
  <li>The variable map exists during code generation, not at runtime.</li>
  <li>You should <strong>definitely use an immutable data structure</strong> for your variable map. In the next post we’ll add <code class="language-plaintext highlighter-rouge">if</code> statements, and then we’ll have nested scopes; a variable declared inside an <code class="language-plaintext highlighter-rouge">if</code> block isn’t accessible outside it. If you have to worry about code from an inner scope messing with the variable map in an outer scope, you will not be a happy camper.</li>
</ul>

<h3 id="variable-assignment">Variable Assignment</h3>

<p>We can look up a variable’s location in memory in our map; to assign it a new value, just move that value to the right memory location.
Here’s how to handle <code class="language-plaintext highlighter-rouge">a = expression</code>:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>  generate_exp(expression) // generate assembly to calculate expression and move it to eax 
  var_offset = var_map.find("a") //if "a" isn't in the map, fail b/c it hasn't been declared yet
  emit "    movl %eax, {}(%ebp)".format(var_offset) //using python-style string formatting here
</code></pre></div></div>

<p>Note that the value of <code class="language-plaintext highlighter-rouge">expression</code> is still in EAX, so this assignment expression has the correct value.</p>

<h3 id="variable-reference">Variable Reference</h3>

<p>To refer to a variable in an expression, just copy it from the stack to EAX:</p>

<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>  var_offset = var_map.find("a") //find location of variable "a" on the stack
                                 //should fail if it hasn't been declared yet
  emit "    movl {}(%ebp), %eax".format(var_offset) //retrieve value of variable
</code></pre></div></div>

<h3 id="missing-return-statements">Missing Return Statements</h3>

<p>Now that we support multiple types of statements, we can successfully parse programs with no return statement at all:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">int</span> <span class="nf">main</span><span class="p">()</span> <span class="p">{</span>
  <span class="kt">int</span> <span class="n">a</span> <span class="o">=</span> <span class="mi">2</span><span class="p">;</span>
<span class="p">}</span>
</code></pre></div></div>

<p>What’s the expected behavior here? According to section 5.1.2.2.3 of the <a href="http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf">C11 standard</a>:</p>

<blockquote>
  <p>If the return type of the <code class="language-plaintext highlighter-rouge">main</code> function is a type compatible with <code class="language-plaintext highlighter-rouge">int</code>, a return from the
initial call to the <code class="language-plaintext highlighter-rouge">main</code> function is equivalent to calling the <code class="language-plaintext highlighter-rouge">exit</code> function with the value
returned by the <code class="language-plaintext highlighter-rouge">main</code> function as its argument; reaching the <code class="language-plaintext highlighter-rouge">}</code> that terminates the
<code class="language-plaintext highlighter-rouge">main</code> function returns a value of 0.</p>
</blockquote>

<p>So, <code class="language-plaintext highlighter-rouge">main</code> needs to return 0 if it’s missing a return statement. Right now <code class="language-plaintext highlighter-rouge">main</code> is our only function, so that’s the only case we need to handle.</p>

<p>Eventually, we’ll need to deal with this problem in functions other than <code class="language-plaintext highlighter-rouge">main</code>. Here’s what section 6.9.1 of the standard says about missing return statements in general:</p>

<blockquote>
  <p>If the <code class="language-plaintext highlighter-rouge">}</code> that terminates a function is reached, and the value of the function call is used by the caller, the behavior is undefined.</p>
</blockquote>

<p>So this program has undefined behavior:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">int</span> <span class="nf">foo</span><span class="p">()</span> <span class="p">{</span>
  <span class="mi">1</span> <span class="o">+</span> <span class="mi">1</span><span class="p">;</span>
<span class="p">}</span>

<span class="kt">int</span> <span class="nf">main</span><span class="p">()</span> <span class="p">{</span>
  <span class="k">return</span> <span class="n">foo</span><span class="p">();</span>
<span class="p">}</span>
</code></pre></div></div>

<p>You could technically handle this however you want – fail, continue silently, issue a <a href="https://en.wikipedia.org/wiki/Halt_and_Catch_Fire">HALT AND CATCH FIRE</a> instruction.</p>

<p>This program, on the other hand, is perfectly valid, because the value returned from <code class="language-plaintext highlighter-rouge">foo()</code> is never used:</p>

<div class="language-c highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="kt">int</span> <span class="nf">foo</span><span class="p">()</span> <span class="p">{</span>
  <span class="mi">1</span> <span class="o">+</span> <span class="mi">1</span><span class="p">;</span>
<span class="p">}</span>

<span class="kt">int</span> <span class="nf">main</span><span class="p">()</span> <span class="p">{</span>
  <span class="n">foo</span><span class="p">();</span>
  <span class="k">return</span> <span class="mi">0</span><span class="p">;</span>
<span class="p">}</span>
</code></pre></div></div>

<p>Honestly, the specification here seems really dumb to me. If I write a non-<code class="language-plaintext highlighter-rouge">void</code> function without a return statement, that is WRONG and I want the compiler to save me from myself, even if I haven’t technically used it in an illegal way yet. I can’t think of any situation where we’d want this behavior; if you can, please let me know.</p>

<p>However, that’s the spec, so our functions have to return successfully even when they’re missing a return statement. That means you need to issue the function epilogue and <code class="language-plaintext highlighter-rouge">ret</code> instruction even if the return statement is missing. It’s probably easiest to handle <code class="language-plaintext highlighter-rouge">main</code> and all other functions uniformly, so you can just return 0 from any function without a return statement.</p>

<h4 id="-task-2">☑ Task:</h4>

<p>Update your code-generation pass to:</p>
<ul>
  <li>Generate function prologues and epilogues.</li>
  <li>Generate correct code for variable declarations, assignments, and references.</li>
  <li>Make <code class="language-plaintext highlighter-rouge">main</code> return 0 even if the return statement is missing.</li>
</ul>

<p>Your code should succeed on all valid examples and fail on all invalid examples for stages 1-5.</p>

<h2 id="bonus-features">Bonus features</h2>

<p>At this point, there are a handful of other features you can implement pretty easily:</p>

<h3 id="compound-assignment-operators">Compound Assignment Operators</h3>

<ul>
  <li><code class="language-plaintext highlighter-rouge">+=</code></li>
  <li><code class="language-plaintext highlighter-rouge">-=</code></li>
  <li><code class="language-plaintext highlighter-rouge">/=</code></li>
  <li><code class="language-plaintext highlighter-rouge">*=</code></li>
  <li><code class="language-plaintext highlighter-rouge">%=</code></li>
  <li><code class="language-plaintext highlighter-rouge">&lt;&lt;=</code></li>
  <li><code class="language-plaintext highlighter-rouge">&gt;&gt;=</code></li>
  <li><code class="language-plaintext highlighter-rouge">&amp;=</code></li>
  <li><code class="language-plaintext highlighter-rouge">|=</code></li>
  <li><code class="language-plaintext highlighter-rouge">^=</code></li>
</ul>

<h3 id="comma-operators">Comma Operators</h3>

<ul>
  <li><code class="language-plaintext highlighter-rouge">e1, e2</code>. The result is the value of e2; the value of e1 is ignored.</li>
</ul>

<h3 id="incrementdecrement-operators">Increment/Decrement Operators</h3>

<ul>
  <li>Prefix and postfix  <code class="language-plaintext highlighter-rouge">++</code></li>
  <li>Prefix and postfix <code class="language-plaintext highlighter-rouge">--</code></li>
</ul>

<p>This week’s tests don’t cover these, so it’s up to you whether to implement them or skip them.</p>

<h2 id="up-next">Up Next</h2>

<p>I’m going to switch to one blog post every two weeks. In the <a href="https://norasandler.com/2018/02/25/Write-a-Compiler-6.html">next post</a>, we’ll add <code class="language-plaintext highlighter-rouge">if</code> statements and conditional operators (<code class="language-plaintext highlighter-rouge">a ? b : c</code>). See you then!</p>

<h2 id="update-112">Update 1/12</h2>

<ul>
  <li>
    <p>Corrected the “Missing Return Statements” section, which previously said that the behavior of <code class="language-plaintext highlighter-rouge">main</code> is undefined when it’s missing a return statement. Also updated the test suite accordingly.</p>
  </li>
  <li>
    <p>Clarified that declaring a variable multiple times is sometimes legal at file scope.</p>
  </li>
</ul>

<p>Thanks to <a href="http://ouah.org/ogay/">Olivier Gay</a> for pointing out both those things.</p>

<p><em>If you have any questions, corrections, or other feedback, you can <a href="mailto:nora@norasandler.com">email me</a> or <a href="https://github.com/nlsandler/write_a_c_compiler/issues">open an issue</a>.</em></p>

<div class="footnote">
  <p><sup id="fn1">1</sup>
Keep in mind that the stack grows <em>down</em> towards lower addresses; we decrement ESP whenever we push things onto the stack, and ESP will always hold a lower value than EBP. So the top of the stack is really…on the bottom ¯_(ツ)_/¯ <a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#anchor1">↩</a></p>
</div>

<div class="footnote">
  <p><sup id="fn2">2</sup>
Even though <code class="language-plaintext highlighter-rouge">main</code> is the only function, it still has a caller: it’s called by the setup routine, <code class="language-plaintext highlighter-rouge">crt0</code>. <a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#anchor2">↩</a></p>
</div>

<div class="footnote">
  <p><sup id="fn3">3</sup>
We don’t really need to keep track of the stack index, since we can just derive it from the size of the variable map.
However, the stack index will come in handy once we add types other than <code class="language-plaintext highlighter-rouge">int</code>, since at that point our variables won’t all be the same size.
If you don’t want to keep track of it for now, that’s fine with me. <a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#anchor3">↩</a></p>
</div>

<div class="footnote">
  <p><sup id="fn4">4</sup>
This is not at all how real compilers work;
they usually allocate space for local variables all at once in the function prologue,
or just store them in registers.
Our way is less effort, though. <a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#anchor4">↩</a></p>
</div>

<div class="footnote">
  <p><sup id="fn5">5</sup>
It’s sometimes legal to declare a variable at file scope, per section 6.9.2 of the C11 specification. <a href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html#anchor5">↩</a></p>
</div>

  </div><a class="u-url" href="https://norasandler.com/2018/01/08/Write-a-Compiler-5.html" hidden=""></a>
</article>

      </div>
    </main>

    <footer class="site-footer">

  <div class="wrapper">
      <div class="footer-col-wrapper">
        <div class="footer-col footer-col-1">
            <div class="rc-scout" data-scout-rendered="true"><p class="rc-scout__text"><i class="rc-scout__logo"></i> Want to become a better programmer? <a class="rc-scout__link" href="https://www.recurse.com/scout/click?t=8f520efbc4be09fb83a71920f53a07b7">Join the Recurse Center!</a></p></div><script async="" defer="" src="./Writing a C Compiler, Part 5_files/loader.js"></script>
        </div>
      </div>
    <div class="footer-col-wrapper">
      <div class="footer-col footer-col-1">
        © 2023 Nora Sandler.
      </div>
    </div>
  </div>

</footer>


  


<script async="" src="./Writing a C Compiler, Part 5_files/scout-176be16681a03cbddd686f3cc96694d3aab338ea5cb65452f83d989309810528.js"></script><style class="rc-scout__style" type="text/css">.rc-scout {
  display: block;
  padding: 0;
  border: 0;
  margin: 0;
}
.rc-scout__text {
  display: block;
  padding: 0;
  border: 0;
  margin: 0;
  height: 100%;
  font-size: 100%;
}
.rc-scout__logo {
  display: inline-block;
  padding: 0;
  border: 0;
  margin: 0;
  width: 0.85em;
  height: 0.85em;
  background: no-repeat center url('data:image/svg+xml;utf8,%3Csvg%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%20viewBox%3D%220%200%2012%2015%22%3E%3Crect%20x%3D%220%22%20y%3D%220%22%20width%3D%2212%22%20height%3D%2210%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%221%22%20y%3D%221%22%20width%3D%2210%22%20height%3D%228%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%222%22%20y%3D%222%22%20width%3D%228%22%20height%3D%226%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%222%22%20y%3D%223%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%224%22%20y%3D%223%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%226%22%20y%3D%223%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%223%22%20y%3D%225%22%20width%3D%222%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%226%22%20y%3D%225%22%20width%3D%222%22%20height%3D%221%22%20fill%3D%22%233dc06c%22%3E%3C%2Frect%3E%3Crect%20x%3D%224%22%20y%3D%229%22%20width%3D%224%22%20height%3D%223%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%221%22%20y%3D%2211%22%20width%3D%2210%22%20height%3D%224%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%220%22%20y%3D%2212%22%20width%3D%2212%22%20height%3D%223%22%20fill%3D%22%23000%22%3E%3C%2Frect%3E%3Crect%20x%3D%222%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%223%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%224%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%225%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%226%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%227%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%228%22%20y%3D%2213%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3Crect%20x%3D%229%22%20y%3D%2212%22%20width%3D%221%22%20height%3D%221%22%20fill%3D%22%23fff%22%3E%3C%2Frect%3E%3C%2Fsvg%3E');
}
.rc-scout__link:link, .rc-scout__link:visited {
  color: #3dc06c;
  text-decoration: underline;
}
.rc-scout__link:hover, .rc-scout__link:active {
  color: #4e8b1d;
}
</style></body></html>