1 files changed, 679 insertions, 0 deletions
diff --git a/miniany/doc/blog.jeff.over.bz_assembly_compilers_jit_2017_01_15_x86-assembler.txt b/miniany/doc/blog.jeff.over.bz_assembly_compilers_jit_2017_01_15_x86-assembler.txt
new file mode 100644
index 0000000..f712d28
--- /dev/null
+++ b/miniany/doc/blog.jeff.over.bz_assembly_compilers_jit_2017_01_15_x86-assembler.txt
@@ -0,0 +1,679 @@
+   #[1]256LOL RSS Feed
+
+[2][headshot-circle.png]
+
+   256 LINES OR LESS
+   (jeff overbey's blog)
+     * [3]22 Apr 2018 Building a Go Doctor Refactoring
+     * [4]09 Sep 2017 Lexical Analysis
+     * [5]01 Jun 2017 On Performance Improvements
+     * [6]30 Mar 2017 Executing Dynamically Generated Machine Code: The
+       Start of a JIT
+     * [7]15 Feb 2017 Finding Machine Language Encodings
+     * [8]15 Jan 2017 An x86 Assembler in 256 LOC
+     * [9]15 Dec 2016 256 Lines or Less (the joverblog?)
+
+   [10]RSS Feed o [11]My Web Site
+   [sidebar-button.png]
+   [empty.jpg]
+
+An x86 Assembler in 256 LOC
+
+   15 Jan 2017
+
+   For the first "real" post in this blog, we'll build an x86 assembler in
+   less than 256 lines of C code. Obviously, it won't implement every x86
+   instruction, but it will implement a surprisingly useful subset: data
+   movement, control flow, integer arithmetic, bitwise operations, and
+   function calls. We won't be able to run the generated machine code yet
+   (that's coming in a later blog post), but we'll be in a good position
+   to do so.
+
+   I'll assume you're already familiar with x86 assembly language
+   (hopefully the table below will serve as a brief refresher), although I
+   won't assume you know about their machine language encodings. I'll also
+   assume that you're familiar with hexadecimal representation and
+   arithmetic (e.g., 9 + 1 = A and 10 - 1 = F).
+
+1. Which instructions will we support?
+
+   By the time we finish, we'll have an assembler that supports all of the
+   following x86 instructions (yes, I'm serious):
+   Instruction    Example    Description of the Example
+   nop   nop   No operation (do nothing)
+           -- Data Movement --
+   mov register, immediate   mov eax, 0F00Dh   Place the value F00D
+   (hexadecimal) in EAX
+   mov register, register   mov eax, ebx   Copy the value from the EBX
+   register into EAX
+   mov register, [register]   mov eax, [ebx]   Treat EBX as pointer; load
+   32-bit value from memory into EAX
+   mov [register], register   mov [eax], ebx   Treat EAX as pointer; store
+   32-bit value from EBX in memory
+           -- Arithmetic --
+   add register, register   add eax, ebx   EAX = EAX + EBX
+   cdq   cdq   Sign-extend EAX into EDX in preparation for idiv
+   dec register   dec eax   EAX = EAX - 1
+   div register   div ebx   Unsigned division: EDX:EAX ÷ EBX,
+   setting EAX = quotient, EDX = remainder
+   idiv register   idiv ebx   Signed division: EDX:EAX ÷ EBX,
+   setting EAX = quotient, EDX = remainder
+   imul register   imul ebx   Signed multiplication: EDX:EAX = EAX × EBX
+   inc register   inc eax   EAX = EAX + 1
+   neg register   neg eax   EAX = -EAX
+   mul register   mul ebx   Unsigned multiplication: EDX:EAX = EAX × EBX
+   sub register, register   sub eax, ebx   EAX = EAX - EBX
+           -- Bitwise Operations --
+   and register, register   and eax, ebx   EAX = EAX & EBX
+   not register   not eax   EAX = ~EAX
+   or register, register   or eax, ebx   EAX = EAX | EBX
+   sar register, immediate   sar eax, 2   Shift EAX right by 2 bits
+   (sign-fill)
+   sar register, cl   sar eax, cl   Shift EAX right by CL bits (sign-fill)
+   shl register, immediate   shl eax, 2   Shift EAX left by 2 bits
+   shl register, cl   shl eax, cl   Shift EAX left by number of bits in CL
+   shr register, immediate   shr eax, 2   Shift EAX right by 2 bits
+   (zero-fill)
+   shr register, cl   shr eax, cl   Shift EAX right by CL bits (zero-fill)
+   xor register, register   xor eax, ebx   EAX = EAX ^ EBX
+           -- Comparison --
+   cmp register, register   cmp eax, ebx   Compare EAX to EBX, setting
+   flags for conditional jump
+           -- Control Flow --
+   jmp bytes   jmp -10   Jump -10 bytes, i.e., move EIP backward by 10
+   bytes
+   ja bytes   ja -10   Jump if above (>, unsigned)
+   jae bytes   jae -10   Jump if above or equal (>=, unsigned)
+   jb bytes   jb -10   Jump if below (<, unsigned)
+   jbe bytes   jbe -10   Jump if below or equal (<=, unsigned)
+   je bytes   je -10   Jump if equal
+   jg bytes   jg -10   Jump if greater (>, signed)
+   jge bytes   jge -10   Jump if greater or equal (>=, signed)
+   jl bytes   jl -10   Jump if less (<, signed)
+   jle bytes   jle -10   Jump if less or equal (<=, signed)
+   jne bytes   jne -10   Jump if not equal
+           -- Function Calls --
+   call register   call eax   Call function at pointer stored in EAX
+   push register   push eax   Push value of EAX onto the stack
+   pop register   pop eax   Pop a value from the stack into EAX
+   ret immediate   ret 4   Return from function, removing 4 bytes of stack
+   arguments
+
+2. The API: x86asm.h
+
+   The header file, x86asm.h, defines the API that we intend for clients
+   to use. It provides
+     * an enumeration of the x86's 32-bit registers (reg32_t), and
+     * one function for each instruction form we can assemble.
+
+   Here's the header in its entirety. (There's more explanation in the
+   next section, but it will be helpful to read through the header file
+   first.)
+// x86 Subset Assembler - API
+//-----------------------------------------------------------------------------
+// Copyright (C) 2017 Jeffrey L. Overbey.  Use of this source code is governed
+// by a BSD-style license posted at http://blog.jeff.over.bz/license/
+
+#ifndef X86ASM_H
+#define X86ASM_H
+
+#include <stdint.h> // uint8_t, unint32_t
+
+typedef enum { EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI } reg32_t;
+
+uint8_t *nop(uint8_t *buf);
+
+uint8_t *mov_immediate(reg32_t dest, int32_t value, uint8_t *buf);
+uint8_t * mov_from_ptr(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *   mov_to_ptr(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *          mov(reg32_t dest, reg32_t src, uint8_t *buf);
+
+uint8_t *  add(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *  sub(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *  and(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *   or(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *  xor(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *  cmp(reg32_t dest, reg32_t src, uint8_t *buf);
+uint8_t *  inc(reg32_t reg, uint8_t *buf);
+uint8_t *  dec(reg32_t reg, uint8_t *buf);
+uint8_t *  not(reg32_t reg, uint8_t *buf);
+uint8_t *  neg(reg32_t reg, uint8_t *buf);
+uint8_t *  mul(reg32_t reg, uint8_t *buf);
+uint8_t * imul(reg32_t reg, uint8_t *buf);
+uint8_t * div_(reg32_t reg, uint8_t *buf);
+uint8_t * idiv(reg32_t reg, uint8_t *buf);
+uint8_t *  cdq(uint8_t *buf);
+
+uint8_t *   shl(reg32_t reg, uint8_t bits, uint8_t *buf);
+uint8_t *shl_cl(reg32_t reg, uint8_t *buf);
+uint8_t *   shr(reg32_t reg, uint8_t bits, uint8_t *buf);
+uint8_t *shr_cl(reg32_t reg, uint8_t *buf);
+uint8_t *   sar(reg32_t reg, uint8_t bits, uint8_t *buf);
+uint8_t *sar_cl(reg32_t reg, uint8_t *buf);
+
+uint8_t *push(reg32_t reg, uint8_t *buf);
+uint8_t * pop(reg32_t reg, uint8_t *buf);
+uint8_t *call(reg32_t reg, uint8_t *buf);
+uint8_t * ret(uint16_t bytes, uint8_t *buf);
+
+uint8_t * jmp(int32_t relative_bytes, uint8_t *buf);
+uint8_t *  jb(int32_t relative_bytes, uint8_t *buf);
+uint8_t * jae(int32_t relative_bytes, uint8_t *buf);
+uint8_t *  je(int32_t relative_bytes, uint8_t *buf);
+uint8_t * jne(int32_t relative_bytes, uint8_t *buf);
+uint8_t * jbe(int32_t relative_bytes, uint8_t *buf);
+uint8_t *  ja(int32_t relative_bytes, uint8_t *buf);
+uint8_t *  jl(int32_t relative_bytes, uint8_t *buf);
+uint8_t * jge(int32_t relative_bytes, uint8_t *buf);
+uint8_t * jle(int32_t relative_bytes, uint8_t *buf);
+uint8_t *  jg(int32_t relative_bytes, uint8_t *buf);
+
+#endif
+
+3. The demo program: demo.c
+
+   Before delving into the implementation of the assembler, it's probably
+   helpful to show how this API is used.
+
+   Each function in our API takes a uint8_t pointer buf, writes the
+   byte(s) of machine code for a single assembly language instruction to
+   memory starting at that address, then returns a pointer to the next
+   byte after the instruction that was just assembled.
+
+   For example, the instruction mov eax, 12345678h is assembled into five
+   bytes of machine code: b8 78 56 34 12. Calling mov_immediate(EAX,
+   0x12345678, buf) stores these five bytes into memory at the location
+   pointed to by buf, then it returns buf+5, which is presumably where
+   you'll want to store the next instruction.
+
+   For example, suppose you want to assemble the following
+   three-instruction program.
+mov eax, 120h
+add eax, ecx
+shl eax, 4
+
+   The following program illustrates how to assemble this sequence of
+   three instructions, then write the byte values of the resulting machine
+   code to standard output:
+#include "x86asm.h"
+#include <stdio.h>
+
+int main() {
+        uint8_t bytes[64];
+        uint8_t *cur = bytes;
+        cur = mov_immediate(EAX, 0x120, cur);  // mov eax, 120h
+        cur = add(EAX, ECX, cur);              // add eax, ecx
+        cur = shl(EAX, 4, cur);                // shl eax, 4
+
+        for (uint8_t *p = bytes; p < cur; p++) {
+                printf("%02x ", *p);
+        }
+        printf("\n");
+        return 0;
+}
+
+   When you run this, the output is:
+b8 20 01 00 00 03 c1 c1 e0 04
+
+4. The implementation: x86asm.c
+
+   Now, we'll start implementing this API. For each instruction, I'll
+   describe its machine language encoding, and then the C function that
+   implements it.
+
+   The definitive, official reference for the x86 instruction set and its
+   machine language encoding is Volume 2 of the [12]Intel® 64 and IA-32
+   Architectures Software Developer Manuals. Unfortunately, Intel's
+   documentation is not easy to read, so for this small assembler, it will
+   be sufficient to simply describe the encodings by example.
+
+No operation - nop
+
+   The nop instruction assembles to a single byte of machine code: 90h.
+uint8_t *nop(uint8_t *buf) {
+        *buf++ = 0x90;
+        return buf;
+}
+
+Increment and decrement - inc, dec
+
+   The inc instruction adds 1 to a value in a register; dec subtracts 1.
+   Recall from the header file above (x86asm.h) that we defined an
+   enumeration with all of the x86's 32-bit registers.
+typedef enum { EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI } reg32_t;
+
+   There's a reason we listed the registers in this specific order: when
+   instructions take register operands, the encodings tend to follow this
+   same order. Notice the pattern in the encodings of the inc and dec
+   instructions:
+   Instruction    Encoding (hex)          Instruction    Encoding (hex)
+   inc eax        40                      dec eax        48
+   inc ecx        41                      dec ecx        49
+   inc edx        42                      dec edx        4A
+   ...                                    ...
+   inc edi        47                      dec edi        4F
+
+   Since our reg32_t enum assigns an integer value to each register name
+   (EAX=0, ECX=1, EDX=2, etc.), this means we can encode inc register by
+   simply adding the register number to hexadecimal 40.
+uint8_t *inc(reg32_t reg, uint8_t *buf) {
+        *buf++ = 0x40 + reg;
+        return buf;
+}
+
+uint8_t *dec(reg32_t reg, uint8_t *buf) {
+        *buf++ = 0x48 + reg;
+        return buf;
+}
+
+   (It's more conventional to describe encodings in terms of which bits in
+   the encoding represent the operand register. For example, see Volume 2,
+   Appendix B of the Intel documentation referenced above. From that
+   perspective, it might make more sense to build encodings using bitwise
+   operations. However, I'm writing this blog post from the perspective of
+   "look at the pattern and implement it;" adding values seems more
+   intuitive and produces the same result.)
+
+Move immediate value to register - mov reg, imm
+
+   The following table shows the encodings for mov reg, 1 and
+   mov reg, 12345678h. Notice the pattern?
+   Instruction    Encoding (hex)             Instruction        Encoding (hex)
+   mov eax, 1     B8 01 00 00 00          mov eax, 12345678h    B8 78 56 34 12
+   mov ecx, 1     B9 01 00 00 00          mov ecx, 12345678h    B9 78 56 34 12
+   mov edx, 1     BA 01 00 00 00          mov edx, 12345678h    BA 78 56 34 12
+   ...                                    ...
+   mov edi, 1     BF 01 00 00 00          mov edi, 12345678h    BF 78 56 34 12
+
+   While the inc and dec instructions had 1-byte encodings, the encoding
+   here is always 5 bytes. The first byte of the encoding is B8 + the
+   register number. The next four bytes are the immediate value in little
+   endian order, i.e., with the low-order byte first. Assuming the
+   assembler will be run on an x86/x64 processor, which uses little endian
+   byte ordering natively, nothing special needs to be done to reorder the
+   bytes--storing a 32-bit value in memory will store the bytes in little
+   endian order.
+uint8_t *mov_immediate(reg32_t dest, int32_t value, uint8_t *buf) {
+        *buf++ = 0xB8 + dest;
+        *((int32_t *)buf) = value; buf += sizeof(int32_t);
+        return buf;
+}
+
+Load value from memory - mov reg, DWORD PTR [reg]
+
+   So far, our instructions have had straightforward encodings with
+   reasonably obvious patterns. This one gets a bit more interesting.
+         Instruction           Encoding (hex)
+   mov eax, DWORD PTR [eax]    8B 00
+   mov eax, DWORD PTR [ecx]    8B 01
+   mov eax, DWORD PTR [edx]    8B 02
+   mov eax, DWORD PTR [ebx]    8B 03
+   mov eax, DWORD PTR [esp]    8B 04 24
+   mov eax, DWORD PTR [ebp]    8B 45 00
+   mov eax, DWORD PTR [esi]    8B 06
+   mov eax, DWORD PTR [edi]    8B 07
+
+   mov ecx, DWORD PTR [eax]    8B 08
+   mov ecx, DWORD PTR [ecx]    8B 09
+   mov ecx, DWORD PTR [edx]    8B 0A
+   mov ecx, DWORD PTR [ebx]    8B 0B
+   mov ecx, DWORD PTR [esp]    8B 0C 24
+   mov ecx, DWORD PTR [ebp]    8B 4D 00
+   mov ecx, DWORD PTR [esi]    8B 0E
+   mov ecx, DWORD PTR [edi]    8B 0F
+
+   mov edx, DWORD PTR [eax]    8B 10
+   mov edx, DWORD PTR [ecx]    8B 11
+   ...
+   mov edi, DWORD PTR [edi]    8B 3F
+
+   This form of the mov instruction has a two-byte encoding with a fairly
+   obvious pattern, except when the source operand is ESP or EBP... then
+   it's a three-byte encoding with a not-so-obvious pattern.^1
+uint8_t *mov_from_ptr(reg32_t dest, reg32_t src, uint8_t *buf) {
+        *buf++ = 0x8B;
+        if (src == ESP) {
+                *buf++ = 8*dest + src;
+                *buf++ = 0x24;
+        } else if (src == EBP) {
+                *buf++ = 0x45 + 8*dest;
+                *buf++ = 0x00;
+        } else {
+                *buf++ = 8*dest + src;
+        }
+        return buf;
+}
+
+Store value into memory - mov DWORD PTR [reg], reg
+
+   When mov is used to store a value in memory, the encodings are almost
+   identical to the encodings for loading a value from memory, except the
+   first byte is 89h and the source and destination operands are reversed
+   when encoding the second byte.
+         Instruction           Encoding (hex)
+   mov DWORD PTR [eax], eax    89 00
+   mov DWORD PTR [ecx], eax    89 01
+   mov DWORD PTR [edx], eax    89 02
+   mov DWORD PTR [ebx], eax    89 03
+   mov DWORD PTR [esp], eax    89 04 24
+   mov DWORD PTR [ebp], eax    89 45 00
+   mov DWORD PTR [esi], eax    89 06
+   mov DWORD PTR [edi], eax    89 07
+   mov DWORD PTR [eax], ecx    89 08
+   mov DWORD PTR [ecx], ecx    89 09
+   mov DWORD PTR [edx], ecx    89 0A
+   mov DWORD PTR [ebx], ecx    89 0B
+   mov DWORD PTR [esp], ecx    89 0C 24
+   mov DWORD PTR [ebp], ecx    89 4D 00
+   mov DWORD PTR [esi], ecx    89 0E
+   mov DWORD PTR [edi], ecx    89 0F
+   mov DWORD PTR [eax], edx    89 10
+   mov DWORD PTR [ecx], edx    89 11
+   ...
+   mov DWORD PTR [edi], edi    89 3F
+uint8_t *mov_to_ptr(reg32_t dest, reg32_t src, uint8_t *buf) {
+        *buf++ = 0x89;
+        if (dest == ESP) {
+                *buf++ = 8*src + dest;
+                *buf++ = 0x24;
+        } else if (dest == EBP) {
+                *buf++ = 0x45 + 8*src;
+                *buf++ = 0x00;
+        } else {
+                *buf++ = 8*src + dest;
+        }
+        return buf;
+}
+
+RM-encoded instructions: mov, add, sub, and, or, xor, cmp
+
+   Next, we will tackle register-register mov, as well as add, sub, and,
+   or, xor, and cmp. All of these instructions have a similar encoding: an
+   opcode byte (that differs from one instruction to the next - hence the
+   name, "operation code"), followed by a single byte indicating the
+   source and destination registers.
+
+   To see the pattern, consider mov and add:
+   Instruction     Encoding (hex)          Instruction     Encoding (hex)
+   mov eax, eax    8B C0                   add eax, eax    03 C0
+   mov eax, ecx    8B C1                   add eax, ecx    03 C1
+   mov eax, edx    8B C2                   add eax, edx    03 C2
+   ...                                     ...
+   mov eax, edi    8B C7                   add eax, edi    03 C7
+   mov ecx, eax    8B C8                   add ecx, eax    03 C8
+   mov ecx, ecx    8B C9                   add ecx, ecx    03 C9
+   ...                                     ...
+   mov ecx, edi    8B CF                   add ecx, edi    03 CF
+   mov edx, eax    8B D0                   add edx, eax    03 D0
+   ...                                     ...
+   mov edi, edi    8B FF                   add edi, edi    03 FF
+
+   The second byte of the encoding is hex C0, plus 8 times the destination
+   register number, plus the source register number.
+#define DEFINE_INSN_RM(mnemonic, opcode)                     \
+uint8_t *mnemonic(reg32_t dest, reg32_t src, uint8_t *buf) { \
+        *buf++ = opcode;                                     \
+        *buf++ = 8*dest + 0xC0 + src;                        \
+        return buf;                                          \
+}
+
+DEFINE_INSN_RM(mov, 0x8B)
+DEFINE_INSN_RM(add, 0x03)
+DEFINE_INSN_RM(sub, 0x2B)
+DEFINE_INSN_RM(and, 0x23)
+DEFINE_INSN_RM( or, 0x0B)
+DEFINE_INSN_RM(xor, 0x33)
+DEFINE_INSN_RM(cmp, 0x3B)
+
+Instructions with opcodes beginning with F7: not, neg, mul, imul, div, idiv
+
+   The not, neg, mul, imul, div, and idiv instructions also have similar
+   encodings. The first byte of the encoding is F7. The second byte
+   indicates both the operation and the operand (register).
+   Instruction    Encoding (hex)          Instruction    Encoding (hex)
+   not eax        F7 D0                   neg eax        F7 D8
+   not ecx        F7 D1                   neg ecx        F7 D9
+   ...
+   not edi        F7 D7                   neg edi        F7 DF
+
+   As a note, we named the C function for the div instruction div_, since
+   the C standard library's stdlib.h includes the [13]div(3) instruction.
+#define DEFINE_INSN_F7(mnemonic, reg_base)     \
+uint8_t *mnemonic(reg32_t reg, uint8_t *buf) { \
+        *buf++ = 0xF7;                         \
+        *buf++ = reg_base + reg;               \
+        return buf;                            \
+}
+
+DEFINE_INSN_F7( not, 0xD0)
+DEFINE_INSN_F7( neg, 0xD8)
+DEFINE_INSN_F7( mul, 0xE0)
+DEFINE_INSN_F7(imul, 0xE8)
+DEFINE_INSN_F7(div_, 0xF0)
+DEFINE_INSN_F7(idiv, 0xF8)
+
+Convert doubleword to quadword - cdq
+
+   Both the div and idiv instructions take a 64-bit dividend (with the
+   high 32 bits in EDX and the low 32 bits in EAX) and divide it by a
+   32-bit divisor (the register operand). To divide two 32-bit values, the
+   dividend must be extended to 64 bits. For unsigned division (div), this
+   is easy: mov edx, 0. For signed division (idiv), the 32-bit value must
+   be sign-extended to 64 bits. This is done by the cdq instruction: it
+   copies the sign bit of EAX into all 32 bits of EDX.
+uint8_t *cdq(uint8_t *buf) {
+        *buf++ = 0x99;
+        return buf;
+}
+
+Bit shift instructions - shl, shr, sar
+
+   The bit shift instructions are interesting for two reasons:
+     * The number of bits to shift can be an immediate value (0-255), or
+       it can be stored in the CL register (another name for the lowest 8
+       bits of the ECX register).
+     * The encoding for a one-bit shift is different.
+
+   Using the left shift instruction as an example:
+    Instruction     Encoding (hex)          Instruction    Encoding (hex)
+   shl eax, 0       C1 E0 00                shl eax, cl    D3 E0
+   shl eax, 1       D1 E0                   shl ecx, cl    D3 E1
+   shl eax, 2       C1 E0 02                shl edx, cl    D3 E2
+   shl eax, 3       C1 E0 03                shl ebx, cl    D3 E3
+   ...                                      ...
+   shl ecx, 0FFh    C1 E1 FF
+   shl ecx, 0       C1 E1 00
+   shl ecx, 1       D1 E1
+   shl ecx, 2       C1 E1 02
+   shl ecx, 3       C1 E1 03
+   ...
+   shl ecx, 0FFh    C1 E1 FF
+   ...
+
+   We can implement this in our assembler as follows.
+#define DEFINE_INSN_D1C1(mnemonic, reg_base)                  \
+uint8_t *mnemonic(reg32_t reg, uint8_t bits, uint8_t *buf) {  \
+        switch (bits) {                                       \
+        case 1: /* 1-bit shifts have a different opcode */    \
+                *buf++ = 0xD1;                                \
+                *buf++ = reg_base + reg;                      \
+                break;                                        \
+        default:                                              \
+                *buf++ = 0xC1;                                \
+                *buf++ = reg_base + reg;                      \
+                *buf++ = bits;                                \
+        }                                                     \
+        return buf;                                           \
+}                                                             \
+uint8_t *mnemonic##_cl(reg32_t reg, uint8_t *buf) {           \
+        *buf++ = 0xD3;                                        \
+        *buf++ = reg_base + reg;                              \
+        return buf;                                           \
+}
+
+DEFINE_INSN_D1C1(shl, 0xE0)
+DEFINE_INSN_D1C1(shr, 0xE8)
+DEFINE_INSN_D1C1(sar, 0xF8)
+
+Procedure calls: push, pop, call, ret
+
+   The push, pop, call, and ret instructions are the four essential
+   instructions for procedure calls. Their encodings follow similar
+   patterns to those we've already seen, except with different opcode
+   bytes.
+uint8_t *push(reg32_t reg, uint8_t *buf) {
+        *buf++ = 0x50 + reg;
+        return buf;
+}
+
+uint8_t *pop(reg32_t reg, uint8_t *buf) {
+        *buf++ = 0x58 + reg;
+        return buf;
+}
+
+uint8_t *call(reg32_t reg, uint8_t *buf) {
+        *buf++ = 0xFF;
+        *buf++ = 0xD0 + reg;
+        return buf;
+}
+
+   The encoding of ret is only slightly more interesting, since ret 0
+   (which is often written as ret with no operand) is encoded differently
+   than ret with a nonzero immediate operand, such as ret 4 or ret 16.
+uint8_t *ret(uint16_t bytes, uint8_t *buf) {
+        if (bytes == 0) {
+                *buf++ = 0xC3;
+        } else {
+                *buf++ = 0xC2;
+                *((uint16_t *)buf) = bytes; buf += sizeof(uint16_t);
+        }
+        return buf;
+}
+
+Jumps
+
+   In x86 assembly language, jumps are usually written with labels. For
+   example:
+there: mov eax, 12345678h    ; b8 78 56 34 12
+       jmp there             ; eb f9
+       nop                   ; 90
+
+   Recall that the EIP register is the instruction pointer. When the
+   processor fetches an instruction to execute, it increments EIP to point
+   to the following instruction. A jump changes the value of EIP. In our
+   example, the effect of the jump is to move EIP backward by 7 bytes, so
+   it will point to the start of the mov instruction.
+                            EIP is here after the processor
+                            fetches the  "jmp there" instruction
+                            |v
+B8  78  56  34  12  EB  F9  90
+^|___________________________|
+We want to move it 7 bytes backward
+to place it here
+
+   So, how is jmp encoded? Hex F9 is the two's complement representation
+   of -7... so the encoding above (EB F9) is in essence "jump -7 bytes."
+
+   Complicating things slightly, the jmp instruction is encoded with an EB
+   opcode byte if the jump distance is between -128 and 127 bytes,
+   inclusive, and with an E9 opcode if the jump distance is larger than
+   that.
+uint8_t *jmp(int32_t bytes, uint8_t *buf) {
+        if (INT8_MIN <= bytes && bytes <= INT8_MAX) {
+                *buf++ = 0xEB;
+                *buf++ = (int8_t)bytes;
+        } else {
+                *buf++ = 0xE9;
+                *((int32_t *)buf) = bytes; buf += sizeof(int32_t);
+        }
+        return buf;
+}
+
+   Conditional jumps are encoded similarly, except with different opcodes,
+   of course.
+#define DEFINE_INSN_JCC(mnemonic, byte_opcode)                     \
+uint8_t *mnemonic(int32_t bytes, uint8_t *buf) {                   \
+        if (INT8_MIN <= bytes && bytes <= INT8_MAX) {              \
+                *buf++ = byte_opcode;                              \
+                *buf++ = (int8_t)bytes;                            \
+        } else {                                                   \
+                *buf++ = 0x0F;                                     \
+                *buf++ = byte_opcode + 0x10;                       \
+                *((int32_t *)buf) = bytes; buf += sizeof(int32_t); \
+        }                                                          \
+        return buf;                                                \
+}
+
+DEFINE_INSN_JCC( jb, 0x72)
+DEFINE_INSN_JCC(jae, 0x73)
+DEFINE_INSN_JCC( je, 0x74)
+DEFINE_INSN_JCC(jne, 0x75)
+DEFINE_INSN_JCC(jbe, 0x76)
+DEFINE_INSN_JCC( ja, 0x77)
+DEFINE_INSN_JCC( jl, 0x7C)
+DEFINE_INSN_JCC(jge, 0x7D)
+DEFINE_INSN_JCC(jle, 0x7E)
+DEFINE_INSN_JCC( jg, 0x7F)
+
+5. What's next?
+
+   So, we have a working x86 assembler. Not bad for 256 lines of code. You
+   can download the complete source code below.
+
+   In the next few posts, we'll:
+     * show how to test this assembler (are you sure it actually works?).
+     * show how to find the encodings of other instructions (in case you
+       want to extend this assembler).
+     * show how to actually execute the generated machine code.
+
+   At some point in the future - maybe not right away - I'd like to
+     * show how the Builder design pattern can make the assembler easier
+       to use.
+     * build an x64 assembler (since you're probably not running a 32-bit
+       machine).
+
+   But there are plenty of other non-assembler-related topics I'd like to
+   blog about, so let's see what actually materializes.
+
+Download the source code
+
+   Source Code:    [14]x86asm.h      69 lines
+                   [15]x86asm.c      171 lines
+                   [16]demo.c        16 lines
+                                     Total: 256 lines
+   Makefiles:      [17]GNUmakefile   (GNU Make on Linux/macOS)
+                   [18]Makefile      (NMAKE on Windows)
+
+   ^1 If you're familiar with the x86 encoding scheme, [EBP] is actually
+   encoded as [EBP+0] (i.e., EBP with an 8-bit displacement), and ESP is
+   encoded using the SIB byte.
+
+   Published on 15 Jan 2017 o 4019 words o Comments? [19]E-mail me!
+
+   Copyright © 2017 Jeffrey L. Overbey. All rights reserved. Except for
+   source code where an explicit license is given, no part of this blog
+   may be copied, reproduced, published, translated, or distributed, in
+   whole or in part, without the written permission of the copyright
+   owner.
+
+References
+
+   1. http://blog.jeff.over.bz/rss.xml
+   2. http://jeff.over.bz/
+   3. http://blog.jeff.over.bz/refactoring/golang/godoctor/2018/04/22/building-a-godoctor-refactoring.html
+   4. http://blog.jeff.over.bz/compilers/lexers/2017/09/09/lexical-analysis.html
+   5. http://blog.jeff.over.bz/performance/statistics/2017/06/01/on-performance-improvements.html
+   6. http://blog.jeff.over.bz/assembly/compilers/jit/2017/03/30/executing-dynamically-generated-machine-code.html
+   7. http://blog.jeff.over.bz/assembly/2017/02/15/finding-machine-language-encodings.html
+   8. http://blog.jeff.over.bz/assembly/compilers/jit/2017/01/15/x86-assembler.html
+   9. http://blog.jeff.over.bz/blog/2016/12/15/first-post.html
+  10. http://blog.jeff.over.bz/rss.xml
+  11. http://jeff.over.bz/
+  12. https://software.intel.com/en-us/articles/intel-sdm
+  13. https://linux.die.net/man/3/div
+  14. http://blog.jeff.over.bz/_posts/code/x86-assembler/x86asm.h
+  15. http://blog.jeff.over.bz/_posts/code/x86-assembler/x86asm.c
+  16. http://blog.jeff.over.bz/_posts/code/x86-assembler/demo.c
+  17. http://blog.jeff.over.bz/_posts/code/x86-assembler/GNUmakefile
+  18. http://blog.jeff.over.bz/_posts/code/x86-assembler/Makefile
+  19. http://www.google.com/recaptcha/mailhide/d?k=01Y7tF2jw9w9xLZucF314wPA==&c=q6vUJyh7NztFNzwTojRuECzwRgJ8lixJT3LvIi58VCM=