summaryrefslogtreecommitdiff
path: root/miniany/doc/blog.packagecloud.io_eng_2016_04_05_the-definitive-guide-to-linux-system-calls.txt
blob: 33d9cff96a9e4b05802e23216ef1ef4238d5971b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
    #[1]Packagecloud Blog

   [2]packagecloud[3]:blog
   (BUTTON) close

Never miss an update!

   ____________________ Sign up!
   ____________________
   [4]Subscribe to our RSS feed Already signed up? [ ]

   [5]back to top [6]back to posts

The Definitive Guide to Linux System Calls

   Apr 5, 2016 o packagecloud

Tags:

     * [7]linux

TL;DR

   This blog post explains how Linux programs call functions in the Linux
   kernel.

   It will outline several different methods of making systems calls, how
   to handcraft your own assembly to make system calls (examples
   included), kernel entry points into system calls, kernel exit points
   from system calls, glibc wrappers, bugs, and much, much more.

   Create a package repository in less than 10 seconds, free.
   [8](BUTTON) Sign up!
     * [9]TL;DR
     * [10]What is a system call?
     * [11]Prerequisite information
          + [12]Hardware and software
          + [13]User programs, the kernel, and CPU privilege levels
          + [14]Interrupts
          + [15]Model Specific Registers (MSRs)
          + [16]Calling system calls with assembly is a bad idea
     * [17]Legacy system calls
          + [18]Using legacy system calls with your own assembly
          + [19]Kernel-side: int $0x80 entry point
          + [20]Returning from a legacy system call with iret
     * [21]Fast system calls
          + [22]32-bit fast system calls
               o [23]sysenter/sysexit
               o [24]__kernel_vsyscall internals
               o [25]Using sysenter system calls with your own assembly
               o [26]Kernel-side: sysenter entry point
               o [27]Returning from a sysenter system call with sysexit
          + [28]64-bit fast system calls
               o [29]syscall/sysret
               o [30]Using syscall system calls with your own assembly
               o [31]Kernel-side: syscall entry point
               o [32]Returning from a syscall system call with sysret
     * [33]Calling a syscall semi-manually with syscall(2)
          + [34]glibc syscall wrapper internals
     * [35]Virtual system calls
          + [36]vDSO in the kernel
          + [37]Locating the vDSO in memory
          + [38]vDSO in glibc
     * [39]glibc system call wrappers
     * [40]Interesting syscall related bugs
          + [41]CVE-2010-3301
          + [42]Android sysenter ABI breakage
     * [43]Conclusion
     * [44]Related Posts

What is a system call?

   When you run a program which calls open, fork, read, write (and many
   others) you are making a system call.

   System calls are how a program enters the kernel to perform some task.
   Programs use system calls to perform a variety of operations such as:
   creating processes, doing network and file IO, and much more.

   You can find a list of system calls by checking the [45]man page for
   syscalls(2).

   There are several different ways for user programs to make system calls
   and the low-level instructions for making a system call vary among CPU
   architectures.

   As an application developer, you don't typically need to think about
   how exactly a system call is made. You simply include the appropriate
   header file and make the call as if it were a normal function.

   glibc provides wrapper code which abstracts you away from the
   underlying code which arranges the arguments you've passed and enters
   the kernel.

   Before we can dive into the details of how system calls are made, we'll
   need to define some terms and examine some core ideas that will appear
   later.

Prerequisite information

Hardware and software

   This blog post makes the following assumptions that:
     * You are using a 32-bit or 64-bit Intel or AMD CPU. The discussion
       about the methods may be useful for people using other systems, but
       the code samples below contain CPU-specific code.
     * You are interested in the Linux kernel, version 3.13.0. Other
       kernel versions will be similar, but the exact line numbers,
       organization of code, and file paths will vary. Links to the 3.13.0
       kernel source tree on GitHub are provided.
     * You are interested in glibc or glibc derived libc implementations
       (e.g., eglibc).

   x86-64 in this blog post will refer to 64bit Intel and AMD CPUs that
   are based on the x86 architecture.

User programs, the kernel, and CPU privilege levels

   User programs (like your editor, terminal, ssh daemon, etc) need to
   interact with the Linux kernel so that the kernel can perform a set of
   operations on behalf of your user programs that they can't perform
   themselves.

   For example, if a user program needs to do some sort of IO (open, read,
   write, etc) or modify its address space (mmap, sbrk, etc) it must
   trigger the kernel to run to complete those actions on its behalf.

   What prevents user programs from performing these actions themselves?

   It turns out that the x86-64 CPUs have a concept called [46]privilege
   levels. Privilege levels are a complex topic suitable for their own
   blog post. For the purposes of this post, we can (greatly) simplify the
   concept of privilege levels by saying:
    1. Privilege levels are a means of access control. The current
       privilege level determines which CPU instructions and IO may be
       performed.
    2. The kernel runs at the most privileged level, called "Ring 0". User
       programs run at a lesser level, typically "Ring 3".

   In order for a user program to perform some privileged operation, it
   must cause a privilege level change (from "Ring 3" to "Ring 0") so that
   the kernel can execute.

   There are several ways to cause a privilege level change and trigger
   the kernel to perform some action.

   Let's start with a common way to cause the kernel to execute:
   interrupts.

Interrupts

   You can think of an interrupt as an event that is generated (or
   "raised") by hardware or software.

   A hardware interrupt is raised by a hardware device to notify the
   kernel that a particular event has occurred. A common example of this
   type of interrupt is an interrupt generated when a NIC receives a
   packet.

   A software interrupt is raised by executing a piece of code. On x86-64
   systems, a software interrupt can be raised by executing the int
   instruction.

   Interrupts usually have numbers assigned to them. Some of these
   interrupt numbers have a special meaning.

   You can imagine an array that lives in memory on the CPU. Each entry in
   this array maps to an interrupt number. Each entry contains the address
   of a function that the CPU will begin executing when that interrupt is
   received along with some options, like what privilege level the
   interrupt handler function should be executed in.

   Here's a photo from the Intel CPU manual showing the layout of an entry
   in this array:

   Screenshot of Interrupt Descriptor Table entry diagram for x86_64 CPUs

   If you look closely at the diagram, you can see a 2-bit field labeled
   DPL (Descriptor Privilege Level). The value in this field determines
   the minimum privilege level the CPU will be in when the handler
   function is executed.

   This is how the CPU knows which address it should execute when a
   particular type of event is received and what privilege level the
   handler for that event should execute in.

   In practice, there are lots of different ways to deal with interrupts
   on x86-64 systems. If you are interested in learning more read about
   the [47]8259 Programmable Interrupt Controller, [48]Advanced Interrupt
   Controllers, and [49]IO Advanced Interrupt Controllers.

   There are other complexities involved with dealing with both hardware
   and software interrupts, such as interrupt number collisions and
   remapping.

   We don't need to concern ourselves with these details for this
   discussion about system calls.

Model Specific Registers (MSRs)

   Model Specific Registers (also known as MSRs) are control registers
   that have a specific purpose to control certain features of the CPU.
   The CPU documentation lists the addresses of each of the MSRs.

   You can use the CPU instructions rdmsr to wrmsr to read and write MSRs,
   respectively.

   There are also command line tools which allow you to read and write
   MSRs, but doing this is not recommended as changing these values
   (especially while a system is running) is dangerous unless you are
   really careful.

   If you don't mind potentially destabilizing your system or irreversibly
   corrupting your data, you can read and write MSRs by installing
   msr-tools and loading the msr kernel module:
% sudo apt-get install msr-tools
% sudo modprobe msr
% sudo rdmsr

   Some of the system call methods we'll see later make use of MSRs, as
   we'll see soon.

Calling system calls with assembly is a bad idea

   It's not a great idea to call system calls by writing your own assembly
   code.

   One big reason for this is that some system calls have additional code
   that runs in glibc before or after the system call runs.

   In the examples below, we'll be using the exit system call. It turns
   out that you can register functions to run when exit is called by a
   program by using [50]atexit.

   Those functions are called from glibc, not the kernel. So, if you write
   your own assembly to call exit as we show below, your registered
   handler functions won't be executed since you are bypassing glibc.

   Nevertheless, manually making system calls with assembly is a good
   learning experience.

Legacy system calls

   Create a package repository in less than 10 seconds, free.
   [51](BUTTON) Sign up!

   Using our prerequisite knowledge we know two things:
    1. We know that we can trigger the kernel to execute by generating a
       software interrupt.
    2. We can generate a software interrupt with the int assembly
       instruction.

   Combining these two concepts leads us to the legacy system call
   interface on Linux.

   The Linux kernel sets aside a specific software interrupt number that
   can be used by user space programs to enter the kernel and execute a
   system call.

   The Linux kernel registers an interrupt handler named ia32_syscall for
   the interrupt number: 128 (0x80). Let's take a look at the code that
   actually does this.

   From the trap_init function in the kernel 3.13.0 source in
   [52]arch/x86/kernel/traps.c:
void __init trap_init(void)
{
        /* ..... other code ... */

        set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);

   Where IA32_SYSCALL_VECTOR is a defined as 0x80 in
   [53]arch/x86/include/asm/irq_vectors.h.

   But, if the kernel reserves a single software interrupt that userland
   programs can raise to trigger the kernel, how does the kernel know
   which of the many system calls it should execute?

   The userland program is expected to put the system call number in the
   eax register. The arguments for the syscall itself are to be placed in
   the remaining general purpose registers.

   One place this is documented is in a comment in
   [54]arch/x86/ia32/ia32entry.S:
 * Emulated IA32 system calls via int 0x80.
 *
 * Arguments:
 * %eax System call number.
 * %ebx Arg1
 * %ecx Arg2
 * %edx Arg3
 * %esi Arg4
 * %edi Arg5
 * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
 *

   Now that we know how to make a system call and where the arguments
   should live, let's try to make one by writing some inline assembly.

Using legacy system calls with your own assembly

   To make a legacy system call, you can write a small bit of inline
   assembly. While this is interesting from a learning perspective, I
   encourage readers to never make system calls by crafting their own
   assembly.

   In this example, we'll try calling the exit system call, which takes a
   single argument: the exit status.

   First, we need to find the system call number for exit. The Linux
   kernel includes a file which lists each system call in a table. This
   file is processed by various scripts at build time to generate header
   files which can be used by user programs.

   Let's look at the table found in [55]arch/x86/syscalls/syscall_32.tbl:
1 i386  exit      sys_exit

   The exit syscall is number 1. According to the interface described
   above, we just need to move the syscall number into the eax register
   and the first argument (the exit status) into ebx.

   Here's a piece of C code with some inline assembly that does this.
   Let's set the exit status to "42":

   (This example can be simplified, but I thought it would be interesting
   to make it a bit more wordy than necessary so that anyone who hasn't
   seen GCC inline assembly before can use this as an example or
   reference.)
int
main(int argc, char *argv[])
{
  unsigned int syscall_nr = 1;
  int exit_status = 42;

  asm ("movl %0, %%eax\n"
             "movl %1, %%ebx\n"
       "int $0x80"
    : /* output parameters, we aren't outputting anything, no none */
      /* (none) */
    : /* input parameters mapped to %0 and %1, repsectively */
      "m" (syscall_nr), "m" (exit_status)
    : /* registers that we are "clobbering", unneeded since we are calling exit
*/
      "eax", "ebx");
}

   Next, compile, execute, and check the exit status:
$ gcc -o test test.c
$ ./test
$ echo $?
42

   Success! We called the exit system call using the legacy system call
   method by raising a software interrupt.

Kernel-side: int $0x80 entry point

   So now that we've seen how to trigger a system call from a userland
   program, let's see how the kernel uses the system call number to
   execute the system call code.

   Recall from the previous section that the kernel registered a syscall
   handler function called ia32_syscall.

   This function is implemented in assembly in
   [56]arch/x86/ia32/ia32entry.S and we can see several things happening
   in this function, the most important of which is the call to the actual
   syscall itself:
ia32_do_call:
        IA32_ARG_FIXUP
        call *ia32_sys_call_table(,%rax,8) # xxx: rip relative

   IA32_ARG_FIXUP is a macro which rearranges the legacy arguments so that
   they may be properly understood by the current system call layer.

   The ia32_sys_call_table identifier refers to a table which is defined
   in [57]arch/x86/ia32/syscall_ia32.c. Note the #include line toward the
   end of the code:
const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
        /*
         * Smells like a compiler bug -- it doesn't work
         * when the & below is removed.
         */
        [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
#include <asm/syscalls_32.h>
};

   Recall earlier we saw the syscall table defined in
   [58]arch/x86/syscalls/syscall_32.tbl.

   There are a few scripts which run at compile time which take this table
   and generate the syscalls_32.h file from it. The generated header file
   is comprised of valid C code, which is simply inserted with the
   #include shown above to fill in ia32_sys_call_table with function
   addresses indexed by system call number.

   And this is how you enter the kernel via a legacy system call.

Returning from a legacy system call with iret

   We've seen how to enter the kernel with a software interrupt, but how
   does the kernel return back to the user program and drop the privilege
   level after it has finished running?

   If we turn to the (warning: large PDF) [59]Intel Software Developer's
   Manual we can find a helpful diagram that illustrates how the program
   stack will be arranged when a privilege level change occurs.

   Let's take a look:

   Screenshot of the Stack Usage on Transfers to Interrupt and
   Exception-Handling Routines

   When execution is transferred to the kernel function ia32_syscall via
   the execution of a software interrupt from a user program, a privilege
   level change occurs. The result is that the stack when ia32_syscall is
   entered will look like the diagram above.

   This means that the return address and the CPU flags which encode the
   privilege level (and other stuff), and more are all saved on the
   program stack before ia32_syscall executes.

   So, in order to resume execution the kernel just needs to copy these
   values from the program stack back into the registers where they belong
   and execution will resume back in userland.

   OK, so how do you do that?

   There's a few ways to do that, but one of the easiest ways is to the
   use the iret instruction.

   The Intel instruction set manual explains that the iret instruction
   pops the return address and saved register values from the stack in the
   order they were prepared:

     As with a real-address mode interrupt return, the IRET instruction
     pops the return instruction pointer, return code segment selector,
     and EFLAGS image from the stack to the EIP, CS, and EFLAGS
     registers, respectively, and then resumes execution of the
     interrupted program or procedure.

   Finding this code in the Linux kernel is a bit difficult as it is
   hidden beneath several macros and there is extensive care taken to deal
   with things like signals and ptrace system call exit tracking.

   Eventually all the macros in the assembly stubs in the kernel reveal
   the iret which returns from a system call back to a user program.

   From irq_return in [60]arch/x86/kernel/entry_64.S:
irq_return:
  INTERRUPT_RETURN

   Where INTERRUPT_RETURN is defined in
   [61]arch/x86/include/asm/irqflags.h as iretq.

   And now you know how legacy system calls work.

Fast system calls

   The legacy method seems pretty reasonable, but there are newer ways to
   trigger a system call which don't involve a software interrupt and are
   [62]much faster than using a software interrupt.

   Each of the two faster methods is comprised of two instructions. One to
   enter the kernel and one to leave. Both methods are described in the
   Intel CPU documentation as "Fast System Call".

   Unfortunately, Intel and AMD implementations have some disagreement on
   which method is valid when a CPU is in 32bit or 64bit mode.

   In order to maximize compatibility across both Intel and AMD CPUs:
     * On 32bit systems use: sysenter and sysexit.
     * On 64bit systems use: syscall and sysret.

   Create a package repository in less than 10 seconds, free.
   [63](BUTTON) Sign up!

32-bit fast system calls

sysenter/sysexit

   Using sysenter to make a system call is more complicated than using the
   legacy interrupt method and involves more coordination between the user
   program (via glibc) and the kernel.

   Let's take it one step at a time and sort out the details. First, let's
   see what the documentation in the Intel Instruction Set Reference
   (warning very large [64]PDF) says about the sysenter and how to use it.

   Let's take a look:

     Prior to executing the SYSENTER instruction, software must specify
     the privilege level 0 code segment and code entry point, and the
     privilege level 0 stack segment and stack pointer by writing values
     to the following MSRs:

     o IA32_SYSENTER_CS (MSR address 174H) -- The lower 16 bits of this
     MSR are the segment selector for the privilege level 0 code segment.
     This value is also used to determine the segment selector of the
     privilege level 0 stack segment (see the Operation section). This
     value cannot indicate a null selector.

     o IA32_SYSENTER_EIP (MSR address 176H) -- The value of this MSR is
     loaded into RIP (thus, this value references the first instruction
     of the selected operating procedure or routine). In protected mode,
     only bits 31:0 are loaded.

     o IA32_SYSENTER_ESP (MSR address 175H) -- The value of this MSR is
     loaded into RSP (thus, this value contains the stack pointer for the
     privilege level 0 stack). This value cannot represent a
     non-canonical address. In protected mode, only bits 31:0 are loaded.

   In other words: in order for the kernel to receive incoming system
   calls with sysenter, the kernel must set 3 Model Specific Registers
   (MSRs). The most interesting MSR in our case is IA32_SYSENTER_EIP
   (which has the address 0x176). This MSR is where the kernel should
   specify the address of the function that will execute when a sysenter
   instruction is executed by a user program.

   We can find the code in the Linux kernel which writes to the MSR in
   [65]arch/x86/vdso/vdso32-setup.c:
void enable_sep_cpu(void)
{
        /* ... other code ... */

        wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);

   Where MSR_IA32_SYSENTER_EIP is defined as a 0x00000176
   [66]arch/x86/include/uapi/asm/msr-index.h.

   Much like the legacy software interrupt syscalls, there is a defined
   convention for making system calls with sysenter.

   One place this is documented is in a comment in
   [67]arch/x86/ia32/ia32entry.S:
 * 32bit SYSENTER instruction entry.
 *
 * Arguments:
 * %eax System call number.
 * %ebx Arg1
 * %ecx Arg2
 * %edx Arg3
 * %esi Arg4
 * %edi Arg5
 * %ebp user stack
 * 0(%ebp) Arg6

   Recall that the legacy system call method includes a mechanism for
   returning back to the userland program which was interrupted: the iret
   instruction.

   Capturing the logic needed to make sysenter work properly is
   complicated because unlike software interrupts, sysenter does not store
   the return address.

   How, exactly, the kernel does this and other bookkeeping prior to
   executing a sysenter instruction can change over time (and it has
   changed, as you will see in the Bugs section below).

   In order to protect against future changes, user programs are intended
   to use a function called __kernel_vsyscall which is implemented in the
   kernel, but mapped into each user process when the process is started.

   This is a bit odd; it's code that comes with the kernel, but runs in
   userland.

   It turns out that __kernel_vsyscall is part of something called a
   virtual Dynamic Shared Object (vDSO) which exists to allow programs to
   execute kernel code in userland.

   We'll examine what the vDSO is, what it does, and how it works in depth
   later.

   For now, let's examine the __kernel_vsyscall internals.

__kernel_vsyscall internals

   The __kernel_vsyscall function that encapulates the sysenter calling
   convention can be found in [68]arch/x86/vdso/vdso32/sysenter.S:
__kernel_vsyscall:
.LSTART_vsyscall:
        push %ecx
.Lpush_ecx:
        push %edx
.Lpush_edx:
        push %ebp
.Lenter_kernel:
        movl %esp,%ebp
        sysenter

   __kernel_vsyscall is part of a Dynamic Shared Object (also known as a
   shared library) how does a user program locate the address of that
   function at runtime?

   The address of the __kernel_vsyscall function is written into an
   [69]ELF auxilliary vector where a user program or library (typically
   glibc) can find it and use it.

   There are a few methods for searching ELF auxilliary vectors:
    1. By using [70]getauxval with the AT_SYSINFO argument.
    2. By iterating to the end of the environment variables and parsing
       them from memory.

   Option 1 is the simplest option, but does not exist on glibc prior to
   2.16. The example code shown below illustrates option 2.

   As we can see in the code above, __kernel_vsyscall does some
   bookkeeping before executing sysenter.

   So, all we need to do to manually enter the kernel with sysenter is:
     * Search the ELF auxilliary vectors for AT_SYSINFO where the address
       of __kernel_vsyscall is written.
     * Put the system call number and arguments into the registers as we
       would normally for legacy system calls
     * Call the __kernel_vsyscall function

   You should absolutely never write your own sysenter wrapper function as
   the convention the kernel uses to enter and leave system calls with
   sysenter can change and your code will break.

   You should always start a sysenter system call by calling through
   __kernel_vsyscall.

   So, lets do that.

Using sysenter system calls with your own assembly

   Keeping with our legacy system call example from earlier, we'll call
   exit with an exit status of 42.

   The exit syscall is number 1. According to the interface described
   above, we just need to move the syscall number into the eax register
   and the first argument (the exit status) into ebx.

   (This example can be simplified, but I thought it would be interesting
   to make it a bit more wordy than necessary so that anyone who hasn't
   seen GCC inline assembly before can use this as an example or
   reference.)
#include <stdlib.h>
#include <elf.h>

int
main(int argc, char* argv[], char* envp[])
{
  unsigned int syscall_nr = 1;
  int exit_status = 42;
  Elf32_auxv_t *auxv;

  /* auxilliary vectors are located after the end of the environment
   * variables
   *
   * check this helpful diagram: https://static.lwn.net/images/2012/auxvec.png
   */
  while(*envp++ != NULL);

  /* envp is now pointed at the auxilliary vectors, since we've iterated
   * through the environment variables.
   */
  for (auxv = (Elf32_auxv_t *)envp; auxv->a_type != AT_NULL; auxv++)
  {
    if( auxv->a_type == AT_SYSINFO) {
      break;
    }
  }

  /* NOTE: in glibc 2.16 and higher you can replace the above code with
   * a call to getauxval(3):  getauxval(AT_SYSINFO)
   */

  asm(
      "movl %0,  %%eax    \n"
      "movl %1, %%ebx    \n"
      "call *%2          \n"
      : /* output parameters, we aren't outputting anything, no none */
        /* (none) */
      : /* input parameters mapped to %0 and %1, repsectively */
        "m" (syscall_nr), "m" (exit_status), "m" (auxv->a_un.a_val)
      : /* registers that we are "clobbering", unneeded since we are calling exi
t */
        "eax", "ebx");
}

   Next, compile, execute, and check the exit status:
$ gcc -m32 -o test test.c
$ ./test
$ echo $?
42

   Success! We called the exit system call using the legacy sysenter
   method without raising a software interrupt.

Kernel-side: sysenter entry point

   So now that we've seen how to trigger a system call from a userland
   program with sysenter via __kernel_vsyscall, let's see how the kernel
   uses the system call number to execute the system call code.

   Recall from the previous section that the kernel registered a syscall
   handler function called ia32_sysenter_target.

   This function is implemented in assembly in
   [71]arch/x86/ia32/ia32entry.S. Let's take a look at where the value in
   the eax register is used to execute the system call:
sysenter_dispatch:
        call    *ia32_sys_call_table(,%rax,8)

   This is identical code as we saw in the legacy system call mode: a
   table named ia32_sys_call_table which is indexed into with the system
   call number.

   After all the needed bookkeeping is done both the legacy system call
   model and the sysenter system call model use the same mechanism and
   system call table for dispatching system calls.

   Refer to the [72]int $0x80 entry point section to learn where the
   ia32_sys_call_table is defined and how it is constructed.

   And this is how you enter the kernel via a sysenter system call.

Returning from a sysenter system call with sysexit

   The kernel can use the sysexit instruction to resume execution back to
   the user program.

   Using this instruction is not as straight forward as using iret. The
   caller is expected to put the address to return to into the rdx
   register, and to put the pointer to the program stack to use in the rcx
   register.

   This means that your software must compute the address where execution
   should be resumed, preserve that value, and restore it prior to calling
   sysexit.

   We can find the code which does this in: [73]arch/x86/ia32/ia32entry.S:
sysexit_from_sys_call:
        andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        /* clear IF, that popfq doesn't enable interrupts early */
        andl  $~0x200,EFLAGS-R11(%rsp)
        movl    RIP-R11(%rsp),%edx              /* User %eip */
        CFI_REGISTER rip,rdx
        RESTORE_ARGS 0,24,0,0,0,0
        xorq    %r8,%r8
        xorq    %r9,%r9
        xorq    %r10,%r10
        xorq    %r11,%r11
        popfq_cfi
        /*CFI_RESTORE rflags*/
        popq_cfi %rcx                           /* User %esp */
        CFI_REGISTER rsp,rcx
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS_SYSEXIT32

   ENABLE_INTERRUPTS_SYSEXIT32 is a macro which is defined in
   [74]arch/x86/include/asm/irqflags.h which contains the sysexit
   instruction.

   And now you know how 32-bit fast system calls work.

64-bit fast system calls

   Next up on our journey are 64-bit fast system calls. These system calls
   use the instructions syscall and sysret to enter and return from a
   system call, respectively.

syscall/sysret

   Create a package repository in less than 10 seconds, free.
   [75](BUTTON) Sign up!

   The documentation in the Intel Instruction Set Reference (very large
   [76]PDF) explains how the syscall instruction works:

     SYSCALL invokes an OS system-call handler at privilege level 0. It
     does so by loading RIP from the IA32_LSTAR MSR (after saving the
     address of the instruction following SYSCALL into RCX).

   In other words: for the kernel to receive incoming system calls, it
   must register the address of the code that will execute when a system
   call occurs by writing its address to the IA32_LSTAR MSR.

   We can find that code in the kernel in
   [77]arch/x86/kernel/cpu/common.c:
void syscall_init(void)
{
        /* ... other code ... */
        wrmsrl(MSR_LSTAR, system_call);

   Where MSR_LSTAR is defined as 0xc0000082 in
   [78]arch/x86/include/uapi/asm/msr-index.h.

   Much like the legacy software interrupt syscalls, there is a defined
   convention for making system calls with syscall.

   The userland program is expected to put the system call number to be in
   the rax register. The arguments to the syscall are expected to be
   placed in a subset of the general purpose registers.

   This is documented in the [79]x86-64 ABI in section A.2.1:

    1. User-level applications use as integer registers for passing the
       sequence %rdi, %rsi, %rdx, %rcx, %r8 and %r9. The kernel interface
       uses %rdi, %rsi, %rdx, %r10, %r8 and %r9.
    2. A system-call is done via the syscall instruction. The kernel
       destroys registers %rcx and %r11.
    3. The number of the syscall has to be passed in register %rax.
    4. System-calls are limited to six arguments,no argument is passed
       directly on the stack.
    5. Returning from the syscall, register %rax contains the result of
       the system-call. A value in the range between -4095 and -1
       indicates an error, it is -errno.
    6. Only values of class INTEGER or class MEMORY are passed to the
       kernel.

   This is also documented in a comment in [80]arch/x86/kernel/entry_64.S.

   Now that we know how to make a system call and where the arguments
   should live, let's try to make one by writing some inline assembly.

Using syscall system calls with your own assembly

   Building on the previous example, let's build a small C program with
   inline assembly which executes the exit system call passing the exit
   status of 42.

   First, we need to find the system call number for exit. In this case we
   need to read the table found in [81]arch/x86/syscalls/syscall_64.tbl:
60      common  exit                    sys_exit

   The exit syscall is number 60. According to the interface described
   above, we just need to move 60 into the rax register and the first
   argument (the exit status) into rdi.

   Here's a piece of C code with some inline assembly that does this. Like
   the previous example, this example is more wordy than necessary in the
   interest of clarity:
int
main(int argc, char *argv[])
{
  unsigned long syscall_nr = 60;
  long exit_status = 42;

  asm ("movq %0, %%rax\n"
       "movq %1, %%rdi\n"
       "syscall"
    : /* output parameters, we aren't outputting anything, no none */
      /* (none) */
    : /* input parameters mapped to %0 and %1, repsectively */
      "m" (syscall_nr), "m" (exit_status)
    : /* registers that we are "clobbering", unneeded since we are calling exit
*/
      "rax", "rdi");
}

   Next, compile, execute, and check the exit status:
$ gcc -o test test.c
$ ./test
$ echo $?
42

   Success! We called the exit system call using the syscall system call
   method. We avoided raising a software interrupt and (if we were timing
   a micro-benchmark) it executes much faster.

Kernel-side: syscall entry point

   Now we've seen how to trigger a system call from a userland program,
   let's see how the kernel uses the system call number to execute the
   system call code.

   Recall from the previous section we saw the address of a function named
   system_call get written to the LSTAR MSR.

   Let's take a look at the code for this function and see how it uses rax
   to actually hand off execution to the system call, from
   [82]arch/x86/kernel/entry_64.S:
        call *sys_call_table(,%rax,8)  # XXX:    rip relative

   Much like the legacy system call method, sys_call_table is a table
   defined in a C file that uses #include to pull in C code generated by a
   script.

   From [83]arch/x86/kernel/syscall_64.c, note the #include at the bottom:
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
        /*
         * Smells like a compiler bug -- it doesn't work
         * when the & below is removed.
         */
        [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};

   Earlier we saw the syscall table defined in
   [84]arch/x86/syscalls/syscall_64.tbl. Exactly like the legacy interrupt
   mode, a script runs at kernel compile time and generates the
   syscalls_64.h file from the table in syscall_64.tbl.

   The code above simply includes the generated C code producing an array
   of function pointers indexed by system call number.

   And this is how you enter the kernel via a syscall system call.

Returning from a syscall system call with sysret

   The kernel can use the sysret instruction to resume execution back to
   where execution left off when the user program used syscall.

   sysret is simpler than sysexit because the address to where execution
   should be resume is copied into the rcx register when syscall is used.

   As long as you preserve that value somewhere and restore it to rcx
   before calling sysret, execution will resume where it left off before
   the call to syscall.

   This is convenient because sysenter requires that you compute this
   address yourself in addition to clobbering an additional register.

   We can find the code which does this in [85]arch/x86/kernel/entry_64.S:
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER    rip,rcx
RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER  rflags,r11*/
movq    PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64

   USERGS_SYSRET64 is a macro which is defined in
   [86]arch/x86/include/asm/irqflags.h which contains the sysret
   instruction.

   And now you know how 64-bit fast system calls work.

Calling a syscall semi-manually with syscall(2)

   Great, we've seen how to call system calls manually by crafting
   assembly for a few different system call methods.

   Usually, you don't need to write your own assembly. Wrapper functions
   are provided by glibc that handle all of the assembly code for you.

   There are some system calls, however, for which no glibc wrapper
   exists. One example of a system call like this is futex, the fast
   userspace locking system call.

   But, wait, why does [87]no system call wrapper exist for futex?

   futex is intended only to be called by libraries, not application code,
   and thus in order to call futex you must do it by:
    1. Generating assembly stubs for every platform you want to support
    2. Using the syscall wrapper provided by glibc

   If you find yourself in the situation of needing to call a system call
   for which no wrapper exists, you should definitely choose option 2: use
   the function syscall from glibc.

   Let's use syscall from glibc to call exit with exit status of 42:
#include <unistd.h>

int
main(int argc, char *argv[])
{
  unsigned long syscall_nr = 60;
  long exit_status = 42;

  syscall(syscall_nr, exit_status);
}

   Next, compile, execute, and check the exit status:
$ gcc -o test test.c
$ ./test
$ echo $?
42

   Success! We called the exit system call using the syscall wrapper from
   glibc.

glibc syscall wrapper internals

   Create a package repository in less than 10 seconds, free.
   [88](BUTTON) Sign up!

   Let's take a look at the syscall wrapper function we used in the
   previous example to see how it works in glibc.

   From [89]sysdeps/unix/sysv/linux/x86_64/syscall.S:
/* Usage: long syscall (syscall_number, arg1, arg2, arg3, arg4, arg5, arg6)
   We need to do some arg shifting, the syscall_number will be in
   rax.  */


        .text
ENTRY (syscall)
        movq %rdi, %rax         /* Syscall number -> rax.  */
        movq %rsi, %rdi         /* shift arg1 - arg5.  */
        movq %rdx, %rsi
        movq %rcx, %rdx
        movq %r8, %r10
        movq %r9, %r8
        movq 8(%rsp),%r9        /* arg6 is on the stack.  */
        syscall                 /* Do the system call.  */
        cmpq $-4095, %rax       /* Check %rax for error.  */
        jae SYSCALL_ERROR_LABEL /* Jump to error handler if error.  */
L(pseudo_end):
        ret                     /* Return to caller.  */

   Earlier we showed an excerpt from the x86_64 ABI document that
   describes both userland and kernel calling conventions.

   This assembly stub is cool because it shows both calling conventions.
   The arguments passed into this function follow the userland calling
   convention, but are then moved to a different set of registers to obey
   the kernel calling convention prior to entering the kernel with
   syscall.

   This is how the glibc syscall wrapper works when you use it to call
   system calls that do not come with a wrapper by default.

Virtual system calls

   We've now covered all the methods of making a system call by entering
   the kernel and shown how you can make those calls manually (or
   semi-manually) to transition the system from userland to the kernel.

   What if programs could call certain system calls without entering the
   kernel at all?

   That's precisely why the Linux virtual Dynamic Shared Object (vDSO)
   exists. The Linux vDSO is a set of code that is part of the kernel, but
   is mapped into the address space of a user program to be run in
   userland.

   The idea is that some system calls can be used without entering the
   kernel. One such call is: gettimeofday.

   Programs calling the gettimeofday system call do not actually enter the
   kernel. They instead make a simple function call to a piece of code
   that was provided by the kernel, but is run in userland.

   No software interrupt is raised, no complicated sysenter or syscall
   bookkeeping is required. gettimeofday is just a normal function call.

   You can see the vDSO listed as the first entry when you use ldd:
$ ldd `which bash`
  linux-vdso.so.1 =>  (0x00007fff667ff000)
  libtinfo.so.5 => /lib/x86_64-linux-gnu/libtinfo.so.5 (0x00007f623df7d000)
  libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f623dd79000)
  libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f623d9ba000)
  /lib64/ld-linux-x86-64.so.2 (0x00007f623e1ae000)

   Let's see how the vDSO is setup in the kernel.

   Create a package repository in less than 10 seconds, free.
   [90](BUTTON) Sign up!

vDSO in the kernel

   You can find the vDSO source in [91]arch/x86/vdso/. There are a few
   assembly and C source files along with a linker script.

   The [92]linker script is a cool thing to take a look at.

   From [93]arch/x86/vdso/vdso.lds.S:
/*
 * This controls what userland symbols we export from the vDSO.
 */
VERSION {
        LINUX_2.6 {
        global:
                clock_gettime;
                __vdso_clock_gettime;
                gettimeofday;
                __vdso_gettimeofday;
                getcpu;
                __vdso_getcpu;
                time;
                __vdso_time;
        local: *;
        };
}

   Linker scripts are pretty useful, but not particularly very well known.
   This linker script arranges the symbols that are going to be exported
   in the vDSO.

   We can see that vDSO exports 4 different functions, each with two
   names. You can find the source for these functions in the C files in
   this directory.

   For example, the source for gettimeofday found in
   [94]arch/x86/vdso/vclock_gettime.c:
int gettimeofday(struct timeval *, struct timezone *)
        __attribute__((weak, alias("__vdso_gettimeofday")));

   This is defining gettimeofday to be a [95]weak alias for
   __vdso_gettimeofday.

   The __vdso_gettimeofday function [96]in the same file contains the
   actual source which will be executed in user land when a user program
   calls the gettimeofday system call.

Locating the vDSO in memory

   Due to [97]address space layout randomization the vDSO will be loaded
   at a random address when a program is started.

   How can user programs find the vDSO if its loaded at a random address?

   If you recall earlier when examining the sysenter system call method we
   saw that user programs should call __kernel_vsyscall instead of writing
   their own sysenter assembly code themselves.

   This function is part of the vDSO, as well.

   The sample code provided located __kernel_vsyscall by searching the
   [98]ELF auxilliary headers to find a header with type AT_SYSINFO which
   contained the address of __kernel_vsyscall.

   Similarly, to locate the vDSO, a user program can search for an ELF
   auxilliary header of type AT_SYSINFO_EHDR. It will contain the address
   of the start of the ELF header for the vDSO that was generated by a
   linker script.

   In both cases, the kernel writes the address in to the ELF header when
   the program is loaded. That's how the correct addresses always end up
   in AT_SYSINFO_EHDR and AT_SYSINFO.

   Once that header is located, user programs can parse the ELF object
   (perhaps using [99]libelf) and call the functions in the ELF object as
   needed.

   This is nice because this means that the vDSO can take advantage of
   some useful ELF features like [100]symbol versioning.

   An example of parsing and calling functions in the vDSO is provided in
   the kernel documentation in [101]Documentation/vDSO/.

vDSO in glibc

   Most of the time, people access the vDSO without knowing it because
   glibc abstracts this away from them by using the interface described in
   the previous section.

   When a program is loaded, the [102]dynamic linker and loader loads the
   DSOs that the program depends on, including the vDSO.

   glibc stores some data about the location of the vDSO when it parses
   the ELF headers of the program that is being loaded. It also includes
   short stub functions that will search the vDSO for a symbol name prior
   to making an actual system call.

   For example, the gettimeofday function in glibc, from
   [103]sysdeps/unix/sysv/linux/x86_64/gettimeofday.c:
void *gettimeofday_ifunc (void) __asm__ ("__gettimeofday");

void *
gettimeofday_ifunc (void)
{
  PREPARE_VERSION (linux26, "LINUX_2.6", 61765110);

  /* If the vDSO is not available we fall back on the old vsyscall.  */
  return (_dl_vdso_vsym ("gettimeofday", &linux26)
          ?: (void *) VSYSCALL_ADDR_vgettimeofday);
}
__asm (".type __gettimeofday, %gnu_indirect_function");

   This code in glibc searches the vDSO for the gettimeofday function and
   returns the address. This is wrapped up nicely with an [104]indirect
   function.

   That's how programs calling gettimeofday pass through glibc and hit the
   vDSO all without switching into kernel mode, incurring a privilege
   level change, or raising a software interrupt.

   And, that concludes the showcase of every single system call method
   available on Linux for 32-bit and 64-bit Intel and AMD CPUs.

glibc system call wrappers

   While we're talking about system calls ;) it makes sense to briefly
   mention how glibc deals with system calls.

   For many system calls, glibc simply needs a wrapper function where it
   moves arguments into the proper registers and then executes the syscall
   or int $0x80 instructions, or calls __kernel_vsyscall.

   It does this by using a series of tables defined in text files that are
   processed with scripts and output C code.

   For example, the [105]sysdeps/unix/syscalls.list file describes some
   common system calls:
access          -       access          i:si    __access        access
acct            -       acct            i:S     acct
chdir           -       chdir           i:s     __chdir         chdir
chmod           -       chmod           i:si    __chmod         chmod

   To learn more about each column, check the comments in the script which
   processes this file: [106]sysdeps/unix/make-syscalls.sh.

   More complex system calls, like exit which invokes handlers have actual
   implementations in C or assembly code and will not be found in a
   templated text file like this.

   Future blog posts will explore the implementation in glibc and the
   linux kernel for interesting system calls.

Interesting syscall related bugs

   Create a package repository in less than 10 seconds, free.
   [107](BUTTON) Sign up!

   It would be unfortunate not to take this opportunity to mention two
   fabulous bugs related to system calls in Linux.

   So, let's take a look!

CVE-2010-3301

   [108]This security exploit allows local users to gain root access.

   The cause is a small bug in the assembly code which allows user
   programs to make legacy system calls on x86-64 systems.

   The exploit code is pretty clever: it generates a region of memory with
   mmap at a particular address and uses an integer overflow to cause this
   code:

   (Remember this code from the legacy interrupts section above?)
call *ia32_sys_call_table(,%rax,8)

   to hand execution off to an arbitrary address which runs as kernel code
   and can escalate the running process to root.

Android sysenter ABI breakage

   Remember the part about not hardcoding the sysenter ABI in your
   application code?

   Unfortunately, the android-x86 folks made this mistake. The kernel ABI
   changed and suddenly android-x86 stopped working.

   The kernel folks ended up restoring the old sysenter ABI to avoid
   breaking the Android devices in the wild with stale hardcoded sysenter
   sequences.

   [109]Here's the fix that was added to the Linux kernel. You can find a
   link to the offending commit in the android source in the commit
   message.

   Remember: never write your own sysenter assembly code. If you have to
   implement it directly for some reason, use a piece of code like the
   example above and go through __kernel_vsyscall at the very least.

Conclusion

   The system call infrastructure in the Linux kernel is incredibly
   complex. There are many different methods for making system calls each
   with their own advantages and disadvantages.

   Calling system calls by crafting your own assembly is generally a bad
   idea as the ABI may break underneath you. Your kernel and libc
   implementation will (probably) choose the fastest method for making
   system calls on your system.

   If you can't use the glibc provided wrappers (or if one doesn't exist),
   you should at the very least use the syscall wrapper function, or try
   to go through the vDSO provided __kernel_vsyscall.

   Stay tuned for future blog posts investigating individual system calls
   and their implementations.

Related Posts

   If you enjoyed this post, you may also enjoy other low level technical
   posts such as:
     * [110]How does strace work?
     * [111]How does ltrace work?
     * [112]APT Hash sum mismatch
     * [113]HOWTO: GPG sign and verify deb packages and APT repositories
     * [114]HOWTO: GPG sign and verify RPM packages and yum repositories

Share this post:

   (BUTTON) close

Never miss an update!

   ____________________ Sign up!
   ____________________
   [115]Subscribe to our RSS feed Already signed up? [ ]

     * Features
     * [116]Travis CI
     * [117]Jenkins
     * [118]Buildkite
     * [119]Public Package Repository
     * [120]Private Package Repository
     * [121]GPG Signatures

     * Info
     * [122]Pricing
     * [123]Private NPM registry
     * [124]Private DEB repository
     * [125]Private RPM repository
     * [126]Private RubyGem server
     * [127]Private PyPI server
     * [128]Private Maven repository

     * HOWTOs
     * [129]NPM/NodeJS HOWTO
     * [130]Maven HOWTO
     * [131]Java
       HOWTO
     * [132]Debian HOWTO
     * [133]RPM HOWTO
     * [134]RubyGem HOWTO
     * [135]Python HOWTO
     * [136]Linux HOWTO

     * Guides
     * [137]Maven Guide
     * [138]Debian Guide
     * [139]RPM Guide
     * [140]RubyGem Guide
     * [141]Python Guide
     * [142]Linux Guide

     * Docs
     * [143]General Docs
     * [144]API Docs
     * [145]Command Line Interface

     * Community
     * [146]Blog
     * [147]Slack
     * [148]Status
     * [149]Contact

     * Legal
     * [150]Terms of Service
     * [151]Privacy Policy

   We use cookies to enhance the user experience on packagecloud.
   By using our site, you acknowledge that you have read and understand
   our
   [152]Cookie Policy, [153]Privacy Policy, and our [154]Terms of Service.

References

   Visible links:
   1. http://0.0.0.0:4000/feed.xml
   2. https://packagecloud.io?utm_campaign=cmkt&utm_medium=header&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls
   3. https://blog.packagecloud.io/
   4. http://feeds.feedburner.com/PackagecloudBlog
   5. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
   6. https://blog.packagecloud.io/
   7. https://blog.packagecloud.io/tag/linux
   8. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
   9. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#tldr
  10. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#what-is-a-system-call
  11. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#prerequisite-information
  12. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#hardware-and-software
  13. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#user-programs-the-kernel-and-cpu-privilege-levels
  14. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#interrupts
  15. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#model-specific-registers-msrs
  16. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#calling-system-calls-with-assembly-is-a-bad-idea
  17. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#legacy-system-calls
  18. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#using-legacy-system-calls-with-your-own-assembly
  19. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#kernel-side-int-0x80-entry-point
  20. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#returning-from-a-legacy-system-call-with-iret
  21. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#fast-system-calls
  22. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#32-bit-fast-system-calls
  23. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#sysentersysexit
  24. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#__kernel_vsyscall-internals
  25. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#using-sysenter-system-calls-with-your-own-assembly
  26. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#kernel-side-sysenter-entry-point
  27. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#returning-from-a-sysenter-system-call-with-sysexit
  28. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#64-bit-fast-system-calls
  29. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#syscallsysret
  30. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#using-syscall-system-calls-with-your-own-assembly
  31. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#kernel-side-syscall-entry-point
  32. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#returning-from-a-syscall-system-call-with-sysret
  33. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#calling-a-syscall-semi-manually-with-syscall2
  34. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#glibc-syscall-wrapper-internals
  35. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#virtual-system-calls
  36. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#vdso-in-the-kernel
  37. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#locating-the-vdso-in-memory
  38. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#vdso-in-glibc
  39. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#glibc-system-call-wrappers
  40. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#interesting-syscall-related-bugs
  41. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#cve-2010-3301
  42. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#android-sysenter-abi-breakage
  43. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#conclusion
  44. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#related-posts
  45. http://man7.org/linux/man-pages/man2/syscalls.2.html
  46. https://en.wikipedia.org/wiki/Privilege_level
  47. http://wiki.osdev.org/8259_PIC
  48. http://wiki.osdev.org/APIC
  49. http://wiki.osdev.org/IOAPIC
  50. http://man7.org/linux/man-pages/man3/atexit.3.html
  51. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
  52. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/traps.c#L770
  53. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/traps.c#L770
  54. https://github.com/torvalds/linux/blob/v3.13/arch/x86/ia32/ia32entry.S#L378-L397
  55. https://github.com/torvalds/linux/blob/v3.13/arch/x86/syscalls/syscall_32.tbl
  56. https://github.com/torvalds/linux/blob/v3.13/arch/x86/ia32/ia32entry.S#L426
  57. https://github.com/torvalds/linux/blob/v3.13/arch/x86/ia32/syscall_ia32.c#L18-L25
  58. https://github.com/torvalds/linux/blob/v3.13/arch/x86/syscalls/syscall_32.tbl
  59. ftp://download.intel.com/design/processor/manuals/253668.pdf
  60. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/entry_64.S#L1042-L1043
  61. https://github.com/torvalds/linux/blob/v3.13/arch/x86/include/asm/irqflags.h#L132
  62. https://lkml.org/lkml/2002/12/9/13
  63. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
  64. http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2b-manual.pdf
  65. https://github.com/torvalds/linux/blob/v3.13/arch/x86/vdso/vdso32-setup.c#L240
  66. https://github.com/torvalds/linux/blob/v3.13/arch/x86/include/uapi/asm/msr-index.h#L54
  67. https://github.com/torvalds/linux/blob/v3.13/arch/x86/ia32/ia32entry.S#L99-L117
  68. https://github.com/torvalds/linux/blob/v3.13/arch/x86/vdso/vdso32/sysenter.S#L31-L40
  69. https://www.gnu.org/software/libc/manual/html_node/Auxiliary-Vector.html
  70. http://man7.org/linux/man-pages/man3/getauxval.3.html
  71. https://github.com/torvalds/linux/blob/v3.13/arch/x86/ia32/ia32entry.S#L162-L163
  72. https://blog.packagecloud.io/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/#kernel-side-int-0x80-entry-point
  73. https://github.com/torvalds/linux/blob/v3.13/arch/x86/ia32/ia32entry.S#L169-L185
  74. https://github.com/torvalds/linux/blob/v3.13/arch/x86/include/asm/irqflags.h#L139-L143
  75. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
  76. http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2b-manual.pdf
  77. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/cpu/common.c#L1128
  78. https://github.com/torvalds/linux/blob/v3.13/arch/x86/include/uapi/asm/msr-index.h#L9
  79. http://www.x86-64.org/documentation/abi.pdf
  80. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/entry_64.S#L569-L591
  81. https://github.com/torvalds/linux/blob/v3.13/arch/x86/syscalls/syscall_64.tbl#L69
  82. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/entry_64.S#L629
  83. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/syscall_64.c#L25-L32
  84. https://github.com/torvalds/linux/blob/v3.13/arch/x86/syscalls/syscall_64.tbl
  85. https://github.com/torvalds/linux/blob/v3.13/arch/x86/kernel/entry_64.S#L650-L655
  86. https://github.com/torvalds/linux/blob/v3.13/arch/x86/include/asm/irqflags.h#L133-L135
  87. http://man7.org/linux/man-pages/man7/futex.7.html#NOTES
  88. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
  89. https://github.molgen.mpg.de/git-mirror/glibc/blob/glibc-2.15/sysdeps/unix/sysv/linux/x86_64/syscall.S#L24-L42
  90. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
  91. https://github.com/torvalds/linux/tree/v3.13/arch/x86/vdso
  92. https://sourceware.org/binutils/docs/ld/Scripts.html
  93. https://github.com/torvalds/linux/blob/v3.13/arch/x86/vdso/vdso.lds.S
  94. https://github.com/torvalds/linux/blob/v3.13/arch/x86/vdso/vclock_gettime.c#L281-L282
  95. https://gcc.gnu.org/onlinedocs/gcc-4.3.5/gcc/Function-Attributes.html
  96. https://github.com/torvalds/linux/blob/v3.13/arch/x86/vdso/vclock_gettime.c#L260-L280
  97. https://en.wikipedia.org/wiki/Address_space_layout_randomization
  98. https://www.gnu.org/software/libc/manual/html_node/Auxiliary-Vector.html
  99. http://www.mr511.de/software/english.html
 100. https://www.akkadia.org/drepper/symbol-versioning
 101. https://github.com/torvalds/linux/tree/v3.13/Documentation/vDSO
 102. http://man7.org/linux/man-pages/man8/ld.so.8.html
 103. https://github.molgen.mpg.de/git-mirror/glibc/blob/glibc-2.15/sysdeps/unix/sysv/linux/x86_64/gettimeofday.c#L26-L37
 104. http://willnewton.name/uncategorized/using-gnu-indirect-functions/
 105. https://github.molgen.mpg.de/git-mirror/glibc/blob/glibc-2.15/sysdeps/unix/syscalls.list
 106. https://github.molgen.mpg.de/git-mirror/glibc/blob/glibc-2.15/sysdeps/unix/make-syscalls.sh
 107. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 108. http://cve.mitre.org/cgi-bin/cvename.cgi?name=2010-3301
 109. http://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/commit/?id=30bfa7b3488bfb1bb75c9f50a5fcac1832970c60
 110. https://blog.packagecloud.io/eng/2016/02/29/how-does-strace-work/
 111. https://blog.packagecloud.io/eng/2016/03/14/how-does-ltrace-work/
 112. https://blog.packagecloud.io/eng/2016/03/21/apt-hash-sum-mismatch/
 113. https://blog.packagecloud.io/eng/2014/10/28/howto-gpg-sign-verify-deb-packages-apt-repositories/
 114. https://blog.packagecloud.io/eng/2014/11/24/howto-gpg-sign-verify-rpm-packages-yum-repositories/
 115. http://feeds.feedburner.com/PackagecloudBlog
 116. https://packagecloud.io/docs#travis
 117. https://packagecloud.io/docs#jenkins
 118. https://packagecloud.io/docs#buildkite
 119. https://packagecloud.io/docs#public_private_repos
 120. https://packagecloud.io/docs#public_private_repos
 121. https://packagecloud.io/docs#security_features
 122. https://packagecloud.io/pricing
 123. https://packagecloud.io/l/npm-registry?utm_campaign=cmkt&utm_medium=footer&utm_source=blog
 124. https://packagecloud.io/l/apt-repository?utm_campaign=cmkt&utm_medium=footer&utm_source=blog
 125. https://packagecloud.io/l/yum-repository?utm_campaign=cmkt&utm_medium=footer&utm_source=blog
 126. https://packagecloud.io/l/rubygem-repository?utm_campaign=cmkt&utm_medium=footer&utm_source=blog
 127. https://packagecloud.io/l/pypi-repository?utm_campaign=cmkt&utm_medium=footer&utm_source=blog
 128. https://packagecloud.io/l/maven-repository?utm_campaign=cmkt&utm_medium=footer&utm_source=blog
 129. https://blog.packagecloud.io/tag/npm-howto
 130. https://blog.packagecloud.io/tag/maven-howto
 131. https://blog.packagecloud.io/tag/java-howto
 132. https://blog.packagecloud.io/tag/debian-howto
 133. https://blog.packagecloud.io/tag/rpm-howto
 134. https://blog.packagecloud.io/tag/rubygem-howto
 135. https://blog.packagecloud.io/tag/python-howto
 136. https://blog.packagecloud.io/tag/linux-howto
 137. https://blog.packagecloud.io/tag/maven-guide
 138. https://blog.packagecloud.io/tag/debian-guide
 139. https://blog.packagecloud.io/tag/rpm-guide
 140. https://blog.packagecloud.io/tag/rubygem-guide
 141. https://blog.packagecloud.io/tag/python-guide
 142. https://blog.packagecloud.io/tag/linux
 143. https://packagecloud.io/docs
 144. https://packagecloud.io/docs/api
 145. https://packagecloud.io/docs#cli
 146. https://packagecloud.io/blog
 147. http://bit.ly/packagecloud-users
 148. http://www.packagecloudstatus.io/
 149. https://packagecloud.io/contact
 150. https://packagecloud.io/legal/tos
 151. https://packagecloud.io/legal/privacy
 152. https://packagecloud.io/legal/cookies
 153. https://packagecloud.io/legal/privacy
 154. https://packagecloud.io/legal/tos

   Hidden links:
 156. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 157. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 158. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 159. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 160. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 161. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 162. https://packagecloud.io/?utm_campaign=cmkt&utm_medium=callout&utm_source=/eng/2016/04/05/the-definitive-guide-to-linux-system-calls&utm_term=Create%20a%20package%20repository%20in%20less%20than%2010%20seconds,%20free.
 163. https://facebook.com/sharer/sharer.php?u=http://0.0.0.0:4000/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
 164. https://twitter.com/intent/tweet/?text=The%20Definitive%20Guide%20to%20Linux%20System%20Calls&url=http://0.0.0.0:4000/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
 165. https://plus.google.com/share?url=http://0.0.0.0:4000/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
 166. https://www.tumblr.com/widgets/share/tool?posttype=link&title=The%20Definitive%20Guide%20to%20Linux%20System%20Calls&caption=The%20Definitive%20Guide%20to%20Linux%20System%20Calls&content=http://0.0.0.0:4000/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/&canonicalUrl=http://0.0.0.0:4000/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/&shareSource=tumblr_share_button
 167. mailto:?subject=The%20Definitive%20Guide%20to%20Linux%20System%20Calls&body=http://0.0.0.0:4000/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
 168. https://reddit.com/submit/?url=http://0.0.0.0:4000/eng/2016/04/05/the-definitive-guide-to-linux-system-calls/
 169. javascript:;