summaryrefslogtreecommitdiffstats
path: root/sandbox/linux/seccomp/syscall.cc
blob: 681fec993b0147a3b934146539a2f0a0f18e5c2d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "debug.h"
#include "sandbox_impl.h"
#include "syscall_table.h"

namespace playground {

// TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file
asm(
    ".pushsection .text, \"ax\", @progbits\n"

    // This is the special wrapper for the clone() system call. The code
    // relies on the stack layout of the system call wrapper (c.f. below). It
    // passes the stack pointer as an additional argument to sandbox__clone(),
    // so that upon starting the child, register values can be restored and
    // the child can start executing at the correct IP, instead of trying to
    // run in the trusted thread.
    "playground$sandbox_clone:"
    ".globl playground$sandbox_clone\n"
    ".type playground$sandbox_clone, @function\n"
    #if defined(__x86_64__)
    // Skip the 8 byte return address into the system call wrapper. The
    // following bytes are the saved register values that we need to restore
    // upon return from clone() in the new thread.
    "lea 8(%rsp), %r9\n"
    "jmp playground$sandbox__clone\n"
    #elif defined(__i386__)
    // As i386 passes function arguments on the stack, we need to skip a few
    // more values before we can get to the saved registers.
    "lea 28(%esp), %eax\n"
    "mov %eax, 24(%esp)\n"
    "jmp playground$sandbox__clone\n"
    #else
    #error Unsupported target platform
    #endif
    ".size playground$sandbox_clone, .-playground$sandbox_clone\n"


    // This is the wrapper which is called by the untrusted code, trying to
    // make a system call.
    "playground$syscallWrapper:"
    ".internal playground$syscallWrapper\n"
    ".globl playground$syscallWrapper\n"
    ".type playground$syscallWrapper, @function\n"
    #if defined(__x86_64__)
    // Check for rt_sigreturn(). It needs to be handled specially.
    "cmp  $15, %rax\n"             // NR_rt_sigreturn
    "jnz  1f\n"
    "add  $0x90, %rsp\n"           // pop return addresses and red zone
  "0:syscall\n"                    // rt_sigreturn() is unrestricted
    "mov  $66, %edi\n"             // rt_sigreturn() should never return
    "mov  $231, %eax\n"            // NR_exit_group
    "jmp  0b\n"

    // Save all registers
  "1:push %rbp\n"
    "mov  %rsp, %rbp\n"
    "push %rbx\n"
    "push %rcx\n"
    "push %rdx\n"
    "push %rsi\n"
    "push %rdi\n"
    "push %r8\n"
    "push %r9\n"
    "push %r10\n"
    "push %r11\n"
    "push %r12\n"
    "push %r13\n"
    "push %r14\n"
    "push %r15\n"

    // Convert from syscall calling conventions to C calling conventions.
    // System calls have a subtly different register ordering than the user-
    // space x86-64 ABI.
    "mov %r10, %rcx\n"

    // Check range of system call
    "cmp playground$maxSyscall(%rip), %eax\n"
    "ja  3f\n"

    // Retrieve function call from system call table (c.f. syscall_table.c).
    // We have three different types of entries; zero for denied system calls,
    // that should be handled by the defaultSystemCallHandler(); minus one
    // for unrestricted system calls that need to be forwarded to the trusted
    // thread; and function pointers to specific handler functions.
    "mov %rax, %r10\n"
    "shl $4, %r10\n"
    "lea playground$syscallTable(%rip), %r11\n"
    "add %r11, %r10\n"
    "mov 0(%r10), %r10\n"

    // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
    // jump to fallback handler.
    "cmp $1, %r10\n"
    "jbe 3f\n"
    "call *%r10\n"
  "2:"

    // Restore CPU registers, except for %rax which was set by the system call.
    "pop %r15\n"
    "pop %r14\n"
    "pop %r13\n"
    "pop %r12\n"
    "pop %r11\n"
    "pop %r10\n"
    "pop %r9\n"
    "pop %r8\n"
    "pop %rdi\n"
    "pop %rsi\n"
    "pop %rdx\n"
    "pop %rcx\n"
    "pop %rbx\n"
    "pop %rbp\n"

    // Remove fake return address. This is added in the patching code in
    // library.cc and it makes stack traces a little cleaner.
    "add $8, %rsp\n"

    // Return to caller
    "ret\n"

  "3:"
    // If we end up calling a specific handler, we don't need to know the
    // system call number. However, in the generic case, we do. Shift
    // registers so that the system call number becomes visible as the
    // first function argument.
    "push %r9\n"
    "mov  %r8, %r9\n"
    "mov  %rcx, %r8\n"
    "mov  %rdx, %rcx\n"
    "mov  %rsi, %rdx\n"
    "mov  %rdi, %rsi\n"
    "mov  %rax, %rdi\n"

    // Call default handler.
    "call playground$defaultSystemCallHandler\n"
    "pop  %r9\n"
    "jmp 2b\n"
    #elif defined(__i386__)
    "cmp  $119, %eax\n"            // NR_sigreturn
    "jnz  1f\n"
    "add  $0x4, %esp\n"            // pop return address
  "0:int  $0x80\n"                 // sigreturn() is unrestricted
    "mov  $66, %ebx\n"             // sigreturn() should never return
    "mov  %ebx, %eax\n"            // NR_exit
    "jmp  0b\n"
  "1:cmp  $173, %eax\n"            // NR_rt_sigreturn
    "jnz  3f\n"

    // Convert rt_sigframe into sigframe, allowing us to call sigreturn().
    // This is possible since the first part of signal stack frames have
    // stayed very stable since the earliest kernel versions. While never
    // officially documented, lots of user space applications rely on this
    // part of the ABI, and kernel developers have been careful to maintain
    // backwards compatibility.
    // In general, the rt_sigframe includes a lot of extra information that
    // the signal handler can look at. Most notably, this means a complete
    // siginfo record.
    // Fortunately though, the kernel doesn't look at any of this extra data
    // when returning from a signal handler. So, we can safely convert an
    // rt_sigframe to a legacy sigframe, discarding the extra data in the
    // process. Interestingly, the legacy signal frame is actually larger than
    // the rt signal frame, as it includes a lot more padding.
    "sub  $0x1C8, %esp\n"          // a legacy signal stack is much larger
    "mov  0x1CC(%esp), %eax\n"     // push signal number
    "push %eax\n"
    "lea  0x270(%esp), %esi\n"     // copy siginfo register values
    "lea  0x4(%esp), %edi\n"       //     into new location
    "mov  $0x16, %ecx\n"
    "cld\n"
    "rep movsl\n"
    "mov  0x2C8(%esp), %ebx\n"     // copy first half of signal mask
    "mov  %ebx, 0x54(%esp)\n"
    "lea  2f, %esi\n"
    "push %esi\n"                  // push restorer function
    "lea  0x2D4(%esp), %edi\n"     // patch up retcode magic numbers
    "movb $2, %cl\n"
    "rep movsl\n"
    "ret\n"                        // return to restorer function
  "2:pop  %eax\n"                  // remove dummy argument (signo)
    "mov  $119, %eax\n"            // NR_sigaction
    "int  $0x80\n"


    // Preserve all registers
  "3:push %ebx\n"
    "push %ecx\n"
    "push %edx\n"
    "push %esi\n"
    "push %edi\n"
    "push %ebp\n"

    // Convert from syscall calling conventions to C calling conventions
    "push %ebp\n"
    "push %edi\n"
    "push %esi\n"
    "push %edx\n"
    "push %ecx\n"
    "push %ebx\n"
    "push %eax\n"

    // Check range of system call
    "cmp playground$maxSyscall, %eax\n"
    "ja  9f\n"

    // We often have long sequences of calls to gettimeofday(). This is
    // needlessly expensive. Coalesce them into a single call.
    //
    // We keep track of state in TLS storage that we can access through
    // the %fs segment register. See trusted_thread.cc for the exact
    // memory layout.
    //
    // TODO(markus): maybe, we should proactively call gettimeofday() and
    //               clock_gettime(), whenever we talk to the trusted thread?
    //               or maybe, if we have recently seen requests to compute
    //               the time. There might be a repeated pattern of those.
    "cmp  $78, %eax\n"             // __NR_gettimeofday
    "jnz  6f\n"
    "cmp  %eax, %fs:0x102C-0x58\n" // last system call
    "jnz  4f\n"

    // This system call and the last system call prior to this one both are
    // calls to gettimeofday(). Try to avoid making the new call and just
    // return the same result as in the previous call.
    // Just in case the caller is spinning on the result from gettimeofday(),
    // every so often, call the actual system call.
    "decl %fs:0x1030-0x58\n"       // countdown calls to gettimofday()
    "jz   4f\n"

    // Atomically read the 64bit word representing last-known timestamp and
    // return it to the caller. On x86-32 this is a little more complicated and
    // requires the use of the cmpxchg8b instruction.
    "mov  %ebx, %eax\n"
    "mov  %ecx, %edx\n"
    "lock; cmpxchg8b 100f\n"
    "mov  %eax, 0(%ebx)\n"
    "mov  %edx, 4(%ebx)\n"
    "xor  %eax, %eax\n"
    "add  $28, %esp\n"
    "jmp  8f\n"

    // This is a call to gettimeofday(), but we don't have a valid cached
    // result, yet.
  "4:mov  %eax, %fs:0x102C-0x58\n" // remember syscall number
    "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations
    "call playground$defaultSystemCallHandler\n"

    // Returned from gettimeofday(). Remember return value, in case the
    // application calls us again right away.
    // Again, this has to happen atomically and requires cmpxchg8b.
    "mov 4(%ebx), %ecx\n"
    "mov 0(%ebx), %ebx\n"
    "mov 100f, %eax\n"
    "mov 101f, %edx\n"
  "5:lock; cmpxchg8b 100f\n"
    "jnz 5b\n"
    "xor %eax, %eax\n"
    "jmp 10f\n"

    // Remember the number of the last system call made. We deliberately do
    // not remember calls to gettid(), as we have often seen long sequences
    // of calls to just gettimeofday() and gettid(). In that situation, we
    // would still like to coalesce the gettimeofday() calls.
  "6:cmp $224, %eax\n"             // __NR_gettid
    "jz  7f\n"
    "mov  %eax, %fs:0x102C-0x58\n" // remember syscall number

    // Retrieve function call from system call table (c.f. syscall_table.c).
    // We have three different types of entries; zero for denied system calls,
    // that should be handled by the defaultSystemCallHandler(); minus one
    // for unrestricted system calls that need to be forwarded to the trusted
    // thread; and function pointers to specific handler functions.
  "7:shl  $3, %eax\n"
    "lea  playground$syscallTable, %ebx\n"
    "add  %ebx, %eax\n"
    "mov  0(%eax), %eax\n"

    // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
    // jump to fallback handler.
    "cmp  $1, %eax\n"
    "jbe  9f\n"
    "add  $4, %esp\n"
    "call *%eax\n"
    "add  $24, %esp\n"

    // Restore CPU registers, except for %eax which was set by the system call.
  "8:pop  %ebp\n"
    "pop  %edi\n"
    "pop  %esi\n"
    "pop  %edx\n"
    "pop  %ecx\n"
    "pop  %ebx\n"

    // Return to caller
    "ret\n"

    // Call default handler.
  "9:call playground$defaultSystemCallHandler\n"
 "10:add  $28, %esp\n"
    "jmp 8b\n"

    ".pushsection \".bss\"\n"
    ".balign 8\n"
"100:.byte 0, 0, 0, 0\n"
"101:.byte 0, 0, 0, 0\n"
    ".popsection\n"

    #else
    #error Unsupported target platform
    #endif
    ".size playground$syscallWrapper, .-playground$syscallWrapper\n"
    ".popsection\n"
);


void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
                                        void* arg2, void* arg3, void* arg4,
                                        void* arg5) {
  // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.

  // We try to avoid intercepting read(), and write(), as these system calls
  // are not restricted in Seccomp mode. But depending on the exact
  // instruction sequence in libc, we might not be able to reliably
  // filter out these system calls at the time when we instrument the code.
  SysCalls  sys;
  long      rc;
  long long tm;
  switch (syscallNum) {
    case __NR_read:
      Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
      rc             = sys.read((long)arg0, arg1, (size_t)arg2);
      break;
    case __NR_write:
      Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
      rc             = sys.write((long)arg0, arg1, (size_t)arg2);
      break;
    default:
      if (Debug::isEnabled()) {
        // In debug mode, prevent stderr from being closed
        if (syscallNum == __NR_close && arg0 == (void *)2)
          return 0;
      }

      if ((unsigned)syscallNum <= maxSyscall &&
          syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) {
        Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
     perform_unrestricted:
        struct {
          int          sysnum;
          void*        unrestricted_req[6];
        } __attribute__((packed)) request = {
          syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } };

        int   thread = threadFdPub();
        void* rc;
        if (write(sys, thread, &request, sizeof(request)) != sizeof(request) ||
            read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) {
          die("Failed to forward unrestricted system call");
        }
        Debug::elapsed(tm, syscallNum);
        return rc;
      } else if (Debug::isEnabled()) {
        Debug::syscall(&tm, syscallNum,
                       "In production mode, this call would be disallowed");
        goto perform_unrestricted;
      } else {
        return (void *)-ENOSYS;
      }
  }
  if (rc < 0) {
    rc               = -sys.my_errno;
  }
  Debug::elapsed(tm, syscallNum);
  return (void *)rc;
}

} // namespace