diff options
author | markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-04-20 18:05:23 +0000 |
---|---|---|
committer | markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-04-20 18:05:23 +0000 |
commit | 00248036bdbbcd913adb34193cf21982a2fdc82a (patch) | |
tree | 15007b2ef02f6291e2a3f312c928ab2f6f0229bf /sandbox/linux | |
parent | ae654689c8fae6eb523054f073ce8baabf4ec2c5 (diff) | |
download | chromium_src-00248036bdbbcd913adb34193cf21982a2fdc82a.zip chromium_src-00248036bdbbcd913adb34193cf21982a2fdc82a.tar.gz chromium_src-00248036bdbbcd913adb34193cf21982a2fdc82a.tar.bz2 |
Make the use of signals inside of the sandbox safe.
We previously assumed that no signals would ever be enabled in the sandbox
and thus there was no way to trick the trusted thread into executing potentially
untrusted code.
In an attempt to lift this restriction, this changelist modifies the trusted
thread so that
- it has an invalid stack pointer at all times. Any attempt to handle a
signal would result in the kernel trying to push a signal stack, which
would immediately result in a SEGV and then terminate the application.
- all signals are blocked while outside of trusted code. If a signal is
triggered, it either gets handled on one of the sandboxed threads (for
asynchronous signals), or it results in the application getting terminated
by the kernel (for synchronous signals).
This changelist is difficult not only because eliminating all uses of the
stack pointer requires some very careful assembly coding, but more importantly
because we have to restore signals after we enter seccomp mode.
As sigprocmask() is a restricted system call, the only way to restore the
signal mask is by calling sigreturn() with a suitably tweaked signal
stack frame. While the first couple of bytes of the signal stack frame are
well-defined and unlikely to change, the entire signal stack frame is not
documented as part of the stable ABI. The exact format depends on the number of modified CPU registers (e.g. SSE, MMX, floating point, ...)
The only way for us to get a valid signal stack frame is to trigger a
signal, and to create a (possibly adjusted) copy of the signal frame. We
obviously have to do this _before_ we block all signals upon entering
trusted code.
The two places where this needs to happen is upon start of the sandbox when
launching the initial trusted thread, and upon any call to clone().
BUG=37728
TEST=Run chrome and verify that /proc/$PID/status shows the correct signal mask for trusted threads. The latter can be identified with strace.
Review URL: http://codereview.chromium.org/1594040
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@45055 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'sandbox/linux')
-rw-r--r-- | sandbox/linux/seccomp/clone.cc | 92 | ||||
-rw-r--r-- | sandbox/linux/seccomp/debug.cc | 18 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox.cc | 154 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox_impl.h | 136 | ||||
-rw-r--r-- | sandbox/linux/seccomp/securemem.cc | 18 | ||||
-rw-r--r-- | sandbox/linux/seccomp/securemem.h | 7 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sigprocmask.cc | 120 | ||||
-rw-r--r-- | sandbox/linux/seccomp/syscall.cc | 10 | ||||
-rw-r--r-- | sandbox/linux/seccomp/syscall_table.c | 12 | ||||
-rw-r--r-- | sandbox/linux/seccomp/trusted_process.cc | 2 | ||||
-rw-r--r-- | sandbox/linux/seccomp/trusted_thread.cc | 804 |
11 files changed, 964 insertions, 409 deletions
diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc index 148bae5..0bf91c1 100644 --- a/sandbox/linux/seccomp/clone.cc +++ b/sandbox/linux/seccomp/clone.cc @@ -7,7 +7,7 @@ namespace playground { -int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid, +int Sandbox::sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls, void *wrapper_sp) { long long tm; Debug::syscall(&tm, __NR_clone, "Executing handler"); @@ -24,25 +24,77 @@ int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid, request.clone_req.ctid = ctid; request.clone_req.tls = tls; - // Pass along the address on the stack where syscallWrapper() stored the - // original CPU registers. These registers will be restored in the newly - // created thread prior to returning from the wrapped system call. - #if defined(__x86_64__) - memcpy(&request.clone_req.regs64, wrapper_sp, - sizeof(request.clone_req.regs64) + sizeof(void *)); - #elif defined(__i386__) - memcpy(&request.clone_req.regs32, wrapper_sp, - sizeof(request.clone_req.regs32) + sizeof(void *)); - #else - #error Unsupported target platform - #endif - + // TODO(markus): Passing stack == 0 currently does not do the same thing + // that the kernel would do without the sandbox. This is just going to + // cause a crash. We should detect this case, and replace the stack pointer + // with the correct value, instead. + // This is complicated by the fact that we will temporarily be executing + // both threads from the same stack. Some synchronization will be necessary. + // Fortunately, this complication also explains why hardly anybody ever + // does this. + // See trusted_thread.cc for more information. long rc; - SysCalls sys; - if (write(sys, processFdPub(), &request, sizeof(request)) != - sizeof(request) || - read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { - die("Failed to forward clone() request [sandbox]"); + if (stack == 0) { + rc = -EINVAL; + } else { + // Pass along the address on the stack where syscallWrapper() stored the + // original CPU registers. These registers will be restored in the newly + // created thread prior to returning from the wrapped system call. + #if defined(__x86_64__) + memcpy(&request.clone_req.regs64, wrapper_sp, + sizeof(request.clone_req.regs64) + sizeof(void *)); + #elif defined(__i386__) + memcpy(&request.clone_req.regs32, wrapper_sp, + sizeof(request.clone_req.regs32) + sizeof(void *)); + #else + #error Unsupported target platform + #endif + + // In order to unblock the signal mask in the newly created thread and + // after entering Seccomp mode, we have to call sigreturn(). But that + // requires access to a proper stack frame describing a valid signal. + // We trigger a signal now and make sure the stack frame ends up on the + // new stack. Our segv() handler (in sandbox.cc) does that for us. + // See trusted_thread.cc for more details on how threads get created. + // + // In general we rely on the kernel for generating the signal stack + // frame, as the exact binary format has been extended several times over + // the course of the kernel's development. Fortunately, the kernel + // developers treat the initial part of the stack frame as a stable part + // of the ABI. So, we can rely on fixed, well-defined offsets for accessing + // register values and for accessing the signal mask. + #if defined(__x86_64__) || defined(__i386__) + #if defined(__x86_64__) + // Red zone compensation. The instrumented system call will remove 128 + // bytes from the thread's stack prior to returning to the original + // call site. + stack -= 128; + request.clone_req.stack = stack; + #endif + asm("int $0" + : "=m"(request.clone_req.stack) + : "a"(__NR_clone + 0xF000), "d"(&request.clone_req.stack) + : "memory"); + #else + #error Unsupported target platform + #endif + + // Adjust the signal stack frame so that it contains the correct stack + // pointer upon returning from sigreturn(). + #if defined(__x86_64__) + *(char **)(request.clone_req.stack + 0xA0) = stack; + #elif defined(__i386__) + *(char **)(request.clone_req.stack + 0x1C) = stack; + #else + #error Unsupported target platform + #endif + + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward clone() request [sandbox]"); + } } Debug::elapsed(tm, __NR_clone); return static_cast<int>(rc); @@ -64,7 +116,7 @@ bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub, SecureMem::abandonSystemCall(threadFd, -EPERM); return false; } else { - SecureMem::Args* newMem = getSecureMem(); + SecureMem::Args* newMem = getNewSecureMem(); if (!newMem) { SecureMem::abandonSystemCall(threadFd, -ENOMEM); return false; diff --git a/sandbox/linux/seccomp/debug.cc b/sandbox/linux/seccomp/debug.cc index e4d6410..5d6de49 100644 --- a/sandbox/linux/seccomp/debug.cc +++ b/sandbox/linux/seccomp/debug.cc @@ -140,8 +140,8 @@ bool Debug::enter() { asm volatile("mov %%gs, %0\n" "test %0, %0\n" "jz 1f\n" - "movl %%gs:0x1050-0xD8, %0\n" - "incl %%gs:0x1050-0xD8\n" + "movl %%gs:0x1050-0xE0, %0\n" + "incl %%gs:0x1050-0xE0\n" "1:\n" : "=r"(level) : @@ -150,8 +150,8 @@ bool Debug::enter() { asm volatile("mov %%fs, %0\n" "test %0, %0\n" "jz 1f\n" - "movl %%fs:0x1034-0x54, %0\n" - "incl %%fs:0x1034-0x54\n" + "movl %%fs:0x1034-0x58, %0\n" + "incl %%fs:0x1034-0x58\n" "1:\n" : "=r"(level) : @@ -178,8 +178,8 @@ bool Debug::leave() { asm volatile("mov %%gs, %0\n" "test %0, %0\n" "jz 1f\n" - "decl %%gs:0x1050-0xD8\n" - "movl %%gs:0x1050-0xD8, %0\n" + "decl %%gs:0x1050-0xE0\n" + "movl %%gs:0x1050-0xE0, %0\n" "1:\n" : "=r"(level) : @@ -188,8 +188,8 @@ bool Debug::leave() { asm volatile("mov %%fs, %0\n" "test %0, %0\n" "jz 1f\n" - "decl %%fs:0x1034-0x54\n" - "movl %%fs:0x1034-0x54, %0\n" + "decl %%fs:0x1034-0x58\n" + "movl %%fs:0x1034-0x58, %0\n" "1:\n" : "=r"(level) : @@ -234,7 +234,7 @@ void Debug::gettimeofday(long long* tm) { // Zero out the lastSyscallNum, so that we don't try to coalesce // calls to gettimeofday(). For debugging purposes, we need the // exact time. - asm volatile("movl $0, %fs:0x102C-0x54"); + asm volatile("movl $0, %fs:0x102C-0x58"); #elif !defined(__x86_64__) #error Unsupported target platform #endif diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc index 12f0c0f..b7a249e 100644 --- a/sandbox/linux/seccomp/sandbox.cc +++ b/sandbox/linux/seccomp/sandbox.cc @@ -112,25 +112,23 @@ bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) { } void Sandbox::setupSignalHandlers() { + // Set SIGCHLD to SIG_DFL so that waitpid() can work SysCalls sys; struct SysCalls::kernel_sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_handler_ = SIG_DFL; sys.sigaction(SIGCHLD, &sa, NULL); - // Set up SEGV handler for dealing with RDTSC instructions + // Set up SEGV handler for dealing with RDTSC instructions, system calls + // that have been rewritten to use INT0, and for sigpending() emulation. sa.sa_handler_ = segv(); sys.sigaction(SIGSEGV, &sa, NULL); - // Block all asynchronous signals, except for SIGCHLD which needs to be - // set to SIG_DFL for waitpid() to work. + // Unblock SIGSEGV and SIGCHLD SysCalls::kernel_sigset_t mask; - memset(&mask, 0xFF, sizeof(mask)); - mask.sig[0] &= ~((1 << (SIGSEGV - 1)) | (1 << (SIGINT - 1)) | - (1 << (SIGTERM - 1)) | (1 << (SIGQUIT - 1)) | - (1 << (SIGHUP - 1)) | (1 << (SIGABRT - 1)) | - (1 << (SIGCHLD - 1))); - sys.sigprocmask(SIG_SETMASK, &mask, 0); + memset(&mask, 0x00, sizeof(mask)); + mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1)); + sys.sigprocmask(SIG_UNBLOCK, &mask, 0); } void (*Sandbox::segv())(int signo) { @@ -158,7 +156,7 @@ void (*Sandbox::segv())(int signo) { "sub $4, %%rsp\n" "push %%r14\n" "mov %%gs:16, %%edi\n" // fd = threadFdPub - "mov %%rsp, %%rsi\n" // buf = %esp + "mov %%rsp, %%rsi\n" // buf = %rsp "mov $4, %%edx\n" // len = sizeof(int) "1:mov $1, %%eax\n" // NR_write "syscall\n" @@ -199,8 +197,8 @@ void (*Sandbox::segv())(int signo) { // of playground::Library being unable to find a way to safely // rewrite the system call instruction. Retrieve the CPU register // at the time of the segmentation fault and invoke syscallWrapper(). - "8:cmpw $0xCD, (%%r15)\n" // INT $0x0 - "jnz 9f\n" + "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0 + "jnz 14f\n" #ifndef NDEBUG "lea 200f(%%rip), %%rdi\n" "call playground$debugMessage\n" @@ -212,7 +210,53 @@ void (*Sandbox::segv())(int signo) { "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault - "lea 7b(%%rip), %%rcx\n" + + // Handle rt_sigprocmask() + "cmp $14, %%rax\n" // NR_rt_sigprocmask + "jnz 12f\n" + "mov $-22, %%rax\n" // -EINVAL + "cmp $8, %%r10\n" // %r10 = sigsetsize (8 bytes = 64 signals) + "jl 7b\n" + "mov 0x130(%%rsp), %%r10\n" // signal mask at time of segmentation fault + "test %%rsi, %%rsi\n" // only set mask, if set is non-NULL + "jz 11f\n" + "mov 0(%%rsi), %%rsi\n" + "cmp $0, %%rdi\n" // %rdi = how (SIG_BLOCK) + "jnz 9f\n" + "or %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "jmp 11f\n" + "9:cmp $1, %%rdi\n" // %rdi = how (SIG_UNBLOCK) + "jnz 10f\n" + "xor $-1, %%rsi\n" + "and %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "jmp 11f\n" + "10:cmp $2, %%rdi\n" // %rdi = how (SIG_SETMASK) + "jnz 7b\n" + "mov %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "11:xor %%rax, %%rax\n" + "test %%rdx, %%rdx\n" // only return old mask, if set is non-NULL + "jz 7b\n" + "mov %%r10, 0(%%rdx)\n" // old_set + "jmp 7b\n" + + + // Copy signal frame onto new stack. See clone.cc for details + "12:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000 + "jnz 13f\n" + "mov 0xA8(%%rsp), %%rcx\n" // %rsp at time of segmentation fault + "sub %%rsp, %%rcx\n" // %rcx = size of stack frame + "sub $8, %%rcx\n" // skip return address + "mov %%rcx, %%rax\n" // return size of signal stack frame + "mov 0(%%rdx), %%rdi\n" // stack for newly clone()'d thread + "sub %%rcx, %%rdi\n" // copy onto new stack + "mov %%rdi, 0(%%rdx)\n" // allocate space on new stack + "lea 8(%%rsp), %%rsi\n" // copy from current stack + "cld\n" + "rep movsb\n" + "jmp 7b\n" + + // Forward system call to syscallWrapper() + "13:lea 7b(%%rip), %%rcx\n" "push %%rcx\n" "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault "lea playground$syscallWrapper(%%rip), %%rcx\n" @@ -221,7 +265,7 @@ void (*Sandbox::segv())(int signo) { // This was a genuine segmentation fault. Trigger the kernel's default // signal disposition. The only way we can do this from seccomp mode // is by blocking the signal and retriggering it. - "9:mov $2, %%edi\n" // stderr + "14:mov $2, %%edi\n" // stderr "lea 300f(%%rip), %%rsi\n" // "Segmentation fault\n" "mov $301f-300f, %%edx\n" "mov $1, %%eax\n" // NR_write @@ -293,8 +337,8 @@ void (*Sandbox::segv())(int signo) { // of playground::Library being unable to find a way to safely // rewrite the system call instruction. Retrieve the CPU register // at the time of the segmentation fault and invoke syscallWrapper(). - "8:cmpw $0xCD, (%%ebp)\n" // INT $0x0 - "jnz 9f\n" + "8:cmpw $0x00CD, (%%ebp)\n" // INT $0x0 + "jnz 16f\n" #ifndef NDEBUG "lea 200f, %%eax\n" "push %%eax\n" @@ -308,13 +352,69 @@ void (*Sandbox::segv())(int signo) { "mov 0x1C(%%esp), %%esi\n" // %esi at time of segmentation fault "mov 0x18(%%esp), %%edi\n" // %edi at time of segmentation fault "mov 0x20(%%esp), %%ebp\n" // %ebp at time of segmentation fault - "call playground$syscallWrapper\n" + + // Handle sigprocmask() and rt_sigprocmask() + "cmp $175, %%eax\n" // NR_rt_sigprocmask + "jnz 9f\n" + "mov $-22, %%eax\n" // -EINVAL + "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals) + "jl 7b\n" + "jmp 10f\n" + "9:cmp $126, %%eax\n" // NR_sigprocmask + "jnz 14f\n" + "mov $-22, %%eax\n" + "10:mov 0x58(%%esp), %%edi\n" // signal mask at time of segmentation fault + "mov 0x5C(%%esp), %%ebp\n" + "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL + "jz 13f\n" + "mov 0(%%ecx), %%esi\n" + "mov 4(%%ecx), %%ecx\n" + "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK) + "jnz 11f\n" + "or %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault + "or %%ecx, 0x5C(%%esp)\n" + "jmp 13f\n" + "11:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK) + "jnz 12f\n" + "xor $-1, %%esi\n" + "xor $-1, %%ecx\n" + "and %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault + "and %%ecx, 0x5C(%%esp)\n" + "jmp 13f\n" + "12:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK) + "jnz 7b\n" + "mov %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault + "mov %%ecx, 0x5C(%%esp)\n" + "13:xor %%eax, %%eax\n" + "test %%edx, %%edx\n" // only return old mask, if set is non-NULL + "jz 7b\n" + "mov %%edi, 0(%%edx)\n" // old_set + "mov %%ebp, 4(%%edx)\n" + "jmp 7b\n" + + // Copy signal frame onto new stack. See clone.cc for details + "14:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000 + "jnz 15f\n" + "mov 0x24(%%esp), %%ecx\n" // %esp at time of segmentation fault + "sub %%esp, %%ecx\n" // %ecx = size of stack frame + "sub $8, %%ecx\n" // skip return address and dummy + "mov %%ecx, %%eax\n" // return size of signal stack frame + "mov 0(%%edx), %%edi\n" // stack for newly clone()'d thread + "sub %%ecx, %%edi\n" // copy onto new stack + "mov %%edi, 0(%%edx)\n" // allocate space on new stack + "lea 8(%%esp), %%esi\n" // copy from current stack + "cld\n" + "rep movsb\n" + "jmp 7b\n" + + // Forward system call to syscallWrapper() + "15:call playground$syscallWrapper\n" "jmp 7b\n" // This was a genuine segmentation fault. Trigger the kernel's default // signal disposition. The only way we can do this from seccomp mode // is by blocking the signal and retriggering it. - "9:mov $2, %%ebx\n" // stderr + "16:mov $2, %%ebx\n" // stderr "lea 300f, %%ecx\n" // "Segmentation fault\n" "mov $301f-300f, %%edx\n" "mov $4, %%eax\n" // NR_write @@ -345,6 +445,24 @@ void (*Sandbox::segv())(int signo) { return fnc; } +SecureMem::Args* Sandbox::getSecureMem() { + // Check trusted_thread.cc for the magic offset that gets us from the TLS + // to the beginning of the secure memory area. + SecureMem::Args* ret; +#if defined(__x86_64__) + asm volatile( + "movq %%gs:-0xE0, %0\n" + : "=q"(ret)); +#elif defined(__i386__) + asm volatile( + "movl %%fs:-0x58, %0\n" + : "=r"(ret)); +#else +#error Unsupported target platform +#endif + return ret; +} + void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) { SysCalls sys; if (sys.lseek(proc_self_maps, 0, SEEK_SET) || diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h index 18a359c..36f01c8 100644 --- a/sandbox/linux/seccomp/sandbox_impl.h +++ b/sandbox/linux/seccomp/sandbox_impl.h @@ -56,7 +56,7 @@ class Sandbox { // "proc_fd" should be a file descriptor for "/proc", or -1 if not provided // by the caller. static int supportsSeccompSandbox(int proc_fd) - asm("SupportsSeccompSandbox"); + asm("SupportsSeccompSandbox"); // The sandbox needs to be able to access "/proc/self/maps". If this file // is not accessible when "startSandbox()" gets called, the caller can @@ -64,12 +64,12 @@ class Sandbox { // The sandbox becomes the newer owner of this file descriptor and will // eventually close it when "startSandbox()" executes. static void setProcSelfMaps(int proc_self_maps) - asm("SeccompSandboxSetProcSelfMaps"); + asm("SeccompSandboxSetProcSelfMaps"); // This is the main public entry point. It finds all system calls that // need rewriting, sets up the resources needed by the sandbox, and // enters Seccomp mode. - static void startSandbox() asm("StartSeccompSandbox"); + static void startSandbox() asm("StartSeccompSandbox"); private: // syscall_table.c has to be implemented in C, as C++ does not support @@ -84,7 +84,7 @@ class Sandbox { // Clone() is special as it has a wrapper in syscall_table.c. The wrapper // adds one extra argument (the pointer to the saved registers) and then // calls playground$sandbox__clone(). - static int sandbox_clone(int flags, void* stack, int* pid, int* ctid, + static int sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls, void* wrapper_sp) asm("playground$sandbox__clone") #if defined(__x86_64__) @@ -96,130 +96,142 @@ class Sandbox { #define bool int #define SecureMemArgs void // This is the wrapper entry point that is found in the syscall_table. - int sandbox_clone(int flags, void* stack, int* pid, int* ctid, void* tls) - asm("playground$sandbox_clone"); + int sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls) + asm("playground$sandbox_clone"); #endif // Entry points for sandboxed code that is attempting to make system calls STATIC int sandbox_access(const char*, int) - asm("playground$sandbox_access"); - STATIC int sandbox_exit(int status) asm("playground$sandbox_exit"); - STATIC int sandbox_getpid() asm("playground$sandbox_getpid"); + asm("playground$sandbox_access"); + STATIC int sandbox_exit(int status) asm("playground$sandbox_exit"); + STATIC int sandbox_getpid() asm("playground$sandbox_getpid"); #if defined(__NR_getsockopt) STATIC int sandbox_getsockopt(int, int, int, void*, socklen_t*) - asm("playground$sandbox_getsockopt"); + asm("playground$sandbox_getsockopt"); #endif - STATIC int sandbox_gettid() asm("playground$sandbox_gettid"); + STATIC int sandbox_gettid() asm("playground$sandbox_gettid"); STATIC int sandbox_ioctl(int d, int req, void* arg) - asm("playground$sandbox_ioctl"); + asm("playground$sandbox_ioctl"); #if defined(__NR_ipc) STATIC int sandbox_ipc(unsigned, int, int, int, void*, long) - asm("playground$sandbox_ipc"); + asm("playground$sandbox_ipc"); #endif STATIC int sandbox_lstat(const char* path, void* buf) - asm("playground$sandbox_lstat"); + asm("playground$sandbox_lstat"); #if defined(__NR_lstat64) STATIC int sandbox_lstat64(const char *path, void* b) - asm("playground$sandbox_lstat64"); + asm("playground$sandbox_lstat64"); #endif STATIC int sandbox_madvise(void*, size_t, int) - asm("playground$sandbox_madvise"); + asm("playground$sandbox_madvise"); STATIC void *sandbox_mmap(void* start, size_t length, int prot, int flags, int fd, off_t offset) - asm("playground$sandbox_mmap"); + asm("playground$sandbox_mmap"); STATIC int sandbox_mprotect(const void*, size_t, int) - asm("playground$sandbox_mprotect"); + asm("playground$sandbox_mprotect"); STATIC int sandbox_munmap(void* start, size_t length) - asm("playground$sandbox_munmap"); + asm("playground$sandbox_munmap"); STATIC int sandbox_open(const char*, int, mode_t) - asm("playground$sandbox_open"); + asm("playground$sandbox_open"); #if defined(__NR_recvfrom) STATIC ssize_t sandbox_recvfrom(int, void*, size_t, int, void*, socklen_t*) - asm("playground$sandbox_recvfrom"); + asm("playground$sandbox_recvfrom"); STATIC ssize_t sandbox_recvmsg(int, struct msghdr*, int) - asm("playground$sandbox_recvmsg"); + asm("playground$sandbox_recvmsg"); + #endif + #if defined(__NR_rt_sigprocmask) + STATIC int sandbox_rt_sigprocmask(int how, const void*, void*, size_t) + asm("playground$sandbox_rt_sigprocmask"); + #endif + #if defined(__NR_sendmsg) STATIC size_t sandbox_sendmsg(int, const struct msghdr*, int) - asm("playground$sandbox_sendmsg"); + asm("playground$sandbox_sendmsg"); STATIC ssize_t sandbox_sendto(int, const void*, size_t, int, const void*, socklen_t)asm("playground$sandbox_sendto"); + #endif #if defined(__NR_shmat) STATIC void* sandbox_shmat(int, const void*, int) - asm("playground$sandbox_shmat"); + asm("playground$sandbox_shmat"); STATIC int sandbox_shmctl(int, int, void*) - asm("playground$sandbox_shmctl"); - STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt"); + asm("playground$sandbox_shmctl"); + STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt"); STATIC int sandbox_shmget(int, size_t, int) - asm("playground$sandbox_shmget"); + asm("playground$sandbox_shmget"); #endif + #if defined(__NR_setsockopt) STATIC int sandbox_setsockopt(int, int, int, const void*, socklen_t) - asm("playground$sandbox_setsockopt"); + asm("playground$sandbox_setsockopt"); + #endif + #if defined(__NR_sigprocmask) + STATIC int sandbox_sigprocmask(int how, const void*, void*) + asm("playground$sandbox_sigprocmask"); #endif #if defined(__NR_socketcall) STATIC int sandbox_socketcall(int call, void* args) - asm("playground$sandbox_socketcall"); + asm("playground$sandbox_socketcall"); #endif STATIC int sandbox_stat(const char* path, void* buf) - asm("playground$sandbox_stat"); + asm("playground$sandbox_stat"); #if defined(__NR_stat64) STATIC int sandbox_stat64(const char *path, void* b) - asm("playground$sandbox_stat64"); + asm("playground$sandbox_stat64"); #endif // Functions for system calls that need to be handled in the trusted process STATIC bool process_access(int, int, int, int, SecureMemArgs*) - asm("playground$process_access"); + asm("playground$process_access"); STATIC bool process_clone(int, int, int, int, SecureMemArgs*) - asm("playground$process_clone"); + asm("playground$process_clone"); STATIC bool process_exit(int, int, int, int, SecureMemArgs*) - asm("playground$process_exit"); + asm("playground$process_exit"); #if defined(__NR_getsockopt) STATIC bool process_getsockopt(int, int, int, int, SecureMemArgs*) - asm("playground$process_getsockopt"); + asm("playground$process_getsockopt"); #endif STATIC bool process_ioctl(int, int, int, int, SecureMemArgs*) - asm("playground$process_ioctl"); + asm("playground$process_ioctl"); #if defined(__NR_ipc) STATIC bool process_ipc(int, int, int, int, SecureMemArgs*) - asm("playground$process_ipc"); + asm("playground$process_ipc"); #endif STATIC bool process_madvise(int, int, int, int, SecureMemArgs*) - asm("playground$process_madvise"); + asm("playground$process_madvise"); STATIC bool process_mmap(int, int, int, int, SecureMemArgs*) - asm("playground$process_mmap"); + asm("playground$process_mmap"); STATIC bool process_mprotect(int, int, int, int, SecureMemArgs*) - asm("playground$process_mprotect"); + asm("playground$process_mprotect"); STATIC bool process_munmap(int, int, int, int, SecureMemArgs*) - asm("playground$process_munmap"); + asm("playground$process_munmap"); STATIC bool process_open(int, int, int, int, SecureMemArgs*) - asm("playground$process_open"); + asm("playground$process_open"); #if defined(__NR_recvfrom) STATIC bool process_recvfrom(int, int, int, int, SecureMemArgs*) - asm("playground$process_recvfrom"); + asm("playground$process_recvfrom"); STATIC bool process_recvmsg(int, int, int, int, SecureMemArgs*) - asm("playground$process_recvmsg"); + asm("playground$process_recvmsg"); STATIC bool process_sendmsg(int, int, int, int, SecureMemArgs*) - asm("playground$process_sendmsg"); + asm("playground$process_sendmsg"); STATIC bool process_sendto(int, int, int, int, SecureMemArgs*) - asm("playground$process_sendto"); + asm("playground$process_sendto"); STATIC bool process_setsockopt(int, int, int, int, SecureMemArgs*) - asm("playground$process_setsockopt"); + asm("playground$process_setsockopt"); #endif #if defined(__NR_shmat) STATIC bool process_shmat(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmat"); + asm("playground$process_shmat"); STATIC bool process_shmctl(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmctl"); + asm("playground$process_shmctl"); STATIC bool process_shmdt(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmdt"); + asm("playground$process_shmdt"); STATIC bool process_shmget(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmget"); + asm("playground$process_shmget"); #endif #if defined(__NR_socketcall) STATIC bool process_socketcall(int, int, int, int, SecureMemArgs*) - asm("playground$process_socketcall"); + asm("playground$process_socketcall"); #endif STATIC bool process_stat(int, int, int, int, SecureMemArgs*) - asm("playground$process_stat"); + asm("playground$process_stat"); #ifdef __cplusplus friend class Debug; @@ -294,13 +306,11 @@ class Sandbox { } // Sends a file handle to another process. + // N.B. trusted_thread.cc has an assembly version of this function that + // is safe to use without a call stack. If the wire-format is changed, + /// make sure to update the assembly code. static bool sendFd(int transport, int fd0, int fd1, const void* buf, - size_t len) - asm("playground$sendFd") - #if defined(__x86_64__) - __attribute__((visibility("internal"))) - #endif - ; + size_t len); // If getFd() fails, it will set the first valid fd slot (e.g. fd0) to // -errno. @@ -334,7 +344,7 @@ class Sandbox { struct Clone { int flags; - void* stack; + char* stack; int* pid; int* ctid; void* tls; @@ -584,6 +594,7 @@ class Sandbox { static int tid() { return TLS::getTLSValue<int>(TLS_TID); } static int threadFdPub() { return TLS::getTLSValue<int>(TLS_THREAD_FD); } static int processFdPub() { return processFdPub_; } + static kernel_sigset_t* signalMask() { return &getSecureMem()->signalMask; } // The SEGV handler knows how to handle RDTSC instructions static void setupSignalHandlers(); @@ -601,9 +612,12 @@ class Sandbox { #endif ; + // Return the current secure memory structure for this thread. + static SecureMem::Args* getSecureMem(); + // Return a secure memory structure that can be used by a newly created // thread. - static SecureMem::Args* getSecureMem(); + static SecureMem::Args* getNewSecureMem(); // This functions runs in the trusted process at startup and finds all the // memory mappings that existed when the sandbox was first enabled. Going diff --git a/sandbox/linux/seccomp/securemem.cc b/sandbox/linux/seccomp/securemem.cc index 0071c45..5f07bbe 100644 --- a/sandbox/linux/seccomp/securemem.cc +++ b/sandbox/linux/seccomp/securemem.cc @@ -72,13 +72,14 @@ void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd, : "q"(&mem->sequence) : "memory"); } - mem->syscallNum = syscallNum; - mem->arg1 = arg1; - mem->arg2 = arg2; - mem->arg3 = arg3; - mem->arg4 = arg4; - mem->arg5 = arg5; - mem->arg6 = arg6; + mem->callType = locked ? -2 : -1; + mem->syscallNum = syscallNum; + mem->arg1 = arg1; + mem->arg2 = arg2; + mem->arg3 = arg3; + mem->arg4 = arg4; + mem->arg5 = arg5; + mem->arg6 = arg6; asm volatile( #if defined(__x86_64__) "lock; incq (%0)\n" @@ -90,9 +91,8 @@ void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd, : : "q"(&mem->sequence) : "memory"); - int data = locked ? -2 : -1; Sandbox::SysCalls sys; - if (Sandbox::write(sys, fd, &data, sizeof(data)) != sizeof(data)) { + if (Sandbox::write(sys, fd, &mem->callType, sizeof(int)) != sizeof(int)) { Sandbox::die("Failed to send system call"); } if (parentMapsFd >= 0) { diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h index ac7823e..dc035ff 100644 --- a/sandbox/linux/seccomp/securemem.h +++ b/sandbox/linux/seccomp/securemem.h @@ -6,6 +6,7 @@ #define SECURE_MEM_H__ #include <stdlib.h> +#include "linux_syscall_support.h" namespace playground { @@ -28,6 +29,7 @@ class SecureMem { struct { struct Args* self; long sequence; + long callType; long syscallNum; void* arg1; void* arg2; @@ -92,7 +94,7 @@ class SecureMem { struct { // This scratch space is used by the trusted thread to read parameters // for unrestricted system calls. - long tmpSyscallNum; + int tmpSyscallNum; void* tmpArg1; void* tmpArg2; void* tmpArg3; @@ -115,6 +117,9 @@ class SecureMem { // result in additional system calls. Make sure that we don't trigger // logging of those recursive calls. int recursionLevel; + + // Computing the signal mask is expensive. Keep a cached copy. + kernel_sigset_t signalMask; } __attribute__((packed)); char scratchPage[4096]; }; diff --git a/sandbox/linux/seccomp/sigprocmask.cc b/sandbox/linux/seccomp/sigprocmask.cc new file mode 100644 index 0000000..f3ad1fb --- /dev/null +++ b/sandbox/linux/seccomp/sigprocmask.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +// If the sandboxed process tries to mask SIGSEGV, there is a good chance +// the process will eventually get terminated. If this is really ever a +// problem, we can hide the fact that SIGSEGV is unmasked. But I don't think +// we really need this. Masking of synchronous signals is rarely necessary. + +#if defined(__NR_sigprocmask) +int Sandbox::sandbox_sigprocmask(int how, const void* set, void* old_set) { + long long tm; + Debug::syscall(&tm, __NR_sigprocmask, "Executing handler"); + + // Access the signal mask by triggering a SEGV and modifying the signal state + // prior to calling rt_sigreturn(). + long res = -ENOSYS; + #if defined(__x86_64__) + #error x86-64 does not support sigprocmask(); use rt_sigprocmask() instead + #elif defined(__i386__) + asm volatile( + "push %%ebx\n" + "movl %2, %%ebx\n" + "int $0\n" + "pop %%ebx\n" + : "=a"(res) + : "0"(__NR_sigprocmask), "ri"((long)how), + "c"((long)set), "d"((long)old_set) + : "esp", "memory"); + #else + #error Unsupported target platform + #endif + + // Update our shadow signal mask, so that we can copy it upon creation of + // new threads. + if (res == 0 && set != NULL) { + SecureMem::Args* args = getSecureMem(); + switch (how) { + case SIG_BLOCK: + *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set; + break; + case SIG_UNBLOCK: + *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set; + break; + case SIG_SETMASK: + *(unsigned long long *)&args->signalMask = *(unsigned long long *)set; + break; + default: + break; + } + } + + Debug::elapsed(tm, __NR_sigprocmask); + + return (int)res; +} +#endif + +#if defined(__NR_rt_sigprocmask) +int Sandbox::sandbox_rt_sigprocmask(int how, const void* set, void* old_set, + size_t bytes) { + long long tm; + Debug::syscall(&tm, __NR_rt_sigprocmask, "Executing handler"); + + // Access the signal mask by triggering a SEGV and modifying the signal state + // prior to calling rt_sigreturn(). + long res = -ENOSYS; + #if defined(__x86_64__) + asm volatile( + "movq %5, %%r10\n" + "int $0\n" + : "=a"(res) + : "0"(__NR_rt_sigprocmask), "D"((long)how), + "S"((long)set), "d"((long)old_set), "r"((long)bytes) + : "r10", "r11", "rcx", "memory"); + #elif defined(__i386__) + asm volatile( + "push %%ebx\n" + "movl %2, %%ebx\n" + "int $0\n" + "pop %%ebx\n" + : "=a"(res) + : "0"(__NR_rt_sigprocmask), "ri"((long)how), + "c"((long)set), "d"((long)old_set), "S"((long)bytes) + : "esp", "memory"); + #else + #error Unsupported target platform + #endif + + // Update our shadow signal mask, so that we can copy it upon creation of + // new threads. + if (res == 0 && set != NULL && bytes >= 8) { + SecureMem::Args* args = getSecureMem(); + switch (how) { + case SIG_BLOCK: + *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set; + break; + case SIG_UNBLOCK: + *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set; + break; + case SIG_SETMASK: + *(unsigned long long *)&args->signalMask = *(unsigned long long *)set; + break; + default: + break; + } + } + + Debug::elapsed(tm, __NR_rt_sigprocmask); + + return (int)res; +} +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc index 7f431a3..76e96e4 100644 --- a/sandbox/linux/seccomp/syscall.cc +++ b/sandbox/linux/seccomp/syscall.cc @@ -165,7 +165,7 @@ asm( // the time. There might be a repeated pattern of those. "cmp $78, %eax\n" // __NR_gettimeofday "jnz 2f\n" - "cmp %eax, %fs:0x102C-0x54\n" // last system call + "cmp %eax, %fs:0x102C-0x58\n" // last system call "jnz 0f\n" // This system call and the last system call prior to this one both are @@ -173,7 +173,7 @@ asm( // return the same result as in the previous call. // Just in case the caller is spinning on the result from gettimeofday(), // every so often, call the actual system call. - "decl %fs:0x1030-0x54\n" // countdown calls to gettimofday() + "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday() "jz 0f\n" // Atomically read the 64bit word representing last-known timestamp and @@ -190,8 +190,8 @@ asm( // This is a call to gettimeofday(), but we don't have a valid cached // result, yet. - "0:mov %eax, %fs:0x102C-0x54\n" // remember syscall number - "movl $500, %fs:0x1030-0x54\n" // make system call, each 500 invocations + "0:mov %eax, %fs:0x102C-0x58\n" // remember syscall number + "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations "call playground$defaultSystemCallHandler\n" // Returned from gettimeofday(). Remember return value, in case the @@ -212,7 +212,7 @@ asm( // would still like to coalesce the gettimeofday() calls. "2:cmp $224, %eax\n" // __NR_gettid "jz 3f\n" - "mov %eax, %fs:0x102C-0x54\n" // remember syscall number + "mov %eax, %fs:0x102C-0x58\n" // remember syscall number // Retrieve function call from system call table (c.f. syscall_table.c). // We have three different types of entries; zero for denied system calls, diff --git a/sandbox/linux/seccomp/syscall_table.c b/sandbox/linux/seccomp/syscall_table.c index 2f66ca3..454ffa9 100644 --- a/sandbox/linux/seccomp/syscall_table.c +++ b/sandbox/linux/seccomp/syscall_table.c @@ -96,19 +96,31 @@ const struct SyscallTable syscallTable[] __attribute__(( #if defined(__NR_recvfrom) [ __NR_recvfrom ] = { (void*)&sandbox_recvfrom, process_recvfrom }, [ __NR_recvmsg ] = { (void*)&sandbox_recvmsg, process_recvmsg }, + #endif + #if defined(__NR_rt_sigprocmask) + [ __NR_rt_sigprocmask ] = { (void*)&sandbox_rt_sigprocmask, 0 }, + #endif + #if defined(__NR_sendmsg) [ __NR_sendmsg ] = { (void*)&sandbox_sendmsg, process_sendmsg }, [ __NR_sendto ] = { (void*)&sandbox_sendto, process_sendto }, #endif [ __NR_set_robust_list ] = { UNRESTRICTED_SYSCALL, 0 }, #if defined(__NR_setsockopt) [ __NR_setsockopt ] = { (void*)&sandbox_setsockopt,process_setsockopt }, + #endif #if defined(__NR_shmat) [ __NR_shmat ] = { (void*)&sandbox_shmat, process_shmat }, [ __NR_shmctl ] = { (void*)&sandbox_shmctl, process_shmctl }, [ __NR_shmdt ] = { (void*)&sandbox_shmdt, process_shmdt }, [ __NR_shmget ] = { (void*)&sandbox_shmget, process_shmget }, #endif + #if defined(__NR_shutdown) [ __NR_shutdown ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + #if defined(__NR_sigprocmask) + [ __NR_sigprocmask ] = { (void*)&sandbox_sigprocmask, 0 }, + #endif + #if defined(__NR_socketpair) [ __NR_socketpair ] = { UNRESTRICTED_SYSCALL, 0 }, #endif #if defined(__NR_socketcall) diff --git a/sandbox/linux/seccomp/trusted_process.cc b/sandbox/linux/seccomp/trusted_process.cc index 1320839..80adbf6 100644 --- a/sandbox/linux/seccomp/trusted_process.cc +++ b/sandbox/linux/seccomp/trusted_process.cc @@ -16,7 +16,7 @@ struct Thread { SecureMem::Args* mem; }; -SecureMem::Args* Sandbox::getSecureMem() { +SecureMem::Args* Sandbox::getNewSecureMem() { if (!secureMemPool_.empty()) { SecureMem::Args* rc = secureMemPool_.back(); secureMemPool_.pop_back(); diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc index c73091c..240e65f 100644 --- a/sandbox/linux/seccomp/trusted_thread.cc +++ b/sandbox/linux/seccomp/trusted_thread.cc @@ -21,6 +21,44 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov %0, %%rbp\n" // %rbp = args "xor %%rbx, %%rbx\n" // initial sequence number "lea 999f(%%rip), %%r15\n" // continue in same thread + + // Signal handlers are process-wide. This means that for security + // reasons, we cannot allow that the trusted thread ever executes any + // signal handlers. + // We prevent the execution of signal handlers by setting a signal + // mask that blocks all signals. In addition, we make sure that the + // stack pointer is invalid. + // We cannot reset the signal mask until after we have enabled + // Seccomp mode. Our sigprocmask() wrapper would normally do this by + // raising a signal, modifying the signal mask in the kernel-generated + // signal frame, and then calling sigreturn(). This presents a bit of + // a Catch-22, as all signals are masked and we can therefore not + // raise any signal that would allow us to generate the signal stack + // frame. + // Instead, we have to create the signal stack frame prior to entering + // Seccomp mode. This incidentally also helps us to restore the + // signal mask to the same value that it had prior to entering the + // sandbox. + // The signal wrapper for clone() is the second entry point into this + // code (by means of sending an IPC to its trusted thread). It goes + // through the same steps of creating a signal stack frame on the + // newly created thread's stacks prior to cloning. See clone.cc for + // details. + "mov $56+0xF000, %%eax\n" // __NR_clone + 0xF000 + "sub $8, %%rsp\n" + "mov %%rsp, %%rdx\n" // push a signal stack frame (see clone.cc) + "mov %%rsp, 0(%%rsp)\n" + "int $0\n" + "mov 0(%%rsp), %%r9\n" + "add $8, 0xA0(%%r9)\n" // pop stack upon call to sigreturn() + "mov $2, %%rdi\n" // how = SIG_SETMASK + "movq $-1, 0(%%rsp)\n" + "mov %%rsp, %%rsi\n" // set = full mask + "xor %%rdx, %%rdx\n" // old_set = NULL + "mov $8, %%r10\n" // mask all 64 signals + "mov $14, %%eax\n" // NR_rt_sigprocmask + "syscall\n" + "xor %%rsp, %%rsp\n" // invalidate the stack in all trusted code "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger @@ -36,42 +74,44 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // %rbx: sequence number for trusted calls // Temporary variables: - // %r9: system call number + // %r8: child stack + // %r9: system call number, child stack // %rbp: secure memory of previous thread // Layout of secure shared memory region (c.f. securemem.h): // 0x00: pointer to the secure shared memory region (i.e. self) // 0x08: sequence number; must match %rbx - // 0x10: system call number; passed to syscall in %rax - // 0x18: first argument; passed to syscall in %rdi - // 0x20: second argument; passed to syscall in %rsi - // 0x28: third argument; passed to syscall in %rdx - // 0x30: fourth argument; passed to syscall in %r10 - // 0x38: fifth argument; passed to syscall in %r8 - // 0x40: sixth argument; passed to syscall in %r9 - // 0x48: stored return address for clone() system call - // 0x50: stored %rbp value for clone() system call - // 0x58: stored %rbx value for clone() system call - // 0x60: stored %rcx value for clone() system call - // 0x68: stored %rdx value for clone() system call - // 0x70: stored %rsi value for clone() system call - // 0x78: stored %rdi value for clone() system call - // 0x80: stored %r8 value for clone() system call - // 0x88: stored %r9 value for clone() system call - // 0x90: stored %r10 value for clone() system call - // 0x98: stored %r11 value for clone() system call - // 0xA0: stored %r12 value for clone() system call - // 0xA8: stored %r13 value for clone() system call - // 0xB0: stored %r14 value for clone() system call - // 0xB8: stored %r15 value for clone() system call - // 0xC0: new shared memory for clone() - // 0xC8: processFdPub for talking to trusted process - // 0xCC: cloneFdPub for talking to trusted process - // 0xD0: set to non-zero, if in debugging mode - // 0xD4: most recent SHM id returned by shmget(IPC_PRIVATE) - // 0xD8: cookie assigned to us by the trusted process (TLS_COOKIE) - // 0xE0: thread id (TLS_TID) - // 0xE8: threadFdPub (TLS_THREAD_FD) + // 0x10: call type; must match %eax, iff %eax == -1 || %eax == -2 + // 0x18: system call number; passed to syscall in %rax + // 0x20: first argument; passed to syscall in %rdi + // 0x28: second argument; passed to syscall in %rsi + // 0x30: third argument; passed to syscall in %rdx + // 0x38: fourth argument; passed to syscall in %r10 + // 0x40: fifth argument; passed to syscall in %r8 + // 0x48: sixth argument; passed to syscall in %r9 + // 0x50: stored return address for clone() system call + // 0x58: stored %rbp value for clone() system call + // 0x60: stored %rbx value for clone() system call + // 0x68: stored %rcx value for clone() system call + // 0x70: stored %rdx value for clone() system call + // 0x78: stored %rsi value for clone() system call + // 0x80: stored %rdi value for clone() system call + // 0x88: stored %r8 value for clone() system call + // 0x90: stored %r9 value for clone() system call + // 0x98: stored %r10 value for clone() system call + // 0xA0: stored %r11 value for clone() system call + // 0xA8: stored %r12 value for clone() system call + // 0xB0: stored %r13 value for clone() system call + // 0xB8: stored %r14 value for clone() system call + // 0xC0: stored %r15 value for clone() system call + // 0xC8: new shared memory for clone() + // 0xD0: processFdPub for talking to trusted process + // 0xD4: cloneFdPub for talking to trusted process + // 0xD8: set to non-zero, if in debugging mode + // 0xDC: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0xE0: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0xE8: thread id (TLS_TID) + // 0xF0: threadFdPub (TLS_THREAD_FD) // 0x200-0x1000: securely passed verified file name(s) // Layout of (untrusted) scratch space: @@ -89,6 +129,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x48: last system call (not used on x86-64) // 0x4C: number of consecutive calls to a time fnc (not used on x86-64) // 0x50: nesting level of system calls (for debugging purposes only) + // 0x54: signal mask // We use the %fs register for accessing the secure read-only page, and // the untrusted scratch space immediately following it. The segment @@ -103,7 +144,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // read(threadFd, &scratch, 4) "1:xor %%rax, %%rax\n" // NR_read "mov %%r13, %%rdi\n" // fd = threadFd - "mov %%fs:0x0, %%rsi\n" + "mov %%fs:0x0, %%rsi\n" // secure_mem "add $0x1000, %%rsi\n" // buf = &scratch "mov $4, %%edx\n" // len = 4 "2:syscall\n" @@ -123,13 +164,15 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "jnz 5f\n" "3:cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process - "mov %%fs:0x10, %%rax\n" - "mov %%fs:0x18, %%rdi\n" - "mov %%fs:0x20, %%rsi\n" - "mov %%fs:0x28, %%rdx\n" - "mov %%fs:0x30, %%r10\n" - "mov %%fs:0x38, %%r8\n" - "mov %%fs:0x40, %%r9\n" + "cmp %%fs:0x10, %%eax\n" + "jne 25f\n" // exit process + "mov %%fs:0x18, %%rax\n" + "mov %%fs:0x20, %%rdi\n" + "mov %%fs:0x28, %%rsi\n" + "mov %%fs:0x30, %%rdx\n" + "mov %%fs:0x38, %%r10\n" + "mov %%fs:0x40, %%r8\n" + "mov %%fs:0x48, %%r9\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process "add $2, %%rbx\n" @@ -153,13 +196,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE "mov $10, %%eax\n" // NR_mprotect "syscall\n" - "mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id + "mov %%r8d, 0xDC(%%rdi)\n" // set most recently returned SysV shm id "xor %%rdi, %%rdi\n" // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG - "cmpw $0, %%fs:0xD0\n" // debug mode - "jz 26f\n" + "cmpw $0, %%fs:0xD8\n" // debug mode + "jz 27f\n" "mov $1, %%eax\n" // NR_write "mov $2, %%edi\n" // fd = stderr "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" @@ -168,7 +211,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "xor %%rdi, %%rdi\n" #endif - "jmp 26f\n" // exit program, no message + "jmp 27f\n" // exit program, no message "4:syscall\n" "jmp 15f\n" // return result @@ -179,10 +222,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "jnz 9f\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process + "cmp %%eax, %%fs:0x10\n" + "jne 25f\n" // exit process // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG - "cmpw $0, %%fs:0xD0\n" // debug mode + "cmpw $0, %%fs:0xD8\n" // debug mode "jz 6f\n" "mov $1, %%eax\n" // NR_write "mov $2, %%edi\n" // fd = stderr @@ -192,13 +237,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "6:" #endif - "mov %%fs:0x10, %%rax\n" - "mov %%fs:0x18, %%rdi\n" - "mov %%fs:0x20, %%rsi\n" - "mov %%fs:0x28, %%rdx\n" - "mov %%fs:0x30, %%r10\n" - "mov %%fs:0x38, %%r8\n" - "mov %%fs:0x40, %%r9\n" + "mov %%fs:0x18, %%rax\n" + "mov %%fs:0x20, %%rdi\n" + "mov %%fs:0x28, %%rsi\n" + "mov %%fs:0x30, %%rdx\n" + "mov %%fs:0x38, %%r10\n" + "mov %%fs:0x40, %%r8\n" + "mov %%fs:0x48, %%r9\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process @@ -255,7 +300,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Check in syscallTable whether this system call is unrestricted "12:mov %%rax, %%r9\n" #ifndef NDEBUG - "cmpw $0, %%fs:0xD0\n" // debug mode + "cmpw $0, %%fs:0xD8\n" // debug mode "jnz 13f\n" #endif "cmp playground$maxSyscall(%%rip), %%eax\n" @@ -287,11 +332,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov 0x2C(%%r8), %%r9\n" "mov 0x24(%%r8), %%r8\n" "cmp $231, %%rax\n" // NR_exit_group - "jz 26f\n" // exit program, no message + "jz 27f\n" // exit program, no message "syscall\n" // Return result of system call to sandboxed thread - "15:mov %%fs:0x0, %%rsi\n" + "15:mov %%fs:0x0, %%rsi\n" // secure_mem "add $0x1034, %%rsi\n" // buf = &scratch + 52 "mov %%rax, (%%rsi)\n" "mov $8, %%edx\n" // len = 8 @@ -306,8 +351,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // NR_exit: // Exit trusted thread after cleaning up resources - "18:mov %%fs:0x0, %%rsi\n" - "mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub + "18:mov %%fs:0x0, %%rsi\n" // secure_mem + "mov 0xF0(%%rsi), %%rdi\n" // fd = threadFdPub "mov $3, %%eax\n" // NR_close "syscall\n" "mov %%rsi, %%rdi\n" // start = secure_mem @@ -324,7 +369,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" "mov %%rax, %%rdi\n" "test %%rax, %%rax\n" - "js 26f\n" // exit process + "js 27f\n" // exit process "jne 21f\n" // reap helper, exit thread "jmp 22f\n" // unlock mutex @@ -334,12 +379,9 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // resources with the caller (i.e. the previous trusted thread), // and by extension it shares all resources with the sandbox'd // threads. - // N.B. It is possible to make the thread creation code crash before - // it releases seccomp privileges. This is generally OK, as it just - // terminates the program. But if we ever support signal handling, - // we have to be careful that the user cannot install a SIGSEGV - // handler that gets executed with elevated privileges. - "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "mov %%rsi, %%r15\n" // remember child stack + "mov $1, %%rsi\n" // stack = 1 "syscall\n" // calls NR_clone "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values "jae 7b\n" // unlock mutex, return result @@ -349,6 +391,23 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // In nascent thread, now. "sub $2, %%rbx\n" + + // We want to maintain an invalid %rsp whenver we access untrusted + // memory. This ensures that even if an attacker can trick us into + // triggering a SIGSEGV, we will never successfully execute a signal + // handler. + // Signal handlers are inherently dangerous, as an attacker could trick + // us into returning to the wrong address by adjusting the signal stack + // right before the handler returns. + // N.B. While POSIX is curiously silent about this, it appears that on + // Linux, alternate signal stacks are a per-thread property. That is + // good. It means that this security mechanism works, even if the + // sandboxed thread manages to set up an alternate signal stack. + // + // TODO(markus): We currently do not support emulating calls to + // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc + // for a discussion on how to fix this, if this ever becomes neccessary. + "mov %%r15, %%r9\n" // %r9 = child_stack "xor %%r15, %%r15\n" // Request to return from clone() when done // Get thread id of nascent thread @@ -358,19 +417,19 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Nascent thread creates socketpair() for sending requests to // trusted thread. - // We can create the filehandles on the stack. Filehandles are + // We can create the filehandles on the child's stack. Filehandles are // always treated as untrusted. // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) - "push %%r15\n" + "sub $0x10, %%r9\n" + "mov %%r15, 8(%%r9)\n" // preserve return address on child stack "mov $53, %%eax\n" // NR_socketpair "mov $1, %%edi\n" // domain = AF_UNIX "mov $1, %%esi\n" // type = SOCK_STREAM "xor %%rdx, %%rdx\n" // protocol = 0 - "sub $8, %%rsp\n" // sv = %rsp - "mov %%rsp, %%r10\n" + "mov %%r9, %%r10\n" // sv = child_stack "syscall\n" "test %%rax, %%rax\n" - "jz 27f\n" + "jz 28f\n" // If things went wrong, we don't have an (easy) way of signaling // the parent. For our purposes, it is sufficient to fail with a @@ -403,12 +462,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "syscall\n" - "mov $1, %%edi\n" - "26:mov $231, %%eax\n" // NR_exit_group + "26:mov $1, %%edi\n" + "27:mov $231, %%eax\n" // NR_exit_group "jmp 24b\n" // The first page is mapped read-only for use as securely shared memory - "27:mov 0xC0(%%rbp), %%r12\n" // %r12 = secure shared memory + "28:mov 0xC8(%%rbp), %%r12\n" // %r12 = secure shared memory "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process "mov $10, %%eax\n" // NR_mprotect @@ -428,12 +487,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| // CLONE_SYSVSEM|CLONE_UNTRACED|CLONE_SETTLS, stack, NULL, NULL, // tls) - "mov 4(%%rsp), %%r13d\n" // %r13 = threadFd + "mov 4(%%r9), %%r13d\n" // %r13 = threadFd (on child's stack) "mov $56, %%eax\n" // NR_clone "mov $0x8D0F00, %%edi\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR|TLS "mov $1, %%rsi\n" // stack = 1 "mov %%r12, %%r8\n" // tls = new_secure_mem - "mov 0xC8(%%rbp), %%r15d\n" // %r15 = processFdPub + "mov 0xD0(%%rbp), %%r15d\n" // %r15 = processFdPub "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process "syscall\n" @@ -441,13 +500,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "js 25b\n" // exit process "jz 0b\n" // invoke trustedThreadFnc() + // Copy the caller's signal mask + "mov 0x1054(%%rbp), %%rax\n" + "mov %%rax, 0x1054(%%r12)\n" + // Done creating trusted thread. We can now get ready to return to caller - "mov 0(%%rsp), %%r9d\n" // %r9 = threadFdPub - "add $8, %%rsp\n" + "mov %%r9, %%r8\n" // %r8 = child_stack + "mov 0(%%r9), %%r9d\n" // %r9 = threadFdPub // Set up thread local storage with information on how to talk to // trusted thread and trusted process. - "lea 0xD8(%%r12), %%rsi\n" // args = &secure_mem.TLS; + "lea 0xE0(%%r12), %%rsi\n" // args = &secure_mem.TLS; "mov $158, %%eax\n" // NR_arch_prctl "mov $0x1001, %%edi\n" // option = ARCH_SET_GS "syscall\n" @@ -459,73 +522,121 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // the very top of this function, you will find that we push 999(%rip) // on the stack. That is the signal that we should return on the same // stack rather than return to where clone was called. - "pop %%r15\n" + "mov 8(%%r8), %%r15\n" + "add $0x10, %%r8\n" "test %%r15, %%r15\n" - "jne 28f\n" + "jne 29f\n" // Returning from clone() into the newly created thread is special. We // cannot unroll the stack, as we just set up a new stack for this // thread. We have to explicitly restore CPU registers to the values // that they had when the program originally called clone(). - "sub $0x80, %%rsp\n" // redzone compensation - "mov 0x48(%%rbp), %%rax\n" - "push %%rax\n" + // We patch the register values in the signal stack frame so that we + // can ask sigreturn() to restore all registers for us. + "sub $0x8, %%r8\n" "mov 0x50(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x00(%%r8)\n" // return address + "xor %%rax, %%rax\n" + "mov %%rax, 0x98(%%r8)\n" // %rax = 0 "mov 0x58(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x80(%%r8)\n" // %rbp "mov 0x60(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x88(%%r8)\n" // %rbx "mov 0x68(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0xA0(%%r8)\n" // %rcx "mov 0x70(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x90(%%r8)\n" // %rdx "mov 0x78(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x78(%%r8)\n" // %rsi "mov 0x80(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x70(%%r8)\n" // %rdi "mov 0x88(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x30(%%r8)\n" // %r8 "mov 0x90(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x38(%%r8)\n" // %r9 "mov 0x98(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x40(%%r8)\n" // %r10 "mov 0xA0(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x48(%%r8)\n" // %r11 "mov 0xA8(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x50(%%r8)\n" // %r12 "mov 0xB0(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x58(%%r8)\n" // %r13 "mov 0xB8(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x60(%%r8)\n" // %r14 + "mov 0xC0(%%rbp), %%rax\n" + "mov %%rax, 0x68(%%r8)\n" // %r15 "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process // Nascent thread launches a helper that doesn't share any of our // resources, except for pages mapped as MAP_SHARED. - // clone(0, %rsp) - "28:mov $56, %%eax\n" // NR_clone + // clone(SIGCHLD, stack=1) + "29:mov $56, %%eax\n" // NR_clone "mov $17, %%rdi\n" // flags = SIGCHLD - "mov %%rsp, %%rsi\n" // stack = %rsp + "mov $1, %%rsi\n" // stack = 1 "syscall\n" "test %%rax, %%rax\n" "js 25b\n" // exit process - "jne 29f\n" + "jne 31f\n" // Use sendmsg() to send to the trusted process the file handles for // communicating with the new trusted thread. We also send the address // of the secure memory area (for sanity checks) and the thread id. - "mov 0xCC(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub() + "mov 0xD4(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub() "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process - "mov %%r9, %%rsi\n" // fd0 = threadFdPub - "mov %%r13, %%rdx\n" // fd1 = threadFd - "push %%r14\n" // threadId - "mov %%esi, 4(%%rsp)\n" // threadFdPub - "push %%r12\n" // secure_mem - "mov %%rsp, %%rcx\n" // buf = &data - "mov $16, %%r8\n" // len = sizeof(void*) + 2*sizeof(int) - "call playground$sendFd\n" + + // 0x00 msg: + // 0x00 msg_name ($0) + // 0x08 msg_namelen ($0) + // 0x10 msg_iov (%r8 + 0x44) + // 0x18 msg_iovlen ($1) + // 0x20 msg_control (%r8 + 0x54) + // 0x28 msg_controllen ($0x18) + // 0x30 data: + // 0x30 msg_flags/err ($0) + // 0x34 secure_mem (%r12) + // 0x3C threadId (%r14d) + // 0x40 threadFdPub (%r9d) + // 0x44 iov: + // 0x44 iov_base (%r8 + 0x30) + // 0x4C iov_len ($0x14) + // 0x54 cmsg: + // 0x54 cmsg_len ($0x18) + // 0x5C cmsg_level ($1, SOL_SOCKET) + // 0x60 cmsg_type ($1, SCM_RIGHTS) + // 0x64 threadFdPub (%r9d) + // 0x68 threadFd (%r13d) + // 0x6C + "sub $0x6C, %%r8\n" + "xor %%rdx, %%rdx\n" // flags = 0 + "mov %%rdx, 0x00(%%r8)\n" // msg_name + "mov %%edx, 0x08(%%r8)\n" // msg_namelen + "mov %%edx, 0x30(%%r8)\n" // msg_flags + "mov $1, %%r11d\n" + "mov %%r11, 0x18(%%r8)\n" // msg_iovlen + "mov %%r11d, 0x5C(%%r8)\n" // cmsg_level + "mov %%r11d, 0x60(%%r8)\n" // cmsg_type + "lea 0x30(%%r8), %%r11\n" + "mov %%r11, 0x44(%%r8)\n" // iov_base + "add $0x14, %%r11\n" + "mov %%r11, 0x10(%%r8)\n" // msg_iov + "add $0x10, %%r11\n" + "mov %%r11, 0x20(%%r8)\n" // msg_control + "mov $0x14, %%r11d\n" + "mov %%r11, 0x4C(%%r8)\n" // iov_len + "add $4, %%r11d\n" + "mov %%r11, 0x28(%%r8)\n" // msg_controllen + "mov %%r11, 0x54(%%r8)\n" // cmsg_len + "mov %%r12, 0x34(%%r8)\n" // secure_mem + "mov %%r14d, 0x3C(%%r8)\n" // threadId + "mov %%r9d, 0x40(%%r8)\n" // threadFdPub + "mov %%r9d, 0x64(%%r8)\n" // threadFdPub + "mov %%r13d, 0x68(%%r8)\n" // threadFd + "mov $46, %%eax\n" // NR_sendmsg + "mov %%r8, %%rsi\n" // msg + "syscall\n" // Release syscall_mutex_. This signals the trusted process that // it can write into the original thread's secure memory again. @@ -534,23 +645,29 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $4096, %%esi\n" "mov $3, %%edx\n" // PROT_READ | PROT_WRITE "syscall\n" + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process "lock; addl $0x80000000, (%%rdi)\n" - "jz 26b\n" // exit process (no error message) + "jz 30f\n" // exit process (no error message) "mov $1, %%edx\n" "mov %%rdx, %%rsi\n" // FUTEX_WAKE "mov $202, %%eax\n" // NR_futex "syscall\n" - "jmp 26b\n" // exit process (no error message) + "30:xor %%rdi, %%rdi\n" + "jmp 27b\n" // exit process (no error message) // Reap helper - "29:mov %%rax, %%rdi\n" - "30:xor %%rsi, %%rsi\n" + "31:mov %%rax, %%rdi\n" + "32:lea -4(%%r8), %%rsi\n" "xor %%rdx, %%rdx\n" "xor %%r10, %%r10\n" "mov $61, %%eax\n" // NR_wait4 "syscall\n" "cmp $-4, %%eax\n" // EINTR - "jz 30b\n" + "jz 32b\n" + "mov -4(%%r8), %%eax\n" + "test %%rax, %%rax\n" + "jnz 26b\n" // exit process (no error message) // Release privileges by entering seccomp mode. "mov $157, %%eax\n" // NR_prctl @@ -560,6 +677,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "test %%rax, %%rax\n" "jnz 25b\n" // exit process + // We can finally start using the stack. Signal handlers no longer pose + // a threat to us. + "mov %%r8, %%rsp\n" + // Back in the newly created sandboxed thread, wait for trusted process // to receive request. It is possible for an attacker to make us // continue even before the trusted process is done. This is OK. It'll @@ -569,10 +690,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $1, %%edx\n" // len = 1 "mov %%rsp, %%rsi\n" // buf = %rsp "mov %%r9, %%rdi\n" // fd = threadFdPub - "31:xor %%rax, %%rax\n" // NR_read + "33:xor %%rax, %%rax\n" // NR_read "syscall\n" "cmp $-4, %%rax\n" // EINTR - "jz 31b\n" + "jz 33b\n" "cmp %%rdx, %%rax\n" "jne 25b\n" // exit process "pop %%rax\n" @@ -580,27 +701,16 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Return to caller. We are in the new thread, now. "xor %%rax, %%rax\n" "test %%r15, %%r15\n" - - // Returning to createTrustedThread() - "jz 32f\n" - "jmp *%%r15\n" - - // Returning to the place where clone() had been called - "32:pop %%r15\n" - "pop %%r14\n" - "pop %%r13\n" - "pop %%r12\n" - "pop %%r11\n" - "pop %%r10\n" - "pop %%r9\n" - "pop %%r8\n" - "pop %%rdi\n" - "pop %%rsi\n" - "pop %%rdx\n" - "pop %%rcx\n" - "pop %%rbx\n" - "pop %%rbp\n" - "ret\n" + "jnz 34f\n" // Returning to createTrustedThread() + + // Returning to the place where clone() had been called. We rely on + // using rt_sigreturn() for restoring our registers. The caller already + // created a signal stack frame, and we patched the register values + // with the ones that were in effect prior to calling sandbox_clone(). + "pop %%r15\n" + "34:mov %%r15, 0xA8(%%rsp)\n" // compute new %rip + "mov $15, %%eax\n" // NR_rt_sigreturn + "syscall\n" ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" @@ -638,19 +748,60 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %0, %%mm6\n" // %mm6 = args "lea 999f, %%ebx\n" // continue in same thread "movd %%ebx, %%mm3\n" - "xor %%ebx, %%ebx\n" // initial sequence number - "movd %%ebx, %%mm2\n" + "xor %%edi, %%edi\n" // initial sequence number + "movd %%edi, %%mm2\n" + + // Signal handlers are process-wide. This means that for security + // reasons, we cannot allow that the trusted thread ever executes any + // signal handlers. + // We prevent the execution of signal handlers by setting a signal + // mask that blocks all signals. In addition, we make sure that the + // stack pointer is invalid. + // We cannot reset the signal mask until after we have enabled + // Seccomp mode. Our sigprocmask() wrapper would normally do this by + // raising a signal, modifying the signal mask in the kernel-generated + // signal frame, and then calling sigreturn(). This presents a bit of + // a Catch-22, as all signals are masked and we can therefore not + // raise any signal that would allow us to generate the signal stack + // frame. + // Instead, we have to create the signal stack frame prior to entering + // Seccomp mode. This incidentally also helps us to restore the + // signal mask to the same value that it had prior to entering the + // sandbox. + // The signal wrapper for clone() is the second entry point into this + // code (by means of sending an IPC to its trusted thread). It goes + // through the same steps of creating a signal stack frame on the + // newly created thread's stacks prior to cloning. See clone.cc for + // details. + "mov $120+0xF000, %%eax\n" // __NR_clone + 0xF000 + "sub $8, %%esp\n" + "mov %%esp, %%edx\n" // push a signal stack frame (see clone.cc) + "mov %%esp, 0(%%esp)\n" + "int $0\n" + "mov 0(%%esp), %%ebp\n" + "add $8, 0x1C(%%ebp)\n" // pop stack upon call to sigreturn() + "mov $2, %%ebx\n" // how = SIG_SETMASK + "movl $-1, 0(%%esp)\n" + "movl $-1, 4(%%esp)\n" + "mov %%esp, %%ecx\n" // set = full mask + "xor %%edx, %%edx\n" // old_set = NULL + "mov $8, %%esi\n" // mask all 64 signals + "mov $175, %%eax\n" // NR_rt_sigprocmask + "int $0x80\n" + "mov $126, %%eax\n" // NR_sigprocmask + "int $0x80\n" + "xor %%esp, %%esp\n" // invalidate the stack in all trusted code "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger // buffer. // Parameters: - // %mm5: secure memory region - // the page following this one contains the scratch space // %mm0: thread's side of threadFd // %mm1: processFdPub // %mm3: return address after creation of new trusted thread + // %mm5: secure memory region + // the page following this one contains the scratch space // Local variables: // %mm2: sequence number for trusted calls @@ -664,28 +815,29 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Layout of secure shared memory region (c.f. securemem.h): // 0x00: pointer to the secure shared memory region (i.e. self) // 0x04: sequence number; must match %mm2 - // 0x08: system call number; passed to syscall in %eax - // 0x0C: first argument; passed to syscall in %ebx - // 0x10: second argument; passed to syscall in %ecx - // 0x14: third argument; passed to syscall in %edx - // 0x18: fourth argument; passed to syscall in %esi - // 0x1C: fifth argument; passed to syscall in %edi - // 0x20: sixth argument; passed to syscall in %ebp - // 0x24: stored return address for clone() system call - // 0x28: stored %ebp value for clone() system call - // 0x2C: stored %edi value for clone() system call - // 0x30: stored %esi value for clone() system call - // 0x34: stored %edx value for clone() system call - // 0x38: stored %ecx value for clone() system call - // 0x3C: stored %ebx value for clone() system call - // 0x40: new shared memory for clone() - // 0x44: processFdPub for talking to trusted process - // 0x48: cloneFdPub for talking to trusted process - // 0x4C: set to non-zero, if in debugging mode - // 0x50: most recent SHM id returned by shmget(IPC_PRIVATE) - // 0x54: cookie assigned to us by the trusted process (TLS_COOKIE) - // 0x5C: thread id (TLS_TID) - // 0x64: threadFdPub (TLS_THREAD_FD) + // 0x08: call type; must match %eax, iff %eax == -1 || %eax == -2 + // 0x0C: system call number; passed to syscall in %eax + // 0x10: first argument; passed to syscall in %ebx + // 0x14: second argument; passed to syscall in %ecx + // 0x18: third argument; passed to syscall in %edx + // 0x1C: fourth argument; passed to syscall in %esi + // 0x20: fifth argument; passed to syscall in %edi + // 0x24: sixth argument; passed to syscall in %ebp + // 0x28: stored return address for clone() system call + // 0x2C: stored %ebp value for clone() system call + // 0x30: stored %edi value for clone() system call + // 0x34: stored %esi value for clone() system call + // 0x38: stored %edx value for clone() system call + // 0x3C: stored %ecx value for clone() system call + // 0x40: stored %ebx value for clone() system call + // 0x44: new shared memory for clone() + // 0x48: processFdPub for talking to trusted process + // 0x4C: cloneFdPub for talking to trusted process + // 0x50: set to non-zero, if in debugging mode + // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0x60: thread id (TLS_TID) + // 0x68: threadFdPub (TLS_THREAD_FD) // 0x200-0x1000: securely passed verified file name(s) // Layout of (untrusted) scratch space: @@ -703,6 +855,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x2C: last system call (updated in syscall.cc) // 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday) // 0x34: nesting level of system calls (for debugging purposes only) + // 0x38: signal mask "0:xor %%esp, %%esp\n" "mov $2, %%eax\n" // %mm2 = initial sequence number @@ -713,7 +866,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // read(threadFd, &scratch, 4) "1:mov $3, %%eax\n" // NR_read "movd %%mm0, %%ebx\n" // fd = threadFd - "movd %%mm5, %%ecx\n" + "movd %%mm5, %%ecx\n" // secure_mem "add $0x1000, %%ecx\n" // buf = &scratch "mov $4, %%edx\n" // len = 4 "2:int $0x80\n" @@ -734,13 +887,15 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "3:movd %%mm2, %%ebp\n" "cmp %%ebp, 0x4-0x1000(%%ecx)\n" "jne 25f\n" // exit process - "mov 0x08-0x1000(%%ecx), %%eax\n" - "mov 0x0C-0x1000(%%ecx), %%ebx\n" - "mov 0x14-0x1000(%%ecx), %%edx\n" - "mov 0x18-0x1000(%%ecx), %%esi\n" - "mov 0x1C-0x1000(%%ecx), %%edi\n" - "mov 0x20-0x1000(%%ecx), %%ebp\n" - "mov 0x10-0x1000(%%ecx), %%ecx\n" + "cmp 0x08-0x1000(%%ecx), %%eax\n" + "jne 25f\n" // exit process + "mov 0x0C-0x1000(%%ecx), %%eax\n" + "mov 0x10-0x1000(%%ecx), %%ebx\n" + "mov 0x18-0x1000(%%ecx), %%edx\n" + "mov 0x1C-0x1000(%%ecx), %%esi\n" + "mov 0x20-0x1000(%%ecx), %%edi\n" + "mov 0x24-0x1000(%%ecx), %%ebp\n" + "mov 0x14-0x1000(%%ecx), %%ecx\n" "movd %%edi, %%mm4\n" "movd %%ebp, %%mm7\n" "movd %%mm2, %%ebp\n" @@ -773,14 +928,14 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE "mov $125, %%eax\n" // NR_mprotect "int $0x80\n" - "mov %%ebp, 0x50(%%ebx)\n" // set most recently returned SysV shm id + "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id "xor %%ebx, %%ebx\n" // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG "movd %%mm5, %%ecx\n" - "cmpw $0, 0x4C(%%ecx)\n" // debug mode - "jz 26f\n" + "cmpw $0, 0x50(%%ecx)\n" // debug mode + "jz 27f\n" "mov $4, %%eax\n" // NR_write "mov $2, %%ebx\n" // fd = stderr "lea 101f, %%ecx\n" // "This is an expensive system call" @@ -789,7 +944,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "xor %%ebx, %%ebx\n" #endif - "jmp 26f\n" // exit program, no message + "jmp 27f\n" // exit program, no message "4:int $0x80\n" "jmp 15f\n" // return result @@ -801,10 +956,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%mm2, %%ebp\n" "cmp %%ebp, 0x4-0x1000(%%ecx)\n" "jne 25f\n" // exit process + "cmp %%eax, 0x8-0x1000(%%ecx)\n" + "jne 25f\n" // exit process // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG - "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "cmpw $0, 0x50-0x1000(%%ecx)\n" "jz 6f\n" // debug mode "mov %%ecx, %%ebp\n" "mov $4, %%eax\n" // NR_write @@ -816,13 +973,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "6:" #endif - "mov 0x08-0x1000(%%ecx), %%eax\n" - "mov 0x0C-0x1000(%%ecx), %%ebx\n" - "mov 0x14-0x1000(%%ecx), %%edx\n" - "mov 0x18-0x1000(%%ecx), %%esi\n" - "mov 0x1C-0x1000(%%ecx), %%edi\n" - "mov 0x20-0x1000(%%ecx), %%ebp\n" - "mov 0x10-0x1000(%%ecx), %%ecx\n" + "mov 0x0C-0x1000(%%ecx), %%eax\n" + "mov 0x10-0x1000(%%ecx), %%ebx\n" + "mov 0x18-0x1000(%%ecx), %%edx\n" + "mov 0x1C-0x1000(%%ecx), %%esi\n" + "mov 0x20-0x1000(%%ecx), %%edi\n" + "mov 0x24-0x1000(%%ecx), %%ebp\n" + "mov 0x14-0x1000(%%ecx), %%ecx\n" "movd %%edi, %%mm4\n" "movd %%ebp, %%mm7\n" "movd %%mm2, %%ebp\n" @@ -864,7 +1021,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 6\n" + "jz 8b\n" "mov %%ebp, %%eax\n" "jmp 15f\n" // return result @@ -889,7 +1046,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Check in syscallTable whether this system call is unrestricted "12:mov %%eax, %%ebp\n" #ifndef NDEBUG - "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "cmpw $0, 0x50-0x1000(%%ecx)\n" "jnz 13f\n" // debug mode #endif "cmp playground$maxSyscall, %%eax\n" @@ -919,11 +1076,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov 0x14(%%ecx), %%ebp\n" "mov 0x04(%%ecx), %%ecx\n" "cmp $252, %%eax\n" // NR_exit_group - "jz 26f\n" // exit program, no message + "jz 27f\n" // exit program, no message "int $0x80\n" // Return result of system call to sandboxed thread - "15:movd %%mm5, %%ecx\n" + "15:movd %%mm5, %%ecx\n" // secure_mem "add $0x101C, %%ecx\n" // buf = &scratch + 28 "mov %%eax, (%%ecx)\n" "mov $4, %%edx\n" // len = 4 @@ -938,8 +1095,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // NR_exit: // Exit trusted thread after cleaning up resources - "18:mov %%edi, %%ecx\n" - "mov 0x64(%%ecx), %%ebx\n" // fd = threadFdPub + "18:mov %%edi, %%ecx\n" // secure_mem + "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub "mov $6, %%eax\n" // NR_close "int $0x80\n" "mov %%ecx, %%ebx\n" // start = secure_mem @@ -966,14 +1123,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // resources with the caller (i.e. the previous trusted thread), // and by extension it shares all resources with the sandbox'd // threads. - // N.B. It is possible to make the thread creation code crash before - // it releases seccomp privileges. This is generally OK, as it just - // terminates the program. But if we ever support signal handling, - // we have to be careful that the user cannot install a SIGSEGV - // handler that gets executed with elevated privileges. - "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem - "movd %%mm4, %%edi\n" - "movd %%mm7, %%ebp\n" + "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem + "movd %%mm4, %%edi\n" // child_tidptr + "mov %%ecx, %%ebp\n" // remember child stack + "mov $1, %%ecx\n" // stack = 1 "int $0x80\n" // calls NR_clone "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values "jae 7b\n" // unlock mutex, return result @@ -986,6 +1139,22 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // In nascent thread, now. "sub $2, %%edi\n" "movd %%edi, %%mm2\n" + + // We want to maintain an invalid %esp whenver we access untrusted + // memory. This ensures that even if an attacker can trick us into + // triggering a SIGSEGV, we will never successfully execute a signal + // handler. + // Signal handlers are inherently dangerous, as an attacker could trick + // us into returning to the wrong address by adjusting the signal stack + // right before the handler returns. + // N.B. While POSIX is curiously silent about this, it appears that on + // Linux, alternate signal stacks are a per-thread property. That is + // good. It means that this security mechanism works, even if the + // sandboxed thread manages to set up an alternate signal stack. + // + // TODO(markus): We currently do not support emulating calls to + // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc + // for a discussion on how to fix this, if this ever becomes neccessary. "movd %%eax, %%mm3\n" // Request to return from clone() when done // Get thread id of nascent thread @@ -995,23 +1164,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Nascent thread creates socketpair() for sending requests to // trusted thread. - // We can create the filehandles on the stack. Filehandles are + // We can create the filehandles on the child's stack. Filehandles are // always treated as untrusted. // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) "mov $102, %%eax\n" // NR_socketcall "mov $8, %%ebx\n" // socketpair - "sub $8, %%esp\n" // sv = %rsp - "push %%esp\n" - "xor %%ecx, %%ecx\n" // protocol = 0 - "push %%ecx\n" - "mov $1, %%ecx\n" // type = SOCK_STREAM - "push %%ecx\n" - "push %%ecx\n" // domain = AF_UNIX - "mov %%esp, %%ecx\n" + "sub $8, %%ebp\n" // sv = child_stack + "mov %%ebp, -0x04(%%ebp)\n" + "movl $0, -0x08(%%ebp)\n" // protocol = 0 + "movl $1, -0x0C(%%ebp)\n" // type = SOCK_STREAM + "movl $1, -0x10(%%ebp)\n" // domain = AF_UNIX + "lea -0x10(%%ebp), %%ecx\n" "int $0x80\n" - "add $0x10, %%esp\n" "test %%eax, %%eax\n" - "jz 27f\n" + "jz 28f\n" // If things went wrong, we don't have an (easy) way of signaling // the parent. For our purposes, it is sufficient to fail with a @@ -1043,19 +1209,18 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "lea 100f, %%ecx\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "int $0x80\n" - "mov $1, %%ebx\n" - "26:mov $252, %%eax\n" // NR_exit_group + "26:mov $1, %%ebx\n" + "27:mov $252, %%eax\n" // NR_exit_group "jmp 24b\n" // The first page is mapped read-only for use as securely shared memory - "27:movd %%mm6, %%ebp\n" - "mov 0x40(%%ebp), %%esi\n" - "movd %%esi, %%mm5\n" // %mm5 = secure shared memory - "movd %%mm2, %%edi\n" - "cmp %%edi, 4(%%ebp)\n" + "28:movd %%mm6, %%edi\n" // %edi = old_shared_mem + "mov 0x44(%%edi), %%ebx\n" // addr = secure_mem + "movd %%ebx, %%mm5\n" // %mm5 = secure_mem + "movd %%mm2, %%esi\n" + "cmp %%esi, 4(%%edi)\n" "jne 25b\n" // exit process "mov $125, %%eax\n" // NR_mprotect - "mov %%esi, %%ebx\n" "mov $4096, %%ecx\n" // len = 4096 "mov $1, %%edx\n" // prot = PROT_READ "int $0x80\n" @@ -1070,13 +1235,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Call clone() to create new trusted thread(). // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| // CLONE_SYSVSEM|CLONE_UNTRACED, stack, NULL, NULL, NULL) - "mov 4(%%esp), %%eax\n" + "mov 4(%%ebp), %%eax\n" // threadFd (on child's stack) "movd %%eax, %%mm0\n" // %mm0 = threadFd "mov $120, %%eax\n" // NR_clone "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR "mov $1, %%ecx\n" // stack = 1 - "movd 0x44(%%ebp), %%mm1\n" // %mm1 = processFdPub - "cmp %%edi, 4(%%ebp)\n" + "movd 0x48(%%edi), %%mm1\n" // %mm1 = processFdPub + "cmp %%esi, 4(%%edi)\n" "jne 25b\n" // exit process "int $0x80\n" "test %%eax, %%eax\n" @@ -1085,86 +1250,146 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Set up thread local storage "mov $0x51, %%eax\n" // seg_32bit, limit_in_pages, useable - "push %%eax\n" + "mov %%eax, -0x04(%%ebp)\n" "mov $0xFFFFF, %%eax\n" // limit - "push %%eax\n" - "add $0x54, %%esi\n" - "push %%esi\n" // base_addr = &secure_mem.TLS + "mov %%eax, -0x08(%%ebp)\n" + "movd %%mm5, %%eax\n" + "add $0x58, %%eax\n" + "mov %%eax, -0x0C(%%ebp)\n" // base_addr = &secure_mem.TLS "mov %%fs, %%eax\n" "shr $3, %%eax\n" - "push %%eax\n" // entry_number + "mov %%eax, -0x10(%%ebp)\n" // entry_number "mov $243, %%eax\n" // NR_set_thread_area - "mov %%esp, %%ebx\n" + "lea -0x10(%%ebp), %%ebx\n" "int $0x80\n" "test %%eax, %%eax\n" "jnz 25b\n" // exit process - "add $16, %%esp\n" + + // Copy the caller's signal mask + "movd %%mm5, %%edx\n" + "mov 0x1038(%%edi), %%eax\n" + "mov %%eax, 0x1038(%%edx)\n" + "mov 0x103C(%%edi), %%eax\n" + "mov %%eax, 0x103C(%%edx)\n" // Done creating trusted thread. We can now get ready to return to caller - "mov 0(%%esp), %%esi\n" // %esi = threadFdPub - "add $8, %%esp\n" + "mov 0(%%ebp), %%esi\n" // %esi = threadFdPub + "add $8, %%ebp\n" // Check whether this is the initial thread, or a newly created one. // At startup we run the same code as when we create a new thread. At - // the very top of this function, you will find that we store 999(%rip) + // the very top of this function, you will find that we store 999f // in %%mm3. That is the signal that we should return on the same // stack rather than return to where clone was called. "movd %%mm3, %%eax\n" + "movd %%mm2, %%edx\n" "test %%eax, %%eax\n" - "jne 28f\n" + "jne 29f\n" // Returning from clone() into the newly created thread is special. We // cannot unroll the stack, as we just set up a new stack for this // thread. We have to explicitly restore CPU registers to the values // that they had when the program originally called clone(). - "mov 0x24(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x28(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x2C(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x30(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x34(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x38(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x3C(%%ebp), %%eax\n" - "push %%eax\n" - "cmp %%edi, 4(%%ebp)\n" + // We patch the register values in the signal stack frame so that we + // can ask sigreturn() to restore all registers for us. + "sub $0x4, %%ebp\n" + "mov 0x28(%%edi), %%eax\n" + "mov %%eax, 0x00(%%ebp)\n" // return address + "xor %%eax, %%eax\n" + "mov %%eax, 0x30(%%ebp)\n" // %eax = 0 + "mov 0x2C(%%edi), %%eax\n" + "mov %%eax, 0x1C(%%ebp)\n" // %ebp + "mov 0x30(%%edi), %%eax\n" + "mov %%eax, 0x14(%%ebp)\n" // %edi + "mov 0x34(%%edi), %%eax\n" + "mov %%eax, 0x18(%%ebp)\n" // %esi + "mov 0x38(%%edi), %%eax\n" + "mov %%eax, 0x28(%%ebp)\n" // %edx + "mov 0x3C(%%edi), %%eax\n" + "mov %%eax, 0x2C(%%ebp)\n" // %ecx + "mov 0x40(%%edi), %%eax\n" + "mov %%eax, 0x24(%%ebp)\n" // %ebx + "cmp %%edx, 4(%%edi)\n" "jne 25b\n" // exit process // Nascent thread launches a helper that doesn't share any of our // resources, except for pages mapped as MAP_SHARED. - // clone(0, %esp) - "28:mov $120, %%eax\n" // NR_clone + // clone(SIGCHLD, stack=1) + "29:mov $120, %%eax\n" // NR_clone "mov $17, %%ebx\n" // flags = SIGCHLD - "mov %%esp, %%ecx\n" // stack = %esp + "mov $1, %%ecx\n" // stack = 1 "int $0x80\n" "test %%eax, %%eax\n" "js 25b\n" // exit process - "jne 29f\n" + "jne 31f\n" // Use sendmsg() to send to the trusted process the file handles for // communicating with the new trusted thread. We also send the address // of the secure memory area (for sanity checks) and the thread id. - "push %%esi\n" // threadFdPub - "movd %%mm4, %%eax\n" // threadId - "push %%eax\n" - "movd %%mm5, %%eax\n" // secure_mem - "push %%eax\n" - "mov %%esp, %%ebx\n" // buf = &data - "mov $12, %%eax\n" // len = sizeof(void*) + 2*sizeof(int) - "push %%eax\n" - "push %%ebx\n" - "movd %%mm0, %%eax\n" // fd1 = threadFd - "push %%eax\n" - "push %%esi\n" // fd0 = threadFdPub - "mov 0x48(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub() - "cmp %%edi, 4(%%ebp)\n" + "cmp %%edx, 4(%%edi)\n" "jne 25b\n" // exit process - "push %%eax\n" - "call playground$sendFd\n" + + // 0x00 socketcall: + // 0x00 socket (0x4C(%edi)) + // 0x04 msg (%ecx + 0x0C) + // 0x08 flags ($0) + // 0x0C msg: + // 0x0C msg_name ($0) + // 0x10 msg_namelen ($0) + // 0x14 msg_iov (%ecx + 0x34) + // 0x18 msg_iovlen ($1) + // 0x1C msg_control (%ecx + 0x3C) + // 0x20 msg_controllen ($0x14) + // 0x24 data: + // 0x24 msg_flags/err ($0) + // 0x28 secure_mem (%mm5) + // 0x2C threadId (%mm4) + // 0x30 threadFdPub (%esi) + // 0x34 iov: + // 0x34 iov_base (%ecx + 0x24) + // 0x38 iov_len ($0x10) + // 0x3C cmsg: + // 0x3C cmsg_len ($0x14) + // 0x40 cmsg_level ($1, SOL_SOCKET) + // 0x44 cmsg_type ($1, SCM_RIGHTS) + // 0x48 threadFdPub (%esi) + // 0x4C threadFd (%mm0) + // 0x50 + "lea -0x50(%%ebp), %%ecx\n" + "xor %%eax, %%eax\n" + "mov %%eax, 0x08(%%ecx)\n" // flags + "mov %%eax, 0x0C(%%ecx)\n" // msg_name + "mov %%eax, 0x10(%%ecx)\n" // msg_namelen + "mov %%eax, 0x24(%%ecx)\n" // msg_flags + "inc %%eax\n" + "mov %%eax, 0x18(%%ecx)\n" // msg_iovlen + "mov %%eax, 0x40(%%ecx)\n" // cmsg_level + "mov %%eax, 0x44(%%ecx)\n" // cmsg_type + "movl $0x10, 0x38(%%ecx)\n" // iov_len + "mov $0x14, %%eax\n" + "mov %%eax, 0x20(%%ecx)\n" // msg_controllen + "mov %%eax, 0x3C(%%ecx)\n" // cmsg_len + "mov 0x4C(%%edi), %%eax\n" // cloneFdPub + "mov %%eax, 0x00(%%ecx)\n" // socket + "lea 0x0C(%%ecx), %%eax\n" + "mov %%eax, 0x04(%%ecx)\n" // msg + "add $0x18, %%eax\n" + "mov %%eax, 0x34(%%ecx)\n" // iov_base + "add $0x10, %%eax\n" + "mov %%eax, 0x14(%%ecx)\n" // msg_iov + "add $8, %%eax\n" + "mov %%eax, 0x1C(%%ecx)\n" // msg_control + "mov %%esi, 0x30(%%ecx)\n" // threadFdPub + "mov %%esi, 0x48(%%ecx)\n" // threadFdPub + "movd %%mm5, %%eax\n" + "mov %%eax, 0x28(%%ecx)\n" // secure_mem + "movd %%mm4, %%eax\n" + "mov %%eax, 0x2C(%%ecx)\n" // threadId + "movd %%mm0, %%eax\n" + "mov %%eax, 0x4C(%%ecx)\n" // threadFd + "mov $16, %%ebx\n" // sendmsg() + "mov $102, %%eax\n" // NR_socketcall + "int $0x80\n" // Release syscall_mutex_. This signals the trusted process that // it can write into the original thread's secure memory again. @@ -1173,31 +1398,42 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $4096, %%ecx\n" "mov $3, %%edx\n" // PROT_READ | PROT_WRITE "int $0x80\n" + "movd %%mm2, %%edx\n" + "cmp %%edx, 0x4(%%edi)\n" + "jnz 25b\n" // exit process "lock; addl $0x80000000, (%%ebx)\n" - "jz 26b\n" // exit process (no error message) + "jz 30f\n" // exit process (no error message) "mov $1, %%edx\n" "mov %%edx, %%ecx\n" // FUTEX_WAKE "mov $240, %%eax\n" // NR_futex "int $0x80\n" - "jmp 26b\n" // exit process (no error message) + "30:xor %%ebx, %%ebx\n" + "jmp 27b\n" // exit process (no error message) // Reap helper - "29:mov %%eax, %%ebx\n" - "30:xor %%ecx, %%ecx\n" + "31:mov %%eax, %%ebx\n" + "32:lea -4(%%ebp), %%ecx\n" "xor %%edx, %%edx\n" "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 30b\n" + "jz 32b\n" + "mov -4(%%ebp), %%eax\n" + "test %%eax, %%eax\n" + "jnz 26b\n" // exit process (no error message) // Release privileges by entering seccomp mode. - "mov $172, %%eax\n" // NR_prctl + "33:mov $172, %%eax\n" // NR_prctl "mov $22, %%ebx\n" // PR_SET_SECCOMP "mov $1, %%ecx\n" "int $0x80\n" "test %%eax, %%eax\n" "jnz 25b\n" // exit process + // We can finally start using the stack. Signal handlers no longer pose + // a threat to us. + "mov %%ebp, %%esp\n" + // Back in the newly created sandboxed thread, wait for trusted process // to receive request. It is possible for an attacker to make us // continue even before the trusted process is done. This is OK. It'll @@ -1205,12 +1441,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // data is considered untrusted anyway. "push %%eax\n" "mov $1, %%edx\n" // len = 1 - "mov %%esp, %%ecx\n" // buf = %rsp + "mov %%esp, %%ecx\n" // buf = %esp "mov %%esi, %%ebx\n" // fd = threadFdPub - "31:mov $3, %%eax\n" // NR_read + "34:mov $3, %%eax\n" // NR_read "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 31b\n" + "jz 34b\n" "cmp %%edx, %%eax\n" "jne 25b\n" // exit process "pop %%eax\n" @@ -1223,19 +1459,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // operations. "emms\n" - // Returning to createTrustedThread() "test %%ebx, %%ebx\n" - "jz 32f\n" - "jmp *%%ebx\n" - - // Returning to the place where clone() had been called - "32:pop %%ebx\n" - "pop %%ecx\n" - "pop %%edx\n" - "pop %%esi\n" - "pop %%edi\n" - "pop %%ebp\n" - "ret\n" + "jnz 35f\n" // Returning to createTrustedThread() + + // Returning to the place where clone() had been called. We rely on + // using sigreturn() for restoring our registers. The caller already + // created a signal stack frame, and we patched the register values + // with the ones that were in effect prior to calling sandbox_clone(). + "pop %%ebx\n" + "35:mov %%ebx, 0x38(%%esp)\n" // compute new %eip + "mov $119, %%eax\n" // NR_sigreturn + "int $0x80\n" ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" |