diff options
Diffstat (limited to 'sandbox')
-rw-r--r-- | sandbox/linux/seccomp/clone.cc | 92 | ||||
-rw-r--r-- | sandbox/linux/seccomp/debug.cc | 18 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox.cc | 154 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox_impl.h | 136 | ||||
-rw-r--r-- | sandbox/linux/seccomp/securemem.cc | 18 | ||||
-rw-r--r-- | sandbox/linux/seccomp/securemem.h | 7 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sigprocmask.cc | 120 | ||||
-rw-r--r-- | sandbox/linux/seccomp/syscall.cc | 10 | ||||
-rw-r--r-- | sandbox/linux/seccomp/syscall_table.c | 12 | ||||
-rw-r--r-- | sandbox/linux/seccomp/trusted_process.cc | 2 | ||||
-rw-r--r-- | sandbox/linux/seccomp/trusted_thread.cc | 804 | ||||
-rw-r--r-- | sandbox/sandbox.gyp | 1 |
12 files changed, 965 insertions, 409 deletions
diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc index 148bae5..0bf91c1 100644 --- a/sandbox/linux/seccomp/clone.cc +++ b/sandbox/linux/seccomp/clone.cc @@ -7,7 +7,7 @@ namespace playground { -int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid, +int Sandbox::sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls, void *wrapper_sp) { long long tm; Debug::syscall(&tm, __NR_clone, "Executing handler"); @@ -24,25 +24,77 @@ int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid, request.clone_req.ctid = ctid; request.clone_req.tls = tls; - // Pass along the address on the stack where syscallWrapper() stored the - // original CPU registers. These registers will be restored in the newly - // created thread prior to returning from the wrapped system call. - #if defined(__x86_64__) - memcpy(&request.clone_req.regs64, wrapper_sp, - sizeof(request.clone_req.regs64) + sizeof(void *)); - #elif defined(__i386__) - memcpy(&request.clone_req.regs32, wrapper_sp, - sizeof(request.clone_req.regs32) + sizeof(void *)); - #else - #error Unsupported target platform - #endif - + // TODO(markus): Passing stack == 0 currently does not do the same thing + // that the kernel would do without the sandbox. This is just going to + // cause a crash. We should detect this case, and replace the stack pointer + // with the correct value, instead. + // This is complicated by the fact that we will temporarily be executing + // both threads from the same stack. Some synchronization will be necessary. + // Fortunately, this complication also explains why hardly anybody ever + // does this. + // See trusted_thread.cc for more information. long rc; - SysCalls sys; - if (write(sys, processFdPub(), &request, sizeof(request)) != - sizeof(request) || - read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { - die("Failed to forward clone() request [sandbox]"); + if (stack == 0) { + rc = -EINVAL; + } else { + // Pass along the address on the stack where syscallWrapper() stored the + // original CPU registers. These registers will be restored in the newly + // created thread prior to returning from the wrapped system call. + #if defined(__x86_64__) + memcpy(&request.clone_req.regs64, wrapper_sp, + sizeof(request.clone_req.regs64) + sizeof(void *)); + #elif defined(__i386__) + memcpy(&request.clone_req.regs32, wrapper_sp, + sizeof(request.clone_req.regs32) + sizeof(void *)); + #else + #error Unsupported target platform + #endif + + // In order to unblock the signal mask in the newly created thread and + // after entering Seccomp mode, we have to call sigreturn(). But that + // requires access to a proper stack frame describing a valid signal. + // We trigger a signal now and make sure the stack frame ends up on the + // new stack. Our segv() handler (in sandbox.cc) does that for us. + // See trusted_thread.cc for more details on how threads get created. + // + // In general we rely on the kernel for generating the signal stack + // frame, as the exact binary format has been extended several times over + // the course of the kernel's development. Fortunately, the kernel + // developers treat the initial part of the stack frame as a stable part + // of the ABI. So, we can rely on fixed, well-defined offsets for accessing + // register values and for accessing the signal mask. + #if defined(__x86_64__) || defined(__i386__) + #if defined(__x86_64__) + // Red zone compensation. The instrumented system call will remove 128 + // bytes from the thread's stack prior to returning to the original + // call site. + stack -= 128; + request.clone_req.stack = stack; + #endif + asm("int $0" + : "=m"(request.clone_req.stack) + : "a"(__NR_clone + 0xF000), "d"(&request.clone_req.stack) + : "memory"); + #else + #error Unsupported target platform + #endif + + // Adjust the signal stack frame so that it contains the correct stack + // pointer upon returning from sigreturn(). + #if defined(__x86_64__) + *(char **)(request.clone_req.stack + 0xA0) = stack; + #elif defined(__i386__) + *(char **)(request.clone_req.stack + 0x1C) = stack; + #else + #error Unsupported target platform + #endif + + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward clone() request [sandbox]"); + } } Debug::elapsed(tm, __NR_clone); return static_cast<int>(rc); @@ -64,7 +116,7 @@ bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub, SecureMem::abandonSystemCall(threadFd, -EPERM); return false; } else { - SecureMem::Args* newMem = getSecureMem(); + SecureMem::Args* newMem = getNewSecureMem(); if (!newMem) { SecureMem::abandonSystemCall(threadFd, -ENOMEM); return false; diff --git a/sandbox/linux/seccomp/debug.cc b/sandbox/linux/seccomp/debug.cc index e4d6410..5d6de49 100644 --- a/sandbox/linux/seccomp/debug.cc +++ b/sandbox/linux/seccomp/debug.cc @@ -140,8 +140,8 @@ bool Debug::enter() { asm volatile("mov %%gs, %0\n" "test %0, %0\n" "jz 1f\n" - "movl %%gs:0x1050-0xD8, %0\n" - "incl %%gs:0x1050-0xD8\n" + "movl %%gs:0x1050-0xE0, %0\n" + "incl %%gs:0x1050-0xE0\n" "1:\n" : "=r"(level) : @@ -150,8 +150,8 @@ bool Debug::enter() { asm volatile("mov %%fs, %0\n" "test %0, %0\n" "jz 1f\n" - "movl %%fs:0x1034-0x54, %0\n" - "incl %%fs:0x1034-0x54\n" + "movl %%fs:0x1034-0x58, %0\n" + "incl %%fs:0x1034-0x58\n" "1:\n" : "=r"(level) : @@ -178,8 +178,8 @@ bool Debug::leave() { asm volatile("mov %%gs, %0\n" "test %0, %0\n" "jz 1f\n" - "decl %%gs:0x1050-0xD8\n" - "movl %%gs:0x1050-0xD8, %0\n" + "decl %%gs:0x1050-0xE0\n" + "movl %%gs:0x1050-0xE0, %0\n" "1:\n" : "=r"(level) : @@ -188,8 +188,8 @@ bool Debug::leave() { asm volatile("mov %%fs, %0\n" "test %0, %0\n" "jz 1f\n" - "decl %%fs:0x1034-0x54\n" - "movl %%fs:0x1034-0x54, %0\n" + "decl %%fs:0x1034-0x58\n" + "movl %%fs:0x1034-0x58, %0\n" "1:\n" : "=r"(level) : @@ -234,7 +234,7 @@ void Debug::gettimeofday(long long* tm) { // Zero out the lastSyscallNum, so that we don't try to coalesce // calls to gettimeofday(). For debugging purposes, we need the // exact time. - asm volatile("movl $0, %fs:0x102C-0x54"); + asm volatile("movl $0, %fs:0x102C-0x58"); #elif !defined(__x86_64__) #error Unsupported target platform #endif diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc index 12f0c0f..b7a249e 100644 --- a/sandbox/linux/seccomp/sandbox.cc +++ b/sandbox/linux/seccomp/sandbox.cc @@ -112,25 +112,23 @@ bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) { } void Sandbox::setupSignalHandlers() { + // Set SIGCHLD to SIG_DFL so that waitpid() can work SysCalls sys; struct SysCalls::kernel_sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_handler_ = SIG_DFL; sys.sigaction(SIGCHLD, &sa, NULL); - // Set up SEGV handler for dealing with RDTSC instructions + // Set up SEGV handler for dealing with RDTSC instructions, system calls + // that have been rewritten to use INT0, and for sigpending() emulation. sa.sa_handler_ = segv(); sys.sigaction(SIGSEGV, &sa, NULL); - // Block all asynchronous signals, except for SIGCHLD which needs to be - // set to SIG_DFL for waitpid() to work. + // Unblock SIGSEGV and SIGCHLD SysCalls::kernel_sigset_t mask; - memset(&mask, 0xFF, sizeof(mask)); - mask.sig[0] &= ~((1 << (SIGSEGV - 1)) | (1 << (SIGINT - 1)) | - (1 << (SIGTERM - 1)) | (1 << (SIGQUIT - 1)) | - (1 << (SIGHUP - 1)) | (1 << (SIGABRT - 1)) | - (1 << (SIGCHLD - 1))); - sys.sigprocmask(SIG_SETMASK, &mask, 0); + memset(&mask, 0x00, sizeof(mask)); + mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1)); + sys.sigprocmask(SIG_UNBLOCK, &mask, 0); } void (*Sandbox::segv())(int signo) { @@ -158,7 +156,7 @@ void (*Sandbox::segv())(int signo) { "sub $4, %%rsp\n" "push %%r14\n" "mov %%gs:16, %%edi\n" // fd = threadFdPub - "mov %%rsp, %%rsi\n" // buf = %esp + "mov %%rsp, %%rsi\n" // buf = %rsp "mov $4, %%edx\n" // len = sizeof(int) "1:mov $1, %%eax\n" // NR_write "syscall\n" @@ -199,8 +197,8 @@ void (*Sandbox::segv())(int signo) { // of playground::Library being unable to find a way to safely // rewrite the system call instruction. Retrieve the CPU register // at the time of the segmentation fault and invoke syscallWrapper(). - "8:cmpw $0xCD, (%%r15)\n" // INT $0x0 - "jnz 9f\n" + "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0 + "jnz 14f\n" #ifndef NDEBUG "lea 200f(%%rip), %%rdi\n" "call playground$debugMessage\n" @@ -212,7 +210,53 @@ void (*Sandbox::segv())(int signo) { "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault - "lea 7b(%%rip), %%rcx\n" + + // Handle rt_sigprocmask() + "cmp $14, %%rax\n" // NR_rt_sigprocmask + "jnz 12f\n" + "mov $-22, %%rax\n" // -EINVAL + "cmp $8, %%r10\n" // %r10 = sigsetsize (8 bytes = 64 signals) + "jl 7b\n" + "mov 0x130(%%rsp), %%r10\n" // signal mask at time of segmentation fault + "test %%rsi, %%rsi\n" // only set mask, if set is non-NULL + "jz 11f\n" + "mov 0(%%rsi), %%rsi\n" + "cmp $0, %%rdi\n" // %rdi = how (SIG_BLOCK) + "jnz 9f\n" + "or %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "jmp 11f\n" + "9:cmp $1, %%rdi\n" // %rdi = how (SIG_UNBLOCK) + "jnz 10f\n" + "xor $-1, %%rsi\n" + "and %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "jmp 11f\n" + "10:cmp $2, %%rdi\n" // %rdi = how (SIG_SETMASK) + "jnz 7b\n" + "mov %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "11:xor %%rax, %%rax\n" + "test %%rdx, %%rdx\n" // only return old mask, if set is non-NULL + "jz 7b\n" + "mov %%r10, 0(%%rdx)\n" // old_set + "jmp 7b\n" + + + // Copy signal frame onto new stack. See clone.cc for details + "12:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000 + "jnz 13f\n" + "mov 0xA8(%%rsp), %%rcx\n" // %rsp at time of segmentation fault + "sub %%rsp, %%rcx\n" // %rcx = size of stack frame + "sub $8, %%rcx\n" // skip return address + "mov %%rcx, %%rax\n" // return size of signal stack frame + "mov 0(%%rdx), %%rdi\n" // stack for newly clone()'d thread + "sub %%rcx, %%rdi\n" // copy onto new stack + "mov %%rdi, 0(%%rdx)\n" // allocate space on new stack + "lea 8(%%rsp), %%rsi\n" // copy from current stack + "cld\n" + "rep movsb\n" + "jmp 7b\n" + + // Forward system call to syscallWrapper() + "13:lea 7b(%%rip), %%rcx\n" "push %%rcx\n" "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault "lea playground$syscallWrapper(%%rip), %%rcx\n" @@ -221,7 +265,7 @@ void (*Sandbox::segv())(int signo) { // This was a genuine segmentation fault. Trigger the kernel's default // signal disposition. The only way we can do this from seccomp mode // is by blocking the signal and retriggering it. - "9:mov $2, %%edi\n" // stderr + "14:mov $2, %%edi\n" // stderr "lea 300f(%%rip), %%rsi\n" // "Segmentation fault\n" "mov $301f-300f, %%edx\n" "mov $1, %%eax\n" // NR_write @@ -293,8 +337,8 @@ void (*Sandbox::segv())(int signo) { // of playground::Library being unable to find a way to safely // rewrite the system call instruction. Retrieve the CPU register // at the time of the segmentation fault and invoke syscallWrapper(). - "8:cmpw $0xCD, (%%ebp)\n" // INT $0x0 - "jnz 9f\n" + "8:cmpw $0x00CD, (%%ebp)\n" // INT $0x0 + "jnz 16f\n" #ifndef NDEBUG "lea 200f, %%eax\n" "push %%eax\n" @@ -308,13 +352,69 @@ void (*Sandbox::segv())(int signo) { "mov 0x1C(%%esp), %%esi\n" // %esi at time of segmentation fault "mov 0x18(%%esp), %%edi\n" // %edi at time of segmentation fault "mov 0x20(%%esp), %%ebp\n" // %ebp at time of segmentation fault - "call playground$syscallWrapper\n" + + // Handle sigprocmask() and rt_sigprocmask() + "cmp $175, %%eax\n" // NR_rt_sigprocmask + "jnz 9f\n" + "mov $-22, %%eax\n" // -EINVAL + "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals) + "jl 7b\n" + "jmp 10f\n" + "9:cmp $126, %%eax\n" // NR_sigprocmask + "jnz 14f\n" + "mov $-22, %%eax\n" + "10:mov 0x58(%%esp), %%edi\n" // signal mask at time of segmentation fault + "mov 0x5C(%%esp), %%ebp\n" + "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL + "jz 13f\n" + "mov 0(%%ecx), %%esi\n" + "mov 4(%%ecx), %%ecx\n" + "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK) + "jnz 11f\n" + "or %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault + "or %%ecx, 0x5C(%%esp)\n" + "jmp 13f\n" + "11:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK) + "jnz 12f\n" + "xor $-1, %%esi\n" + "xor $-1, %%ecx\n" + "and %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault + "and %%ecx, 0x5C(%%esp)\n" + "jmp 13f\n" + "12:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK) + "jnz 7b\n" + "mov %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault + "mov %%ecx, 0x5C(%%esp)\n" + "13:xor %%eax, %%eax\n" + "test %%edx, %%edx\n" // only return old mask, if set is non-NULL + "jz 7b\n" + "mov %%edi, 0(%%edx)\n" // old_set + "mov %%ebp, 4(%%edx)\n" + "jmp 7b\n" + + // Copy signal frame onto new stack. See clone.cc for details + "14:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000 + "jnz 15f\n" + "mov 0x24(%%esp), %%ecx\n" // %esp at time of segmentation fault + "sub %%esp, %%ecx\n" // %ecx = size of stack frame + "sub $8, %%ecx\n" // skip return address and dummy + "mov %%ecx, %%eax\n" // return size of signal stack frame + "mov 0(%%edx), %%edi\n" // stack for newly clone()'d thread + "sub %%ecx, %%edi\n" // copy onto new stack + "mov %%edi, 0(%%edx)\n" // allocate space on new stack + "lea 8(%%esp), %%esi\n" // copy from current stack + "cld\n" + "rep movsb\n" + "jmp 7b\n" + + // Forward system call to syscallWrapper() + "15:call playground$syscallWrapper\n" "jmp 7b\n" // This was a genuine segmentation fault. Trigger the kernel's default // signal disposition. The only way we can do this from seccomp mode // is by blocking the signal and retriggering it. - "9:mov $2, %%ebx\n" // stderr + "16:mov $2, %%ebx\n" // stderr "lea 300f, %%ecx\n" // "Segmentation fault\n" "mov $301f-300f, %%edx\n" "mov $4, %%eax\n" // NR_write @@ -345,6 +445,24 @@ void (*Sandbox::segv())(int signo) { return fnc; } +SecureMem::Args* Sandbox::getSecureMem() { + // Check trusted_thread.cc for the magic offset that gets us from the TLS + // to the beginning of the secure memory area. + SecureMem::Args* ret; +#if defined(__x86_64__) + asm volatile( + "movq %%gs:-0xE0, %0\n" + : "=q"(ret)); +#elif defined(__i386__) + asm volatile( + "movl %%fs:-0x58, %0\n" + : "=r"(ret)); +#else +#error Unsupported target platform +#endif + return ret; +} + void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) { SysCalls sys; if (sys.lseek(proc_self_maps, 0, SEEK_SET) || diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h index 18a359c..36f01c8 100644 --- a/sandbox/linux/seccomp/sandbox_impl.h +++ b/sandbox/linux/seccomp/sandbox_impl.h @@ -56,7 +56,7 @@ class Sandbox { // "proc_fd" should be a file descriptor for "/proc", or -1 if not provided // by the caller. static int supportsSeccompSandbox(int proc_fd) - asm("SupportsSeccompSandbox"); + asm("SupportsSeccompSandbox"); // The sandbox needs to be able to access "/proc/self/maps". If this file // is not accessible when "startSandbox()" gets called, the caller can @@ -64,12 +64,12 @@ class Sandbox { // The sandbox becomes the newer owner of this file descriptor and will // eventually close it when "startSandbox()" executes. static void setProcSelfMaps(int proc_self_maps) - asm("SeccompSandboxSetProcSelfMaps"); + asm("SeccompSandboxSetProcSelfMaps"); // This is the main public entry point. It finds all system calls that // need rewriting, sets up the resources needed by the sandbox, and // enters Seccomp mode. - static void startSandbox() asm("StartSeccompSandbox"); + static void startSandbox() asm("StartSeccompSandbox"); private: // syscall_table.c has to be implemented in C, as C++ does not support @@ -84,7 +84,7 @@ class Sandbox { // Clone() is special as it has a wrapper in syscall_table.c. The wrapper // adds one extra argument (the pointer to the saved registers) and then // calls playground$sandbox__clone(). - static int sandbox_clone(int flags, void* stack, int* pid, int* ctid, + static int sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls, void* wrapper_sp) asm("playground$sandbox__clone") #if defined(__x86_64__) @@ -96,130 +96,142 @@ class Sandbox { #define bool int #define SecureMemArgs void // This is the wrapper entry point that is found in the syscall_table. - int sandbox_clone(int flags, void* stack, int* pid, int* ctid, void* tls) - asm("playground$sandbox_clone"); + int sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls) + asm("playground$sandbox_clone"); #endif // Entry points for sandboxed code that is attempting to make system calls STATIC int sandbox_access(const char*, int) - asm("playground$sandbox_access"); - STATIC int sandbox_exit(int status) asm("playground$sandbox_exit"); - STATIC int sandbox_getpid() asm("playground$sandbox_getpid"); + asm("playground$sandbox_access"); + STATIC int sandbox_exit(int status) asm("playground$sandbox_exit"); + STATIC int sandbox_getpid() asm("playground$sandbox_getpid"); #if defined(__NR_getsockopt) STATIC int sandbox_getsockopt(int, int, int, void*, socklen_t*) - asm("playground$sandbox_getsockopt"); + asm("playground$sandbox_getsockopt"); #endif - STATIC int sandbox_gettid() asm("playground$sandbox_gettid"); + STATIC int sandbox_gettid() asm("playground$sandbox_gettid"); STATIC int sandbox_ioctl(int d, int req, void* arg) - asm("playground$sandbox_ioctl"); + asm("playground$sandbox_ioctl"); #if defined(__NR_ipc) STATIC int sandbox_ipc(unsigned, int, int, int, void*, long) - asm("playground$sandbox_ipc"); + asm("playground$sandbox_ipc"); #endif STATIC int sandbox_lstat(const char* path, void* buf) - asm("playground$sandbox_lstat"); + asm("playground$sandbox_lstat"); #if defined(__NR_lstat64) STATIC int sandbox_lstat64(const char *path, void* b) - asm("playground$sandbox_lstat64"); + asm("playground$sandbox_lstat64"); #endif STATIC int sandbox_madvise(void*, size_t, int) - asm("playground$sandbox_madvise"); + asm("playground$sandbox_madvise"); STATIC void *sandbox_mmap(void* start, size_t length, int prot, int flags, int fd, off_t offset) - asm("playground$sandbox_mmap"); + asm("playground$sandbox_mmap"); STATIC int sandbox_mprotect(const void*, size_t, int) - asm("playground$sandbox_mprotect"); + asm("playground$sandbox_mprotect"); STATIC int sandbox_munmap(void* start, size_t length) - asm("playground$sandbox_munmap"); + asm("playground$sandbox_munmap"); STATIC int sandbox_open(const char*, int, mode_t) - asm("playground$sandbox_open"); + asm("playground$sandbox_open"); #if defined(__NR_recvfrom) STATIC ssize_t sandbox_recvfrom(int, void*, size_t, int, void*, socklen_t*) - asm("playground$sandbox_recvfrom"); + asm("playground$sandbox_recvfrom"); STATIC ssize_t sandbox_recvmsg(int, struct msghdr*, int) - asm("playground$sandbox_recvmsg"); + asm("playground$sandbox_recvmsg"); + #endif + #if defined(__NR_rt_sigprocmask) + STATIC int sandbox_rt_sigprocmask(int how, const void*, void*, size_t) + asm("playground$sandbox_rt_sigprocmask"); + #endif + #if defined(__NR_sendmsg) STATIC size_t sandbox_sendmsg(int, const struct msghdr*, int) - asm("playground$sandbox_sendmsg"); + asm("playground$sandbox_sendmsg"); STATIC ssize_t sandbox_sendto(int, const void*, size_t, int, const void*, socklen_t)asm("playground$sandbox_sendto"); + #endif #if defined(__NR_shmat) STATIC void* sandbox_shmat(int, const void*, int) - asm("playground$sandbox_shmat"); + asm("playground$sandbox_shmat"); STATIC int sandbox_shmctl(int, int, void*) - asm("playground$sandbox_shmctl"); - STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt"); + asm("playground$sandbox_shmctl"); + STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt"); STATIC int sandbox_shmget(int, size_t, int) - asm("playground$sandbox_shmget"); + asm("playground$sandbox_shmget"); #endif + #if defined(__NR_setsockopt) STATIC int sandbox_setsockopt(int, int, int, const void*, socklen_t) - asm("playground$sandbox_setsockopt"); + asm("playground$sandbox_setsockopt"); + #endif + #if defined(__NR_sigprocmask) + STATIC int sandbox_sigprocmask(int how, const void*, void*) + asm("playground$sandbox_sigprocmask"); #endif #if defined(__NR_socketcall) STATIC int sandbox_socketcall(int call, void* args) - asm("playground$sandbox_socketcall"); + asm("playground$sandbox_socketcall"); #endif STATIC int sandbox_stat(const char* path, void* buf) - asm("playground$sandbox_stat"); + asm("playground$sandbox_stat"); #if defined(__NR_stat64) STATIC int sandbox_stat64(const char *path, void* b) - asm("playground$sandbox_stat64"); + asm("playground$sandbox_stat64"); #endif // Functions for system calls that need to be handled in the trusted process STATIC bool process_access(int, int, int, int, SecureMemArgs*) - asm("playground$process_access"); + asm("playground$process_access"); STATIC bool process_clone(int, int, int, int, SecureMemArgs*) - asm("playground$process_clone"); + asm("playground$process_clone"); STATIC bool process_exit(int, int, int, int, SecureMemArgs*) - asm("playground$process_exit"); + asm("playground$process_exit"); #if defined(__NR_getsockopt) STATIC bool process_getsockopt(int, int, int, int, SecureMemArgs*) - asm("playground$process_getsockopt"); + asm("playground$process_getsockopt"); #endif STATIC bool process_ioctl(int, int, int, int, SecureMemArgs*) - asm("playground$process_ioctl"); + asm("playground$process_ioctl"); #if defined(__NR_ipc) STATIC bool process_ipc(int, int, int, int, SecureMemArgs*) - asm("playground$process_ipc"); + asm("playground$process_ipc"); #endif STATIC bool process_madvise(int, int, int, int, SecureMemArgs*) - asm("playground$process_madvise"); + asm("playground$process_madvise"); STATIC bool process_mmap(int, int, int, int, SecureMemArgs*) - asm("playground$process_mmap"); + asm("playground$process_mmap"); STATIC bool process_mprotect(int, int, int, int, SecureMemArgs*) - asm("playground$process_mprotect"); + asm("playground$process_mprotect"); STATIC bool process_munmap(int, int, int, int, SecureMemArgs*) - asm("playground$process_munmap"); + asm("playground$process_munmap"); STATIC bool process_open(int, int, int, int, SecureMemArgs*) - asm("playground$process_open"); + asm("playground$process_open"); #if defined(__NR_recvfrom) STATIC bool process_recvfrom(int, int, int, int, SecureMemArgs*) - asm("playground$process_recvfrom"); + asm("playground$process_recvfrom"); STATIC bool process_recvmsg(int, int, int, int, SecureMemArgs*) - asm("playground$process_recvmsg"); + asm("playground$process_recvmsg"); STATIC bool process_sendmsg(int, int, int, int, SecureMemArgs*) - asm("playground$process_sendmsg"); + asm("playground$process_sendmsg"); STATIC bool process_sendto(int, int, int, int, SecureMemArgs*) - asm("playground$process_sendto"); + asm("playground$process_sendto"); STATIC bool process_setsockopt(int, int, int, int, SecureMemArgs*) - asm("playground$process_setsockopt"); + asm("playground$process_setsockopt"); #endif #if defined(__NR_shmat) STATIC bool process_shmat(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmat"); + asm("playground$process_shmat"); STATIC bool process_shmctl(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmctl"); + asm("playground$process_shmctl"); STATIC bool process_shmdt(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmdt"); + asm("playground$process_shmdt"); STATIC bool process_shmget(int, int, int, int, SecureMemArgs*) - asm("playground$process_shmget"); + asm("playground$process_shmget"); #endif #if defined(__NR_socketcall) STATIC bool process_socketcall(int, int, int, int, SecureMemArgs*) - asm("playground$process_socketcall"); + asm("playground$process_socketcall"); #endif STATIC bool process_stat(int, int, int, int, SecureMemArgs*) - asm("playground$process_stat"); + asm("playground$process_stat"); #ifdef __cplusplus friend class Debug; @@ -294,13 +306,11 @@ class Sandbox { } // Sends a file handle to another process. + // N.B. trusted_thread.cc has an assembly version of this function that + // is safe to use without a call stack. If the wire-format is changed, + /// make sure to update the assembly code. static bool sendFd(int transport, int fd0, int fd1, const void* buf, - size_t len) - asm("playground$sendFd") - #if defined(__x86_64__) - __attribute__((visibility("internal"))) - #endif - ; + size_t len); // If getFd() fails, it will set the first valid fd slot (e.g. fd0) to // -errno. @@ -334,7 +344,7 @@ class Sandbox { struct Clone { int flags; - void* stack; + char* stack; int* pid; int* ctid; void* tls; @@ -584,6 +594,7 @@ class Sandbox { static int tid() { return TLS::getTLSValue<int>(TLS_TID); } static int threadFdPub() { return TLS::getTLSValue<int>(TLS_THREAD_FD); } static int processFdPub() { return processFdPub_; } + static kernel_sigset_t* signalMask() { return &getSecureMem()->signalMask; } // The SEGV handler knows how to handle RDTSC instructions static void setupSignalHandlers(); @@ -601,9 +612,12 @@ class Sandbox { #endif ; + // Return the current secure memory structure for this thread. + static SecureMem::Args* getSecureMem(); + // Return a secure memory structure that can be used by a newly created // thread. - static SecureMem::Args* getSecureMem(); + static SecureMem::Args* getNewSecureMem(); // This functions runs in the trusted process at startup and finds all the // memory mappings that existed when the sandbox was first enabled. Going diff --git a/sandbox/linux/seccomp/securemem.cc b/sandbox/linux/seccomp/securemem.cc index 0071c45..5f07bbe 100644 --- a/sandbox/linux/seccomp/securemem.cc +++ b/sandbox/linux/seccomp/securemem.cc @@ -72,13 +72,14 @@ void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd, : "q"(&mem->sequence) : "memory"); } - mem->syscallNum = syscallNum; - mem->arg1 = arg1; - mem->arg2 = arg2; - mem->arg3 = arg3; - mem->arg4 = arg4; - mem->arg5 = arg5; - mem->arg6 = arg6; + mem->callType = locked ? -2 : -1; + mem->syscallNum = syscallNum; + mem->arg1 = arg1; + mem->arg2 = arg2; + mem->arg3 = arg3; + mem->arg4 = arg4; + mem->arg5 = arg5; + mem->arg6 = arg6; asm volatile( #if defined(__x86_64__) "lock; incq (%0)\n" @@ -90,9 +91,8 @@ void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd, : : "q"(&mem->sequence) : "memory"); - int data = locked ? -2 : -1; Sandbox::SysCalls sys; - if (Sandbox::write(sys, fd, &data, sizeof(data)) != sizeof(data)) { + if (Sandbox::write(sys, fd, &mem->callType, sizeof(int)) != sizeof(int)) { Sandbox::die("Failed to send system call"); } if (parentMapsFd >= 0) { diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h index ac7823e..dc035ff 100644 --- a/sandbox/linux/seccomp/securemem.h +++ b/sandbox/linux/seccomp/securemem.h @@ -6,6 +6,7 @@ #define SECURE_MEM_H__ #include <stdlib.h> +#include "linux_syscall_support.h" namespace playground { @@ -28,6 +29,7 @@ class SecureMem { struct { struct Args* self; long sequence; + long callType; long syscallNum; void* arg1; void* arg2; @@ -92,7 +94,7 @@ class SecureMem { struct { // This scratch space is used by the trusted thread to read parameters // for unrestricted system calls. - long tmpSyscallNum; + int tmpSyscallNum; void* tmpArg1; void* tmpArg2; void* tmpArg3; @@ -115,6 +117,9 @@ class SecureMem { // result in additional system calls. Make sure that we don't trigger // logging of those recursive calls. int recursionLevel; + + // Computing the signal mask is expensive. Keep a cached copy. + kernel_sigset_t signalMask; } __attribute__((packed)); char scratchPage[4096]; }; diff --git a/sandbox/linux/seccomp/sigprocmask.cc b/sandbox/linux/seccomp/sigprocmask.cc new file mode 100644 index 0000000..f3ad1fb --- /dev/null +++ b/sandbox/linux/seccomp/sigprocmask.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +// If the sandboxed process tries to mask SIGSEGV, there is a good chance +// the process will eventually get terminated. If this is really ever a +// problem, we can hide the fact that SIGSEGV is unmasked. But I don't think +// we really need this. Masking of synchronous signals is rarely necessary. + +#if defined(__NR_sigprocmask) +int Sandbox::sandbox_sigprocmask(int how, const void* set, void* old_set) { + long long tm; + Debug::syscall(&tm, __NR_sigprocmask, "Executing handler"); + + // Access the signal mask by triggering a SEGV and modifying the signal state + // prior to calling rt_sigreturn(). + long res = -ENOSYS; + #if defined(__x86_64__) + #error x86-64 does not support sigprocmask(); use rt_sigprocmask() instead + #elif defined(__i386__) + asm volatile( + "push %%ebx\n" + "movl %2, %%ebx\n" + "int $0\n" + "pop %%ebx\n" + : "=a"(res) + : "0"(__NR_sigprocmask), "ri"((long)how), + "c"((long)set), "d"((long)old_set) + : "esp", "memory"); + #else + #error Unsupported target platform + #endif + + // Update our shadow signal mask, so that we can copy it upon creation of + // new threads. + if (res == 0 && set != NULL) { + SecureMem::Args* args = getSecureMem(); + switch (how) { + case SIG_BLOCK: + *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set; + break; + case SIG_UNBLOCK: + *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set; + break; + case SIG_SETMASK: + *(unsigned long long *)&args->signalMask = *(unsigned long long *)set; + break; + default: + break; + } + } + + Debug::elapsed(tm, __NR_sigprocmask); + + return (int)res; +} +#endif + +#if defined(__NR_rt_sigprocmask) +int Sandbox::sandbox_rt_sigprocmask(int how, const void* set, void* old_set, + size_t bytes) { + long long tm; + Debug::syscall(&tm, __NR_rt_sigprocmask, "Executing handler"); + + // Access the signal mask by triggering a SEGV and modifying the signal state + // prior to calling rt_sigreturn(). + long res = -ENOSYS; + #if defined(__x86_64__) + asm volatile( + "movq %5, %%r10\n" + "int $0\n" + : "=a"(res) + : "0"(__NR_rt_sigprocmask), "D"((long)how), + "S"((long)set), "d"((long)old_set), "r"((long)bytes) + : "r10", "r11", "rcx", "memory"); + #elif defined(__i386__) + asm volatile( + "push %%ebx\n" + "movl %2, %%ebx\n" + "int $0\n" + "pop %%ebx\n" + : "=a"(res) + : "0"(__NR_rt_sigprocmask), "ri"((long)how), + "c"((long)set), "d"((long)old_set), "S"((long)bytes) + : "esp", "memory"); + #else + #error Unsupported target platform + #endif + + // Update our shadow signal mask, so that we can copy it upon creation of + // new threads. + if (res == 0 && set != NULL && bytes >= 8) { + SecureMem::Args* args = getSecureMem(); + switch (how) { + case SIG_BLOCK: + *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set; + break; + case SIG_UNBLOCK: + *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set; + break; + case SIG_SETMASK: + *(unsigned long long *)&args->signalMask = *(unsigned long long *)set; + break; + default: + break; + } + } + + Debug::elapsed(tm, __NR_rt_sigprocmask); + + return (int)res; +} +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc index 7f431a3..76e96e4 100644 --- a/sandbox/linux/seccomp/syscall.cc +++ b/sandbox/linux/seccomp/syscall.cc @@ -165,7 +165,7 @@ asm( // the time. There might be a repeated pattern of those. "cmp $78, %eax\n" // __NR_gettimeofday "jnz 2f\n" - "cmp %eax, %fs:0x102C-0x54\n" // last system call + "cmp %eax, %fs:0x102C-0x58\n" // last system call "jnz 0f\n" // This system call and the last system call prior to this one both are @@ -173,7 +173,7 @@ asm( // return the same result as in the previous call. // Just in case the caller is spinning on the result from gettimeofday(), // every so often, call the actual system call. - "decl %fs:0x1030-0x54\n" // countdown calls to gettimofday() + "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday() "jz 0f\n" // Atomically read the 64bit word representing last-known timestamp and @@ -190,8 +190,8 @@ asm( // This is a call to gettimeofday(), but we don't have a valid cached // result, yet. - "0:mov %eax, %fs:0x102C-0x54\n" // remember syscall number - "movl $500, %fs:0x1030-0x54\n" // make system call, each 500 invocations + "0:mov %eax, %fs:0x102C-0x58\n" // remember syscall number + "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations "call playground$defaultSystemCallHandler\n" // Returned from gettimeofday(). Remember return value, in case the @@ -212,7 +212,7 @@ asm( // would still like to coalesce the gettimeofday() calls. "2:cmp $224, %eax\n" // __NR_gettid "jz 3f\n" - "mov %eax, %fs:0x102C-0x54\n" // remember syscall number + "mov %eax, %fs:0x102C-0x58\n" // remember syscall number // Retrieve function call from system call table (c.f. syscall_table.c). // We have three different types of entries; zero for denied system calls, diff --git a/sandbox/linux/seccomp/syscall_table.c b/sandbox/linux/seccomp/syscall_table.c index 2f66ca3..454ffa9 100644 --- a/sandbox/linux/seccomp/syscall_table.c +++ b/sandbox/linux/seccomp/syscall_table.c @@ -96,19 +96,31 @@ const struct SyscallTable syscallTable[] __attribute__(( #if defined(__NR_recvfrom) [ __NR_recvfrom ] = { (void*)&sandbox_recvfrom, process_recvfrom }, [ __NR_recvmsg ] = { (void*)&sandbox_recvmsg, process_recvmsg }, + #endif + #if defined(__NR_rt_sigprocmask) + [ __NR_rt_sigprocmask ] = { (void*)&sandbox_rt_sigprocmask, 0 }, + #endif + #if defined(__NR_sendmsg) [ __NR_sendmsg ] = { (void*)&sandbox_sendmsg, process_sendmsg }, [ __NR_sendto ] = { (void*)&sandbox_sendto, process_sendto }, #endif [ __NR_set_robust_list ] = { UNRESTRICTED_SYSCALL, 0 }, #if defined(__NR_setsockopt) [ __NR_setsockopt ] = { (void*)&sandbox_setsockopt,process_setsockopt }, + #endif #if defined(__NR_shmat) [ __NR_shmat ] = { (void*)&sandbox_shmat, process_shmat }, [ __NR_shmctl ] = { (void*)&sandbox_shmctl, process_shmctl }, [ __NR_shmdt ] = { (void*)&sandbox_shmdt, process_shmdt }, [ __NR_shmget ] = { (void*)&sandbox_shmget, process_shmget }, #endif + #if defined(__NR_shutdown) [ __NR_shutdown ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + #if defined(__NR_sigprocmask) + [ __NR_sigprocmask ] = { (void*)&sandbox_sigprocmask, 0 }, + #endif + #if defined(__NR_socketpair) [ __NR_socketpair ] = { UNRESTRICTED_SYSCALL, 0 }, #endif #if defined(__NR_socketcall) diff --git a/sandbox/linux/seccomp/trusted_process.cc b/sandbox/linux/seccomp/trusted_process.cc index 1320839..80adbf6 100644 --- a/sandbox/linux/seccomp/trusted_process.cc +++ b/sandbox/linux/seccomp/trusted_process.cc @@ -16,7 +16,7 @@ struct Thread { SecureMem::Args* mem; }; -SecureMem::Args* Sandbox::getSecureMem() { +SecureMem::Args* Sandbox::getNewSecureMem() { if (!secureMemPool_.empty()) { SecureMem::Args* rc = secureMemPool_.back(); secureMemPool_.pop_back(); diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc index c73091c..240e65f 100644 --- a/sandbox/linux/seccomp/trusted_thread.cc +++ b/sandbox/linux/seccomp/trusted_thread.cc @@ -21,6 +21,44 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov %0, %%rbp\n" // %rbp = args "xor %%rbx, %%rbx\n" // initial sequence number "lea 999f(%%rip), %%r15\n" // continue in same thread + + // Signal handlers are process-wide. This means that for security + // reasons, we cannot allow that the trusted thread ever executes any + // signal handlers. + // We prevent the execution of signal handlers by setting a signal + // mask that blocks all signals. In addition, we make sure that the + // stack pointer is invalid. + // We cannot reset the signal mask until after we have enabled + // Seccomp mode. Our sigprocmask() wrapper would normally do this by + // raising a signal, modifying the signal mask in the kernel-generated + // signal frame, and then calling sigreturn(). This presents a bit of + // a Catch-22, as all signals are masked and we can therefore not + // raise any signal that would allow us to generate the signal stack + // frame. + // Instead, we have to create the signal stack frame prior to entering + // Seccomp mode. This incidentally also helps us to restore the + // signal mask to the same value that it had prior to entering the + // sandbox. + // The signal wrapper for clone() is the second entry point into this + // code (by means of sending an IPC to its trusted thread). It goes + // through the same steps of creating a signal stack frame on the + // newly created thread's stacks prior to cloning. See clone.cc for + // details. + "mov $56+0xF000, %%eax\n" // __NR_clone + 0xF000 + "sub $8, %%rsp\n" + "mov %%rsp, %%rdx\n" // push a signal stack frame (see clone.cc) + "mov %%rsp, 0(%%rsp)\n" + "int $0\n" + "mov 0(%%rsp), %%r9\n" + "add $8, 0xA0(%%r9)\n" // pop stack upon call to sigreturn() + "mov $2, %%rdi\n" // how = SIG_SETMASK + "movq $-1, 0(%%rsp)\n" + "mov %%rsp, %%rsi\n" // set = full mask + "xor %%rdx, %%rdx\n" // old_set = NULL + "mov $8, %%r10\n" // mask all 64 signals + "mov $14, %%eax\n" // NR_rt_sigprocmask + "syscall\n" + "xor %%rsp, %%rsp\n" // invalidate the stack in all trusted code "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger @@ -36,42 +74,44 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // %rbx: sequence number for trusted calls // Temporary variables: - // %r9: system call number + // %r8: child stack + // %r9: system call number, child stack // %rbp: secure memory of previous thread // Layout of secure shared memory region (c.f. securemem.h): // 0x00: pointer to the secure shared memory region (i.e. self) // 0x08: sequence number; must match %rbx - // 0x10: system call number; passed to syscall in %rax - // 0x18: first argument; passed to syscall in %rdi - // 0x20: second argument; passed to syscall in %rsi - // 0x28: third argument; passed to syscall in %rdx - // 0x30: fourth argument; passed to syscall in %r10 - // 0x38: fifth argument; passed to syscall in %r8 - // 0x40: sixth argument; passed to syscall in %r9 - // 0x48: stored return address for clone() system call - // 0x50: stored %rbp value for clone() system call - // 0x58: stored %rbx value for clone() system call - // 0x60: stored %rcx value for clone() system call - // 0x68: stored %rdx value for clone() system call - // 0x70: stored %rsi value for clone() system call - // 0x78: stored %rdi value for clone() system call - // 0x80: stored %r8 value for clone() system call - // 0x88: stored %r9 value for clone() system call - // 0x90: stored %r10 value for clone() system call - // 0x98: stored %r11 value for clone() system call - // 0xA0: stored %r12 value for clone() system call - // 0xA8: stored %r13 value for clone() system call - // 0xB0: stored %r14 value for clone() system call - // 0xB8: stored %r15 value for clone() system call - // 0xC0: new shared memory for clone() - // 0xC8: processFdPub for talking to trusted process - // 0xCC: cloneFdPub for talking to trusted process - // 0xD0: set to non-zero, if in debugging mode - // 0xD4: most recent SHM id returned by shmget(IPC_PRIVATE) - // 0xD8: cookie assigned to us by the trusted process (TLS_COOKIE) - // 0xE0: thread id (TLS_TID) - // 0xE8: threadFdPub (TLS_THREAD_FD) + // 0x10: call type; must match %eax, iff %eax == -1 || %eax == -2 + // 0x18: system call number; passed to syscall in %rax + // 0x20: first argument; passed to syscall in %rdi + // 0x28: second argument; passed to syscall in %rsi + // 0x30: third argument; passed to syscall in %rdx + // 0x38: fourth argument; passed to syscall in %r10 + // 0x40: fifth argument; passed to syscall in %r8 + // 0x48: sixth argument; passed to syscall in %r9 + // 0x50: stored return address for clone() system call + // 0x58: stored %rbp value for clone() system call + // 0x60: stored %rbx value for clone() system call + // 0x68: stored %rcx value for clone() system call + // 0x70: stored %rdx value for clone() system call + // 0x78: stored %rsi value for clone() system call + // 0x80: stored %rdi value for clone() system call + // 0x88: stored %r8 value for clone() system call + // 0x90: stored %r9 value for clone() system call + // 0x98: stored %r10 value for clone() system call + // 0xA0: stored %r11 value for clone() system call + // 0xA8: stored %r12 value for clone() system call + // 0xB0: stored %r13 value for clone() system call + // 0xB8: stored %r14 value for clone() system call + // 0xC0: stored %r15 value for clone() system call + // 0xC8: new shared memory for clone() + // 0xD0: processFdPub for talking to trusted process + // 0xD4: cloneFdPub for talking to trusted process + // 0xD8: set to non-zero, if in debugging mode + // 0xDC: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0xE0: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0xE8: thread id (TLS_TID) + // 0xF0: threadFdPub (TLS_THREAD_FD) // 0x200-0x1000: securely passed verified file name(s) // Layout of (untrusted) scratch space: @@ -89,6 +129,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x48: last system call (not used on x86-64) // 0x4C: number of consecutive calls to a time fnc (not used on x86-64) // 0x50: nesting level of system calls (for debugging purposes only) + // 0x54: signal mask // We use the %fs register for accessing the secure read-only page, and // the untrusted scratch space immediately following it. The segment @@ -103,7 +144,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // read(threadFd, &scratch, 4) "1:xor %%rax, %%rax\n" // NR_read "mov %%r13, %%rdi\n" // fd = threadFd - "mov %%fs:0x0, %%rsi\n" + "mov %%fs:0x0, %%rsi\n" // secure_mem "add $0x1000, %%rsi\n" // buf = &scratch "mov $4, %%edx\n" // len = 4 "2:syscall\n" @@ -123,13 +164,15 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "jnz 5f\n" "3:cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process - "mov %%fs:0x10, %%rax\n" - "mov %%fs:0x18, %%rdi\n" - "mov %%fs:0x20, %%rsi\n" - "mov %%fs:0x28, %%rdx\n" - "mov %%fs:0x30, %%r10\n" - "mov %%fs:0x38, %%r8\n" - "mov %%fs:0x40, %%r9\n" + "cmp %%fs:0x10, %%eax\n" + "jne 25f\n" // exit process + "mov %%fs:0x18, %%rax\n" + "mov %%fs:0x20, %%rdi\n" + "mov %%fs:0x28, %%rsi\n" + "mov %%fs:0x30, %%rdx\n" + "mov %%fs:0x38, %%r10\n" + "mov %%fs:0x40, %%r8\n" + "mov %%fs:0x48, %%r9\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process "add $2, %%rbx\n" @@ -153,13 +196,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE "mov $10, %%eax\n" // NR_mprotect "syscall\n" - "mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id + "mov %%r8d, 0xDC(%%rdi)\n" // set most recently returned SysV shm id "xor %%rdi, %%rdi\n" // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG - "cmpw $0, %%fs:0xD0\n" // debug mode - "jz 26f\n" + "cmpw $0, %%fs:0xD8\n" // debug mode + "jz 27f\n" "mov $1, %%eax\n" // NR_write "mov $2, %%edi\n" // fd = stderr "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" @@ -168,7 +211,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "xor %%rdi, %%rdi\n" #endif - "jmp 26f\n" // exit program, no message + "jmp 27f\n" // exit program, no message "4:syscall\n" "jmp 15f\n" // return result @@ -179,10 +222,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "jnz 9f\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process + "cmp %%eax, %%fs:0x10\n" + "jne 25f\n" // exit process // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG - "cmpw $0, %%fs:0xD0\n" // debug mode + "cmpw $0, %%fs:0xD8\n" // debug mode "jz 6f\n" "mov $1, %%eax\n" // NR_write "mov $2, %%edi\n" // fd = stderr @@ -192,13 +237,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "6:" #endif - "mov %%fs:0x10, %%rax\n" - "mov %%fs:0x18, %%rdi\n" - "mov %%fs:0x20, %%rsi\n" - "mov %%fs:0x28, %%rdx\n" - "mov %%fs:0x30, %%r10\n" - "mov %%fs:0x38, %%r8\n" - "mov %%fs:0x40, %%r9\n" + "mov %%fs:0x18, %%rax\n" + "mov %%fs:0x20, %%rdi\n" + "mov %%fs:0x28, %%rsi\n" + "mov %%fs:0x30, %%rdx\n" + "mov %%fs:0x38, %%r10\n" + "mov %%fs:0x40, %%r8\n" + "mov %%fs:0x48, %%r9\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process @@ -255,7 +300,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Check in syscallTable whether this system call is unrestricted "12:mov %%rax, %%r9\n" #ifndef NDEBUG - "cmpw $0, %%fs:0xD0\n" // debug mode + "cmpw $0, %%fs:0xD8\n" // debug mode "jnz 13f\n" #endif "cmp playground$maxSyscall(%%rip), %%eax\n" @@ -287,11 +332,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov 0x2C(%%r8), %%r9\n" "mov 0x24(%%r8), %%r8\n" "cmp $231, %%rax\n" // NR_exit_group - "jz 26f\n" // exit program, no message + "jz 27f\n" // exit program, no message "syscall\n" // Return result of system call to sandboxed thread - "15:mov %%fs:0x0, %%rsi\n" + "15:mov %%fs:0x0, %%rsi\n" // secure_mem "add $0x1034, %%rsi\n" // buf = &scratch + 52 "mov %%rax, (%%rsi)\n" "mov $8, %%edx\n" // len = 8 @@ -306,8 +351,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // NR_exit: // Exit trusted thread after cleaning up resources - "18:mov %%fs:0x0, %%rsi\n" - "mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub + "18:mov %%fs:0x0, %%rsi\n" // secure_mem + "mov 0xF0(%%rsi), %%rdi\n" // fd = threadFdPub "mov $3, %%eax\n" // NR_close "syscall\n" "mov %%rsi, %%rdi\n" // start = secure_mem @@ -324,7 +369,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" "mov %%rax, %%rdi\n" "test %%rax, %%rax\n" - "js 26f\n" // exit process + "js 27f\n" // exit process "jne 21f\n" // reap helper, exit thread "jmp 22f\n" // unlock mutex @@ -334,12 +379,9 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // resources with the caller (i.e. the previous trusted thread), // and by extension it shares all resources with the sandbox'd // threads. - // N.B. It is possible to make the thread creation code crash before - // it releases seccomp privileges. This is generally OK, as it just - // terminates the program. But if we ever support signal handling, - // we have to be careful that the user cannot install a SIGSEGV - // handler that gets executed with elevated privileges. - "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "mov %%rsi, %%r15\n" // remember child stack + "mov $1, %%rsi\n" // stack = 1 "syscall\n" // calls NR_clone "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values "jae 7b\n" // unlock mutex, return result @@ -349,6 +391,23 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // In nascent thread, now. "sub $2, %%rbx\n" + + // We want to maintain an invalid %rsp whenver we access untrusted + // memory. This ensures that even if an attacker can trick us into + // triggering a SIGSEGV, we will never successfully execute a signal + // handler. + // Signal handlers are inherently dangerous, as an attacker could trick + // us into returning to the wrong address by adjusting the signal stack + // right before the handler returns. + // N.B. While POSIX is curiously silent about this, it appears that on + // Linux, alternate signal stacks are a per-thread property. That is + // good. It means that this security mechanism works, even if the + // sandboxed thread manages to set up an alternate signal stack. + // + // TODO(markus): We currently do not support emulating calls to + // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc + // for a discussion on how to fix this, if this ever becomes neccessary. + "mov %%r15, %%r9\n" // %r9 = child_stack "xor %%r15, %%r15\n" // Request to return from clone() when done // Get thread id of nascent thread @@ -358,19 +417,19 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Nascent thread creates socketpair() for sending requests to // trusted thread. - // We can create the filehandles on the stack. Filehandles are + // We can create the filehandles on the child's stack. Filehandles are // always treated as untrusted. // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) - "push %%r15\n" + "sub $0x10, %%r9\n" + "mov %%r15, 8(%%r9)\n" // preserve return address on child stack "mov $53, %%eax\n" // NR_socketpair "mov $1, %%edi\n" // domain = AF_UNIX "mov $1, %%esi\n" // type = SOCK_STREAM "xor %%rdx, %%rdx\n" // protocol = 0 - "sub $8, %%rsp\n" // sv = %rsp - "mov %%rsp, %%r10\n" + "mov %%r9, %%r10\n" // sv = child_stack "syscall\n" "test %%rax, %%rax\n" - "jz 27f\n" + "jz 28f\n" // If things went wrong, we don't have an (easy) way of signaling // the parent. For our purposes, it is sufficient to fail with a @@ -403,12 +462,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "syscall\n" - "mov $1, %%edi\n" - "26:mov $231, %%eax\n" // NR_exit_group + "26:mov $1, %%edi\n" + "27:mov $231, %%eax\n" // NR_exit_group "jmp 24b\n" // The first page is mapped read-only for use as securely shared memory - "27:mov 0xC0(%%rbp), %%r12\n" // %r12 = secure shared memory + "28:mov 0xC8(%%rbp), %%r12\n" // %r12 = secure shared memory "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process "mov $10, %%eax\n" // NR_mprotect @@ -428,12 +487,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| // CLONE_SYSVSEM|CLONE_UNTRACED|CLONE_SETTLS, stack, NULL, NULL, // tls) - "mov 4(%%rsp), %%r13d\n" // %r13 = threadFd + "mov 4(%%r9), %%r13d\n" // %r13 = threadFd (on child's stack) "mov $56, %%eax\n" // NR_clone "mov $0x8D0F00, %%edi\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR|TLS "mov $1, %%rsi\n" // stack = 1 "mov %%r12, %%r8\n" // tls = new_secure_mem - "mov 0xC8(%%rbp), %%r15d\n" // %r15 = processFdPub + "mov 0xD0(%%rbp), %%r15d\n" // %r15 = processFdPub "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process "syscall\n" @@ -441,13 +500,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "js 25b\n" // exit process "jz 0b\n" // invoke trustedThreadFnc() + // Copy the caller's signal mask + "mov 0x1054(%%rbp), %%rax\n" + "mov %%rax, 0x1054(%%r12)\n" + // Done creating trusted thread. We can now get ready to return to caller - "mov 0(%%rsp), %%r9d\n" // %r9 = threadFdPub - "add $8, %%rsp\n" + "mov %%r9, %%r8\n" // %r8 = child_stack + "mov 0(%%r9), %%r9d\n" // %r9 = threadFdPub // Set up thread local storage with information on how to talk to // trusted thread and trusted process. - "lea 0xD8(%%r12), %%rsi\n" // args = &secure_mem.TLS; + "lea 0xE0(%%r12), %%rsi\n" // args = &secure_mem.TLS; "mov $158, %%eax\n" // NR_arch_prctl "mov $0x1001, %%edi\n" // option = ARCH_SET_GS "syscall\n" @@ -459,73 +522,121 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // the very top of this function, you will find that we push 999(%rip) // on the stack. That is the signal that we should return on the same // stack rather than return to where clone was called. - "pop %%r15\n" + "mov 8(%%r8), %%r15\n" + "add $0x10, %%r8\n" "test %%r15, %%r15\n" - "jne 28f\n" + "jne 29f\n" // Returning from clone() into the newly created thread is special. We // cannot unroll the stack, as we just set up a new stack for this // thread. We have to explicitly restore CPU registers to the values // that they had when the program originally called clone(). - "sub $0x80, %%rsp\n" // redzone compensation - "mov 0x48(%%rbp), %%rax\n" - "push %%rax\n" + // We patch the register values in the signal stack frame so that we + // can ask sigreturn() to restore all registers for us. + "sub $0x8, %%r8\n" "mov 0x50(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x00(%%r8)\n" // return address + "xor %%rax, %%rax\n" + "mov %%rax, 0x98(%%r8)\n" // %rax = 0 "mov 0x58(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x80(%%r8)\n" // %rbp "mov 0x60(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x88(%%r8)\n" // %rbx "mov 0x68(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0xA0(%%r8)\n" // %rcx "mov 0x70(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x90(%%r8)\n" // %rdx "mov 0x78(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x78(%%r8)\n" // %rsi "mov 0x80(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x70(%%r8)\n" // %rdi "mov 0x88(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x30(%%r8)\n" // %r8 "mov 0x90(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x38(%%r8)\n" // %r9 "mov 0x98(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x40(%%r8)\n" // %r10 "mov 0xA0(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x48(%%r8)\n" // %r11 "mov 0xA8(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x50(%%r8)\n" // %r12 "mov 0xB0(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x58(%%r8)\n" // %r13 "mov 0xB8(%%rbp), %%rax\n" - "push %%rax\n" + "mov %%rax, 0x60(%%r8)\n" // %r14 + "mov 0xC0(%%rbp), %%rax\n" + "mov %%rax, 0x68(%%r8)\n" // %r15 "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process // Nascent thread launches a helper that doesn't share any of our // resources, except for pages mapped as MAP_SHARED. - // clone(0, %rsp) - "28:mov $56, %%eax\n" // NR_clone + // clone(SIGCHLD, stack=1) + "29:mov $56, %%eax\n" // NR_clone "mov $17, %%rdi\n" // flags = SIGCHLD - "mov %%rsp, %%rsi\n" // stack = %rsp + "mov $1, %%rsi\n" // stack = 1 "syscall\n" "test %%rax, %%rax\n" "js 25b\n" // exit process - "jne 29f\n" + "jne 31f\n" // Use sendmsg() to send to the trusted process the file handles for // communicating with the new trusted thread. We also send the address // of the secure memory area (for sanity checks) and the thread id. - "mov 0xCC(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub() + "mov 0xD4(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub() "cmp %%rbx, 8(%%rbp)\n" "jne 25b\n" // exit process - "mov %%r9, %%rsi\n" // fd0 = threadFdPub - "mov %%r13, %%rdx\n" // fd1 = threadFd - "push %%r14\n" // threadId - "mov %%esi, 4(%%rsp)\n" // threadFdPub - "push %%r12\n" // secure_mem - "mov %%rsp, %%rcx\n" // buf = &data - "mov $16, %%r8\n" // len = sizeof(void*) + 2*sizeof(int) - "call playground$sendFd\n" + + // 0x00 msg: + // 0x00 msg_name ($0) + // 0x08 msg_namelen ($0) + // 0x10 msg_iov (%r8 + 0x44) + // 0x18 msg_iovlen ($1) + // 0x20 msg_control (%r8 + 0x54) + // 0x28 msg_controllen ($0x18) + // 0x30 data: + // 0x30 msg_flags/err ($0) + // 0x34 secure_mem (%r12) + // 0x3C threadId (%r14d) + // 0x40 threadFdPub (%r9d) + // 0x44 iov: + // 0x44 iov_base (%r8 + 0x30) + // 0x4C iov_len ($0x14) + // 0x54 cmsg: + // 0x54 cmsg_len ($0x18) + // 0x5C cmsg_level ($1, SOL_SOCKET) + // 0x60 cmsg_type ($1, SCM_RIGHTS) + // 0x64 threadFdPub (%r9d) + // 0x68 threadFd (%r13d) + // 0x6C + "sub $0x6C, %%r8\n" + "xor %%rdx, %%rdx\n" // flags = 0 + "mov %%rdx, 0x00(%%r8)\n" // msg_name + "mov %%edx, 0x08(%%r8)\n" // msg_namelen + "mov %%edx, 0x30(%%r8)\n" // msg_flags + "mov $1, %%r11d\n" + "mov %%r11, 0x18(%%r8)\n" // msg_iovlen + "mov %%r11d, 0x5C(%%r8)\n" // cmsg_level + "mov %%r11d, 0x60(%%r8)\n" // cmsg_type + "lea 0x30(%%r8), %%r11\n" + "mov %%r11, 0x44(%%r8)\n" // iov_base + "add $0x14, %%r11\n" + "mov %%r11, 0x10(%%r8)\n" // msg_iov + "add $0x10, %%r11\n" + "mov %%r11, 0x20(%%r8)\n" // msg_control + "mov $0x14, %%r11d\n" + "mov %%r11, 0x4C(%%r8)\n" // iov_len + "add $4, %%r11d\n" + "mov %%r11, 0x28(%%r8)\n" // msg_controllen + "mov %%r11, 0x54(%%r8)\n" // cmsg_len + "mov %%r12, 0x34(%%r8)\n" // secure_mem + "mov %%r14d, 0x3C(%%r8)\n" // threadId + "mov %%r9d, 0x40(%%r8)\n" // threadFdPub + "mov %%r9d, 0x64(%%r8)\n" // threadFdPub + "mov %%r13d, 0x68(%%r8)\n" // threadFd + "mov $46, %%eax\n" // NR_sendmsg + "mov %%r8, %%rsi\n" // msg + "syscall\n" // Release syscall_mutex_. This signals the trusted process that // it can write into the original thread's secure memory again. @@ -534,23 +645,29 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $4096, %%esi\n" "mov $3, %%edx\n" // PROT_READ | PROT_WRITE "syscall\n" + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process "lock; addl $0x80000000, (%%rdi)\n" - "jz 26b\n" // exit process (no error message) + "jz 30f\n" // exit process (no error message) "mov $1, %%edx\n" "mov %%rdx, %%rsi\n" // FUTEX_WAKE "mov $202, %%eax\n" // NR_futex "syscall\n" - "jmp 26b\n" // exit process (no error message) + "30:xor %%rdi, %%rdi\n" + "jmp 27b\n" // exit process (no error message) // Reap helper - "29:mov %%rax, %%rdi\n" - "30:xor %%rsi, %%rsi\n" + "31:mov %%rax, %%rdi\n" + "32:lea -4(%%r8), %%rsi\n" "xor %%rdx, %%rdx\n" "xor %%r10, %%r10\n" "mov $61, %%eax\n" // NR_wait4 "syscall\n" "cmp $-4, %%eax\n" // EINTR - "jz 30b\n" + "jz 32b\n" + "mov -4(%%r8), %%eax\n" + "test %%rax, %%rax\n" + "jnz 26b\n" // exit process (no error message) // Release privileges by entering seccomp mode. "mov $157, %%eax\n" // NR_prctl @@ -560,6 +677,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "test %%rax, %%rax\n" "jnz 25b\n" // exit process + // We can finally start using the stack. Signal handlers no longer pose + // a threat to us. + "mov %%r8, %%rsp\n" + // Back in the newly created sandboxed thread, wait for trusted process // to receive request. It is possible for an attacker to make us // continue even before the trusted process is done. This is OK. It'll @@ -569,10 +690,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $1, %%edx\n" // len = 1 "mov %%rsp, %%rsi\n" // buf = %rsp "mov %%r9, %%rdi\n" // fd = threadFdPub - "31:xor %%rax, %%rax\n" // NR_read + "33:xor %%rax, %%rax\n" // NR_read "syscall\n" "cmp $-4, %%rax\n" // EINTR - "jz 31b\n" + "jz 33b\n" "cmp %%rdx, %%rax\n" "jne 25b\n" // exit process "pop %%rax\n" @@ -580,27 +701,16 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Return to caller. We are in the new thread, now. "xor %%rax, %%rax\n" "test %%r15, %%r15\n" - - // Returning to createTrustedThread() - "jz 32f\n" - "jmp *%%r15\n" - - // Returning to the place where clone() had been called - "32:pop %%r15\n" - "pop %%r14\n" - "pop %%r13\n" - "pop %%r12\n" - "pop %%r11\n" - "pop %%r10\n" - "pop %%r9\n" - "pop %%r8\n" - "pop %%rdi\n" - "pop %%rsi\n" - "pop %%rdx\n" - "pop %%rcx\n" - "pop %%rbx\n" - "pop %%rbp\n" - "ret\n" + "jnz 34f\n" // Returning to createTrustedThread() + + // Returning to the place where clone() had been called. We rely on + // using rt_sigreturn() for restoring our registers. The caller already + // created a signal stack frame, and we patched the register values + // with the ones that were in effect prior to calling sandbox_clone(). + "pop %%r15\n" + "34:mov %%r15, 0xA8(%%rsp)\n" // compute new %rip + "mov $15, %%eax\n" // NR_rt_sigreturn + "syscall\n" ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" @@ -638,19 +748,60 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %0, %%mm6\n" // %mm6 = args "lea 999f, %%ebx\n" // continue in same thread "movd %%ebx, %%mm3\n" - "xor %%ebx, %%ebx\n" // initial sequence number - "movd %%ebx, %%mm2\n" + "xor %%edi, %%edi\n" // initial sequence number + "movd %%edi, %%mm2\n" + + // Signal handlers are process-wide. This means that for security + // reasons, we cannot allow that the trusted thread ever executes any + // signal handlers. + // We prevent the execution of signal handlers by setting a signal + // mask that blocks all signals. In addition, we make sure that the + // stack pointer is invalid. + // We cannot reset the signal mask until after we have enabled + // Seccomp mode. Our sigprocmask() wrapper would normally do this by + // raising a signal, modifying the signal mask in the kernel-generated + // signal frame, and then calling sigreturn(). This presents a bit of + // a Catch-22, as all signals are masked and we can therefore not + // raise any signal that would allow us to generate the signal stack + // frame. + // Instead, we have to create the signal stack frame prior to entering + // Seccomp mode. This incidentally also helps us to restore the + // signal mask to the same value that it had prior to entering the + // sandbox. + // The signal wrapper for clone() is the second entry point into this + // code (by means of sending an IPC to its trusted thread). It goes + // through the same steps of creating a signal stack frame on the + // newly created thread's stacks prior to cloning. See clone.cc for + // details. + "mov $120+0xF000, %%eax\n" // __NR_clone + 0xF000 + "sub $8, %%esp\n" + "mov %%esp, %%edx\n" // push a signal stack frame (see clone.cc) + "mov %%esp, 0(%%esp)\n" + "int $0\n" + "mov 0(%%esp), %%ebp\n" + "add $8, 0x1C(%%ebp)\n" // pop stack upon call to sigreturn() + "mov $2, %%ebx\n" // how = SIG_SETMASK + "movl $-1, 0(%%esp)\n" + "movl $-1, 4(%%esp)\n" + "mov %%esp, %%ecx\n" // set = full mask + "xor %%edx, %%edx\n" // old_set = NULL + "mov $8, %%esi\n" // mask all 64 signals + "mov $175, %%eax\n" // NR_rt_sigprocmask + "int $0x80\n" + "mov $126, %%eax\n" // NR_sigprocmask + "int $0x80\n" + "xor %%esp, %%esp\n" // invalidate the stack in all trusted code "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger // buffer. // Parameters: - // %mm5: secure memory region - // the page following this one contains the scratch space // %mm0: thread's side of threadFd // %mm1: processFdPub // %mm3: return address after creation of new trusted thread + // %mm5: secure memory region + // the page following this one contains the scratch space // Local variables: // %mm2: sequence number for trusted calls @@ -664,28 +815,29 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Layout of secure shared memory region (c.f. securemem.h): // 0x00: pointer to the secure shared memory region (i.e. self) // 0x04: sequence number; must match %mm2 - // 0x08: system call number; passed to syscall in %eax - // 0x0C: first argument; passed to syscall in %ebx - // 0x10: second argument; passed to syscall in %ecx - // 0x14: third argument; passed to syscall in %edx - // 0x18: fourth argument; passed to syscall in %esi - // 0x1C: fifth argument; passed to syscall in %edi - // 0x20: sixth argument; passed to syscall in %ebp - // 0x24: stored return address for clone() system call - // 0x28: stored %ebp value for clone() system call - // 0x2C: stored %edi value for clone() system call - // 0x30: stored %esi value for clone() system call - // 0x34: stored %edx value for clone() system call - // 0x38: stored %ecx value for clone() system call - // 0x3C: stored %ebx value for clone() system call - // 0x40: new shared memory for clone() - // 0x44: processFdPub for talking to trusted process - // 0x48: cloneFdPub for talking to trusted process - // 0x4C: set to non-zero, if in debugging mode - // 0x50: most recent SHM id returned by shmget(IPC_PRIVATE) - // 0x54: cookie assigned to us by the trusted process (TLS_COOKIE) - // 0x5C: thread id (TLS_TID) - // 0x64: threadFdPub (TLS_THREAD_FD) + // 0x08: call type; must match %eax, iff %eax == -1 || %eax == -2 + // 0x0C: system call number; passed to syscall in %eax + // 0x10: first argument; passed to syscall in %ebx + // 0x14: second argument; passed to syscall in %ecx + // 0x18: third argument; passed to syscall in %edx + // 0x1C: fourth argument; passed to syscall in %esi + // 0x20: fifth argument; passed to syscall in %edi + // 0x24: sixth argument; passed to syscall in %ebp + // 0x28: stored return address for clone() system call + // 0x2C: stored %ebp value for clone() system call + // 0x30: stored %edi value for clone() system call + // 0x34: stored %esi value for clone() system call + // 0x38: stored %edx value for clone() system call + // 0x3C: stored %ecx value for clone() system call + // 0x40: stored %ebx value for clone() system call + // 0x44: new shared memory for clone() + // 0x48: processFdPub for talking to trusted process + // 0x4C: cloneFdPub for talking to trusted process + // 0x50: set to non-zero, if in debugging mode + // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0x60: thread id (TLS_TID) + // 0x68: threadFdPub (TLS_THREAD_FD) // 0x200-0x1000: securely passed verified file name(s) // Layout of (untrusted) scratch space: @@ -703,6 +855,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x2C: last system call (updated in syscall.cc) // 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday) // 0x34: nesting level of system calls (for debugging purposes only) + // 0x38: signal mask "0:xor %%esp, %%esp\n" "mov $2, %%eax\n" // %mm2 = initial sequence number @@ -713,7 +866,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // read(threadFd, &scratch, 4) "1:mov $3, %%eax\n" // NR_read "movd %%mm0, %%ebx\n" // fd = threadFd - "movd %%mm5, %%ecx\n" + "movd %%mm5, %%ecx\n" // secure_mem "add $0x1000, %%ecx\n" // buf = &scratch "mov $4, %%edx\n" // len = 4 "2:int $0x80\n" @@ -734,13 +887,15 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "3:movd %%mm2, %%ebp\n" "cmp %%ebp, 0x4-0x1000(%%ecx)\n" "jne 25f\n" // exit process - "mov 0x08-0x1000(%%ecx), %%eax\n" - "mov 0x0C-0x1000(%%ecx), %%ebx\n" - "mov 0x14-0x1000(%%ecx), %%edx\n" - "mov 0x18-0x1000(%%ecx), %%esi\n" - "mov 0x1C-0x1000(%%ecx), %%edi\n" - "mov 0x20-0x1000(%%ecx), %%ebp\n" - "mov 0x10-0x1000(%%ecx), %%ecx\n" + "cmp 0x08-0x1000(%%ecx), %%eax\n" + "jne 25f\n" // exit process + "mov 0x0C-0x1000(%%ecx), %%eax\n" + "mov 0x10-0x1000(%%ecx), %%ebx\n" + "mov 0x18-0x1000(%%ecx), %%edx\n" + "mov 0x1C-0x1000(%%ecx), %%esi\n" + "mov 0x20-0x1000(%%ecx), %%edi\n" + "mov 0x24-0x1000(%%ecx), %%ebp\n" + "mov 0x14-0x1000(%%ecx), %%ecx\n" "movd %%edi, %%mm4\n" "movd %%ebp, %%mm7\n" "movd %%mm2, %%ebp\n" @@ -773,14 +928,14 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE "mov $125, %%eax\n" // NR_mprotect "int $0x80\n" - "mov %%ebp, 0x50(%%ebx)\n" // set most recently returned SysV shm id + "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id "xor %%ebx, %%ebx\n" // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG "movd %%mm5, %%ecx\n" - "cmpw $0, 0x4C(%%ecx)\n" // debug mode - "jz 26f\n" + "cmpw $0, 0x50(%%ecx)\n" // debug mode + "jz 27f\n" "mov $4, %%eax\n" // NR_write "mov $2, %%ebx\n" // fd = stderr "lea 101f, %%ecx\n" // "This is an expensive system call" @@ -789,7 +944,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "xor %%ebx, %%ebx\n" #endif - "jmp 26f\n" // exit program, no message + "jmp 27f\n" // exit program, no message "4:int $0x80\n" "jmp 15f\n" // return result @@ -801,10 +956,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%mm2, %%ebp\n" "cmp %%ebp, 0x4-0x1000(%%ecx)\n" "jne 25f\n" // exit process + "cmp %%eax, 0x8-0x1000(%%ecx)\n" + "jne 25f\n" // exit process // When debugging messages are enabled, warn about expensive system calls #ifndef NDEBUG - "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "cmpw $0, 0x50-0x1000(%%ecx)\n" "jz 6f\n" // debug mode "mov %%ecx, %%ebp\n" "mov $4, %%eax\n" // NR_write @@ -816,13 +973,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "6:" #endif - "mov 0x08-0x1000(%%ecx), %%eax\n" - "mov 0x0C-0x1000(%%ecx), %%ebx\n" - "mov 0x14-0x1000(%%ecx), %%edx\n" - "mov 0x18-0x1000(%%ecx), %%esi\n" - "mov 0x1C-0x1000(%%ecx), %%edi\n" - "mov 0x20-0x1000(%%ecx), %%ebp\n" - "mov 0x10-0x1000(%%ecx), %%ecx\n" + "mov 0x0C-0x1000(%%ecx), %%eax\n" + "mov 0x10-0x1000(%%ecx), %%ebx\n" + "mov 0x18-0x1000(%%ecx), %%edx\n" + "mov 0x1C-0x1000(%%ecx), %%esi\n" + "mov 0x20-0x1000(%%ecx), %%edi\n" + "mov 0x24-0x1000(%%ecx), %%ebp\n" + "mov 0x14-0x1000(%%ecx), %%ecx\n" "movd %%edi, %%mm4\n" "movd %%ebp, %%mm7\n" "movd %%mm2, %%ebp\n" @@ -864,7 +1021,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 6\n" + "jz 8b\n" "mov %%ebp, %%eax\n" "jmp 15f\n" // return result @@ -889,7 +1046,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Check in syscallTable whether this system call is unrestricted "12:mov %%eax, %%ebp\n" #ifndef NDEBUG - "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "cmpw $0, 0x50-0x1000(%%ecx)\n" "jnz 13f\n" // debug mode #endif "cmp playground$maxSyscall, %%eax\n" @@ -919,11 +1076,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov 0x14(%%ecx), %%ebp\n" "mov 0x04(%%ecx), %%ecx\n" "cmp $252, %%eax\n" // NR_exit_group - "jz 26f\n" // exit program, no message + "jz 27f\n" // exit program, no message "int $0x80\n" // Return result of system call to sandboxed thread - "15:movd %%mm5, %%ecx\n" + "15:movd %%mm5, %%ecx\n" // secure_mem "add $0x101C, %%ecx\n" // buf = &scratch + 28 "mov %%eax, (%%ecx)\n" "mov $4, %%edx\n" // len = 4 @@ -938,8 +1095,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // NR_exit: // Exit trusted thread after cleaning up resources - "18:mov %%edi, %%ecx\n" - "mov 0x64(%%ecx), %%ebx\n" // fd = threadFdPub + "18:mov %%edi, %%ecx\n" // secure_mem + "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub "mov $6, %%eax\n" // NR_close "int $0x80\n" "mov %%ecx, %%ebx\n" // start = secure_mem @@ -966,14 +1123,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // resources with the caller (i.e. the previous trusted thread), // and by extension it shares all resources with the sandbox'd // threads. - // N.B. It is possible to make the thread creation code crash before - // it releases seccomp privileges. This is generally OK, as it just - // terminates the program. But if we ever support signal handling, - // we have to be careful that the user cannot install a SIGSEGV - // handler that gets executed with elevated privileges. - "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem - "movd %%mm4, %%edi\n" - "movd %%mm7, %%ebp\n" + "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem + "movd %%mm4, %%edi\n" // child_tidptr + "mov %%ecx, %%ebp\n" // remember child stack + "mov $1, %%ecx\n" // stack = 1 "int $0x80\n" // calls NR_clone "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values "jae 7b\n" // unlock mutex, return result @@ -986,6 +1139,22 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // In nascent thread, now. "sub $2, %%edi\n" "movd %%edi, %%mm2\n" + + // We want to maintain an invalid %esp whenver we access untrusted + // memory. This ensures that even if an attacker can trick us into + // triggering a SIGSEGV, we will never successfully execute a signal + // handler. + // Signal handlers are inherently dangerous, as an attacker could trick + // us into returning to the wrong address by adjusting the signal stack + // right before the handler returns. + // N.B. While POSIX is curiously silent about this, it appears that on + // Linux, alternate signal stacks are a per-thread property. That is + // good. It means that this security mechanism works, even if the + // sandboxed thread manages to set up an alternate signal stack. + // + // TODO(markus): We currently do not support emulating calls to + // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc + // for a discussion on how to fix this, if this ever becomes neccessary. "movd %%eax, %%mm3\n" // Request to return from clone() when done // Get thread id of nascent thread @@ -995,23 +1164,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Nascent thread creates socketpair() for sending requests to // trusted thread. - // We can create the filehandles on the stack. Filehandles are + // We can create the filehandles on the child's stack. Filehandles are // always treated as untrusted. // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) "mov $102, %%eax\n" // NR_socketcall "mov $8, %%ebx\n" // socketpair - "sub $8, %%esp\n" // sv = %rsp - "push %%esp\n" - "xor %%ecx, %%ecx\n" // protocol = 0 - "push %%ecx\n" - "mov $1, %%ecx\n" // type = SOCK_STREAM - "push %%ecx\n" - "push %%ecx\n" // domain = AF_UNIX - "mov %%esp, %%ecx\n" + "sub $8, %%ebp\n" // sv = child_stack + "mov %%ebp, -0x04(%%ebp)\n" + "movl $0, -0x08(%%ebp)\n" // protocol = 0 + "movl $1, -0x0C(%%ebp)\n" // type = SOCK_STREAM + "movl $1, -0x10(%%ebp)\n" // domain = AF_UNIX + "lea -0x10(%%ebp), %%ecx\n" "int $0x80\n" - "add $0x10, %%esp\n" "test %%eax, %%eax\n" - "jz 27f\n" + "jz 28f\n" // If things went wrong, we don't have an (easy) way of signaling // the parent. For our purposes, it is sufficient to fail with a @@ -1043,19 +1209,18 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "lea 100f, %%ecx\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "int $0x80\n" - "mov $1, %%ebx\n" - "26:mov $252, %%eax\n" // NR_exit_group + "26:mov $1, %%ebx\n" + "27:mov $252, %%eax\n" // NR_exit_group "jmp 24b\n" // The first page is mapped read-only for use as securely shared memory - "27:movd %%mm6, %%ebp\n" - "mov 0x40(%%ebp), %%esi\n" - "movd %%esi, %%mm5\n" // %mm5 = secure shared memory - "movd %%mm2, %%edi\n" - "cmp %%edi, 4(%%ebp)\n" + "28:movd %%mm6, %%edi\n" // %edi = old_shared_mem + "mov 0x44(%%edi), %%ebx\n" // addr = secure_mem + "movd %%ebx, %%mm5\n" // %mm5 = secure_mem + "movd %%mm2, %%esi\n" + "cmp %%esi, 4(%%edi)\n" "jne 25b\n" // exit process "mov $125, %%eax\n" // NR_mprotect - "mov %%esi, %%ebx\n" "mov $4096, %%ecx\n" // len = 4096 "mov $1, %%edx\n" // prot = PROT_READ "int $0x80\n" @@ -1070,13 +1235,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Call clone() to create new trusted thread(). // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| // CLONE_SYSVSEM|CLONE_UNTRACED, stack, NULL, NULL, NULL) - "mov 4(%%esp), %%eax\n" + "mov 4(%%ebp), %%eax\n" // threadFd (on child's stack) "movd %%eax, %%mm0\n" // %mm0 = threadFd "mov $120, %%eax\n" // NR_clone "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR "mov $1, %%ecx\n" // stack = 1 - "movd 0x44(%%ebp), %%mm1\n" // %mm1 = processFdPub - "cmp %%edi, 4(%%ebp)\n" + "movd 0x48(%%edi), %%mm1\n" // %mm1 = processFdPub + "cmp %%esi, 4(%%edi)\n" "jne 25b\n" // exit process "int $0x80\n" "test %%eax, %%eax\n" @@ -1085,86 +1250,146 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Set up thread local storage "mov $0x51, %%eax\n" // seg_32bit, limit_in_pages, useable - "push %%eax\n" + "mov %%eax, -0x04(%%ebp)\n" "mov $0xFFFFF, %%eax\n" // limit - "push %%eax\n" - "add $0x54, %%esi\n" - "push %%esi\n" // base_addr = &secure_mem.TLS + "mov %%eax, -0x08(%%ebp)\n" + "movd %%mm5, %%eax\n" + "add $0x58, %%eax\n" + "mov %%eax, -0x0C(%%ebp)\n" // base_addr = &secure_mem.TLS "mov %%fs, %%eax\n" "shr $3, %%eax\n" - "push %%eax\n" // entry_number + "mov %%eax, -0x10(%%ebp)\n" // entry_number "mov $243, %%eax\n" // NR_set_thread_area - "mov %%esp, %%ebx\n" + "lea -0x10(%%ebp), %%ebx\n" "int $0x80\n" "test %%eax, %%eax\n" "jnz 25b\n" // exit process - "add $16, %%esp\n" + + // Copy the caller's signal mask + "movd %%mm5, %%edx\n" + "mov 0x1038(%%edi), %%eax\n" + "mov %%eax, 0x1038(%%edx)\n" + "mov 0x103C(%%edi), %%eax\n" + "mov %%eax, 0x103C(%%edx)\n" // Done creating trusted thread. We can now get ready to return to caller - "mov 0(%%esp), %%esi\n" // %esi = threadFdPub - "add $8, %%esp\n" + "mov 0(%%ebp), %%esi\n" // %esi = threadFdPub + "add $8, %%ebp\n" // Check whether this is the initial thread, or a newly created one. // At startup we run the same code as when we create a new thread. At - // the very top of this function, you will find that we store 999(%rip) + // the very top of this function, you will find that we store 999f // in %%mm3. That is the signal that we should return on the same // stack rather than return to where clone was called. "movd %%mm3, %%eax\n" + "movd %%mm2, %%edx\n" "test %%eax, %%eax\n" - "jne 28f\n" + "jne 29f\n" // Returning from clone() into the newly created thread is special. We // cannot unroll the stack, as we just set up a new stack for this // thread. We have to explicitly restore CPU registers to the values // that they had when the program originally called clone(). - "mov 0x24(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x28(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x2C(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x30(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x34(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x38(%%ebp), %%eax\n" - "push %%eax\n" - "mov 0x3C(%%ebp), %%eax\n" - "push %%eax\n" - "cmp %%edi, 4(%%ebp)\n" + // We patch the register values in the signal stack frame so that we + // can ask sigreturn() to restore all registers for us. + "sub $0x4, %%ebp\n" + "mov 0x28(%%edi), %%eax\n" + "mov %%eax, 0x00(%%ebp)\n" // return address + "xor %%eax, %%eax\n" + "mov %%eax, 0x30(%%ebp)\n" // %eax = 0 + "mov 0x2C(%%edi), %%eax\n" + "mov %%eax, 0x1C(%%ebp)\n" // %ebp + "mov 0x30(%%edi), %%eax\n" + "mov %%eax, 0x14(%%ebp)\n" // %edi + "mov 0x34(%%edi), %%eax\n" + "mov %%eax, 0x18(%%ebp)\n" // %esi + "mov 0x38(%%edi), %%eax\n" + "mov %%eax, 0x28(%%ebp)\n" // %edx + "mov 0x3C(%%edi), %%eax\n" + "mov %%eax, 0x2C(%%ebp)\n" // %ecx + "mov 0x40(%%edi), %%eax\n" + "mov %%eax, 0x24(%%ebp)\n" // %ebx + "cmp %%edx, 4(%%edi)\n" "jne 25b\n" // exit process // Nascent thread launches a helper that doesn't share any of our // resources, except for pages mapped as MAP_SHARED. - // clone(0, %esp) - "28:mov $120, %%eax\n" // NR_clone + // clone(SIGCHLD, stack=1) + "29:mov $120, %%eax\n" // NR_clone "mov $17, %%ebx\n" // flags = SIGCHLD - "mov %%esp, %%ecx\n" // stack = %esp + "mov $1, %%ecx\n" // stack = 1 "int $0x80\n" "test %%eax, %%eax\n" "js 25b\n" // exit process - "jne 29f\n" + "jne 31f\n" // Use sendmsg() to send to the trusted process the file handles for // communicating with the new trusted thread. We also send the address // of the secure memory area (for sanity checks) and the thread id. - "push %%esi\n" // threadFdPub - "movd %%mm4, %%eax\n" // threadId - "push %%eax\n" - "movd %%mm5, %%eax\n" // secure_mem - "push %%eax\n" - "mov %%esp, %%ebx\n" // buf = &data - "mov $12, %%eax\n" // len = sizeof(void*) + 2*sizeof(int) - "push %%eax\n" - "push %%ebx\n" - "movd %%mm0, %%eax\n" // fd1 = threadFd - "push %%eax\n" - "push %%esi\n" // fd0 = threadFdPub - "mov 0x48(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub() - "cmp %%edi, 4(%%ebp)\n" + "cmp %%edx, 4(%%edi)\n" "jne 25b\n" // exit process - "push %%eax\n" - "call playground$sendFd\n" + + // 0x00 socketcall: + // 0x00 socket (0x4C(%edi)) + // 0x04 msg (%ecx + 0x0C) + // 0x08 flags ($0) + // 0x0C msg: + // 0x0C msg_name ($0) + // 0x10 msg_namelen ($0) + // 0x14 msg_iov (%ecx + 0x34) + // 0x18 msg_iovlen ($1) + // 0x1C msg_control (%ecx + 0x3C) + // 0x20 msg_controllen ($0x14) + // 0x24 data: + // 0x24 msg_flags/err ($0) + // 0x28 secure_mem (%mm5) + // 0x2C threadId (%mm4) + // 0x30 threadFdPub (%esi) + // 0x34 iov: + // 0x34 iov_base (%ecx + 0x24) + // 0x38 iov_len ($0x10) + // 0x3C cmsg: + // 0x3C cmsg_len ($0x14) + // 0x40 cmsg_level ($1, SOL_SOCKET) + // 0x44 cmsg_type ($1, SCM_RIGHTS) + // 0x48 threadFdPub (%esi) + // 0x4C threadFd (%mm0) + // 0x50 + "lea -0x50(%%ebp), %%ecx\n" + "xor %%eax, %%eax\n" + "mov %%eax, 0x08(%%ecx)\n" // flags + "mov %%eax, 0x0C(%%ecx)\n" // msg_name + "mov %%eax, 0x10(%%ecx)\n" // msg_namelen + "mov %%eax, 0x24(%%ecx)\n" // msg_flags + "inc %%eax\n" + "mov %%eax, 0x18(%%ecx)\n" // msg_iovlen + "mov %%eax, 0x40(%%ecx)\n" // cmsg_level + "mov %%eax, 0x44(%%ecx)\n" // cmsg_type + "movl $0x10, 0x38(%%ecx)\n" // iov_len + "mov $0x14, %%eax\n" + "mov %%eax, 0x20(%%ecx)\n" // msg_controllen + "mov %%eax, 0x3C(%%ecx)\n" // cmsg_len + "mov 0x4C(%%edi), %%eax\n" // cloneFdPub + "mov %%eax, 0x00(%%ecx)\n" // socket + "lea 0x0C(%%ecx), %%eax\n" + "mov %%eax, 0x04(%%ecx)\n" // msg + "add $0x18, %%eax\n" + "mov %%eax, 0x34(%%ecx)\n" // iov_base + "add $0x10, %%eax\n" + "mov %%eax, 0x14(%%ecx)\n" // msg_iov + "add $8, %%eax\n" + "mov %%eax, 0x1C(%%ecx)\n" // msg_control + "mov %%esi, 0x30(%%ecx)\n" // threadFdPub + "mov %%esi, 0x48(%%ecx)\n" // threadFdPub + "movd %%mm5, %%eax\n" + "mov %%eax, 0x28(%%ecx)\n" // secure_mem + "movd %%mm4, %%eax\n" + "mov %%eax, 0x2C(%%ecx)\n" // threadId + "movd %%mm0, %%eax\n" + "mov %%eax, 0x4C(%%ecx)\n" // threadFd + "mov $16, %%ebx\n" // sendmsg() + "mov $102, %%eax\n" // NR_socketcall + "int $0x80\n" // Release syscall_mutex_. This signals the trusted process that // it can write into the original thread's secure memory again. @@ -1173,31 +1398,42 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $4096, %%ecx\n" "mov $3, %%edx\n" // PROT_READ | PROT_WRITE "int $0x80\n" + "movd %%mm2, %%edx\n" + "cmp %%edx, 0x4(%%edi)\n" + "jnz 25b\n" // exit process "lock; addl $0x80000000, (%%ebx)\n" - "jz 26b\n" // exit process (no error message) + "jz 30f\n" // exit process (no error message) "mov $1, %%edx\n" "mov %%edx, %%ecx\n" // FUTEX_WAKE "mov $240, %%eax\n" // NR_futex "int $0x80\n" - "jmp 26b\n" // exit process (no error message) + "30:xor %%ebx, %%ebx\n" + "jmp 27b\n" // exit process (no error message) // Reap helper - "29:mov %%eax, %%ebx\n" - "30:xor %%ecx, %%ecx\n" + "31:mov %%eax, %%ebx\n" + "32:lea -4(%%ebp), %%ecx\n" "xor %%edx, %%edx\n" "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 30b\n" + "jz 32b\n" + "mov -4(%%ebp), %%eax\n" + "test %%eax, %%eax\n" + "jnz 26b\n" // exit process (no error message) // Release privileges by entering seccomp mode. - "mov $172, %%eax\n" // NR_prctl + "33:mov $172, %%eax\n" // NR_prctl "mov $22, %%ebx\n" // PR_SET_SECCOMP "mov $1, %%ecx\n" "int $0x80\n" "test %%eax, %%eax\n" "jnz 25b\n" // exit process + // We can finally start using the stack. Signal handlers no longer pose + // a threat to us. + "mov %%ebp, %%esp\n" + // Back in the newly created sandboxed thread, wait for trusted process // to receive request. It is possible for an attacker to make us // continue even before the trusted process is done. This is OK. It'll @@ -1205,12 +1441,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // data is considered untrusted anyway. "push %%eax\n" "mov $1, %%edx\n" // len = 1 - "mov %%esp, %%ecx\n" // buf = %rsp + "mov %%esp, %%ecx\n" // buf = %esp "mov %%esi, %%ebx\n" // fd = threadFdPub - "31:mov $3, %%eax\n" // NR_read + "34:mov $3, %%eax\n" // NR_read "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 31b\n" + "jz 34b\n" "cmp %%edx, %%eax\n" "jne 25b\n" // exit process "pop %%eax\n" @@ -1223,19 +1459,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // operations. "emms\n" - // Returning to createTrustedThread() "test %%ebx, %%ebx\n" - "jz 32f\n" - "jmp *%%ebx\n" - - // Returning to the place where clone() had been called - "32:pop %%ebx\n" - "pop %%ecx\n" - "pop %%edx\n" - "pop %%esi\n" - "pop %%edi\n" - "pop %%ebp\n" - "ret\n" + "jnz 35f\n" // Returning to createTrustedThread() + + // Returning to the place where clone() had been called. We rely on + // using sigreturn() for restoring our registers. The caller already + // created a signal stack frame, and we patched the register values + // with the ones that were in effect prior to calling sandbox_clone(). + "pop %%ebx\n" + "35:mov %%ebx, 0x38(%%esp)\n" // compute new %eip + "mov $119, %%eax\n" // NR_sigreturn + "int $0x80\n" ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" diff --git a/sandbox/sandbox.gyp b/sandbox/sandbox.gyp index a835089..b73c1e5 100644 --- a/sandbox/sandbox.gyp +++ b/sandbox/sandbox.gyp @@ -181,6 +181,7 @@ 'linux/seccomp/sandbox_impl.h', 'linux/seccomp/securemem.cc', 'linux/seccomp/securemem.h', + 'linux/seccomp/sigprocmask.cc', 'linux/seccomp/socketcall.cc', 'linux/seccomp/stat.cc', 'linux/seccomp/syscall.cc', |