summaryrefslogtreecommitdiffstats
path: root/sandbox
diff options
context:
space:
mode:
Diffstat (limited to 'sandbox')
-rw-r--r--sandbox/linux/seccomp/clone.cc92
-rw-r--r--sandbox/linux/seccomp/debug.cc18
-rw-r--r--sandbox/linux/seccomp/sandbox.cc154
-rw-r--r--sandbox/linux/seccomp/sandbox_impl.h136
-rw-r--r--sandbox/linux/seccomp/securemem.cc18
-rw-r--r--sandbox/linux/seccomp/securemem.h7
-rw-r--r--sandbox/linux/seccomp/sigprocmask.cc120
-rw-r--r--sandbox/linux/seccomp/syscall.cc10
-rw-r--r--sandbox/linux/seccomp/syscall_table.c12
-rw-r--r--sandbox/linux/seccomp/trusted_process.cc2
-rw-r--r--sandbox/linux/seccomp/trusted_thread.cc804
-rw-r--r--sandbox/sandbox.gyp1
12 files changed, 965 insertions, 409 deletions
diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc
index 148bae5..0bf91c1 100644
--- a/sandbox/linux/seccomp/clone.cc
+++ b/sandbox/linux/seccomp/clone.cc
@@ -7,7 +7,7 @@
namespace playground {
-int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid,
+int Sandbox::sandbox_clone(int flags, char* stack, int* pid, int* ctid,
void* tls, void *wrapper_sp) {
long long tm;
Debug::syscall(&tm, __NR_clone, "Executing handler");
@@ -24,25 +24,77 @@ int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid,
request.clone_req.ctid = ctid;
request.clone_req.tls = tls;
- // Pass along the address on the stack where syscallWrapper() stored the
- // original CPU registers. These registers will be restored in the newly
- // created thread prior to returning from the wrapped system call.
- #if defined(__x86_64__)
- memcpy(&request.clone_req.regs64, wrapper_sp,
- sizeof(request.clone_req.regs64) + sizeof(void *));
- #elif defined(__i386__)
- memcpy(&request.clone_req.regs32, wrapper_sp,
- sizeof(request.clone_req.regs32) + sizeof(void *));
- #else
- #error Unsupported target platform
- #endif
-
+ // TODO(markus): Passing stack == 0 currently does not do the same thing
+ // that the kernel would do without the sandbox. This is just going to
+ // cause a crash. We should detect this case, and replace the stack pointer
+ // with the correct value, instead.
+ // This is complicated by the fact that we will temporarily be executing
+ // both threads from the same stack. Some synchronization will be necessary.
+ // Fortunately, this complication also explains why hardly anybody ever
+ // does this.
+ // See trusted_thread.cc for more information.
long rc;
- SysCalls sys;
- if (write(sys, processFdPub(), &request, sizeof(request)) !=
- sizeof(request) ||
- read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
- die("Failed to forward clone() request [sandbox]");
+ if (stack == 0) {
+ rc = -EINVAL;
+ } else {
+ // Pass along the address on the stack where syscallWrapper() stored the
+ // original CPU registers. These registers will be restored in the newly
+ // created thread prior to returning from the wrapped system call.
+ #if defined(__x86_64__)
+ memcpy(&request.clone_req.regs64, wrapper_sp,
+ sizeof(request.clone_req.regs64) + sizeof(void *));
+ #elif defined(__i386__)
+ memcpy(&request.clone_req.regs32, wrapper_sp,
+ sizeof(request.clone_req.regs32) + sizeof(void *));
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // In order to unblock the signal mask in the newly created thread and
+ // after entering Seccomp mode, we have to call sigreturn(). But that
+ // requires access to a proper stack frame describing a valid signal.
+ // We trigger a signal now and make sure the stack frame ends up on the
+ // new stack. Our segv() handler (in sandbox.cc) does that for us.
+ // See trusted_thread.cc for more details on how threads get created.
+ //
+ // In general we rely on the kernel for generating the signal stack
+ // frame, as the exact binary format has been extended several times over
+ // the course of the kernel's development. Fortunately, the kernel
+ // developers treat the initial part of the stack frame as a stable part
+ // of the ABI. So, we can rely on fixed, well-defined offsets for accessing
+ // register values and for accessing the signal mask.
+ #if defined(__x86_64__) || defined(__i386__)
+ #if defined(__x86_64__)
+ // Red zone compensation. The instrumented system call will remove 128
+ // bytes from the thread's stack prior to returning to the original
+ // call site.
+ stack -= 128;
+ request.clone_req.stack = stack;
+ #endif
+ asm("int $0"
+ : "=m"(request.clone_req.stack)
+ : "a"(__NR_clone + 0xF000), "d"(&request.clone_req.stack)
+ : "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Adjust the signal stack frame so that it contains the correct stack
+ // pointer upon returning from sigreturn().
+ #if defined(__x86_64__)
+ *(char **)(request.clone_req.stack + 0xA0) = stack;
+ #elif defined(__i386__)
+ *(char **)(request.clone_req.stack + 0x1C) = stack;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward clone() request [sandbox]");
+ }
}
Debug::elapsed(tm, __NR_clone);
return static_cast<int>(rc);
@@ -64,7 +116,7 @@ bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub,
SecureMem::abandonSystemCall(threadFd, -EPERM);
return false;
} else {
- SecureMem::Args* newMem = getSecureMem();
+ SecureMem::Args* newMem = getNewSecureMem();
if (!newMem) {
SecureMem::abandonSystemCall(threadFd, -ENOMEM);
return false;
diff --git a/sandbox/linux/seccomp/debug.cc b/sandbox/linux/seccomp/debug.cc
index e4d6410..5d6de49 100644
--- a/sandbox/linux/seccomp/debug.cc
+++ b/sandbox/linux/seccomp/debug.cc
@@ -140,8 +140,8 @@ bool Debug::enter() {
asm volatile("mov %%gs, %0\n"
"test %0, %0\n"
"jz 1f\n"
- "movl %%gs:0x1050-0xD8, %0\n"
- "incl %%gs:0x1050-0xD8\n"
+ "movl %%gs:0x1050-0xE0, %0\n"
+ "incl %%gs:0x1050-0xE0\n"
"1:\n"
: "=r"(level)
:
@@ -150,8 +150,8 @@ bool Debug::enter() {
asm volatile("mov %%fs, %0\n"
"test %0, %0\n"
"jz 1f\n"
- "movl %%fs:0x1034-0x54, %0\n"
- "incl %%fs:0x1034-0x54\n"
+ "movl %%fs:0x1034-0x58, %0\n"
+ "incl %%fs:0x1034-0x58\n"
"1:\n"
: "=r"(level)
:
@@ -178,8 +178,8 @@ bool Debug::leave() {
asm volatile("mov %%gs, %0\n"
"test %0, %0\n"
"jz 1f\n"
- "decl %%gs:0x1050-0xD8\n"
- "movl %%gs:0x1050-0xD8, %0\n"
+ "decl %%gs:0x1050-0xE0\n"
+ "movl %%gs:0x1050-0xE0, %0\n"
"1:\n"
: "=r"(level)
:
@@ -188,8 +188,8 @@ bool Debug::leave() {
asm volatile("mov %%fs, %0\n"
"test %0, %0\n"
"jz 1f\n"
- "decl %%fs:0x1034-0x54\n"
- "movl %%fs:0x1034-0x54, %0\n"
+ "decl %%fs:0x1034-0x58\n"
+ "movl %%fs:0x1034-0x58, %0\n"
"1:\n"
: "=r"(level)
:
@@ -234,7 +234,7 @@ void Debug::gettimeofday(long long* tm) {
// Zero out the lastSyscallNum, so that we don't try to coalesce
// calls to gettimeofday(). For debugging purposes, we need the
// exact time.
- asm volatile("movl $0, %fs:0x102C-0x54");
+ asm volatile("movl $0, %fs:0x102C-0x58");
#elif !defined(__x86_64__)
#error Unsupported target platform
#endif
diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc
index 12f0c0f..b7a249e 100644
--- a/sandbox/linux/seccomp/sandbox.cc
+++ b/sandbox/linux/seccomp/sandbox.cc
@@ -112,25 +112,23 @@ bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) {
}
void Sandbox::setupSignalHandlers() {
+ // Set SIGCHLD to SIG_DFL so that waitpid() can work
SysCalls sys;
struct SysCalls::kernel_sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler_ = SIG_DFL;
sys.sigaction(SIGCHLD, &sa, NULL);
- // Set up SEGV handler for dealing with RDTSC instructions
+ // Set up SEGV handler for dealing with RDTSC instructions, system calls
+ // that have been rewritten to use INT0, and for sigpending() emulation.
sa.sa_handler_ = segv();
sys.sigaction(SIGSEGV, &sa, NULL);
- // Block all asynchronous signals, except for SIGCHLD which needs to be
- // set to SIG_DFL for waitpid() to work.
+ // Unblock SIGSEGV and SIGCHLD
SysCalls::kernel_sigset_t mask;
- memset(&mask, 0xFF, sizeof(mask));
- mask.sig[0] &= ~((1 << (SIGSEGV - 1)) | (1 << (SIGINT - 1)) |
- (1 << (SIGTERM - 1)) | (1 << (SIGQUIT - 1)) |
- (1 << (SIGHUP - 1)) | (1 << (SIGABRT - 1)) |
- (1 << (SIGCHLD - 1)));
- sys.sigprocmask(SIG_SETMASK, &mask, 0);
+ memset(&mask, 0x00, sizeof(mask));
+ mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1));
+ sys.sigprocmask(SIG_UNBLOCK, &mask, 0);
}
void (*Sandbox::segv())(int signo) {
@@ -158,7 +156,7 @@ void (*Sandbox::segv())(int signo) {
"sub $4, %%rsp\n"
"push %%r14\n"
"mov %%gs:16, %%edi\n" // fd = threadFdPub
- "mov %%rsp, %%rsi\n" // buf = %esp
+ "mov %%rsp, %%rsi\n" // buf = %rsp
"mov $4, %%edx\n" // len = sizeof(int)
"1:mov $1, %%eax\n" // NR_write
"syscall\n"
@@ -199,8 +197,8 @@ void (*Sandbox::segv())(int signo) {
// of playground::Library being unable to find a way to safely
// rewrite the system call instruction. Retrieve the CPU register
// at the time of the segmentation fault and invoke syscallWrapper().
- "8:cmpw $0xCD, (%%r15)\n" // INT $0x0
- "jnz 9f\n"
+ "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0
+ "jnz 14f\n"
#ifndef NDEBUG
"lea 200f(%%rip), %%rdi\n"
"call playground$debugMessage\n"
@@ -212,7 +210,53 @@ void (*Sandbox::segv())(int signo) {
"mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault
"mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault
"mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault
- "lea 7b(%%rip), %%rcx\n"
+
+ // Handle rt_sigprocmask()
+ "cmp $14, %%rax\n" // NR_rt_sigprocmask
+ "jnz 12f\n"
+ "mov $-22, %%rax\n" // -EINVAL
+ "cmp $8, %%r10\n" // %r10 = sigsetsize (8 bytes = 64 signals)
+ "jl 7b\n"
+ "mov 0x130(%%rsp), %%r10\n" // signal mask at time of segmentation fault
+ "test %%rsi, %%rsi\n" // only set mask, if set is non-NULL
+ "jz 11f\n"
+ "mov 0(%%rsi), %%rsi\n"
+ "cmp $0, %%rdi\n" // %rdi = how (SIG_BLOCK)
+ "jnz 9f\n"
+ "or %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault
+ "jmp 11f\n"
+ "9:cmp $1, %%rdi\n" // %rdi = how (SIG_UNBLOCK)
+ "jnz 10f\n"
+ "xor $-1, %%rsi\n"
+ "and %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault
+ "jmp 11f\n"
+ "10:cmp $2, %%rdi\n" // %rdi = how (SIG_SETMASK)
+ "jnz 7b\n"
+ "mov %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault
+ "11:xor %%rax, %%rax\n"
+ "test %%rdx, %%rdx\n" // only return old mask, if set is non-NULL
+ "jz 7b\n"
+ "mov %%r10, 0(%%rdx)\n" // old_set
+ "jmp 7b\n"
+
+
+ // Copy signal frame onto new stack. See clone.cc for details
+ "12:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000
+ "jnz 13f\n"
+ "mov 0xA8(%%rsp), %%rcx\n" // %rsp at time of segmentation fault
+ "sub %%rsp, %%rcx\n" // %rcx = size of stack frame
+ "sub $8, %%rcx\n" // skip return address
+ "mov %%rcx, %%rax\n" // return size of signal stack frame
+ "mov 0(%%rdx), %%rdi\n" // stack for newly clone()'d thread
+ "sub %%rcx, %%rdi\n" // copy onto new stack
+ "mov %%rdi, 0(%%rdx)\n" // allocate space on new stack
+ "lea 8(%%rsp), %%rsi\n" // copy from current stack
+ "cld\n"
+ "rep movsb\n"
+ "jmp 7b\n"
+
+ // Forward system call to syscallWrapper()
+ "13:lea 7b(%%rip), %%rcx\n"
"push %%rcx\n"
"push 0xB8(%%rsp)\n" // %rip at time of segmentation fault
"lea playground$syscallWrapper(%%rip), %%rcx\n"
@@ -221,7 +265,7 @@ void (*Sandbox::segv())(int signo) {
// This was a genuine segmentation fault. Trigger the kernel's default
// signal disposition. The only way we can do this from seccomp mode
// is by blocking the signal and retriggering it.
- "9:mov $2, %%edi\n" // stderr
+ "14:mov $2, %%edi\n" // stderr
"lea 300f(%%rip), %%rsi\n" // "Segmentation fault\n"
"mov $301f-300f, %%edx\n"
"mov $1, %%eax\n" // NR_write
@@ -293,8 +337,8 @@ void (*Sandbox::segv())(int signo) {
// of playground::Library being unable to find a way to safely
// rewrite the system call instruction. Retrieve the CPU register
// at the time of the segmentation fault and invoke syscallWrapper().
- "8:cmpw $0xCD, (%%ebp)\n" // INT $0x0
- "jnz 9f\n"
+ "8:cmpw $0x00CD, (%%ebp)\n" // INT $0x0
+ "jnz 16f\n"
#ifndef NDEBUG
"lea 200f, %%eax\n"
"push %%eax\n"
@@ -308,13 +352,69 @@ void (*Sandbox::segv())(int signo) {
"mov 0x1C(%%esp), %%esi\n" // %esi at time of segmentation fault
"mov 0x18(%%esp), %%edi\n" // %edi at time of segmentation fault
"mov 0x20(%%esp), %%ebp\n" // %ebp at time of segmentation fault
- "call playground$syscallWrapper\n"
+
+ // Handle sigprocmask() and rt_sigprocmask()
+ "cmp $175, %%eax\n" // NR_rt_sigprocmask
+ "jnz 9f\n"
+ "mov $-22, %%eax\n" // -EINVAL
+ "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals)
+ "jl 7b\n"
+ "jmp 10f\n"
+ "9:cmp $126, %%eax\n" // NR_sigprocmask
+ "jnz 14f\n"
+ "mov $-22, %%eax\n"
+ "10:mov 0x58(%%esp), %%edi\n" // signal mask at time of segmentation fault
+ "mov 0x5C(%%esp), %%ebp\n"
+ "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL
+ "jz 13f\n"
+ "mov 0(%%ecx), %%esi\n"
+ "mov 4(%%ecx), %%ecx\n"
+ "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK)
+ "jnz 11f\n"
+ "or %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault
+ "or %%ecx, 0x5C(%%esp)\n"
+ "jmp 13f\n"
+ "11:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK)
+ "jnz 12f\n"
+ "xor $-1, %%esi\n"
+ "xor $-1, %%ecx\n"
+ "and %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault
+ "and %%ecx, 0x5C(%%esp)\n"
+ "jmp 13f\n"
+ "12:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK)
+ "jnz 7b\n"
+ "mov %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault
+ "mov %%ecx, 0x5C(%%esp)\n"
+ "13:xor %%eax, %%eax\n"
+ "test %%edx, %%edx\n" // only return old mask, if set is non-NULL
+ "jz 7b\n"
+ "mov %%edi, 0(%%edx)\n" // old_set
+ "mov %%ebp, 4(%%edx)\n"
+ "jmp 7b\n"
+
+ // Copy signal frame onto new stack. See clone.cc for details
+ "14:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000
+ "jnz 15f\n"
+ "mov 0x24(%%esp), %%ecx\n" // %esp at time of segmentation fault
+ "sub %%esp, %%ecx\n" // %ecx = size of stack frame
+ "sub $8, %%ecx\n" // skip return address and dummy
+ "mov %%ecx, %%eax\n" // return size of signal stack frame
+ "mov 0(%%edx), %%edi\n" // stack for newly clone()'d thread
+ "sub %%ecx, %%edi\n" // copy onto new stack
+ "mov %%edi, 0(%%edx)\n" // allocate space on new stack
+ "lea 8(%%esp), %%esi\n" // copy from current stack
+ "cld\n"
+ "rep movsb\n"
+ "jmp 7b\n"
+
+ // Forward system call to syscallWrapper()
+ "15:call playground$syscallWrapper\n"
"jmp 7b\n"
// This was a genuine segmentation fault. Trigger the kernel's default
// signal disposition. The only way we can do this from seccomp mode
// is by blocking the signal and retriggering it.
- "9:mov $2, %%ebx\n" // stderr
+ "16:mov $2, %%ebx\n" // stderr
"lea 300f, %%ecx\n" // "Segmentation fault\n"
"mov $301f-300f, %%edx\n"
"mov $4, %%eax\n" // NR_write
@@ -345,6 +445,24 @@ void (*Sandbox::segv())(int signo) {
return fnc;
}
+SecureMem::Args* Sandbox::getSecureMem() {
+ // Check trusted_thread.cc for the magic offset that gets us from the TLS
+ // to the beginning of the secure memory area.
+ SecureMem::Args* ret;
+#if defined(__x86_64__)
+ asm volatile(
+ "movq %%gs:-0xE0, %0\n"
+ : "=q"(ret));
+#elif defined(__i386__)
+ asm volatile(
+ "movl %%fs:-0x58, %0\n"
+ : "=r"(ret));
+#else
+#error Unsupported target platform
+#endif
+ return ret;
+}
+
void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) {
SysCalls sys;
if (sys.lseek(proc_self_maps, 0, SEEK_SET) ||
diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h
index 18a359c..36f01c8 100644
--- a/sandbox/linux/seccomp/sandbox_impl.h
+++ b/sandbox/linux/seccomp/sandbox_impl.h
@@ -56,7 +56,7 @@ class Sandbox {
// "proc_fd" should be a file descriptor for "/proc", or -1 if not provided
// by the caller.
static int supportsSeccompSandbox(int proc_fd)
- asm("SupportsSeccompSandbox");
+ asm("SupportsSeccompSandbox");
// The sandbox needs to be able to access "/proc/self/maps". If this file
// is not accessible when "startSandbox()" gets called, the caller can
@@ -64,12 +64,12 @@ class Sandbox {
// The sandbox becomes the newer owner of this file descriptor and will
// eventually close it when "startSandbox()" executes.
static void setProcSelfMaps(int proc_self_maps)
- asm("SeccompSandboxSetProcSelfMaps");
+ asm("SeccompSandboxSetProcSelfMaps");
// This is the main public entry point. It finds all system calls that
// need rewriting, sets up the resources needed by the sandbox, and
// enters Seccomp mode.
- static void startSandbox() asm("StartSeccompSandbox");
+ static void startSandbox() asm("StartSeccompSandbox");
private:
// syscall_table.c has to be implemented in C, as C++ does not support
@@ -84,7 +84,7 @@ class Sandbox {
// Clone() is special as it has a wrapper in syscall_table.c. The wrapper
// adds one extra argument (the pointer to the saved registers) and then
// calls playground$sandbox__clone().
- static int sandbox_clone(int flags, void* stack, int* pid, int* ctid,
+ static int sandbox_clone(int flags, char* stack, int* pid, int* ctid,
void* tls, void* wrapper_sp)
asm("playground$sandbox__clone")
#if defined(__x86_64__)
@@ -96,130 +96,142 @@ class Sandbox {
#define bool int
#define SecureMemArgs void
// This is the wrapper entry point that is found in the syscall_table.
- int sandbox_clone(int flags, void* stack, int* pid, int* ctid, void* tls)
- asm("playground$sandbox_clone");
+ int sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls)
+ asm("playground$sandbox_clone");
#endif
// Entry points for sandboxed code that is attempting to make system calls
STATIC int sandbox_access(const char*, int)
- asm("playground$sandbox_access");
- STATIC int sandbox_exit(int status) asm("playground$sandbox_exit");
- STATIC int sandbox_getpid() asm("playground$sandbox_getpid");
+ asm("playground$sandbox_access");
+ STATIC int sandbox_exit(int status) asm("playground$sandbox_exit");
+ STATIC int sandbox_getpid() asm("playground$sandbox_getpid");
#if defined(__NR_getsockopt)
STATIC int sandbox_getsockopt(int, int, int, void*, socklen_t*)
- asm("playground$sandbox_getsockopt");
+ asm("playground$sandbox_getsockopt");
#endif
- STATIC int sandbox_gettid() asm("playground$sandbox_gettid");
+ STATIC int sandbox_gettid() asm("playground$sandbox_gettid");
STATIC int sandbox_ioctl(int d, int req, void* arg)
- asm("playground$sandbox_ioctl");
+ asm("playground$sandbox_ioctl");
#if defined(__NR_ipc)
STATIC int sandbox_ipc(unsigned, int, int, int, void*, long)
- asm("playground$sandbox_ipc");
+ asm("playground$sandbox_ipc");
#endif
STATIC int sandbox_lstat(const char* path, void* buf)
- asm("playground$sandbox_lstat");
+ asm("playground$sandbox_lstat");
#if defined(__NR_lstat64)
STATIC int sandbox_lstat64(const char *path, void* b)
- asm("playground$sandbox_lstat64");
+ asm("playground$sandbox_lstat64");
#endif
STATIC int sandbox_madvise(void*, size_t, int)
- asm("playground$sandbox_madvise");
+ asm("playground$sandbox_madvise");
STATIC void *sandbox_mmap(void* start, size_t length, int prot, int flags,
int fd, off_t offset)
- asm("playground$sandbox_mmap");
+ asm("playground$sandbox_mmap");
STATIC int sandbox_mprotect(const void*, size_t, int)
- asm("playground$sandbox_mprotect");
+ asm("playground$sandbox_mprotect");
STATIC int sandbox_munmap(void* start, size_t length)
- asm("playground$sandbox_munmap");
+ asm("playground$sandbox_munmap");
STATIC int sandbox_open(const char*, int, mode_t)
- asm("playground$sandbox_open");
+ asm("playground$sandbox_open");
#if defined(__NR_recvfrom)
STATIC ssize_t sandbox_recvfrom(int, void*, size_t, int, void*, socklen_t*)
- asm("playground$sandbox_recvfrom");
+ asm("playground$sandbox_recvfrom");
STATIC ssize_t sandbox_recvmsg(int, struct msghdr*, int)
- asm("playground$sandbox_recvmsg");
+ asm("playground$sandbox_recvmsg");
+ #endif
+ #if defined(__NR_rt_sigprocmask)
+ STATIC int sandbox_rt_sigprocmask(int how, const void*, void*, size_t)
+ asm("playground$sandbox_rt_sigprocmask");
+ #endif
+ #if defined(__NR_sendmsg)
STATIC size_t sandbox_sendmsg(int, const struct msghdr*, int)
- asm("playground$sandbox_sendmsg");
+ asm("playground$sandbox_sendmsg");
STATIC ssize_t sandbox_sendto(int, const void*, size_t, int, const void*,
socklen_t)asm("playground$sandbox_sendto");
+ #endif
#if defined(__NR_shmat)
STATIC void* sandbox_shmat(int, const void*, int)
- asm("playground$sandbox_shmat");
+ asm("playground$sandbox_shmat");
STATIC int sandbox_shmctl(int, int, void*)
- asm("playground$sandbox_shmctl");
- STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt");
+ asm("playground$sandbox_shmctl");
+ STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt");
STATIC int sandbox_shmget(int, size_t, int)
- asm("playground$sandbox_shmget");
+ asm("playground$sandbox_shmget");
#endif
+ #if defined(__NR_setsockopt)
STATIC int sandbox_setsockopt(int, int, int, const void*, socklen_t)
- asm("playground$sandbox_setsockopt");
+ asm("playground$sandbox_setsockopt");
+ #endif
+ #if defined(__NR_sigprocmask)
+ STATIC int sandbox_sigprocmask(int how, const void*, void*)
+ asm("playground$sandbox_sigprocmask");
#endif
#if defined(__NR_socketcall)
STATIC int sandbox_socketcall(int call, void* args)
- asm("playground$sandbox_socketcall");
+ asm("playground$sandbox_socketcall");
#endif
STATIC int sandbox_stat(const char* path, void* buf)
- asm("playground$sandbox_stat");
+ asm("playground$sandbox_stat");
#if defined(__NR_stat64)
STATIC int sandbox_stat64(const char *path, void* b)
- asm("playground$sandbox_stat64");
+ asm("playground$sandbox_stat64");
#endif
// Functions for system calls that need to be handled in the trusted process
STATIC bool process_access(int, int, int, int, SecureMemArgs*)
- asm("playground$process_access");
+ asm("playground$process_access");
STATIC bool process_clone(int, int, int, int, SecureMemArgs*)
- asm("playground$process_clone");
+ asm("playground$process_clone");
STATIC bool process_exit(int, int, int, int, SecureMemArgs*)
- asm("playground$process_exit");
+ asm("playground$process_exit");
#if defined(__NR_getsockopt)
STATIC bool process_getsockopt(int, int, int, int, SecureMemArgs*)
- asm("playground$process_getsockopt");
+ asm("playground$process_getsockopt");
#endif
STATIC bool process_ioctl(int, int, int, int, SecureMemArgs*)
- asm("playground$process_ioctl");
+ asm("playground$process_ioctl");
#if defined(__NR_ipc)
STATIC bool process_ipc(int, int, int, int, SecureMemArgs*)
- asm("playground$process_ipc");
+ asm("playground$process_ipc");
#endif
STATIC bool process_madvise(int, int, int, int, SecureMemArgs*)
- asm("playground$process_madvise");
+ asm("playground$process_madvise");
STATIC bool process_mmap(int, int, int, int, SecureMemArgs*)
- asm("playground$process_mmap");
+ asm("playground$process_mmap");
STATIC bool process_mprotect(int, int, int, int, SecureMemArgs*)
- asm("playground$process_mprotect");
+ asm("playground$process_mprotect");
STATIC bool process_munmap(int, int, int, int, SecureMemArgs*)
- asm("playground$process_munmap");
+ asm("playground$process_munmap");
STATIC bool process_open(int, int, int, int, SecureMemArgs*)
- asm("playground$process_open");
+ asm("playground$process_open");
#if defined(__NR_recvfrom)
STATIC bool process_recvfrom(int, int, int, int, SecureMemArgs*)
- asm("playground$process_recvfrom");
+ asm("playground$process_recvfrom");
STATIC bool process_recvmsg(int, int, int, int, SecureMemArgs*)
- asm("playground$process_recvmsg");
+ asm("playground$process_recvmsg");
STATIC bool process_sendmsg(int, int, int, int, SecureMemArgs*)
- asm("playground$process_sendmsg");
+ asm("playground$process_sendmsg");
STATIC bool process_sendto(int, int, int, int, SecureMemArgs*)
- asm("playground$process_sendto");
+ asm("playground$process_sendto");
STATIC bool process_setsockopt(int, int, int, int, SecureMemArgs*)
- asm("playground$process_setsockopt");
+ asm("playground$process_setsockopt");
#endif
#if defined(__NR_shmat)
STATIC bool process_shmat(int, int, int, int, SecureMemArgs*)
- asm("playground$process_shmat");
+ asm("playground$process_shmat");
STATIC bool process_shmctl(int, int, int, int, SecureMemArgs*)
- asm("playground$process_shmctl");
+ asm("playground$process_shmctl");
STATIC bool process_shmdt(int, int, int, int, SecureMemArgs*)
- asm("playground$process_shmdt");
+ asm("playground$process_shmdt");
STATIC bool process_shmget(int, int, int, int, SecureMemArgs*)
- asm("playground$process_shmget");
+ asm("playground$process_shmget");
#endif
#if defined(__NR_socketcall)
STATIC bool process_socketcall(int, int, int, int, SecureMemArgs*)
- asm("playground$process_socketcall");
+ asm("playground$process_socketcall");
#endif
STATIC bool process_stat(int, int, int, int, SecureMemArgs*)
- asm("playground$process_stat");
+ asm("playground$process_stat");
#ifdef __cplusplus
friend class Debug;
@@ -294,13 +306,11 @@ class Sandbox {
}
// Sends a file handle to another process.
+ // N.B. trusted_thread.cc has an assembly version of this function that
+ // is safe to use without a call stack. If the wire-format is changed,
+ /// make sure to update the assembly code.
static bool sendFd(int transport, int fd0, int fd1, const void* buf,
- size_t len)
- asm("playground$sendFd")
- #if defined(__x86_64__)
- __attribute__((visibility("internal")))
- #endif
- ;
+ size_t len);
// If getFd() fails, it will set the first valid fd slot (e.g. fd0) to
// -errno.
@@ -334,7 +344,7 @@ class Sandbox {
struct Clone {
int flags;
- void* stack;
+ char* stack;
int* pid;
int* ctid;
void* tls;
@@ -584,6 +594,7 @@ class Sandbox {
static int tid() { return TLS::getTLSValue<int>(TLS_TID); }
static int threadFdPub() { return TLS::getTLSValue<int>(TLS_THREAD_FD); }
static int processFdPub() { return processFdPub_; }
+ static kernel_sigset_t* signalMask() { return &getSecureMem()->signalMask; }
// The SEGV handler knows how to handle RDTSC instructions
static void setupSignalHandlers();
@@ -601,9 +612,12 @@ class Sandbox {
#endif
;
+ // Return the current secure memory structure for this thread.
+ static SecureMem::Args* getSecureMem();
+
// Return a secure memory structure that can be used by a newly created
// thread.
- static SecureMem::Args* getSecureMem();
+ static SecureMem::Args* getNewSecureMem();
// This functions runs in the trusted process at startup and finds all the
// memory mappings that existed when the sandbox was first enabled. Going
diff --git a/sandbox/linux/seccomp/securemem.cc b/sandbox/linux/seccomp/securemem.cc
index 0071c45..5f07bbe 100644
--- a/sandbox/linux/seccomp/securemem.cc
+++ b/sandbox/linux/seccomp/securemem.cc
@@ -72,13 +72,14 @@ void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd,
: "q"(&mem->sequence)
: "memory");
}
- mem->syscallNum = syscallNum;
- mem->arg1 = arg1;
- mem->arg2 = arg2;
- mem->arg3 = arg3;
- mem->arg4 = arg4;
- mem->arg5 = arg5;
- mem->arg6 = arg6;
+ mem->callType = locked ? -2 : -1;
+ mem->syscallNum = syscallNum;
+ mem->arg1 = arg1;
+ mem->arg2 = arg2;
+ mem->arg3 = arg3;
+ mem->arg4 = arg4;
+ mem->arg5 = arg5;
+ mem->arg6 = arg6;
asm volatile(
#if defined(__x86_64__)
"lock; incq (%0)\n"
@@ -90,9 +91,8 @@ void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd,
:
: "q"(&mem->sequence)
: "memory");
- int data = locked ? -2 : -1;
Sandbox::SysCalls sys;
- if (Sandbox::write(sys, fd, &data, sizeof(data)) != sizeof(data)) {
+ if (Sandbox::write(sys, fd, &mem->callType, sizeof(int)) != sizeof(int)) {
Sandbox::die("Failed to send system call");
}
if (parentMapsFd >= 0) {
diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h
index ac7823e..dc035ff 100644
--- a/sandbox/linux/seccomp/securemem.h
+++ b/sandbox/linux/seccomp/securemem.h
@@ -6,6 +6,7 @@
#define SECURE_MEM_H__
#include <stdlib.h>
+#include "linux_syscall_support.h"
namespace playground {
@@ -28,6 +29,7 @@ class SecureMem {
struct {
struct Args* self;
long sequence;
+ long callType;
long syscallNum;
void* arg1;
void* arg2;
@@ -92,7 +94,7 @@ class SecureMem {
struct {
// This scratch space is used by the trusted thread to read parameters
// for unrestricted system calls.
- long tmpSyscallNum;
+ int tmpSyscallNum;
void* tmpArg1;
void* tmpArg2;
void* tmpArg3;
@@ -115,6 +117,9 @@ class SecureMem {
// result in additional system calls. Make sure that we don't trigger
// logging of those recursive calls.
int recursionLevel;
+
+ // Computing the signal mask is expensive. Keep a cached copy.
+ kernel_sigset_t signalMask;
} __attribute__((packed));
char scratchPage[4096];
};
diff --git a/sandbox/linux/seccomp/sigprocmask.cc b/sandbox/linux/seccomp/sigprocmask.cc
new file mode 100644
index 0000000..f3ad1fb
--- /dev/null
+++ b/sandbox/linux/seccomp/sigprocmask.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+// If the sandboxed process tries to mask SIGSEGV, there is a good chance
+// the process will eventually get terminated. If this is really ever a
+// problem, we can hide the fact that SIGSEGV is unmasked. But I don't think
+// we really need this. Masking of synchronous signals is rarely necessary.
+
+#if defined(__NR_sigprocmask)
+int Sandbox::sandbox_sigprocmask(int how, const void* set, void* old_set) {
+ long long tm;
+ Debug::syscall(&tm, __NR_sigprocmask, "Executing handler");
+
+ // Access the signal mask by triggering a SEGV and modifying the signal state
+ // prior to calling rt_sigreturn().
+ long res = -ENOSYS;
+ #if defined(__x86_64__)
+ #error x86-64 does not support sigprocmask(); use rt_sigprocmask() instead
+ #elif defined(__i386__)
+ asm volatile(
+ "push %%ebx\n"
+ "movl %2, %%ebx\n"
+ "int $0\n"
+ "pop %%ebx\n"
+ : "=a"(res)
+ : "0"(__NR_sigprocmask), "ri"((long)how),
+ "c"((long)set), "d"((long)old_set)
+ : "esp", "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Update our shadow signal mask, so that we can copy it upon creation of
+ // new threads.
+ if (res == 0 && set != NULL) {
+ SecureMem::Args* args = getSecureMem();
+ switch (how) {
+ case SIG_BLOCK:
+ *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set;
+ break;
+ case SIG_UNBLOCK:
+ *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set;
+ break;
+ case SIG_SETMASK:
+ *(unsigned long long *)&args->signalMask = *(unsigned long long *)set;
+ break;
+ default:
+ break;
+ }
+ }
+
+ Debug::elapsed(tm, __NR_sigprocmask);
+
+ return (int)res;
+}
+#endif
+
+#if defined(__NR_rt_sigprocmask)
+int Sandbox::sandbox_rt_sigprocmask(int how, const void* set, void* old_set,
+ size_t bytes) {
+ long long tm;
+ Debug::syscall(&tm, __NR_rt_sigprocmask, "Executing handler");
+
+ // Access the signal mask by triggering a SEGV and modifying the signal state
+ // prior to calling rt_sigreturn().
+ long res = -ENOSYS;
+ #if defined(__x86_64__)
+ asm volatile(
+ "movq %5, %%r10\n"
+ "int $0\n"
+ : "=a"(res)
+ : "0"(__NR_rt_sigprocmask), "D"((long)how),
+ "S"((long)set), "d"((long)old_set), "r"((long)bytes)
+ : "r10", "r11", "rcx", "memory");
+ #elif defined(__i386__)
+ asm volatile(
+ "push %%ebx\n"
+ "movl %2, %%ebx\n"
+ "int $0\n"
+ "pop %%ebx\n"
+ : "=a"(res)
+ : "0"(__NR_rt_sigprocmask), "ri"((long)how),
+ "c"((long)set), "d"((long)old_set), "S"((long)bytes)
+ : "esp", "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Update our shadow signal mask, so that we can copy it upon creation of
+ // new threads.
+ if (res == 0 && set != NULL && bytes >= 8) {
+ SecureMem::Args* args = getSecureMem();
+ switch (how) {
+ case SIG_BLOCK:
+ *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set;
+ break;
+ case SIG_UNBLOCK:
+ *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set;
+ break;
+ case SIG_SETMASK:
+ *(unsigned long long *)&args->signalMask = *(unsigned long long *)set;
+ break;
+ default:
+ break;
+ }
+ }
+
+ Debug::elapsed(tm, __NR_rt_sigprocmask);
+
+ return (int)res;
+}
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc
index 7f431a3..76e96e4 100644
--- a/sandbox/linux/seccomp/syscall.cc
+++ b/sandbox/linux/seccomp/syscall.cc
@@ -165,7 +165,7 @@ asm(
// the time. There might be a repeated pattern of those.
"cmp $78, %eax\n" // __NR_gettimeofday
"jnz 2f\n"
- "cmp %eax, %fs:0x102C-0x54\n" // last system call
+ "cmp %eax, %fs:0x102C-0x58\n" // last system call
"jnz 0f\n"
// This system call and the last system call prior to this one both are
@@ -173,7 +173,7 @@ asm(
// return the same result as in the previous call.
// Just in case the caller is spinning on the result from gettimeofday(),
// every so often, call the actual system call.
- "decl %fs:0x1030-0x54\n" // countdown calls to gettimofday()
+ "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday()
"jz 0f\n"
// Atomically read the 64bit word representing last-known timestamp and
@@ -190,8 +190,8 @@ asm(
// This is a call to gettimeofday(), but we don't have a valid cached
// result, yet.
- "0:mov %eax, %fs:0x102C-0x54\n" // remember syscall number
- "movl $500, %fs:0x1030-0x54\n" // make system call, each 500 invocations
+ "0:mov %eax, %fs:0x102C-0x58\n" // remember syscall number
+ "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations
"call playground$defaultSystemCallHandler\n"
// Returned from gettimeofday(). Remember return value, in case the
@@ -212,7 +212,7 @@ asm(
// would still like to coalesce the gettimeofday() calls.
"2:cmp $224, %eax\n" // __NR_gettid
"jz 3f\n"
- "mov %eax, %fs:0x102C-0x54\n" // remember syscall number
+ "mov %eax, %fs:0x102C-0x58\n" // remember syscall number
// Retrieve function call from system call table (c.f. syscall_table.c).
// We have three different types of entries; zero for denied system calls,
diff --git a/sandbox/linux/seccomp/syscall_table.c b/sandbox/linux/seccomp/syscall_table.c
index 2f66ca3..454ffa9 100644
--- a/sandbox/linux/seccomp/syscall_table.c
+++ b/sandbox/linux/seccomp/syscall_table.c
@@ -96,19 +96,31 @@ const struct SyscallTable syscallTable[] __attribute__((
#if defined(__NR_recvfrom)
[ __NR_recvfrom ] = { (void*)&sandbox_recvfrom, process_recvfrom },
[ __NR_recvmsg ] = { (void*)&sandbox_recvmsg, process_recvmsg },
+ #endif
+ #if defined(__NR_rt_sigprocmask)
+ [ __NR_rt_sigprocmask ] = { (void*)&sandbox_rt_sigprocmask, 0 },
+ #endif
+ #if defined(__NR_sendmsg)
[ __NR_sendmsg ] = { (void*)&sandbox_sendmsg, process_sendmsg },
[ __NR_sendto ] = { (void*)&sandbox_sendto, process_sendto },
#endif
[ __NR_set_robust_list ] = { UNRESTRICTED_SYSCALL, 0 },
#if defined(__NR_setsockopt)
[ __NR_setsockopt ] = { (void*)&sandbox_setsockopt,process_setsockopt },
+ #endif
#if defined(__NR_shmat)
[ __NR_shmat ] = { (void*)&sandbox_shmat, process_shmat },
[ __NR_shmctl ] = { (void*)&sandbox_shmctl, process_shmctl },
[ __NR_shmdt ] = { (void*)&sandbox_shmdt, process_shmdt },
[ __NR_shmget ] = { (void*)&sandbox_shmget, process_shmget },
#endif
+ #if defined(__NR_shutdown)
[ __NR_shutdown ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ #if defined(__NR_sigprocmask)
+ [ __NR_sigprocmask ] = { (void*)&sandbox_sigprocmask, 0 },
+ #endif
+ #if defined(__NR_socketpair)
[ __NR_socketpair ] = { UNRESTRICTED_SYSCALL, 0 },
#endif
#if defined(__NR_socketcall)
diff --git a/sandbox/linux/seccomp/trusted_process.cc b/sandbox/linux/seccomp/trusted_process.cc
index 1320839..80adbf6 100644
--- a/sandbox/linux/seccomp/trusted_process.cc
+++ b/sandbox/linux/seccomp/trusted_process.cc
@@ -16,7 +16,7 @@ struct Thread {
SecureMem::Args* mem;
};
-SecureMem::Args* Sandbox::getSecureMem() {
+SecureMem::Args* Sandbox::getNewSecureMem() {
if (!secureMemPool_.empty()) {
SecureMem::Args* rc = secureMemPool_.back();
secureMemPool_.pop_back();
diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc
index c73091c..240e65f 100644
--- a/sandbox/linux/seccomp/trusted_thread.cc
+++ b/sandbox/linux/seccomp/trusted_thread.cc
@@ -21,6 +21,44 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov %0, %%rbp\n" // %rbp = args
"xor %%rbx, %%rbx\n" // initial sequence number
"lea 999f(%%rip), %%r15\n" // continue in same thread
+
+ // Signal handlers are process-wide. This means that for security
+ // reasons, we cannot allow that the trusted thread ever executes any
+ // signal handlers.
+ // We prevent the execution of signal handlers by setting a signal
+ // mask that blocks all signals. In addition, we make sure that the
+ // stack pointer is invalid.
+ // We cannot reset the signal mask until after we have enabled
+ // Seccomp mode. Our sigprocmask() wrapper would normally do this by
+ // raising a signal, modifying the signal mask in the kernel-generated
+ // signal frame, and then calling sigreturn(). This presents a bit of
+ // a Catch-22, as all signals are masked and we can therefore not
+ // raise any signal that would allow us to generate the signal stack
+ // frame.
+ // Instead, we have to create the signal stack frame prior to entering
+ // Seccomp mode. This incidentally also helps us to restore the
+ // signal mask to the same value that it had prior to entering the
+ // sandbox.
+ // The signal wrapper for clone() is the second entry point into this
+ // code (by means of sending an IPC to its trusted thread). It goes
+ // through the same steps of creating a signal stack frame on the
+ // newly created thread's stacks prior to cloning. See clone.cc for
+ // details.
+ "mov $56+0xF000, %%eax\n" // __NR_clone + 0xF000
+ "sub $8, %%rsp\n"
+ "mov %%rsp, %%rdx\n" // push a signal stack frame (see clone.cc)
+ "mov %%rsp, 0(%%rsp)\n"
+ "int $0\n"
+ "mov 0(%%rsp), %%r9\n"
+ "add $8, 0xA0(%%r9)\n" // pop stack upon call to sigreturn()
+ "mov $2, %%rdi\n" // how = SIG_SETMASK
+ "movq $-1, 0(%%rsp)\n"
+ "mov %%rsp, %%rsi\n" // set = full mask
+ "xor %%rdx, %%rdx\n" // old_set = NULL
+ "mov $8, %%r10\n" // mask all 64 signals
+ "mov $14, %%eax\n" // NR_rt_sigprocmask
+ "syscall\n"
+ "xor %%rsp, %%rsp\n" // invalidate the stack in all trusted code
"jmp 20f\n" // create trusted thread
// TODO(markus): Coalesce the read() operations by reading into a bigger
@@ -36,42 +74,44 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// %rbx: sequence number for trusted calls
// Temporary variables:
- // %r9: system call number
+ // %r8: child stack
+ // %r9: system call number, child stack
// %rbp: secure memory of previous thread
// Layout of secure shared memory region (c.f. securemem.h):
// 0x00: pointer to the secure shared memory region (i.e. self)
// 0x08: sequence number; must match %rbx
- // 0x10: system call number; passed to syscall in %rax
- // 0x18: first argument; passed to syscall in %rdi
- // 0x20: second argument; passed to syscall in %rsi
- // 0x28: third argument; passed to syscall in %rdx
- // 0x30: fourth argument; passed to syscall in %r10
- // 0x38: fifth argument; passed to syscall in %r8
- // 0x40: sixth argument; passed to syscall in %r9
- // 0x48: stored return address for clone() system call
- // 0x50: stored %rbp value for clone() system call
- // 0x58: stored %rbx value for clone() system call
- // 0x60: stored %rcx value for clone() system call
- // 0x68: stored %rdx value for clone() system call
- // 0x70: stored %rsi value for clone() system call
- // 0x78: stored %rdi value for clone() system call
- // 0x80: stored %r8 value for clone() system call
- // 0x88: stored %r9 value for clone() system call
- // 0x90: stored %r10 value for clone() system call
- // 0x98: stored %r11 value for clone() system call
- // 0xA0: stored %r12 value for clone() system call
- // 0xA8: stored %r13 value for clone() system call
- // 0xB0: stored %r14 value for clone() system call
- // 0xB8: stored %r15 value for clone() system call
- // 0xC0: new shared memory for clone()
- // 0xC8: processFdPub for talking to trusted process
- // 0xCC: cloneFdPub for talking to trusted process
- // 0xD0: set to non-zero, if in debugging mode
- // 0xD4: most recent SHM id returned by shmget(IPC_PRIVATE)
- // 0xD8: cookie assigned to us by the trusted process (TLS_COOKIE)
- // 0xE0: thread id (TLS_TID)
- // 0xE8: threadFdPub (TLS_THREAD_FD)
+ // 0x10: call type; must match %eax, iff %eax == -1 || %eax == -2
+ // 0x18: system call number; passed to syscall in %rax
+ // 0x20: first argument; passed to syscall in %rdi
+ // 0x28: second argument; passed to syscall in %rsi
+ // 0x30: third argument; passed to syscall in %rdx
+ // 0x38: fourth argument; passed to syscall in %r10
+ // 0x40: fifth argument; passed to syscall in %r8
+ // 0x48: sixth argument; passed to syscall in %r9
+ // 0x50: stored return address for clone() system call
+ // 0x58: stored %rbp value for clone() system call
+ // 0x60: stored %rbx value for clone() system call
+ // 0x68: stored %rcx value for clone() system call
+ // 0x70: stored %rdx value for clone() system call
+ // 0x78: stored %rsi value for clone() system call
+ // 0x80: stored %rdi value for clone() system call
+ // 0x88: stored %r8 value for clone() system call
+ // 0x90: stored %r9 value for clone() system call
+ // 0x98: stored %r10 value for clone() system call
+ // 0xA0: stored %r11 value for clone() system call
+ // 0xA8: stored %r12 value for clone() system call
+ // 0xB0: stored %r13 value for clone() system call
+ // 0xB8: stored %r14 value for clone() system call
+ // 0xC0: stored %r15 value for clone() system call
+ // 0xC8: new shared memory for clone()
+ // 0xD0: processFdPub for talking to trusted process
+ // 0xD4: cloneFdPub for talking to trusted process
+ // 0xD8: set to non-zero, if in debugging mode
+ // 0xDC: most recent SHM id returned by shmget(IPC_PRIVATE)
+ // 0xE0: cookie assigned to us by the trusted process (TLS_COOKIE)
+ // 0xE8: thread id (TLS_TID)
+ // 0xF0: threadFdPub (TLS_THREAD_FD)
// 0x200-0x1000: securely passed verified file name(s)
// Layout of (untrusted) scratch space:
@@ -89,6 +129,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// 0x48: last system call (not used on x86-64)
// 0x4C: number of consecutive calls to a time fnc (not used on x86-64)
// 0x50: nesting level of system calls (for debugging purposes only)
+ // 0x54: signal mask
// We use the %fs register for accessing the secure read-only page, and
// the untrusted scratch space immediately following it. The segment
@@ -103,7 +144,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// read(threadFd, &scratch, 4)
"1:xor %%rax, %%rax\n" // NR_read
"mov %%r13, %%rdi\n" // fd = threadFd
- "mov %%fs:0x0, %%rsi\n"
+ "mov %%fs:0x0, %%rsi\n" // secure_mem
"add $0x1000, %%rsi\n" // buf = &scratch
"mov $4, %%edx\n" // len = 4
"2:syscall\n"
@@ -123,13 +164,15 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"jnz 5f\n"
"3:cmp %%rbx, %%fs:0x8\n"
"jne 25f\n" // exit process
- "mov %%fs:0x10, %%rax\n"
- "mov %%fs:0x18, %%rdi\n"
- "mov %%fs:0x20, %%rsi\n"
- "mov %%fs:0x28, %%rdx\n"
- "mov %%fs:0x30, %%r10\n"
- "mov %%fs:0x38, %%r8\n"
- "mov %%fs:0x40, %%r9\n"
+ "cmp %%fs:0x10, %%eax\n"
+ "jne 25f\n" // exit process
+ "mov %%fs:0x18, %%rax\n"
+ "mov %%fs:0x20, %%rdi\n"
+ "mov %%fs:0x28, %%rsi\n"
+ "mov %%fs:0x30, %%rdx\n"
+ "mov %%fs:0x38, %%r10\n"
+ "mov %%fs:0x40, %%r8\n"
+ "mov %%fs:0x48, %%r9\n"
"cmp %%rbx, %%fs:0x8\n"
"jne 25f\n" // exit process
"add $2, %%rbx\n"
@@ -153,13 +196,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
"mov $10, %%eax\n" // NR_mprotect
"syscall\n"
- "mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id
+ "mov %%r8d, 0xDC(%%rdi)\n" // set most recently returned SysV shm id
"xor %%rdi, %%rdi\n"
// When debugging messages are enabled, warn about expensive system calls
#ifndef NDEBUG
- "cmpw $0, %%fs:0xD0\n" // debug mode
- "jz 26f\n"
+ "cmpw $0, %%fs:0xD8\n" // debug mode
+ "jz 27f\n"
"mov $1, %%eax\n" // NR_write
"mov $2, %%edi\n" // fd = stderr
"lea 101f(%%rip), %%rsi\n" // "This is an expensive system call"
@@ -168,7 +211,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"xor %%rdi, %%rdi\n"
#endif
- "jmp 26f\n" // exit program, no message
+ "jmp 27f\n" // exit program, no message
"4:syscall\n"
"jmp 15f\n" // return result
@@ -179,10 +222,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"jnz 9f\n"
"cmp %%rbx, %%fs:0x8\n"
"jne 25f\n" // exit process
+ "cmp %%eax, %%fs:0x10\n"
+ "jne 25f\n" // exit process
// When debugging messages are enabled, warn about expensive system calls
#ifndef NDEBUG
- "cmpw $0, %%fs:0xD0\n" // debug mode
+ "cmpw $0, %%fs:0xD8\n" // debug mode
"jz 6f\n"
"mov $1, %%eax\n" // NR_write
"mov $2, %%edi\n" // fd = stderr
@@ -192,13 +237,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"6:"
#endif
- "mov %%fs:0x10, %%rax\n"
- "mov %%fs:0x18, %%rdi\n"
- "mov %%fs:0x20, %%rsi\n"
- "mov %%fs:0x28, %%rdx\n"
- "mov %%fs:0x30, %%r10\n"
- "mov %%fs:0x38, %%r8\n"
- "mov %%fs:0x40, %%r9\n"
+ "mov %%fs:0x18, %%rax\n"
+ "mov %%fs:0x20, %%rdi\n"
+ "mov %%fs:0x28, %%rsi\n"
+ "mov %%fs:0x30, %%rdx\n"
+ "mov %%fs:0x38, %%r10\n"
+ "mov %%fs:0x40, %%r8\n"
+ "mov %%fs:0x48, %%r9\n"
"cmp %%rbx, %%fs:0x8\n"
"jne 25f\n" // exit process
@@ -255,7 +300,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Check in syscallTable whether this system call is unrestricted
"12:mov %%rax, %%r9\n"
#ifndef NDEBUG
- "cmpw $0, %%fs:0xD0\n" // debug mode
+ "cmpw $0, %%fs:0xD8\n" // debug mode
"jnz 13f\n"
#endif
"cmp playground$maxSyscall(%%rip), %%eax\n"
@@ -287,11 +332,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov 0x2C(%%r8), %%r9\n"
"mov 0x24(%%r8), %%r8\n"
"cmp $231, %%rax\n" // NR_exit_group
- "jz 26f\n" // exit program, no message
+ "jz 27f\n" // exit program, no message
"syscall\n"
// Return result of system call to sandboxed thread
- "15:mov %%fs:0x0, %%rsi\n"
+ "15:mov %%fs:0x0, %%rsi\n" // secure_mem
"add $0x1034, %%rsi\n" // buf = &scratch + 52
"mov %%rax, (%%rsi)\n"
"mov $8, %%edx\n" // len = 8
@@ -306,8 +351,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// NR_exit:
// Exit trusted thread after cleaning up resources
- "18:mov %%fs:0x0, %%rsi\n"
- "mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub
+ "18:mov %%fs:0x0, %%rsi\n" // secure_mem
+ "mov 0xF0(%%rsi), %%rdi\n" // fd = threadFdPub
"mov $3, %%eax\n" // NR_close
"syscall\n"
"mov %%rsi, %%rdi\n" // start = secure_mem
@@ -324,7 +369,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"syscall\n"
"mov %%rax, %%rdi\n"
"test %%rax, %%rax\n"
- "js 26f\n" // exit process
+ "js 27f\n" // exit process
"jne 21f\n" // reap helper, exit thread
"jmp 22f\n" // unlock mutex
@@ -334,12 +379,9 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// resources with the caller (i.e. the previous trusted thread),
// and by extension it shares all resources with the sandbox'd
// threads.
- // N.B. It is possible to make the thread creation code crash before
- // it releases seccomp privileges. This is generally OK, as it just
- // terminates the program. But if we ever support signal handling,
- // we have to be careful that the user cannot install a SIGSEGV
- // handler that gets executed with elevated privileges.
- "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem
+ "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem
+ "mov %%rsi, %%r15\n" // remember child stack
+ "mov $1, %%rsi\n" // stack = 1
"syscall\n" // calls NR_clone
"cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values
"jae 7b\n" // unlock mutex, return result
@@ -349,6 +391,23 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// In nascent thread, now.
"sub $2, %%rbx\n"
+
+ // We want to maintain an invalid %rsp whenver we access untrusted
+ // memory. This ensures that even if an attacker can trick us into
+ // triggering a SIGSEGV, we will never successfully execute a signal
+ // handler.
+ // Signal handlers are inherently dangerous, as an attacker could trick
+ // us into returning to the wrong address by adjusting the signal stack
+ // right before the handler returns.
+ // N.B. While POSIX is curiously silent about this, it appears that on
+ // Linux, alternate signal stacks are a per-thread property. That is
+ // good. It means that this security mechanism works, even if the
+ // sandboxed thread manages to set up an alternate signal stack.
+ //
+ // TODO(markus): We currently do not support emulating calls to
+ // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc
+ // for a discussion on how to fix this, if this ever becomes neccessary.
+ "mov %%r15, %%r9\n" // %r9 = child_stack
"xor %%r15, %%r15\n" // Request to return from clone() when done
// Get thread id of nascent thread
@@ -358,19 +417,19 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Nascent thread creates socketpair() for sending requests to
// trusted thread.
- // We can create the filehandles on the stack. Filehandles are
+ // We can create the filehandles on the child's stack. Filehandles are
// always treated as untrusted.
// socketpair(AF_UNIX, SOCK_STREAM, 0, fds)
- "push %%r15\n"
+ "sub $0x10, %%r9\n"
+ "mov %%r15, 8(%%r9)\n" // preserve return address on child stack
"mov $53, %%eax\n" // NR_socketpair
"mov $1, %%edi\n" // domain = AF_UNIX
"mov $1, %%esi\n" // type = SOCK_STREAM
"xor %%rdx, %%rdx\n" // protocol = 0
- "sub $8, %%rsp\n" // sv = %rsp
- "mov %%rsp, %%r10\n"
+ "mov %%r9, %%r10\n" // sv = child_stack
"syscall\n"
"test %%rax, %%rax\n"
- "jz 27f\n"
+ "jz 28f\n"
// If things went wrong, we don't have an (easy) way of signaling
// the parent. For our purposes, it is sufficient to fail with a
@@ -403,12 +462,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected"
"mov $101f-100f, %%edx\n" // len = strlen(msg)
"syscall\n"
- "mov $1, %%edi\n"
- "26:mov $231, %%eax\n" // NR_exit_group
+ "26:mov $1, %%edi\n"
+ "27:mov $231, %%eax\n" // NR_exit_group
"jmp 24b\n"
// The first page is mapped read-only for use as securely shared memory
- "27:mov 0xC0(%%rbp), %%r12\n" // %r12 = secure shared memory
+ "28:mov 0xC8(%%rbp), %%r12\n" // %r12 = secure shared memory
"cmp %%rbx, 8(%%rbp)\n"
"jne 25b\n" // exit process
"mov $10, %%eax\n" // NR_mprotect
@@ -428,12 +487,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
// CLONE_SYSVSEM|CLONE_UNTRACED|CLONE_SETTLS, stack, NULL, NULL,
// tls)
- "mov 4(%%rsp), %%r13d\n" // %r13 = threadFd
+ "mov 4(%%r9), %%r13d\n" // %r13 = threadFd (on child's stack)
"mov $56, %%eax\n" // NR_clone
"mov $0x8D0F00, %%edi\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR|TLS
"mov $1, %%rsi\n" // stack = 1
"mov %%r12, %%r8\n" // tls = new_secure_mem
- "mov 0xC8(%%rbp), %%r15d\n" // %r15 = processFdPub
+ "mov 0xD0(%%rbp), %%r15d\n" // %r15 = processFdPub
"cmp %%rbx, 8(%%rbp)\n"
"jne 25b\n" // exit process
"syscall\n"
@@ -441,13 +500,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"js 25b\n" // exit process
"jz 0b\n" // invoke trustedThreadFnc()
+ // Copy the caller's signal mask
+ "mov 0x1054(%%rbp), %%rax\n"
+ "mov %%rax, 0x1054(%%r12)\n"
+
// Done creating trusted thread. We can now get ready to return to caller
- "mov 0(%%rsp), %%r9d\n" // %r9 = threadFdPub
- "add $8, %%rsp\n"
+ "mov %%r9, %%r8\n" // %r8 = child_stack
+ "mov 0(%%r9), %%r9d\n" // %r9 = threadFdPub
// Set up thread local storage with information on how to talk to
// trusted thread and trusted process.
- "lea 0xD8(%%r12), %%rsi\n" // args = &secure_mem.TLS;
+ "lea 0xE0(%%r12), %%rsi\n" // args = &secure_mem.TLS;
"mov $158, %%eax\n" // NR_arch_prctl
"mov $0x1001, %%edi\n" // option = ARCH_SET_GS
"syscall\n"
@@ -459,73 +522,121 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// the very top of this function, you will find that we push 999(%rip)
// on the stack. That is the signal that we should return on the same
// stack rather than return to where clone was called.
- "pop %%r15\n"
+ "mov 8(%%r8), %%r15\n"
+ "add $0x10, %%r8\n"
"test %%r15, %%r15\n"
- "jne 28f\n"
+ "jne 29f\n"
// Returning from clone() into the newly created thread is special. We
// cannot unroll the stack, as we just set up a new stack for this
// thread. We have to explicitly restore CPU registers to the values
// that they had when the program originally called clone().
- "sub $0x80, %%rsp\n" // redzone compensation
- "mov 0x48(%%rbp), %%rax\n"
- "push %%rax\n"
+ // We patch the register values in the signal stack frame so that we
+ // can ask sigreturn() to restore all registers for us.
+ "sub $0x8, %%r8\n"
"mov 0x50(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x00(%%r8)\n" // return address
+ "xor %%rax, %%rax\n"
+ "mov %%rax, 0x98(%%r8)\n" // %rax = 0
"mov 0x58(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x80(%%r8)\n" // %rbp
"mov 0x60(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x88(%%r8)\n" // %rbx
"mov 0x68(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0xA0(%%r8)\n" // %rcx
"mov 0x70(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x90(%%r8)\n" // %rdx
"mov 0x78(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x78(%%r8)\n" // %rsi
"mov 0x80(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x70(%%r8)\n" // %rdi
"mov 0x88(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x30(%%r8)\n" // %r8
"mov 0x90(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x38(%%r8)\n" // %r9
"mov 0x98(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x40(%%r8)\n" // %r10
"mov 0xA0(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x48(%%r8)\n" // %r11
"mov 0xA8(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x50(%%r8)\n" // %r12
"mov 0xB0(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x58(%%r8)\n" // %r13
"mov 0xB8(%%rbp), %%rax\n"
- "push %%rax\n"
+ "mov %%rax, 0x60(%%r8)\n" // %r14
+ "mov 0xC0(%%rbp), %%rax\n"
+ "mov %%rax, 0x68(%%r8)\n" // %r15
"cmp %%rbx, 8(%%rbp)\n"
"jne 25b\n" // exit process
// Nascent thread launches a helper that doesn't share any of our
// resources, except for pages mapped as MAP_SHARED.
- // clone(0, %rsp)
- "28:mov $56, %%eax\n" // NR_clone
+ // clone(SIGCHLD, stack=1)
+ "29:mov $56, %%eax\n" // NR_clone
"mov $17, %%rdi\n" // flags = SIGCHLD
- "mov %%rsp, %%rsi\n" // stack = %rsp
+ "mov $1, %%rsi\n" // stack = 1
"syscall\n"
"test %%rax, %%rax\n"
"js 25b\n" // exit process
- "jne 29f\n"
+ "jne 31f\n"
// Use sendmsg() to send to the trusted process the file handles for
// communicating with the new trusted thread. We also send the address
// of the secure memory area (for sanity checks) and the thread id.
- "mov 0xCC(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub()
+ "mov 0xD4(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub()
"cmp %%rbx, 8(%%rbp)\n"
"jne 25b\n" // exit process
- "mov %%r9, %%rsi\n" // fd0 = threadFdPub
- "mov %%r13, %%rdx\n" // fd1 = threadFd
- "push %%r14\n" // threadId
- "mov %%esi, 4(%%rsp)\n" // threadFdPub
- "push %%r12\n" // secure_mem
- "mov %%rsp, %%rcx\n" // buf = &data
- "mov $16, %%r8\n" // len = sizeof(void*) + 2*sizeof(int)
- "call playground$sendFd\n"
+
+ // 0x00 msg:
+ // 0x00 msg_name ($0)
+ // 0x08 msg_namelen ($0)
+ // 0x10 msg_iov (%r8 + 0x44)
+ // 0x18 msg_iovlen ($1)
+ // 0x20 msg_control (%r8 + 0x54)
+ // 0x28 msg_controllen ($0x18)
+ // 0x30 data:
+ // 0x30 msg_flags/err ($0)
+ // 0x34 secure_mem (%r12)
+ // 0x3C threadId (%r14d)
+ // 0x40 threadFdPub (%r9d)
+ // 0x44 iov:
+ // 0x44 iov_base (%r8 + 0x30)
+ // 0x4C iov_len ($0x14)
+ // 0x54 cmsg:
+ // 0x54 cmsg_len ($0x18)
+ // 0x5C cmsg_level ($1, SOL_SOCKET)
+ // 0x60 cmsg_type ($1, SCM_RIGHTS)
+ // 0x64 threadFdPub (%r9d)
+ // 0x68 threadFd (%r13d)
+ // 0x6C
+ "sub $0x6C, %%r8\n"
+ "xor %%rdx, %%rdx\n" // flags = 0
+ "mov %%rdx, 0x00(%%r8)\n" // msg_name
+ "mov %%edx, 0x08(%%r8)\n" // msg_namelen
+ "mov %%edx, 0x30(%%r8)\n" // msg_flags
+ "mov $1, %%r11d\n"
+ "mov %%r11, 0x18(%%r8)\n" // msg_iovlen
+ "mov %%r11d, 0x5C(%%r8)\n" // cmsg_level
+ "mov %%r11d, 0x60(%%r8)\n" // cmsg_type
+ "lea 0x30(%%r8), %%r11\n"
+ "mov %%r11, 0x44(%%r8)\n" // iov_base
+ "add $0x14, %%r11\n"
+ "mov %%r11, 0x10(%%r8)\n" // msg_iov
+ "add $0x10, %%r11\n"
+ "mov %%r11, 0x20(%%r8)\n" // msg_control
+ "mov $0x14, %%r11d\n"
+ "mov %%r11, 0x4C(%%r8)\n" // iov_len
+ "add $4, %%r11d\n"
+ "mov %%r11, 0x28(%%r8)\n" // msg_controllen
+ "mov %%r11, 0x54(%%r8)\n" // cmsg_len
+ "mov %%r12, 0x34(%%r8)\n" // secure_mem
+ "mov %%r14d, 0x3C(%%r8)\n" // threadId
+ "mov %%r9d, 0x40(%%r8)\n" // threadFdPub
+ "mov %%r9d, 0x64(%%r8)\n" // threadFdPub
+ "mov %%r13d, 0x68(%%r8)\n" // threadFd
+ "mov $46, %%eax\n" // NR_sendmsg
+ "mov %%r8, %%rsi\n" // msg
+ "syscall\n"
// Release syscall_mutex_. This signals the trusted process that
// it can write into the original thread's secure memory again.
@@ -534,23 +645,29 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $4096, %%esi\n"
"mov $3, %%edx\n" // PROT_READ | PROT_WRITE
"syscall\n"
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
"lock; addl $0x80000000, (%%rdi)\n"
- "jz 26b\n" // exit process (no error message)
+ "jz 30f\n" // exit process (no error message)
"mov $1, %%edx\n"
"mov %%rdx, %%rsi\n" // FUTEX_WAKE
"mov $202, %%eax\n" // NR_futex
"syscall\n"
- "jmp 26b\n" // exit process (no error message)
+ "30:xor %%rdi, %%rdi\n"
+ "jmp 27b\n" // exit process (no error message)
// Reap helper
- "29:mov %%rax, %%rdi\n"
- "30:xor %%rsi, %%rsi\n"
+ "31:mov %%rax, %%rdi\n"
+ "32:lea -4(%%r8), %%rsi\n"
"xor %%rdx, %%rdx\n"
"xor %%r10, %%r10\n"
"mov $61, %%eax\n" // NR_wait4
"syscall\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 30b\n"
+ "jz 32b\n"
+ "mov -4(%%r8), %%eax\n"
+ "test %%rax, %%rax\n"
+ "jnz 26b\n" // exit process (no error message)
// Release privileges by entering seccomp mode.
"mov $157, %%eax\n" // NR_prctl
@@ -560,6 +677,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"test %%rax, %%rax\n"
"jnz 25b\n" // exit process
+ // We can finally start using the stack. Signal handlers no longer pose
+ // a threat to us.
+ "mov %%r8, %%rsp\n"
+
// Back in the newly created sandboxed thread, wait for trusted process
// to receive request. It is possible for an attacker to make us
// continue even before the trusted process is done. This is OK. It'll
@@ -569,10 +690,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $1, %%edx\n" // len = 1
"mov %%rsp, %%rsi\n" // buf = %rsp
"mov %%r9, %%rdi\n" // fd = threadFdPub
- "31:xor %%rax, %%rax\n" // NR_read
+ "33:xor %%rax, %%rax\n" // NR_read
"syscall\n"
"cmp $-4, %%rax\n" // EINTR
- "jz 31b\n"
+ "jz 33b\n"
"cmp %%rdx, %%rax\n"
"jne 25b\n" // exit process
"pop %%rax\n"
@@ -580,27 +701,16 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Return to caller. We are in the new thread, now.
"xor %%rax, %%rax\n"
"test %%r15, %%r15\n"
-
- // Returning to createTrustedThread()
- "jz 32f\n"
- "jmp *%%r15\n"
-
- // Returning to the place where clone() had been called
- "32:pop %%r15\n"
- "pop %%r14\n"
- "pop %%r13\n"
- "pop %%r12\n"
- "pop %%r11\n"
- "pop %%r10\n"
- "pop %%r9\n"
- "pop %%r8\n"
- "pop %%rdi\n"
- "pop %%rsi\n"
- "pop %%rdx\n"
- "pop %%rcx\n"
- "pop %%rbx\n"
- "pop %%rbp\n"
- "ret\n"
+ "jnz 34f\n" // Returning to createTrustedThread()
+
+ // Returning to the place where clone() had been called. We rely on
+ // using rt_sigreturn() for restoring our registers. The caller already
+ // created a signal stack frame, and we patched the register values
+ // with the ones that were in effect prior to calling sandbox_clone().
+ "pop %%r15\n"
+ "34:mov %%r15, 0xA8(%%rsp)\n" // compute new %rip
+ "mov $15, %%eax\n" // NR_rt_sigreturn
+ "syscall\n"
".pushsection \".rodata\"\n"
"100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
@@ -638,19 +748,60 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"movd %0, %%mm6\n" // %mm6 = args
"lea 999f, %%ebx\n" // continue in same thread
"movd %%ebx, %%mm3\n"
- "xor %%ebx, %%ebx\n" // initial sequence number
- "movd %%ebx, %%mm2\n"
+ "xor %%edi, %%edi\n" // initial sequence number
+ "movd %%edi, %%mm2\n"
+
+ // Signal handlers are process-wide. This means that for security
+ // reasons, we cannot allow that the trusted thread ever executes any
+ // signal handlers.
+ // We prevent the execution of signal handlers by setting a signal
+ // mask that blocks all signals. In addition, we make sure that the
+ // stack pointer is invalid.
+ // We cannot reset the signal mask until after we have enabled
+ // Seccomp mode. Our sigprocmask() wrapper would normally do this by
+ // raising a signal, modifying the signal mask in the kernel-generated
+ // signal frame, and then calling sigreturn(). This presents a bit of
+ // a Catch-22, as all signals are masked and we can therefore not
+ // raise any signal that would allow us to generate the signal stack
+ // frame.
+ // Instead, we have to create the signal stack frame prior to entering
+ // Seccomp mode. This incidentally also helps us to restore the
+ // signal mask to the same value that it had prior to entering the
+ // sandbox.
+ // The signal wrapper for clone() is the second entry point into this
+ // code (by means of sending an IPC to its trusted thread). It goes
+ // through the same steps of creating a signal stack frame on the
+ // newly created thread's stacks prior to cloning. See clone.cc for
+ // details.
+ "mov $120+0xF000, %%eax\n" // __NR_clone + 0xF000
+ "sub $8, %%esp\n"
+ "mov %%esp, %%edx\n" // push a signal stack frame (see clone.cc)
+ "mov %%esp, 0(%%esp)\n"
+ "int $0\n"
+ "mov 0(%%esp), %%ebp\n"
+ "add $8, 0x1C(%%ebp)\n" // pop stack upon call to sigreturn()
+ "mov $2, %%ebx\n" // how = SIG_SETMASK
+ "movl $-1, 0(%%esp)\n"
+ "movl $-1, 4(%%esp)\n"
+ "mov %%esp, %%ecx\n" // set = full mask
+ "xor %%edx, %%edx\n" // old_set = NULL
+ "mov $8, %%esi\n" // mask all 64 signals
+ "mov $175, %%eax\n" // NR_rt_sigprocmask
+ "int $0x80\n"
+ "mov $126, %%eax\n" // NR_sigprocmask
+ "int $0x80\n"
+ "xor %%esp, %%esp\n" // invalidate the stack in all trusted code
"jmp 20f\n" // create trusted thread
// TODO(markus): Coalesce the read() operations by reading into a bigger
// buffer.
// Parameters:
- // %mm5: secure memory region
- // the page following this one contains the scratch space
// %mm0: thread's side of threadFd
// %mm1: processFdPub
// %mm3: return address after creation of new trusted thread
+ // %mm5: secure memory region
+ // the page following this one contains the scratch space
// Local variables:
// %mm2: sequence number for trusted calls
@@ -664,28 +815,29 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Layout of secure shared memory region (c.f. securemem.h):
// 0x00: pointer to the secure shared memory region (i.e. self)
// 0x04: sequence number; must match %mm2
- // 0x08: system call number; passed to syscall in %eax
- // 0x0C: first argument; passed to syscall in %ebx
- // 0x10: second argument; passed to syscall in %ecx
- // 0x14: third argument; passed to syscall in %edx
- // 0x18: fourth argument; passed to syscall in %esi
- // 0x1C: fifth argument; passed to syscall in %edi
- // 0x20: sixth argument; passed to syscall in %ebp
- // 0x24: stored return address for clone() system call
- // 0x28: stored %ebp value for clone() system call
- // 0x2C: stored %edi value for clone() system call
- // 0x30: stored %esi value for clone() system call
- // 0x34: stored %edx value for clone() system call
- // 0x38: stored %ecx value for clone() system call
- // 0x3C: stored %ebx value for clone() system call
- // 0x40: new shared memory for clone()
- // 0x44: processFdPub for talking to trusted process
- // 0x48: cloneFdPub for talking to trusted process
- // 0x4C: set to non-zero, if in debugging mode
- // 0x50: most recent SHM id returned by shmget(IPC_PRIVATE)
- // 0x54: cookie assigned to us by the trusted process (TLS_COOKIE)
- // 0x5C: thread id (TLS_TID)
- // 0x64: threadFdPub (TLS_THREAD_FD)
+ // 0x08: call type; must match %eax, iff %eax == -1 || %eax == -2
+ // 0x0C: system call number; passed to syscall in %eax
+ // 0x10: first argument; passed to syscall in %ebx
+ // 0x14: second argument; passed to syscall in %ecx
+ // 0x18: third argument; passed to syscall in %edx
+ // 0x1C: fourth argument; passed to syscall in %esi
+ // 0x20: fifth argument; passed to syscall in %edi
+ // 0x24: sixth argument; passed to syscall in %ebp
+ // 0x28: stored return address for clone() system call
+ // 0x2C: stored %ebp value for clone() system call
+ // 0x30: stored %edi value for clone() system call
+ // 0x34: stored %esi value for clone() system call
+ // 0x38: stored %edx value for clone() system call
+ // 0x3C: stored %ecx value for clone() system call
+ // 0x40: stored %ebx value for clone() system call
+ // 0x44: new shared memory for clone()
+ // 0x48: processFdPub for talking to trusted process
+ // 0x4C: cloneFdPub for talking to trusted process
+ // 0x50: set to non-zero, if in debugging mode
+ // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE)
+ // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE)
+ // 0x60: thread id (TLS_TID)
+ // 0x68: threadFdPub (TLS_THREAD_FD)
// 0x200-0x1000: securely passed verified file name(s)
// Layout of (untrusted) scratch space:
@@ -703,6 +855,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// 0x2C: last system call (updated in syscall.cc)
// 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday)
// 0x34: nesting level of system calls (for debugging purposes only)
+ // 0x38: signal mask
"0:xor %%esp, %%esp\n"
"mov $2, %%eax\n" // %mm2 = initial sequence number
@@ -713,7 +866,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// read(threadFd, &scratch, 4)
"1:mov $3, %%eax\n" // NR_read
"movd %%mm0, %%ebx\n" // fd = threadFd
- "movd %%mm5, %%ecx\n"
+ "movd %%mm5, %%ecx\n" // secure_mem
"add $0x1000, %%ecx\n" // buf = &scratch
"mov $4, %%edx\n" // len = 4
"2:int $0x80\n"
@@ -734,13 +887,15 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"3:movd %%mm2, %%ebp\n"
"cmp %%ebp, 0x4-0x1000(%%ecx)\n"
"jne 25f\n" // exit process
- "mov 0x08-0x1000(%%ecx), %%eax\n"
- "mov 0x0C-0x1000(%%ecx), %%ebx\n"
- "mov 0x14-0x1000(%%ecx), %%edx\n"
- "mov 0x18-0x1000(%%ecx), %%esi\n"
- "mov 0x1C-0x1000(%%ecx), %%edi\n"
- "mov 0x20-0x1000(%%ecx), %%ebp\n"
- "mov 0x10-0x1000(%%ecx), %%ecx\n"
+ "cmp 0x08-0x1000(%%ecx), %%eax\n"
+ "jne 25f\n" // exit process
+ "mov 0x0C-0x1000(%%ecx), %%eax\n"
+ "mov 0x10-0x1000(%%ecx), %%ebx\n"
+ "mov 0x18-0x1000(%%ecx), %%edx\n"
+ "mov 0x1C-0x1000(%%ecx), %%esi\n"
+ "mov 0x20-0x1000(%%ecx), %%edi\n"
+ "mov 0x24-0x1000(%%ecx), %%ebp\n"
+ "mov 0x14-0x1000(%%ecx), %%ecx\n"
"movd %%edi, %%mm4\n"
"movd %%ebp, %%mm7\n"
"movd %%mm2, %%ebp\n"
@@ -773,14 +928,14 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
"mov $125, %%eax\n" // NR_mprotect
"int $0x80\n"
- "mov %%ebp, 0x50(%%ebx)\n" // set most recently returned SysV shm id
+ "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id
"xor %%ebx, %%ebx\n"
// When debugging messages are enabled, warn about expensive system calls
#ifndef NDEBUG
"movd %%mm5, %%ecx\n"
- "cmpw $0, 0x4C(%%ecx)\n" // debug mode
- "jz 26f\n"
+ "cmpw $0, 0x50(%%ecx)\n" // debug mode
+ "jz 27f\n"
"mov $4, %%eax\n" // NR_write
"mov $2, %%ebx\n" // fd = stderr
"lea 101f, %%ecx\n" // "This is an expensive system call"
@@ -789,7 +944,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"xor %%ebx, %%ebx\n"
#endif
- "jmp 26f\n" // exit program, no message
+ "jmp 27f\n" // exit program, no message
"4:int $0x80\n"
"jmp 15f\n" // return result
@@ -801,10 +956,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"movd %%mm2, %%ebp\n"
"cmp %%ebp, 0x4-0x1000(%%ecx)\n"
"jne 25f\n" // exit process
+ "cmp %%eax, 0x8-0x1000(%%ecx)\n"
+ "jne 25f\n" // exit process
// When debugging messages are enabled, warn about expensive system calls
#ifndef NDEBUG
- "cmpw $0, 0x4C-0x1000(%%ecx)\n"
+ "cmpw $0, 0x50-0x1000(%%ecx)\n"
"jz 6f\n" // debug mode
"mov %%ecx, %%ebp\n"
"mov $4, %%eax\n" // NR_write
@@ -816,13 +973,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"6:"
#endif
- "mov 0x08-0x1000(%%ecx), %%eax\n"
- "mov 0x0C-0x1000(%%ecx), %%ebx\n"
- "mov 0x14-0x1000(%%ecx), %%edx\n"
- "mov 0x18-0x1000(%%ecx), %%esi\n"
- "mov 0x1C-0x1000(%%ecx), %%edi\n"
- "mov 0x20-0x1000(%%ecx), %%ebp\n"
- "mov 0x10-0x1000(%%ecx), %%ecx\n"
+ "mov 0x0C-0x1000(%%ecx), %%eax\n"
+ "mov 0x10-0x1000(%%ecx), %%ebx\n"
+ "mov 0x18-0x1000(%%ecx), %%edx\n"
+ "mov 0x1C-0x1000(%%ecx), %%esi\n"
+ "mov 0x20-0x1000(%%ecx), %%edi\n"
+ "mov 0x24-0x1000(%%ecx), %%ebp\n"
+ "mov 0x14-0x1000(%%ecx), %%ecx\n"
"movd %%edi, %%mm4\n"
"movd %%ebp, %%mm7\n"
"movd %%mm2, %%ebp\n"
@@ -864,7 +1021,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $7, %%eax\n" // NR_waitpid
"int $0x80\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 6\n"
+ "jz 8b\n"
"mov %%ebp, %%eax\n"
"jmp 15f\n" // return result
@@ -889,7 +1046,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Check in syscallTable whether this system call is unrestricted
"12:mov %%eax, %%ebp\n"
#ifndef NDEBUG
- "cmpw $0, 0x4C-0x1000(%%ecx)\n"
+ "cmpw $0, 0x50-0x1000(%%ecx)\n"
"jnz 13f\n" // debug mode
#endif
"cmp playground$maxSyscall, %%eax\n"
@@ -919,11 +1076,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov 0x14(%%ecx), %%ebp\n"
"mov 0x04(%%ecx), %%ecx\n"
"cmp $252, %%eax\n" // NR_exit_group
- "jz 26f\n" // exit program, no message
+ "jz 27f\n" // exit program, no message
"int $0x80\n"
// Return result of system call to sandboxed thread
- "15:movd %%mm5, %%ecx\n"
+ "15:movd %%mm5, %%ecx\n" // secure_mem
"add $0x101C, %%ecx\n" // buf = &scratch + 28
"mov %%eax, (%%ecx)\n"
"mov $4, %%edx\n" // len = 4
@@ -938,8 +1095,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// NR_exit:
// Exit trusted thread after cleaning up resources
- "18:mov %%edi, %%ecx\n"
- "mov 0x64(%%ecx), %%ebx\n" // fd = threadFdPub
+ "18:mov %%edi, %%ecx\n" // secure_mem
+ "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub
"mov $6, %%eax\n" // NR_close
"int $0x80\n"
"mov %%ecx, %%ebx\n" // start = secure_mem
@@ -966,14 +1123,10 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// resources with the caller (i.e. the previous trusted thread),
// and by extension it shares all resources with the sandbox'd
// threads.
- // N.B. It is possible to make the thread creation code crash before
- // it releases seccomp privileges. This is generally OK, as it just
- // terminates the program. But if we ever support signal handling,
- // we have to be careful that the user cannot install a SIGSEGV
- // handler that gets executed with elevated privileges.
- "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem
- "movd %%mm4, %%edi\n"
- "movd %%mm7, %%ebp\n"
+ "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem
+ "movd %%mm4, %%edi\n" // child_tidptr
+ "mov %%ecx, %%ebp\n" // remember child stack
+ "mov $1, %%ecx\n" // stack = 1
"int $0x80\n" // calls NR_clone
"cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values
"jae 7b\n" // unlock mutex, return result
@@ -986,6 +1139,22 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// In nascent thread, now.
"sub $2, %%edi\n"
"movd %%edi, %%mm2\n"
+
+ // We want to maintain an invalid %esp whenver we access untrusted
+ // memory. This ensures that even if an attacker can trick us into
+ // triggering a SIGSEGV, we will never successfully execute a signal
+ // handler.
+ // Signal handlers are inherently dangerous, as an attacker could trick
+ // us into returning to the wrong address by adjusting the signal stack
+ // right before the handler returns.
+ // N.B. While POSIX is curiously silent about this, it appears that on
+ // Linux, alternate signal stacks are a per-thread property. That is
+ // good. It means that this security mechanism works, even if the
+ // sandboxed thread manages to set up an alternate signal stack.
+ //
+ // TODO(markus): We currently do not support emulating calls to
+ // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc
+ // for a discussion on how to fix this, if this ever becomes neccessary.
"movd %%eax, %%mm3\n" // Request to return from clone() when done
// Get thread id of nascent thread
@@ -995,23 +1164,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Nascent thread creates socketpair() for sending requests to
// trusted thread.
- // We can create the filehandles on the stack. Filehandles are
+ // We can create the filehandles on the child's stack. Filehandles are
// always treated as untrusted.
// socketpair(AF_UNIX, SOCK_STREAM, 0, fds)
"mov $102, %%eax\n" // NR_socketcall
"mov $8, %%ebx\n" // socketpair
- "sub $8, %%esp\n" // sv = %rsp
- "push %%esp\n"
- "xor %%ecx, %%ecx\n" // protocol = 0
- "push %%ecx\n"
- "mov $1, %%ecx\n" // type = SOCK_STREAM
- "push %%ecx\n"
- "push %%ecx\n" // domain = AF_UNIX
- "mov %%esp, %%ecx\n"
+ "sub $8, %%ebp\n" // sv = child_stack
+ "mov %%ebp, -0x04(%%ebp)\n"
+ "movl $0, -0x08(%%ebp)\n" // protocol = 0
+ "movl $1, -0x0C(%%ebp)\n" // type = SOCK_STREAM
+ "movl $1, -0x10(%%ebp)\n" // domain = AF_UNIX
+ "lea -0x10(%%ebp), %%ecx\n"
"int $0x80\n"
- "add $0x10, %%esp\n"
"test %%eax, %%eax\n"
- "jz 27f\n"
+ "jz 28f\n"
// If things went wrong, we don't have an (easy) way of signaling
// the parent. For our purposes, it is sufficient to fail with a
@@ -1043,19 +1209,18 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"lea 100f, %%ecx\n" // "Sandbox violation detected"
"mov $101f-100f, %%edx\n" // len = strlen(msg)
"int $0x80\n"
- "mov $1, %%ebx\n"
- "26:mov $252, %%eax\n" // NR_exit_group
+ "26:mov $1, %%ebx\n"
+ "27:mov $252, %%eax\n" // NR_exit_group
"jmp 24b\n"
// The first page is mapped read-only for use as securely shared memory
- "27:movd %%mm6, %%ebp\n"
- "mov 0x40(%%ebp), %%esi\n"
- "movd %%esi, %%mm5\n" // %mm5 = secure shared memory
- "movd %%mm2, %%edi\n"
- "cmp %%edi, 4(%%ebp)\n"
+ "28:movd %%mm6, %%edi\n" // %edi = old_shared_mem
+ "mov 0x44(%%edi), %%ebx\n" // addr = secure_mem
+ "movd %%ebx, %%mm5\n" // %mm5 = secure_mem
+ "movd %%mm2, %%esi\n"
+ "cmp %%esi, 4(%%edi)\n"
"jne 25b\n" // exit process
"mov $125, %%eax\n" // NR_mprotect
- "mov %%esi, %%ebx\n"
"mov $4096, %%ecx\n" // len = 4096
"mov $1, %%edx\n" // prot = PROT_READ
"int $0x80\n"
@@ -1070,13 +1235,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Call clone() to create new trusted thread().
// clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
// CLONE_SYSVSEM|CLONE_UNTRACED, stack, NULL, NULL, NULL)
- "mov 4(%%esp), %%eax\n"
+ "mov 4(%%ebp), %%eax\n" // threadFd (on child's stack)
"movd %%eax, %%mm0\n" // %mm0 = threadFd
"mov $120, %%eax\n" // NR_clone
"mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR
"mov $1, %%ecx\n" // stack = 1
- "movd 0x44(%%ebp), %%mm1\n" // %mm1 = processFdPub
- "cmp %%edi, 4(%%ebp)\n"
+ "movd 0x48(%%edi), %%mm1\n" // %mm1 = processFdPub
+ "cmp %%esi, 4(%%edi)\n"
"jne 25b\n" // exit process
"int $0x80\n"
"test %%eax, %%eax\n"
@@ -1085,86 +1250,146 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Set up thread local storage
"mov $0x51, %%eax\n" // seg_32bit, limit_in_pages, useable
- "push %%eax\n"
+ "mov %%eax, -0x04(%%ebp)\n"
"mov $0xFFFFF, %%eax\n" // limit
- "push %%eax\n"
- "add $0x54, %%esi\n"
- "push %%esi\n" // base_addr = &secure_mem.TLS
+ "mov %%eax, -0x08(%%ebp)\n"
+ "movd %%mm5, %%eax\n"
+ "add $0x58, %%eax\n"
+ "mov %%eax, -0x0C(%%ebp)\n" // base_addr = &secure_mem.TLS
"mov %%fs, %%eax\n"
"shr $3, %%eax\n"
- "push %%eax\n" // entry_number
+ "mov %%eax, -0x10(%%ebp)\n" // entry_number
"mov $243, %%eax\n" // NR_set_thread_area
- "mov %%esp, %%ebx\n"
+ "lea -0x10(%%ebp), %%ebx\n"
"int $0x80\n"
"test %%eax, %%eax\n"
"jnz 25b\n" // exit process
- "add $16, %%esp\n"
+
+ // Copy the caller's signal mask
+ "movd %%mm5, %%edx\n"
+ "mov 0x1038(%%edi), %%eax\n"
+ "mov %%eax, 0x1038(%%edx)\n"
+ "mov 0x103C(%%edi), %%eax\n"
+ "mov %%eax, 0x103C(%%edx)\n"
// Done creating trusted thread. We can now get ready to return to caller
- "mov 0(%%esp), %%esi\n" // %esi = threadFdPub
- "add $8, %%esp\n"
+ "mov 0(%%ebp), %%esi\n" // %esi = threadFdPub
+ "add $8, %%ebp\n"
// Check whether this is the initial thread, or a newly created one.
// At startup we run the same code as when we create a new thread. At
- // the very top of this function, you will find that we store 999(%rip)
+ // the very top of this function, you will find that we store 999f
// in %%mm3. That is the signal that we should return on the same
// stack rather than return to where clone was called.
"movd %%mm3, %%eax\n"
+ "movd %%mm2, %%edx\n"
"test %%eax, %%eax\n"
- "jne 28f\n"
+ "jne 29f\n"
// Returning from clone() into the newly created thread is special. We
// cannot unroll the stack, as we just set up a new stack for this
// thread. We have to explicitly restore CPU registers to the values
// that they had when the program originally called clone().
- "mov 0x24(%%ebp), %%eax\n"
- "push %%eax\n"
- "mov 0x28(%%ebp), %%eax\n"
- "push %%eax\n"
- "mov 0x2C(%%ebp), %%eax\n"
- "push %%eax\n"
- "mov 0x30(%%ebp), %%eax\n"
- "push %%eax\n"
- "mov 0x34(%%ebp), %%eax\n"
- "push %%eax\n"
- "mov 0x38(%%ebp), %%eax\n"
- "push %%eax\n"
- "mov 0x3C(%%ebp), %%eax\n"
- "push %%eax\n"
- "cmp %%edi, 4(%%ebp)\n"
+ // We patch the register values in the signal stack frame so that we
+ // can ask sigreturn() to restore all registers for us.
+ "sub $0x4, %%ebp\n"
+ "mov 0x28(%%edi), %%eax\n"
+ "mov %%eax, 0x00(%%ebp)\n" // return address
+ "xor %%eax, %%eax\n"
+ "mov %%eax, 0x30(%%ebp)\n" // %eax = 0
+ "mov 0x2C(%%edi), %%eax\n"
+ "mov %%eax, 0x1C(%%ebp)\n" // %ebp
+ "mov 0x30(%%edi), %%eax\n"
+ "mov %%eax, 0x14(%%ebp)\n" // %edi
+ "mov 0x34(%%edi), %%eax\n"
+ "mov %%eax, 0x18(%%ebp)\n" // %esi
+ "mov 0x38(%%edi), %%eax\n"
+ "mov %%eax, 0x28(%%ebp)\n" // %edx
+ "mov 0x3C(%%edi), %%eax\n"
+ "mov %%eax, 0x2C(%%ebp)\n" // %ecx
+ "mov 0x40(%%edi), %%eax\n"
+ "mov %%eax, 0x24(%%ebp)\n" // %ebx
+ "cmp %%edx, 4(%%edi)\n"
"jne 25b\n" // exit process
// Nascent thread launches a helper that doesn't share any of our
// resources, except for pages mapped as MAP_SHARED.
- // clone(0, %esp)
- "28:mov $120, %%eax\n" // NR_clone
+ // clone(SIGCHLD, stack=1)
+ "29:mov $120, %%eax\n" // NR_clone
"mov $17, %%ebx\n" // flags = SIGCHLD
- "mov %%esp, %%ecx\n" // stack = %esp
+ "mov $1, %%ecx\n" // stack = 1
"int $0x80\n"
"test %%eax, %%eax\n"
"js 25b\n" // exit process
- "jne 29f\n"
+ "jne 31f\n"
// Use sendmsg() to send to the trusted process the file handles for
// communicating with the new trusted thread. We also send the address
// of the secure memory area (for sanity checks) and the thread id.
- "push %%esi\n" // threadFdPub
- "movd %%mm4, %%eax\n" // threadId
- "push %%eax\n"
- "movd %%mm5, %%eax\n" // secure_mem
- "push %%eax\n"
- "mov %%esp, %%ebx\n" // buf = &data
- "mov $12, %%eax\n" // len = sizeof(void*) + 2*sizeof(int)
- "push %%eax\n"
- "push %%ebx\n"
- "movd %%mm0, %%eax\n" // fd1 = threadFd
- "push %%eax\n"
- "push %%esi\n" // fd0 = threadFdPub
- "mov 0x48(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub()
- "cmp %%edi, 4(%%ebp)\n"
+ "cmp %%edx, 4(%%edi)\n"
"jne 25b\n" // exit process
- "push %%eax\n"
- "call playground$sendFd\n"
+
+ // 0x00 socketcall:
+ // 0x00 socket (0x4C(%edi))
+ // 0x04 msg (%ecx + 0x0C)
+ // 0x08 flags ($0)
+ // 0x0C msg:
+ // 0x0C msg_name ($0)
+ // 0x10 msg_namelen ($0)
+ // 0x14 msg_iov (%ecx + 0x34)
+ // 0x18 msg_iovlen ($1)
+ // 0x1C msg_control (%ecx + 0x3C)
+ // 0x20 msg_controllen ($0x14)
+ // 0x24 data:
+ // 0x24 msg_flags/err ($0)
+ // 0x28 secure_mem (%mm5)
+ // 0x2C threadId (%mm4)
+ // 0x30 threadFdPub (%esi)
+ // 0x34 iov:
+ // 0x34 iov_base (%ecx + 0x24)
+ // 0x38 iov_len ($0x10)
+ // 0x3C cmsg:
+ // 0x3C cmsg_len ($0x14)
+ // 0x40 cmsg_level ($1, SOL_SOCKET)
+ // 0x44 cmsg_type ($1, SCM_RIGHTS)
+ // 0x48 threadFdPub (%esi)
+ // 0x4C threadFd (%mm0)
+ // 0x50
+ "lea -0x50(%%ebp), %%ecx\n"
+ "xor %%eax, %%eax\n"
+ "mov %%eax, 0x08(%%ecx)\n" // flags
+ "mov %%eax, 0x0C(%%ecx)\n" // msg_name
+ "mov %%eax, 0x10(%%ecx)\n" // msg_namelen
+ "mov %%eax, 0x24(%%ecx)\n" // msg_flags
+ "inc %%eax\n"
+ "mov %%eax, 0x18(%%ecx)\n" // msg_iovlen
+ "mov %%eax, 0x40(%%ecx)\n" // cmsg_level
+ "mov %%eax, 0x44(%%ecx)\n" // cmsg_type
+ "movl $0x10, 0x38(%%ecx)\n" // iov_len
+ "mov $0x14, %%eax\n"
+ "mov %%eax, 0x20(%%ecx)\n" // msg_controllen
+ "mov %%eax, 0x3C(%%ecx)\n" // cmsg_len
+ "mov 0x4C(%%edi), %%eax\n" // cloneFdPub
+ "mov %%eax, 0x00(%%ecx)\n" // socket
+ "lea 0x0C(%%ecx), %%eax\n"
+ "mov %%eax, 0x04(%%ecx)\n" // msg
+ "add $0x18, %%eax\n"
+ "mov %%eax, 0x34(%%ecx)\n" // iov_base
+ "add $0x10, %%eax\n"
+ "mov %%eax, 0x14(%%ecx)\n" // msg_iov
+ "add $8, %%eax\n"
+ "mov %%eax, 0x1C(%%ecx)\n" // msg_control
+ "mov %%esi, 0x30(%%ecx)\n" // threadFdPub
+ "mov %%esi, 0x48(%%ecx)\n" // threadFdPub
+ "movd %%mm5, %%eax\n"
+ "mov %%eax, 0x28(%%ecx)\n" // secure_mem
+ "movd %%mm4, %%eax\n"
+ "mov %%eax, 0x2C(%%ecx)\n" // threadId
+ "movd %%mm0, %%eax\n"
+ "mov %%eax, 0x4C(%%ecx)\n" // threadFd
+ "mov $16, %%ebx\n" // sendmsg()
+ "mov $102, %%eax\n" // NR_socketcall
+ "int $0x80\n"
// Release syscall_mutex_. This signals the trusted process that
// it can write into the original thread's secure memory again.
@@ -1173,31 +1398,42 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $4096, %%ecx\n"
"mov $3, %%edx\n" // PROT_READ | PROT_WRITE
"int $0x80\n"
+ "movd %%mm2, %%edx\n"
+ "cmp %%edx, 0x4(%%edi)\n"
+ "jnz 25b\n" // exit process
"lock; addl $0x80000000, (%%ebx)\n"
- "jz 26b\n" // exit process (no error message)
+ "jz 30f\n" // exit process (no error message)
"mov $1, %%edx\n"
"mov %%edx, %%ecx\n" // FUTEX_WAKE
"mov $240, %%eax\n" // NR_futex
"int $0x80\n"
- "jmp 26b\n" // exit process (no error message)
+ "30:xor %%ebx, %%ebx\n"
+ "jmp 27b\n" // exit process (no error message)
// Reap helper
- "29:mov %%eax, %%ebx\n"
- "30:xor %%ecx, %%ecx\n"
+ "31:mov %%eax, %%ebx\n"
+ "32:lea -4(%%ebp), %%ecx\n"
"xor %%edx, %%edx\n"
"mov $7, %%eax\n" // NR_waitpid
"int $0x80\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 30b\n"
+ "jz 32b\n"
+ "mov -4(%%ebp), %%eax\n"
+ "test %%eax, %%eax\n"
+ "jnz 26b\n" // exit process (no error message)
// Release privileges by entering seccomp mode.
- "mov $172, %%eax\n" // NR_prctl
+ "33:mov $172, %%eax\n" // NR_prctl
"mov $22, %%ebx\n" // PR_SET_SECCOMP
"mov $1, %%ecx\n"
"int $0x80\n"
"test %%eax, %%eax\n"
"jnz 25b\n" // exit process
+ // We can finally start using the stack. Signal handlers no longer pose
+ // a threat to us.
+ "mov %%ebp, %%esp\n"
+
// Back in the newly created sandboxed thread, wait for trusted process
// to receive request. It is possible for an attacker to make us
// continue even before the trusted process is done. This is OK. It'll
@@ -1205,12 +1441,12 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// data is considered untrusted anyway.
"push %%eax\n"
"mov $1, %%edx\n" // len = 1
- "mov %%esp, %%ecx\n" // buf = %rsp
+ "mov %%esp, %%ecx\n" // buf = %esp
"mov %%esi, %%ebx\n" // fd = threadFdPub
- "31:mov $3, %%eax\n" // NR_read
+ "34:mov $3, %%eax\n" // NR_read
"int $0x80\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 31b\n"
+ "jz 34b\n"
"cmp %%edx, %%eax\n"
"jne 25b\n" // exit process
"pop %%eax\n"
@@ -1223,19 +1459,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// operations.
"emms\n"
- // Returning to createTrustedThread()
"test %%ebx, %%ebx\n"
- "jz 32f\n"
- "jmp *%%ebx\n"
-
- // Returning to the place where clone() had been called
- "32:pop %%ebx\n"
- "pop %%ecx\n"
- "pop %%edx\n"
- "pop %%esi\n"
- "pop %%edi\n"
- "pop %%ebp\n"
- "ret\n"
+ "jnz 35f\n" // Returning to createTrustedThread()
+
+ // Returning to the place where clone() had been called. We rely on
+ // using sigreturn() for restoring our registers. The caller already
+ // created a signal stack frame, and we patched the register values
+ // with the ones that were in effect prior to calling sandbox_clone().
+ "pop %%ebx\n"
+ "35:mov %%ebx, 0x38(%%esp)\n" // compute new %eip
+ "mov $119, %%eax\n" // NR_sigreturn
+ "int $0x80\n"
".pushsection \".rodata\"\n"
"100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
diff --git a/sandbox/sandbox.gyp b/sandbox/sandbox.gyp
index a835089..b73c1e5 100644
--- a/sandbox/sandbox.gyp
+++ b/sandbox/sandbox.gyp
@@ -181,6 +181,7 @@
'linux/seccomp/sandbox_impl.h',
'linux/seccomp/securemem.cc',
'linux/seccomp/securemem.h',
+ 'linux/seccomp/sigprocmask.cc',
'linux/seccomp/socketcall.cc',
'linux/seccomp/stat.cc',
'linux/seccomp/syscall.cc',