diff options
Diffstat (limited to 'sandbox/linux/seccomp')
45 files changed, 14319 insertions, 0 deletions
diff --git a/sandbox/linux/seccomp/Makefile b/sandbox/linux/seccomp/Makefile new file mode 100644 index 0000000..141d8c3 --- /dev/null +++ b/sandbox/linux/seccomp/Makefile @@ -0,0 +1,59 @@ +# Copyright (c) 2010 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# This Makefile temporarily has been checked into the source tree so that +# we can run the tests. It will be replaced with a proper gyp file. + +CFLAGS = -g -O0 -Wall -Werror -Wextra -Wno-missing-field-initializers \ + -Wno-unused-parameter -I. +LDFLAGS = -g +CPPFLAGS = +MODS := allocator library debug maps x86_decode securemem sandbox \ + syscall syscall_table trusted_thread trusted_process \ + access exit clone getpid gettid ioctl ipc madvise mmap mprotect \ + munmap open sigaction sigprocmask socketcall stat +OBJS64 := $(shell echo ${MODS} | xargs -n 1 | sed -e 's/$$/.o64/') +OBJS32 := $(shell echo ${MODS} | xargs -n 1 | sed -e 's/$$/.o32/') +HEADERS:= $(shell for i in ${MODS}; do [ -r "$$i" ] && echo "$$i"; done) + +.SUFFIXES: .o64 .o32 + +all: test + +clean: + -rm -f *.o *.o32 *.o64 tests/*.o32 tests/*.o.64 + -rm -f core core.* vgcore vgcore.* strace.log* + -rm -f run_tests_32 run_tests_64 + -rm -f tests/test_syscalls.o64 tests/test_syscalls.o32 + -rm -f tests/test-list.h + +test: run_tests_64 run_tests_32 + ./run_tests_64 + ./run_tests_32 + +# TODO: Track header file dependencies properly +tests/test_syscalls.o64 tests/test_syscalls.o32: tests/test-list.h + +tests/test-list.h: tests/list_tests.py tests/test_syscalls.cc + python tests/list_tests.py tests/test_syscalls.cc > $@ + +run_tests_64: $(OBJS64) tests/test_syscalls.o64 tests/test-list.h + g++ -m64 tests/test_syscalls.o64 $(OBJS64) -lpthread -lutil -o $@ +run_tests_32: $(OBJS32) tests/test_syscalls.o32 tests/test-list.h + g++ -m32 tests/test_syscalls.o32 $(OBJS32) -lpthread -lutil -o $@ + +.cc.o: ${HEADERS} + ${CXX} ${CFLAGS} ${CPPFLAGS} -c -o $@ $< + +.cc.o64: ${HEADERS} + ${CXX} ${CFLAGS} ${CPPFLAGS} -fPIC -c -o $@ $< + +.c.o64: ${HEADERS} + ${CC} ${CFLAGS} ${CPPFLAGS} --std=gnu99 -fPIC -c -o $@ $< + +.cc.o32: ${HEADERS} + ${CXX} ${CFLAGS} ${CPPFLAGS} -m32 -fPIC -c -o $@ $< + +.c.o32: ${HEADERS} + ${CC} ${CFLAGS} ${CPPFLAGS} -m32 --std=gnu99 -fPIC -c -o $@ $< diff --git a/sandbox/linux/seccomp/access.cc b/sandbox/linux/seccomp/access.cc new file mode 100644 index 0000000..fbe7e53 --- /dev/null +++ b/sandbox/linux/seccomp/access.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_access(const char *pathname, int mode) { + long long tm; + Debug::syscall(&tm, __NR_access, "Executing handler"); + size_t len = strlen(pathname); + struct Request { + int sysnum; + long long cookie; + Access access_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_access; + request->cookie = cookie(); + request->access_req.path_length = len; + request->access_req.mode = mode; + memcpy(request->pathname, pathname, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward access() request [sandbox]"); + } + Debug::elapsed(tm, __NR_access); + return rc; +} + +bool Sandbox::process_access(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + Access access_req; + if (read(sys, sandboxFd, &access_req, sizeof(access_req)) != + sizeof(access_req)) { + read_parm_failed: + die("Failed to read parameters for access() [process]"); + } + int rc = -ENAMETOOLONG; + if (access_req.path_length >= sizeof(mem->pathname)) { + char buf[32]; + while (access_req.path_length > 0) { + size_t len = access_req.path_length > sizeof(buf) ? + sizeof(buf) : access_req.path_length; + ssize_t i = read(sys, sandboxFd, buf, len); + if (i <= 0) { + goto read_parm_failed; + } + access_req.path_length -= i; + } + if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to return data from access() [process]"); + } + return false; + } + + if (!g_policy.allow_file_namespace) { + // After locking the mutex, we can no longer abandon the system call. So, + // perform checks before clobbering the securely shared memory. + char tmp[access_req.path_length]; + if (read(sys, sandboxFd, tmp, access_req.path_length) != + (ssize_t)access_req.path_length) { + goto read_parm_failed; + } + Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str()); + SecureMem::abandonSystemCall(threadFd, -EACCES); + return false; + } + + SecureMem::lockSystemCall(parentMapsFd, mem); + if (read(sys, sandboxFd, mem->pathname, access_req.path_length) != + (ssize_t)access_req.path_length) { + goto read_parm_failed; + } + mem->pathname[access_req.path_length] = '\000'; + + // TODO(markus): Implement sandboxing policy + Debug::message(("Allowing access to \"" + std::string(mem->pathname) + + "\"").c_str()); + + // Tell trusted thread to access the file. + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, __NR_access, + mem->pathname - (char*)mem + (char*)mem->self, + access_req.mode); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/allocator.cc b/sandbox/linux/seccomp/allocator.cc new file mode 100644 index 0000000..6e11a4a --- /dev/null +++ b/sandbox/linux/seccomp/allocator.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The allocator is very simplistic. It requests memory pages directly from +// the system. Each page starts with a header describing the allocation. This +// makes sure that we can return the memory to the system when it is +// deallocated. +// For allocations that are smaller than a single page, we try to squeeze +// multiple of them into the same page. +// We expect to use this allocator for a moderate number of small allocations. +// In most cases, it will only need to ever make a single request to the +// operating system for the lifetime of the STL container object. +// We don't worry about memory fragmentation as the allocator is expected to +// be short-lived. + +#include <stdint.h> +#include <sys/mman.h> + +#include "allocator.h" +#include "linux_syscall_support.h" + +namespace playground { + +class SysCalls { + public: + #define SYS_CPLUSPLUS + #define SYS_ERRNO my_errno + #define SYS_INLINE inline + #define SYS_PREFIX -1 + #undef SYS_LINUX_SYSCALL_SUPPORT_H + #include "linux_syscall_support.h" + SysCalls() : my_errno(0) { } + int my_errno; +}; +#ifdef __NR_mmap2 + #define MMAP mmap2 + #define __NR_MMAP __NR_mmap2 +#else + #define MMAP mmap + #define __NR_MMAP __NR_mmap +#endif + +// We only ever keep track of the very last partial page that was used for +// allocations. This approach simplifies the code a lot. It can theoretically +// lead to more memory fragmentation, but for our use case that is unlikely +// to happen. +struct Header { + // The total amount of memory allocated for this chunk of memory. Typically, + // this would be a single page. + size_t total_len; + + // "used" keeps track of the number of bytes currently allocated in this + // page. Note that as elements are freed from this page, "used" is updated + // allowing us to track when the page is free. However, these holes in the + // page are never re-used, so "tail" is the only way to find out how much + // free space remains and when we need to request another chunk of memory + // from the system. + size_t used; + void *tail; +}; +static Header* last_alloc; + +void* SystemAllocatorHelper::sys_allocate(size_t size) { + // Number of bytes that need to be allocated + if (size + 3 < size) { + return NULL; + } + size_t len = (size + 3) & ~3; + + if (last_alloc) { + // Remaining space in the last chunk of memory allocated from system + size_t remainder = last_alloc->total_len - + (reinterpret_cast<char *>(last_alloc->tail) - + reinterpret_cast<char *>(last_alloc)); + + if (remainder >= len) { + void* ret = last_alloc->tail; + last_alloc->tail = reinterpret_cast<char *>(last_alloc->tail) + len; + last_alloc->used += len; + return ret; + } + } + + SysCalls sys; + if (sizeof(Header) + len + 4095 < len) { + return NULL; + } + size_t total_len = (sizeof(Header) + len + 4095) & ~4095; + Header* mem = reinterpret_cast<Header *>( + sys.MMAP(NULL, total_len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); + if (mem == MAP_FAILED) { + return NULL; + } + + // If we were only asked to allocate a single page, then we will use any + // remaining space for other small allocations. + if (total_len - sizeof(Header) - len >= 4) { + last_alloc = mem; + } + mem->total_len = total_len; + mem->used = len; + char* ret = reinterpret_cast<char *>(mem) + sizeof(Header); + mem->tail = ret + len; + + return ret; +} + +void SystemAllocatorHelper::sys_deallocate(void* p, size_t size) { + // Number of bytes in this allocation + if (size + 3 < size) { + return; + } + size_t len = (size + 3) & ~3; + + // All allocations (small and large) have starting addresses in the + // first page that was allocated from the system. This page starts with + // a header that keeps track of how many bytes are currently used. The + // header can be found by truncating the last few bits of the address. + Header* header = reinterpret_cast<Header *>( + reinterpret_cast<uintptr_t>(p) & ~4095); + header->used -= len; + + // After the last allocation has been freed, return the page(s) to the + // system + if (!header->used) { + SysCalls sys; + sys.munmap(header, header->total_len); + if (last_alloc == header) { + last_alloc = NULL; + } + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/allocator.h b/sandbox/linux/seccomp/allocator.h new file mode 100644 index 0000000..29e0065 --- /dev/null +++ b/sandbox/linux/seccomp/allocator.h @@ -0,0 +1,88 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Implement a very basic memory allocator that make direct system calls +// instead of relying on libc. +// This allocator is not thread-safe. + +#ifndef ALLOCATOR_H__ +#define ALLOCATOR_H__ + +#include <cstddef> + +namespace playground { + +class SystemAllocatorHelper { + protected: + static void *sys_allocate(size_t size); + static void sys_deallocate(void* p, size_t size); +}; + +template <class T> +class SystemAllocator : SystemAllocatorHelper { + public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + + template <class U> + struct rebind { + typedef SystemAllocator<U> other; + }; + + pointer address(reference value) const { + return &value; + } + + const_pointer address(const_reference value) const { + return &value; + } + + SystemAllocator() throw() { } + SystemAllocator(const SystemAllocator& src) throw() { } + template <class U> SystemAllocator(const SystemAllocator<U>& src) throw() { } + ~SystemAllocator() throw() { } + + size_type max_size() const throw() { + return (1 << 30) / sizeof(T); + } + + pointer allocate(size_type num, const void* = 0) { + if (num > max_size()) { + return NULL; + } + return (pointer)sys_allocate(num * sizeof(T)); + } + + void construct(pointer p, const T& value) { + new(reinterpret_cast<void *>(p))T(value); + } + + void destroy(pointer p) { + p->~T(); + } + + void deallocate(pointer p, size_type num) { + sys_deallocate(p, num * sizeof(T)); + } +}; + +template <class T1, class T2> +bool operator== (const SystemAllocator<T1>&, const SystemAllocator<T2>&) + throw() { + return true; +} +template <class T1, class T2> +bool operator!= (const SystemAllocator<T1>&, const SystemAllocator<T2>&) + throw() { + return false; +} + +} // namespace + +#endif // ALLOCATOR_H__ diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc new file mode 100644 index 0000000..0d35181 --- /dev/null +++ b/sandbox/linux/seccomp/clone.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_clone(int flags, char* stack, int* pid, int* ctid, + void* tls, void *wrapper_sp) { + long long tm; + Debug::syscall(&tm, __NR_clone, "Executing handler"); + struct { + int sysnum; + long long cookie; + Clone clone_req; + } __attribute__((packed)) request; + request.sysnum = __NR_clone; + request.cookie = cookie(); + request.clone_req.flags = flags; + request.clone_req.stack = stack; + request.clone_req.pid = pid; + request.clone_req.ctid = ctid; + request.clone_req.tls = tls; + + // TODO(markus): Passing stack == 0 currently does not do the same thing + // that the kernel would do without the sandbox. This is just going to + // cause a crash. We should detect this case, and replace the stack pointer + // with the correct value, instead. + // This is complicated by the fact that we will temporarily be executing + // both threads from the same stack. Some synchronization will be necessary. + // Fortunately, this complication also explains why hardly anybody ever + // does this. + // See trusted_thread.cc for more information. + long rc; + if (stack == 0) { + rc = -EINVAL; + } else { + // Pass along the address on the stack where syscallWrapper() stored the + // original CPU registers. These registers will be restored in the newly + // created thread prior to returning from the wrapped system call. + #if defined(__x86_64__) + memcpy(&request.clone_req.regs64, wrapper_sp, + sizeof(request.clone_req.regs64) + sizeof(void *)); + #elif defined(__i386__) + memcpy(&request.clone_req.regs32, wrapper_sp, + sizeof(request.clone_req.regs32) + sizeof(void *)); + #else + #error Unsupported target platform + #endif + + // In order to unblock the signal mask in the newly created thread and + // after entering Seccomp mode, we have to call sigreturn(). But that + // requires access to a proper stack frame describing a valid signal. + // We trigger a signal now and make sure the stack frame ends up on the + // new stack. Our segv() handler (in sandbox.cc) does that for us. + // See trusted_thread.cc for more details on how threads get created. + // + // In general we rely on the kernel for generating the signal stack + // frame, as the exact binary format has been extended several times over + // the course of the kernel's development. Fortunately, the kernel + // developers treat the initial part of the stack frame as a stable part + // of the ABI. So, we can rely on fixed, well-defined offsets for accessing + // register values and for accessing the signal mask. + #if defined(__x86_64__) + // Red zone compensation. The instrumented system call will remove 128 + // bytes from the thread's stack prior to returning to the original + // call site. + stack -= 128; + request.clone_req.stack = stack; + void *dummy; + asm volatile("mov %%rsp, %%rcx\n" + "mov %3, %%rsp\n" + "int $0\n" + "mov %%rcx, %%rsp\n" + : "=a"(request.clone_req.stack), "=&c"(dummy) + : "a"(__NR_clone + 0xF000), "m"(request.clone_req.stack) + : "memory"); + #elif defined(__i386__) + void *dummy; + asm volatile("mov %%esp, %%ecx\n" + "mov %3, %%esp\n" + "int $0\n" + "mov %%ecx, %%esp\n" + : "=a"(request.clone_req.stack), "=&c"(dummy) + : "a"(__NR_clone + 0xF000), "m"(request.clone_req.stack) + : "memory"); + #else + #error Unsupported target platform + #endif + + // Adjust the signal stack frame so that it contains the correct stack + // pointer upon returning from sigreturn(). + #if defined(__x86_64__) + *(char **)(request.clone_req.stack + 0xA0) = stack; + #elif defined(__i386__) + *(char **)(request.clone_req.stack + 0x1C) = stack; + #else + #error Unsupported target platform + #endif + + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward clone() request [sandbox]"); + } + } + Debug::elapsed(tm, __NR_clone); + return rc; +} + +bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + Clone clone_req; + SysCalls sys; + if (read(sys, sandboxFd, &clone_req, sizeof(clone_req)) !=sizeof(clone_req)){ + die("Failed to read parameters for clone() [process]"); + } + + // TODO(markus): add policy restricting parameters for clone + if ((clone_req.flags & ~CLONE_DETACHED) != (CLONE_VM|CLONE_FS|CLONE_FILES| + CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS| + CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID)) { + SecureMem::abandonSystemCall(threadFd, -EPERM); + return false; + } else { + SecureMem::Args* newMem = getNewSecureMem(); + if (!newMem) { + SecureMem::abandonSystemCall(threadFd, -ENOMEM); + return false; + } else { + // clone() has unusual semantics. We don't want to return back into the + // trusted thread, but instead we need to continue execution at the IP + // where we got called initially. + SecureMem::lockSystemCall(parentMapsFd, mem); + mem->ret = clone_req.ret; + #if defined(__x86_64__) + mem->rbp = clone_req.regs64.rbp; + mem->rbx = clone_req.regs64.rbx; + mem->rcx = clone_req.regs64.rcx; + mem->rdx = clone_req.regs64.rdx; + mem->rsi = clone_req.regs64.rsi; + mem->rdi = clone_req.regs64.rdi; + mem->r8 = clone_req.regs64.r8; + mem->r9 = clone_req.regs64.r9; + mem->r10 = clone_req.regs64.r10; + mem->r11 = clone_req.regs64.r11; + mem->r12 = clone_req.regs64.r12; + mem->r13 = clone_req.regs64.r13; + mem->r14 = clone_req.regs64.r14; + mem->r15 = clone_req.regs64.r15; + #elif defined(__i386__) + mem->ebp = clone_req.regs32.ebp; + mem->edi = clone_req.regs32.edi; + mem->esi = clone_req.regs32.esi; + mem->edx = clone_req.regs32.edx; + mem->ecx = clone_req.regs32.ecx; + mem->ebx = clone_req.regs32.ebx; + #else + #error Unsupported target platform + #endif + newMem->sequence = 0; + newMem->shmId = -1; + mem->newSecureMem = newMem; + mem->processFdPub = processFdPub_; + mem->cloneFdPub = cloneFdPub_; + + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, + __NR_clone, clone_req.flags, clone_req.stack, + clone_req.pid, clone_req.ctid, clone_req.tls); + return true; + } + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/debug.cc b/sandbox/linux/seccomp/debug.cc new file mode 100644 index 0000000..5d6de49 --- /dev/null +++ b/sandbox/linux/seccomp/debug.cc @@ -0,0 +1,363 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef NDEBUG + +#include "debug.h" + +namespace playground { + +bool Debug::enabled_; +int Debug::numSyscallNames_; +const char **Debug::syscallNames_; +std::map<int, std::string> Debug::syscallNamesMap_; + +Debug Debug::debug_; + +Debug::Debug() { + // Logging is disabled by default, but can be turned on by setting an + // appropriate environment variable. Initialize this code from a global + // constructor, so that it runs before the sandbox is turned on. + enabled_ = !!getenv("SECCOMP_SANDBOX_DEBUGGING"); + + // Read names of system calls from header files, if available. Symbolic + // names make debugging so much nicer. + if (enabled_) { + static const char *filenames[] = { + #if __WORDSIZE == 64 + "/usr/include/asm/unistd_64.h", + #elif __WORDSIZE == 32 + "/usr/include/asm/unistd_32.h", + #endif + "/usr/include/asm/unistd.h", + NULL }; + numSyscallNames_ = 0; + for (const char **fn = filenames; *fn; ++fn) { + FILE *fp = fopen(*fn, "r"); + if (fp) { + std::string baseName; + int baseNum = -1; + char buf[80]; + while (fgets(buf, sizeof(buf), fp)) { + // Check if the line starts with "#define" + static const char* whitespace = " \t\r\n"; + char *token, *save; + token = strtok_r(buf, whitespace, &save); + if (token && !strcmp(token, "#define")) { + + // Only parse identifiers that start with "__NR_" + token = strtok_r(NULL, whitespace, &save); + if (token) { + if (strncmp(token, "__NR_", 5)) { + continue; + } + std::string syscallName(token + 5); + + // Parse the value of the symbol. Try to be forgiving in what + // we accept, as the file format might change over time. + token = strtok_r(NULL, "\r\n", &save); + if (token) { + // Some values are defined relative to previous values, we + // detect these examples by finding an earlier symbol name + // followed by a '+' plus character. + bool isRelative = false; + char *base = strstr(token, baseName.c_str()); + if (baseNum >= 0 && base) { + base += baseName.length(); + while (*base == ' ' || *base == '\t') { + ++base; + } + if (*base == '+') { + isRelative = true; + token = base; + } + } + + // Skip any characters that are not part of the syscall number. + while (*token < '0' || *token > '9') { + token++; + } + + // If we now have a valid datum, enter it into our map. + if (*token) { + int sysnum = atoi(token); + + // Deal with symbols that are defined relative to earlier + // ones. + if (isRelative) { + sysnum += baseNum; + } else { + baseNum = sysnum; + baseName = syscallName; + } + + // Keep track of the highest syscall number that we know + // about. + if (sysnum >= numSyscallNames_) { + numSyscallNames_ = sysnum + 1; + } + + syscallNamesMap_[sysnum] = syscallName; + } + } + } + } + } + fclose(fp); + break; + } + } + if (numSyscallNames_) { + // We cannot make system calls at the time, when we are looking up + // the names. So, copy them into a data structure that can be + // accessed without having to allocated memory (i.e. no more STL). + syscallNames_ = reinterpret_cast<const char **>( + calloc(sizeof(char *), numSyscallNames_)); + for (std::map<int, std::string>::const_iterator iter = + syscallNamesMap_.begin(); + iter != syscallNamesMap_.end(); + ++iter) { + syscallNames_[iter->first] = iter->second.c_str(); + } + } + } +} + +bool Debug::enter() { + // Increment the recursion level in TLS storage. This allows us to + // make system calls from within our debugging functions, without triggering + // additional debugging output. + // + // This function can be called from both the sandboxed process and from the + // trusted process. Only the sandboxed process needs to worry about + // recursively calling system calls. The trusted process doesn't intercept + // system calls and thus doesn't have this problem. It also doesn't have + // a TLS. We explicitly set the segment selector to zero, when in the + // trusted process, so that we can avoid tracking recursion levels. + int level; + #if defined(__x86_64__) + asm volatile("mov %%gs, %0\n" + "test %0, %0\n" + "jz 1f\n" + "movl %%gs:0x1050-0xE0, %0\n" + "incl %%gs:0x1050-0xE0\n" + "1:\n" + : "=r"(level) + : + : "memory"); + #elif defined(__i386__) + asm volatile("mov %%fs, %0\n" + "test %0, %0\n" + "jz 1f\n" + "movl %%fs:0x1034-0x58, %0\n" + "incl %%fs:0x1034-0x58\n" + "1:\n" + : "=r"(level) + : + : "memory"); + #else + #error "Unsupported target platform" + #endif + return !level; +} + +bool Debug::leave() { + // Decrement the recursion level in TLS storage. This allows us to + // make system calls from within our debugging functions, without triggering + // additional debugging output. + // + // This function can be called from both the sandboxed process and from the + // trusted process. Only the sandboxed process needs to worry about + // recursively calling system calls. The trusted process doesn't intercept + // system calls and thus doesn't have this problem. It also doesn't have + // a TLS. We explicitly set the segment selector to zero, when in the + // trusted process, so that we can avoid tracking recursion levels. + int level; + #if defined(__x86_64__) + asm volatile("mov %%gs, %0\n" + "test %0, %0\n" + "jz 1f\n" + "decl %%gs:0x1050-0xE0\n" + "movl %%gs:0x1050-0xE0, %0\n" + "1:\n" + : "=r"(level) + : + : "memory"); + #elif defined(__i386__) + asm volatile("mov %%fs, %0\n" + "test %0, %0\n" + "jz 1f\n" + "decl %%fs:0x1034-0x58\n" + "movl %%fs:0x1034-0x58, %0\n" + "1:\n" + : "=r"(level) + : + : "memory"); + #else + #error Unsupported target platform + #endif + return !level; +} + +void Debug::_message(const char* msg) { + if (enabled_) { + Sandbox::SysCalls sys; + size_t len = strlen(msg); + if (len && msg[len-1] != '\n') { + // Write operations should be atomic, so that we don't interleave + // messages from multiple threads. Append a newline, if it is not + // already there. + char copy[len + 1]; + memcpy(copy, msg, len); + copy[len] = '\n'; + Sandbox::write(sys, 2, copy, len + 1); + } else { + Sandbox::write(sys, 2, msg, len); + } + } +} + +void Debug::message(const char* msg) { + if (enabled_) { + if (enter()) { + _message(msg); + } + leave(); + } +} + +void Debug::gettimeofday(long long* tm) { + if (tm) { + struct timeval tv; + #if defined(__i386__) + // Zero out the lastSyscallNum, so that we don't try to coalesce + // calls to gettimeofday(). For debugging purposes, we need the + // exact time. + asm volatile("movl $0, %fs:0x102C-0x58"); + #elif !defined(__x86_64__) + #error Unsupported target platform + #endif + ::gettimeofday(&tv, NULL); + *tm = 1000ULL*1000ULL*static_cast<unsigned>(tv.tv_sec) + + static_cast<unsigned>(tv.tv_usec); + } +} + +void Debug::syscall(long long* tm, int sysnum, const char* msg, int call) { + // This function gets called from the system call wrapper. Avoid calling + // any library functions that themselves need system calls. + if (enabled_) { + if (enter() || !tm) { + gettimeofday(tm); + + const char *sysname = NULL; + if (sysnum >= 0 && sysnum < numSyscallNames_) { + sysname = syscallNames_[sysnum]; + } + static const char kUnnamedMessage[] = "Unnamed syscall #"; + char unnamed[40]; + if (!sysname) { + memcpy(unnamed, kUnnamedMessage, sizeof(kUnnamedMessage) - 1); + itoa(unnamed + sizeof(kUnnamedMessage) - 1, sysnum); + sysname = unnamed; + } + #if defined(__NR_socketcall) || defined(__NR_ipc) + char extra[40]; + *extra = '\000'; + #if defined(__NR_socketcall) + if (sysnum == __NR_socketcall) { + static const char* socketcall_name[] = { + 0, "socket", "bind", "connect", "listen", "accept", "getsockname", + "getpeername", "socketpair", "send", "recv", "sendto","recvfrom", + "shutdown", "setsockopt", "getsockopt", "sendmsg", "recvmsg", + "accept4" + }; + if (call >= 1 && + call < (int)(sizeof(socketcall_name)/sizeof(char *))) { + strcat(strcpy(extra, " "), socketcall_name[call]); + } else { + itoa(strcpy(extra, " #") + 2, call); + } + } + #endif + #if defined(__NR_ipc) + if (sysnum == __NR_ipc) { + static const char* ipc_name[] = { + 0, "semop", "semget", "semctl", "semtimedop", 0, 0, 0, 0, 0, 0, + "msgsnd", "msgrcv", "msgget", "msgctl", 0, 0, 0, 0, 0, 0, + "shmat", "shmdt", "shmget", "shmctl" }; + if (call >= 1 && call < (int)(sizeof(ipc_name)/sizeof(char *)) && + ipc_name[call]) { + strcat(strcpy(extra, " "), ipc_name[call]); + } else { + itoa(strcpy(extra, " #") + 2, call); + } + } + #endif + #else + static const char extra[1] = { 0 }; + #endif + char buf[strlen(sysname) + strlen(extra) + (msg ? strlen(msg) : 0) + 4]; + strcat(strcat(strcat(strcat(strcpy(buf, sysname), extra), ": "), + msg ? msg : ""), "\n"); + _message(buf); + } + leave(); + } +} + +char* Debug::itoa(char* s, int n) { + // Remember return value + char *ret = s; + + // Insert sign for negative numbers + if (n < 0) { + *s++ = '-'; + n = -n; + } + + // Convert to decimal (in reverse order) + char *start = s; + do { + *s++ = '0' + (n % 10); + n /= 10; + } while (n); + *s-- = '\000'; + + // Reverse order of digits + while (start < s) { + char ch = *s; + *s-- = *start; + *start++ = ch; + } + + return ret; +} + +void Debug::elapsed(long long tm, int sysnum, int call) { + if (enabled_) { + if (enter()) { + // Compute the time that has passed since the system call started. + long long delta; + gettimeofday(&delta); + delta -= tm; + + // Format "Elapsed time: %d.%03dms" without using sprintf(). + char buf[80]; + itoa(strrchr(strcpy(buf, "Elapsed time: "), '\000'), delta/1000); + delta %= 1000; + strcat(buf, delta < 100 ? delta < 10 ? ".00" : ".0" : "."); + itoa(strrchr(buf, '\000'), delta); + strcat(buf, "ms"); + + // Print system call name and elapsed time. + syscall(NULL, sysnum, buf, call); + } + leave(); + } +} + +} // namespace + +#endif // NDEBUG diff --git a/sandbox/linux/seccomp/debug.h b/sandbox/linux/seccomp/debug.h new file mode 100644 index 0000000..eb5a194 --- /dev/null +++ b/sandbox/linux/seccomp/debug.h @@ -0,0 +1,80 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef DEBUG_H__ +#define DEBUG_H__ + +#include <map> +#include <stdio.h> +#include <stdlib.h> +#include <string> +#include <string.h> + +#include "sandbox_impl.h" + +namespace playground { + +class Debug { + public: + // If debugging is enabled, write a message to stderr. + static void message(const char* msg) + #ifndef NDEBUG + asm("playground$debugMessage") + #if defined(__x86_64__) + __attribute__((visibility("internal"))) + #endif + ; + #else + { } + #endif + + // If debugging is enabled, write the name of the syscall and an optional + // message to stderr. + static void syscall(long long* tm, int sysnum, + const char* msg, int call = -1) + #ifndef NDEBUG + ; + #else + { } + #endif + + // Print how much wall-time has elapsed since the last call to syscall() + static void elapsed(long long tm, int sysnum, int call = -1) + #ifndef NDEBUG + ; + #else + { + } + #endif + + // Check whether debugging is enabled. + static bool isEnabled() { + #ifndef NDEBUG + return enabled_; + #else + return false; + #endif + } + + private: + #ifndef NDEBUG + Debug(); + static bool enter(); + static bool leave(); + static void _message(const char* msg); + static void gettimeofday(long long* tm); + static char* itoa(char* s, int n); + + static Debug debug_; + + static bool enabled_; + static int numSyscallNames_; + static const char **syscallNames_; + static std::map<int, std::string> syscallNamesMap_; + #endif +}; + +} // namespace + +#endif // DEBUG_H__ diff --git a/sandbox/linux/seccomp/exit.cc b/sandbox/linux/seccomp/exit.cc new file mode 100644 index 0000000..f4db643 --- /dev/null +++ b/sandbox/linux/seccomp/exit.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_exit(int status) { + long long tm; + Debug::syscall(&tm, __NR_exit, "Executing handler"); + struct { + int sysnum; + long long cookie; + } __attribute__((packed)) request; + request.sysnum = __NR_exit; + request.cookie = cookie(); + + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request)) { + die("Failed to forward exit() request [sandbox]"); + } + for (;;) { + sys._exit(status); + } +} + +bool Sandbox::process_exit(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + SecureMem::lockSystemCall(parentMapsFd, mem); + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, + __NR_exit, 0); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/getpid.cc b/sandbox/linux/seccomp/getpid.cc new file mode 100644 index 0000000..be5449b --- /dev/null +++ b/sandbox/linux/seccomp/getpid.cc @@ -0,0 +1,17 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_getpid() { + long long tm; + Debug::syscall(&tm, __NR_getpid, "Executing handler"); + Debug::elapsed(tm, __NR_getpid); + return pid_; +} + +} // namespace diff --git a/sandbox/linux/seccomp/gettid.cc b/sandbox/linux/seccomp/gettid.cc new file mode 100644 index 0000000..699774a --- /dev/null +++ b/sandbox/linux/seccomp/gettid.cc @@ -0,0 +1,18 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_gettid() { + long long tm; + Debug::syscall(&tm, __NR_gettid, "Executing handler"); + pid_t t = tid(); + Debug::elapsed(tm, __NR_gettid); + return t; +} + +} // namespace diff --git a/sandbox/linux/seccomp/ioctl.cc b/sandbox/linux/seccomp/ioctl.cc new file mode 100644 index 0000000..4d2b3c5c5 --- /dev/null +++ b/sandbox/linux/seccomp/ioctl.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_ioctl(int d, int req, void *arg) { + long long tm; + Debug::syscall(&tm, __NR_ioctl, "Executing handler"); + struct { + int sysnum; + long long cookie; + IOCtl ioctl_req; + } __attribute__((packed)) request; + request.sysnum = __NR_ioctl; + request.cookie = cookie(); + request.ioctl_req.d = d; + request.ioctl_req.req = req; + request.ioctl_req.arg = arg; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward ioctl() request [sandbox]"); + } + Debug::elapsed(tm, __NR_ioctl); + return rc; +} + +bool Sandbox::process_ioctl(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + IOCtl ioctl_req; + SysCalls sys; + if (read(sys, sandboxFd, &ioctl_req, sizeof(ioctl_req)) !=sizeof(ioctl_req)){ + die("Failed to read parameters for ioctl() [process]"); + } + int rc = -EINVAL; + switch (ioctl_req.req) { + case TCGETS: + case TIOCGWINSZ: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_ioctl, + ioctl_req.d, ioctl_req.req, ioctl_req.arg); + return true; + default: + if (Debug::isEnabled()) { + char buf[80]; + sprintf(buf, "Unsupported ioctl: 0x%04X\n", ioctl_req.req); + Debug::message(buf); + } + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/ipc.cc b/sandbox/linux/seccomp/ipc.cc new file mode 100644 index 0000000..67a4e34 --- /dev/null +++ b/sandbox/linux/seccomp/ipc.cc @@ -0,0 +1,351 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +#ifndef IPC_PRIVATE +#define IPC_PRIVATE 0 +#endif +#ifndef IPC_RMID +#define IPC_RMID 0 +#endif +#ifndef IPC_64 +#define IPC_64 256 +#endif + +#if defined(__NR_shmget) +void* Sandbox::sandbox_shmat(int shmid, const void* shmaddr, int shmflg) { + long long tm; + Debug::syscall(&tm, __NR_shmat, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmAt shmat_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmat; + request.cookie = cookie(); + request.shmat_req.shmid = shmid; + request.shmat_req.shmaddr = shmaddr; + request.shmat_req.shmflg = shmflg; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmat() request [sandbox]"); + } + Debug::elapsed(tm, __NR_shmat); + return reinterpret_cast<void *>(rc); +} + +long Sandbox::sandbox_shmctl(int shmid, int cmd, void* buf) { + long long tm; + Debug::syscall(&tm, __NR_shmctl, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmCtl shmctl_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmctl; + request.cookie = cookie(); + request.shmctl_req.shmid = shmid; + request.shmctl_req.cmd = cmd; + request.shmctl_req.buf = buf; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmctl() request [sandbox]"); + } + Debug::elapsed(tm, __NR_shmctl); + return rc; +} + +long Sandbox::sandbox_shmdt(const void* shmaddr) { + long long tm; + Debug::syscall(&tm, __NR_shmdt, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmDt shmdt_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmdt; + request.cookie = cookie(); + request.shmdt_req.shmaddr = shmaddr; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmdt() request [sandbox]"); + } + Debug::elapsed(tm, __NR_shmdt); + return rc; +} + +long Sandbox::sandbox_shmget(int key, size_t size, int shmflg) { + long long tm; + Debug::syscall(&tm, __NR_shmget, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmGet shmget_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmget; + request.cookie = cookie(); + request.shmget_req.key = key; + request.shmget_req.size = size; + request.shmget_req.shmflg = shmflg; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmget() request [sandbox]"); + } + Debug::elapsed(tm, __NR_shmget); + return rc; +} + +bool Sandbox::process_shmat(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmAt shmat_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmat_req, sizeof(shmat_req)) != + sizeof(shmat_req)) { + die("Failed to read parameters for shmat() [process]"); + } + + // We only allow attaching to the shm identifier that was returned by + // the most recent call to shmget(IPC_PRIVATE) + if (shmat_req.shmaddr || shmat_req.shmflg || shmat_req.shmid != mem->shmId) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmat, shmat_req.shmid, shmat_req.shmaddr, + shmat_req.shmflg); + return true; +} + +bool Sandbox::process_shmctl(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmCtl shmctl_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmctl_req, sizeof(shmctl_req)) != + sizeof(shmctl_req)) { + die("Failed to read parameters for shmctl() [process]"); + } + + // The only shmctl() operation that we need to support is removal. This + // operation is generally safe. + if ((shmctl_req.cmd & ~(IPC_64 | IPC_RMID)) || shmctl_req.buf) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmctl, shmctl_req.shmid, shmctl_req.cmd, + shmctl_req.buf); + return true; +} + +bool Sandbox::process_shmdt(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmDt shmdt_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmdt_req, sizeof(shmdt_req)) != + sizeof(shmdt_req)) { + die("Failed to read parameters for shmdt() [process]"); + } + + // Detaching shared memory segments it generally safe, but just in case + // of a kernel bug, we make sure that the address does not fall into any + // of the reserved memory regions. + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)shmdt_req.shmaddr); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first <= shmdt_req.shmaddr; + ++iter){ + if (shmdt_req.shmaddr < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + shmdt_req.shmaddr >= iter->first) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmdt, shmdt_req.shmaddr); + return true; +} + +bool Sandbox::process_shmget(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmGet shmget_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmget_req, sizeof(shmget_req)) != + sizeof(shmget_req)) { + die("Failed to read parameters for shmget() [process]"); + } + + // We do not want to allow the sandboxed application to access arbitrary + // shared memory regions. We only allow it to access regions that it + // created itself. + if (shmget_req.key != IPC_PRIVATE || shmget_req.shmflg & ~0777) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmget, shmget_req.key, shmget_req.size, + shmget_req.shmflg); + return true; +} +#endif + +#if defined(__NR_ipc) +#ifndef SHMAT +#define SHMAT 21 +#endif +#ifndef SHMDT +#define SHMDT 22 +#endif +#ifndef SHMGET +#define SHMGET 23 +#endif +#ifndef SHMCTL +#define SHMCTL 24 +#endif + +long Sandbox::sandbox_ipc(unsigned call, int first, int second, int third, + void* ptr, long fifth) { + long long tm; + Debug::syscall(&tm, __NR_ipc, "Executing handler", call); + struct { + int sysnum; + long long cookie; + IPC ipc_req; + } __attribute__((packed)) request; + request.sysnum = __NR_ipc; + request.cookie = cookie(); + request.ipc_req.call = call; + request.ipc_req.first = first; + request.ipc_req.second = second; + request.ipc_req.third = third; + request.ipc_req.ptr = ptr; + request.ipc_req.fifth = fifth; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward ipc() request [sandbox]"); + } + Debug::elapsed(tm, __NR_ipc, call); + return rc; +} + +bool Sandbox::process_ipc(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + IPC ipc_req; + SysCalls sys; + if (read(sys, sandboxFd, &ipc_req, sizeof(ipc_req)) != sizeof(ipc_req)) { + die("Failed to read parameters for ipc() [process]"); + } + + // We do not support all of the SysV IPC calls. In fact, we only support + // the minimum feature set necessary for Chrome's renderers to share memory + // with the X server. + switch (ipc_req.call) { + case SHMAT: { + // We only allow attaching to the shm identifier that was returned by + // the most recent call to shmget(IPC_PRIVATE) + if (ipc_req.ptr || ipc_req.second || ipc_req.first != mem->shmId) { + goto deny; + } + accept: + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_ipc, ipc_req.call, ipc_req.first, + ipc_req.second, ipc_req.third, ipc_req.ptr, + ipc_req.fifth); + return true; + } + case SHMCTL: + // The only shmctl() operation that we need to support is removal. This + // operation is generally safe. + if ((ipc_req.second & ~(IPC_64 | IPC_RMID)) || ipc_req.ptr) { + goto deny; + } else { + goto accept; + } + case SHMDT: { + // Detaching shared memory segments it generally safe, but just in case + // of a kernel bug, we make sure that the address does not fall into any + // of the reserved memory regions. + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)ipc_req.ptr); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first <=ipc_req.ptr; ++iter){ + if (ipc_req.ptr < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + ipc_req.ptr >= iter->first) { + goto deny; + } + } + goto accept; + } + case SHMGET: + // We do not want to allow the sandboxed application to access arbitrary + // shared memory regions. We only allow it to access regions that it + // created itself. + if (ipc_req.first != IPC_PRIVATE || ipc_req.third & ~0777) { + goto deny; + } else { + goto accept; + } + default: + // Other than SysV shared memory, we do not actually need to support any + // other SysV IPC calls. + deny: + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } +} +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc new file mode 100644 index 0000000..8dd9b93 --- /dev/null +++ b/sandbox/linux/seccomp/library.cc @@ -0,0 +1,1208 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#define XOPEN_SOURCE 500 +#include <algorithm> +#include <elf.h> +#include <errno.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/unistd.h> +#include <set> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include "allocator.h" +#include "debug.h" +#include "library.h" +#include "sandbox_impl.h" +#include "syscall.h" +#include "syscall_table.h" +#include "x86_decode.h" + +#if defined(__x86_64__) +typedef Elf64_Phdr Elf_Phdr; +typedef Elf64_Rela Elf_Rel; + +typedef Elf64_Half Elf_Half; +typedef Elf64_Word Elf_Word; +typedef Elf64_Sword Elf_Sword; +typedef Elf64_Xword Elf_Xword; +typedef Elf64_Sxword Elf_Sxword; +typedef Elf64_Off Elf_Off; +typedef Elf64_Section Elf_Section; +typedef Elf64_Versym Elf_Versym; + +#define ELF_ST_BIND ELF64_ST_BIND +#define ELF_ST_TYPE ELF64_ST_TYPE +#define ELF_ST_INFO ELF64_ST_INFO +#define ELF_R_SYM ELF64_R_SYM +#define ELF_R_TYPE ELF64_R_TYPE +#define ELF_R_INFO ELF64_R_INFO + +#define ELF_REL_PLT ".rela.plt" +#define ELF_JUMP_SLOT R_X86_64_JUMP_SLOT +#elif defined(__i386__) +typedef Elf32_Phdr Elf_Phdr; +typedef Elf32_Rel Elf_Rel; + +typedef Elf32_Half Elf_Half; +typedef Elf32_Word Elf_Word; +typedef Elf32_Sword Elf_Sword; +typedef Elf32_Xword Elf_Xword; +typedef Elf32_Sxword Elf_Sxword; +typedef Elf32_Off Elf_Off; +typedef Elf32_Section Elf_Section; +typedef Elf32_Versym Elf_Versym; + +#define ELF_ST_BIND ELF32_ST_BIND +#define ELF_ST_TYPE ELF32_ST_TYPE +#define ELF_ST_INFO ELF32_ST_INFO +#define ELF_R_SYM ELF32_R_SYM +#define ELF_R_TYPE ELF32_R_TYPE +#define ELF_R_INFO ELF32_R_INFO + +#define ELF_REL_PLT ".rel.plt" +#define ELF_JUMP_SLOT R_386_JMP_SLOT +#else +#error Unsupported target platform +#endif + +namespace playground { + +char* Library::__kernel_vsyscall; +char* Library::__kernel_sigreturn; +char* Library::__kernel_rt_sigreturn; + +Library::~Library() { + if (image_size_) { + // We no longer need access to a full mapping of the underlying library + // file. Move the temporarily extended mapping back to where we originally + // found. Make sure to preserve any changes that we might have made since. + Sandbox::SysCalls sys; + sys.mprotect(image_, 4096, PROT_READ | PROT_WRITE | PROT_EXEC); + if (memcmp(image_, memory_ranges_.rbegin()->second.start, 4096)) { + // Only copy data, if we made any changes in this data. Otherwise there + // is no need to create another modified COW mapping. + memcpy(image_, memory_ranges_.rbegin()->second.start, 4096); + } + sys.mprotect(image_, 4096, PROT_READ | PROT_EXEC); + sys.mremap(image_, image_size_, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, + memory_ranges_.rbegin()->second.start); + } +} + +char* Library::getBytes(char* dst, const char* src, ssize_t len) { + // Some kernels don't allow accessing the VDSO from write() + if (isVDSO_ && + src >= memory_ranges_.begin()->second.start && + src <= memory_ranges_.begin()->second.stop) { + ssize_t max = + reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - src; + if (len > max) { + len = max; + } + memcpy(dst, src, len); + return dst; + } + + // Read up to "len" bytes from "src" and copy them to "dst". Short + // copies are possible, if we are at the end of a mapping. Returns + // NULL, if the operation failed completely. + static int helper_socket[2]; + Sandbox::SysCalls sys; + if (!helper_socket[0] && !helper_socket[1]) { + // Copy data through a socketpair, as this allows us to access it + // without incurring a segmentation fault. + sys.socketpair(AF_UNIX, SOCK_STREAM, 0, helper_socket); + } + char* ptr = dst; + int inc = 4096; + while (len > 0) { + ssize_t l = inc == 1 ? inc : 4096 - (reinterpret_cast<long>(src) & 0xFFF); + if (l > len) { + l = len; + } + l = NOINTR_SYS(sys.write(helper_socket[0], src, l)); + if (l == -1) { + if (sys.my_errno == EFAULT) { + if (inc == 1) { + if (ptr == dst) { + return NULL; + } + break; + } + inc = 1; + continue; + } else { + return NULL; + } + } + l = sys.read(helper_socket[1], ptr, l); + if (l <= 0) { + return NULL; + } + ptr += l; + src += l; + len -= l; + } + return dst; +} + +char *Library::get(Elf_Addr offset, char *buf, size_t len) { + if (!valid_) { + memset(buf, 0, len); + return NULL; + } + RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); + if (iter == memory_ranges_.end()) { + memset(buf, 0, len); + return NULL; + } + offset -= iter->first; + long size = reinterpret_cast<char *>(iter->second.stop) - + reinterpret_cast<char *>(iter->second.start); + if (offset > size - len) { + memset(buf, 0, len); + return NULL; + } + char *src = reinterpret_cast<char *>(iter->second.start) + offset; + memset(buf, 0, len); + if (!getBytes(buf, src, len)) { + return NULL; + } + return buf; +} + +Library::string Library::get(Elf_Addr offset) { + if (!valid_) { + return ""; + } + RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); + if (iter == memory_ranges_.end()) { + return ""; + } + offset -= iter->first; + const char *start = reinterpret_cast<char *>(iter->second.start) + offset; + const char *stop = reinterpret_cast<char *>(iter->second.stop) + offset; + char buf[4096] = { 0 }; + getBytes(buf, start, stop - start >= (int)sizeof(buf) ? + sizeof(buf) - 1 : stop - start); + start = buf; + stop = buf; + while (*stop) { + ++stop; + } + string s = stop > start ? string(start, stop - start) : ""; + return s; +} + +char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) { + if (!valid_) { + memset(buf, 0, len); + return NULL; + } + Sandbox::SysCalls sys; + if (!image_ && !isVDSO_ && !memory_ranges_.empty() && + memory_ranges_.rbegin()->first == 0) { + // Extend the mapping of the very first page of the underlying library + // file. This way, we can read the original file contents of the entire + // library. + // We have to be careful, because doing so temporarily removes the first + // 4096 bytes of the library from memory. And we don't want to accidentally + // unmap code that we are executing. So, only use functions that can be + // inlined. + void* start = memory_ranges_.rbegin()->second.start; + image_size_ = memory_ranges_.begin()->first + + (reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - + reinterpret_cast<char *>(memory_ranges_.begin()->second.start)); + if (image_size_ < 8192) { + // It is possible to create a library that is only a single page in + // size. In that case, we have to make sure that we artificially map + // one extra page past the end of it, as our code relies on mremap() + // actually moving the mapping. + image_size_ = 8192; + } + image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_, + MREMAP_MAYMOVE)); + if (image_size_ == 8192 && image_ == start) { + // We really mean it, when we say we want the memory to be moved. + image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_, + MREMAP_MAYMOVE)); + sys.munmap(reinterpret_cast<char *>(start) + 4096, 4096); + } + if (image_ == MAP_FAILED) { + image_ = NULL; + } else { + sys.MMAP(start, 4096, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + for (int i = 4096 / sizeof(long); --i; + reinterpret_cast<long *>(start)[i] = + reinterpret_cast<long *>(image_)[i]); + } + } + + if (image_) { + if (offset + len > image_size_) { + // It is quite likely that we initially did not map the entire file as + // we did not know how large it is. So, if necessary, try to extend the + // mapping. + size_t new_size = (offset + len + 4095) & ~4095; + char* tmp = + reinterpret_cast<char *>(sys.mremap(image_, image_size_, new_size, + MREMAP_MAYMOVE)); + if (tmp != MAP_FAILED) { + image_ = tmp; + image_size_ = new_size; + } + } + if (buf && offset + len <= image_size_) { + return reinterpret_cast<char *>(memcpy(buf, image_ + offset, len)); + } + return NULL; + } + return buf ? get(offset, buf, len) : NULL; +} + +Library::string Library::getOriginal(Elf_Addr offset) { + if (!valid_) { + return ""; + } + // Make sure we actually have a mapping that we can access. If the string + // is located at the end of the image, we might not yet have extended the + // mapping sufficiently. + if (!image_ || image_size_ <= offset) { + getOriginal(offset, NULL, 1); + } + + if (image_) { + if (offset < image_size_) { + char* start = image_ + offset; + char* stop = start; + while (stop < image_ + image_size_ && *stop) { + ++stop; + if (stop >= image_ + image_size_) { + getOriginal(stop - image_, NULL, 1); + } + } + return string(start, stop - start); + } + return ""; + } + return get(offset); +} + +const Elf_Ehdr* Library::getEhdr() { + if (!valid_) { + return NULL; + } + return &ehdr_; +} + +const Elf_Shdr* Library::getSection(const string& section) { + if (!valid_) { + return NULL; + } + SectionTable::const_iterator iter = section_table_.find(section); + if (iter == section_table_.end()) { + return NULL; + } + return &iter->second.second; +} + +int Library::getSectionIndex(const string& section) { + if (!valid_) { + return -1; + } + SectionTable::const_iterator iter = section_table_.find(section); + if (iter == section_table_.end()) { + return -1; + } + return iter->second.first; +} + +void Library::makeWritable(bool state) const { + for (RangeMap::const_iterator iter = memory_ranges_.begin(); + iter != memory_ranges_.end(); ++iter) { + const Range& range = iter->second; + long length = reinterpret_cast<char *>(range.stop) - + reinterpret_cast<char *>(range.start); + Sandbox::SysCalls sys; + sys.mprotect(range.start, length, + range.prot | (state ? PROT_WRITE : 0)); + } +} + +bool Library::isSafeInsn(unsigned short insn) { + // Check if the instruction has no unexpected side-effects. If so, it can + // be safely relocated from the function that we are patching into the + // out-of-line scratch space that we are setting up. This is often necessary + // to make room for the JMP into the scratch space. + return ((insn & 0x7) < 0x6 && (insn & 0xF0) < 0x40 + /* ADD, OR, ADC, SBB, AND, SUB, XOR, CMP */) || + #if defined(__x86_64__) + insn == 0x63 /* MOVSXD */ || + #endif + (insn >= 0x80 && insn <= 0x8E /* ADD, OR, ADC, + SBB, AND, SUB, XOR, CMP, TEST, XCHG, MOV, LEA */) || + (insn == 0x90) || /* NOP */ + (insn >= 0xA0 && insn <= 0xA9) /* MOV, TEST */ || + (insn >= 0xB0 && insn <= 0xBF /* MOV */) || + (insn >= 0xC0 && insn <= 0xC1) || /* Bit Shift */ + (insn >= 0xD0 && insn <= 0xD3) || /* Bit Shift */ + (insn >= 0xC6 && insn <= 0xC7 /* MOV */) || + (insn == 0xF7) /* TEST, NOT, NEG, MUL, IMUL, DIV, IDIV */; +} + +char* Library::getScratchSpace(const Maps* maps, char* near, int needed, + char** extraSpace, int* extraLength) { + if (needed > *extraLength || + labs(*extraSpace - reinterpret_cast<char *>(near)) > (1536 << 20)) { + if (*extraSpace) { + // Start a new scratch page and mark any previous page as write-protected + Sandbox::SysCalls sys; + sys.mprotect(*extraSpace, 4096, PROT_READ|PROT_EXEC); + } + // Our new scratch space is initially executable and writable. + *extraLength = 4096; + *extraSpace = maps->allocNearAddr(near, *extraLength, + PROT_READ|PROT_WRITE|PROT_EXEC); + } + if (*extraSpace) { + *extraLength -= needed; + return *extraSpace + *extraLength; + } + Sandbox::die("Insufficient space to intercept system call"); +} + +void Library::patchSystemCallsInFunction(const Maps* maps, char *start, + char *end, char** extraSpace, + int* extraLength) { + std::set<char *, std::less<char *>, SystemAllocator<char *> > branch_targets; + for (char *ptr = start; ptr < end; ) { + unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64); + char *target; + if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || insn == 0xEB /* JMP */) { + target = ptr + (reinterpret_cast<signed char *>(ptr))[-1]; + } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ || + (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) { + target = ptr + (reinterpret_cast<int *>(ptr))[-1]; + } else { + continue; + } + branch_targets.insert(target); + } + struct Code { + char* addr; + int len; + unsigned short insn; + bool is_ip_relative; + } code[5] = { { 0 } }; + int codeIdx = 0; + char* ptr = start; + while (ptr < end) { + // Keep a ring-buffer of the last few instruction in order to find the + // correct place to patch the code. + char *mod_rm; + code[codeIdx].addr = ptr; + code[codeIdx].insn = next_inst((const char **)&ptr, __WORDSIZE == 64, + 0, 0, &mod_rm, 0, 0); + code[codeIdx].len = ptr - code[codeIdx].addr; + code[codeIdx].is_ip_relative = + #if defined(__x86_64__) + mod_rm && (*mod_rm & 0xC7) == 0x5; + #else + false; + #endif + + // Whenever we find a system call, we patch it with a jump to out-of-line + // code that redirects to our system call wrapper. + bool is_syscall = true; + #if defined(__x86_64__) + bool is_indirect_call = false; + if (code[codeIdx].insn == 0x0F05 /* SYSCALL */ || + // In addition, on x86-64, we need to redirect all CALLs between the + // VDSO and the VSyscalls page. We want these to jump to our own + // modified copy of the VSyscalls. As we know that the VSyscalls are + // always more than 2GB away from the VDSO, the compiler has to + // generate some form of indirect jumps. We can find all indirect + // CALLs and redirect them to a separate scratch area, where we can + // inspect the destination address. If it indeed points to the + // VSyscall area, we then adjust the destination address accordingly. + (is_indirect_call = + (isVDSO_ && vsys_offset_ && code[codeIdx].insn == 0xFF && + !code[codeIdx].is_ip_relative && + mod_rm && (*mod_rm & 0x38) == 0x10 /* CALL (indirect) */))) { + is_syscall = !is_indirect_call; + #elif defined(__i386__) + bool is_gs_call = false; + if (code[codeIdx].len == 7 && + code[codeIdx].insn == 0xFF && + code[codeIdx].addr[2] == '\x15' /* CALL (indirect) */ && + code[codeIdx].addr[0] == '\x65' /* %gs prefix */) { + char* target; + asm volatile("mov %%gs:(%1), %0\n" + : "=a"(target) + : "c"(*reinterpret_cast<int *>(code[codeIdx].addr+3))); + if (target == __kernel_vsyscall) { + is_gs_call = true; + // TODO(markus): also handle the other vsyscalls + } + } + if (is_gs_call || + (code[codeIdx].insn == 0xCD && + code[codeIdx].addr[1] == '\x80' /* INT $0x80 */)) { + #else + #error Unsupported target platform + #endif + // Found a system call. Search backwards to figure out how to redirect + // the code. We will need to overwrite a couple of instructions and, + // of course, move these instructions somewhere else. + int startIdx = codeIdx; + int endIdx = codeIdx; + int length = code[codeIdx].len; + for (int idx = codeIdx; + (idx = (idx + (sizeof(code) / sizeof(struct Code)) - 1) % + (sizeof(code) / sizeof(struct Code))) != codeIdx; ) { + std::set<char *>::const_iterator iter = + std::upper_bound(branch_targets.begin(), branch_targets.end(), + code[idx].addr); + if (iter != branch_targets.end() && *iter < ptr) { + // Found a branch pointing to somewhere past our instruction. This + // instruction cannot be moved safely. Leave it in place. + break; + } + if (code[idx].addr && !code[idx].is_ip_relative && + isSafeInsn(code[idx].insn)) { + // These are all benign instructions with no side-effects and no + // dependency on the program counter. We should be able to safely + // relocate them. + startIdx = idx; + length = ptr - code[startIdx].addr; + } else { + break; + } + } + // Search forward past the system call, too. Sometimes, we can only + // find relocatable instructions following the system call. + #if defined(__i386__) + findEndIdx: + #endif + char *next = ptr; + for (int i = codeIdx; + next < end && + (i = (i + 1) % (sizeof(code) / sizeof(struct Code))) != startIdx; + ) { + std::set<char *>::const_iterator iter = + std::lower_bound(branch_targets.begin(), branch_targets.end(), + next); + if (iter != branch_targets.end() && *iter == next) { + // Found branch target pointing to our instruction + break; + } + char *tmp_rm; + code[i].addr = next; + code[i].insn = next_inst((const char **)&next, __WORDSIZE == 64, + 0, 0, &tmp_rm, 0, 0); + code[i].len = next - code[i].addr; + code[i].is_ip_relative = tmp_rm && (*tmp_rm & 0xC7) == 0x5; + if (!code[i].is_ip_relative && isSafeInsn(code[i].insn)) { + endIdx = i; + length = next - code[startIdx].addr; + } else { + break; + } + } + // We now know, how many instructions neighboring the system call we + // can safely overwrite. On x86-32 we need six bytes, and on x86-64 + // We need five bytes to insert a JMPQ and a 32bit address. We then + // jump to a code fragment that safely forwards to our system call + // wrapper. + // On x86-64, this is complicated by the fact that the API allows up + // to 128 bytes of red-zones below the current stack pointer. So, we + // cannot write to the stack until we have adjusted the stack + // pointer. + // On both x86-32 and x86-64 we take care to leave the stack unchanged + // while we are executing the preamble and postamble. This allows us + // to treat instructions that reference %esp/%rsp as safe for + // relocation. + // In particular, this means that on x86-32 we cannot use CALL, but + // have to use a PUSH/RET combination to change the instruction pointer. + // On x86-64, we can instead use a 32bit JMPQ. + // + // .. .. .. .. ; any leading instructions copied from original code + // 48 81 EC 80 00 00 00 SUB $0x80, %rsp + // 50 PUSH %rax + // 48 8D 05 .. .. .. .. LEA ...(%rip), %rax + // 50 PUSH %rax + // 48 B8 .. .. .. .. MOV $syscallWrapper, %rax + // .. .. .. .. + // 50 PUSH %rax + // 48 8D 05 06 00 00 00 LEA 6(%rip), %rax + // 48 87 44 24 10 XCHG %rax, 16(%rsp) + // C3 RETQ + // 48 81 C4 80 00 00 00 ADD $0x80, %rsp + // .. .. .. .. ; any trailing instructions copied from original code + // E9 .. .. .. .. JMPQ ... + // + // Total: 52 bytes + any bytes that were copied + // + // On x86-32, the stack is available and we can do: + // + // TODO(markus): Try to maintain frame pointers on x86-32 + // + // .. .. .. .. ; any leading instructions copied from original code + // 68 .. .. .. .. PUSH return_addr + // 68 .. .. .. .. PUSH $syscallWrapper + // C3 RET + // .. .. .. .. ; any trailing instructions copied from original code + // 68 .. .. .. .. PUSH return_addr + // C3 RET + // + // Total: 17 bytes + any bytes that were copied + // + // For indirect jumps from the VDSO to the VSyscall page, we instead + // replace the following code (this is only necessary on x86-64). This + // time, we don't have to worry about red zones: + // + // .. .. .. .. ; any leading instructions copied from original code + // E8 00 00 00 00 CALL . + // 48 83 04 24 .. ADDQ $.., (%rsp) + // FF .. .. .. .. .. PUSH .. ; from original CALL instruction + // 48 81 3C 24 00 00 00 FF CMPQ $0xFFFFFFFFFF000000, 0(%rsp) + // 72 10 JB . + 16 + // 81 2C 24 .. .. .. .. SUBL ..., 0(%rsp) + // C7 44 24 04 00 00 00 00 MOVL $0, 4(%rsp) + // C3 RETQ + // 48 87 04 24 XCHG %rax,(%rsp) + // 48 89 44 24 08 MOV %rax,0x8(%rsp) + // 58 POP %rax + // C3 RETQ + // .. .. .. .. ; any trailing instructions copied from original code + // E9 .. .. .. .. JMPQ ... + // + // Total: 52 bytes + any bytes that were copied + + if (length < (__WORDSIZE == 32 ? 6 : 5)) { + // There are a very small number of instruction sequences that we + // cannot easily intercept, and that have been observed in real world + // examples. Handle them here: + #if defined(__i386__) + int diff; + if (!memcmp(code[codeIdx].addr, "\xCD\x80\xEB", 3) && + (diff = *reinterpret_cast<signed char *>( + code[codeIdx].addr + 3)) < 0 && diff >= -6) { + // We have seen... + // for (;;) { + // _exit(0); + // } + // ..get compiled to: + // B8 01 00 00 00 MOV $__NR_exit, %eax + // 66 90 XCHG %ax, %ax + // 31 DB 0:XOR %ebx, %ebx + // CD 80 INT $0x80 + // EB FA JMP 0b + // The JMP is really superfluous as the system call never returns. + // And there are in fact no returning system calls that need to be + // unconditionally repeated in an infinite loop. + // If we replace the JMP with NOPs, the system call can successfully + // be intercepted. + *reinterpret_cast<unsigned short *>(code[codeIdx].addr + 2) = 0x9090; + goto findEndIdx; + } + #elif defined(__x86_64__) + std::set<char *>::const_iterator iter; + #endif + // If we cannot figure out any other way to intercept this system call, + // we replace it with a call to INT0. This causes a SEGV which we then + // handle in the signal handler. That's a lot slower than rewriting the + // instruction with a jump, but it should only happen very rarely. + if (is_syscall) { + memcpy(code[codeIdx].addr, "\xCD", 2); + if (code[codeIdx].len > 2) { + memset(code[codeIdx].addr + 2, 0x90, code[codeIdx].len - 2); + } + goto replaced; + } + #if defined(__x86_64__) + // On x86-64, we occasionally see code like this in the VDSO: + // 48 8B 05 CF FE FF FF MOV -0x131(%rip),%rax + // FF 50 20 CALLQ *0x20(%rax) + // By default, we would not replace the MOV instruction, as it is + // IP relative. But if the following instruction is also IP relative, + // we are left with only three bytes which is not enough to insert a + // jump. + // We recognize this particular situation, and as long as the CALLQ + // is not a branch target, we decide to still relocate the entire + // sequence. We just have to make sure that we then patch up the + // IP relative addressing. + else if (is_indirect_call && startIdx == codeIdx && + code[startIdx = (startIdx + (sizeof(code) / + sizeof(struct Code)) - 1) % + (sizeof(code) / sizeof(struct Code))].addr && + ptr - code[startIdx].addr >= 5 && + code[startIdx].is_ip_relative && + isSafeInsn(code[startIdx].insn) && + ((iter = std::upper_bound(branch_targets.begin(), + branch_targets.end(), + code[startIdx].addr)) == + branch_targets.end() || *iter >= ptr)) { + // We changed startIdx to include the IP relative instruction. + // When copying this preamble, we make sure to patch up the + // offset. + } + #endif + else { + Sandbox::die("Cannot intercept system call"); + } + } + int needed = (__WORDSIZE == 32 ? 6 : 5) - code[codeIdx].len; + int first = codeIdx; + while (needed > 0 && first != startIdx) { + first = (first + (sizeof(code) / sizeof(struct Code)) - 1) % + (sizeof(code) / sizeof(struct Code)); + needed -= code[first].len; + } + int second = codeIdx; + while (needed > 0) { + second = (second + 1) % (sizeof(code) / sizeof(struct Code)); + needed -= code[second].len; + } + int preamble = code[codeIdx].addr - code[first].addr; + int postamble = code[second].addr + code[second].len - + code[codeIdx].addr - code[codeIdx].len; + + // The following is all the code that construct the various bits of + // assembly code. + #if defined(__x86_64__) + if (is_indirect_call) { + needed = 52 + preamble + code[codeIdx].len + postamble; + } else { + needed = 52 + preamble + postamble; + } + #elif defined(__i386__) + needed = 17 + preamble + postamble; + #else + #error Unsupported target platform + #endif + + // Allocate scratch space and copy the preamble of code that was moved + // from the function that we are patching. + char* dest = getScratchSpace(maps, code[first].addr, needed, + extraSpace, extraLength); + memcpy(dest, code[first].addr, preamble); + + // For jumps from the VDSO to the VSyscalls we sometimes allow exactly + // one IP relative instruction in the preamble. + if (code[first].is_ip_relative) { + *reinterpret_cast<int *>(dest + (code[codeIdx].addr - + code[first].addr) - 4) + -= dest - code[first].addr; + } + + // For indirect calls, we need to copy the actual CALL instruction and + // turn it into a PUSH instruction. + #if defined(__x86_64__) + if (is_indirect_call) { + memcpy(dest + preamble, "\xE8\x00\x00\x00\x00\x48\x83\x04\x24", 9); + dest[preamble + 9] = code[codeIdx].len + 42; + memcpy(dest + preamble + 10, code[codeIdx].addr, code[codeIdx].len); + + // Convert CALL -> PUSH + dest[preamble + 10 + (mod_rm - code[codeIdx].addr)] |= 0x20; + preamble += 10 + code[codeIdx].len; + } + #endif + + // Copy the static body of the assembly code. + memcpy(dest + preamble, + #if defined(__x86_64__) + is_indirect_call ? + "\x48\x81\x3C\x24\x00\x00\x00\xFF\x72\x10\x81\x2C\x24\x00\x00\x00" + "\x00\xC7\x44\x24\x04\x00\x00\x00\x00\xC3\x48\x87\x04\x24\x48\x89" + "\x44\x24\x08\x58\xC3" : + "\x48\x81\xEC\x80\x00\x00\x00\x50\x48\x8D\x05\x00\x00\x00\x00\x50" + "\x48\xB8\x00\x00\x00\x00\x00\x00\x00\x00\x50\x48\x8D\x05\x06\x00" + "\x00\x00\x48\x87\x44\x24\x10\xC3\x48\x81\xC4\x80\x00\x00", + is_indirect_call ? 37 : 47 + #elif defined(__i386__) + "\x68\x00\x00\x00\x00\x68\x00\x00\x00\x00\xC3", 11 + #else + #error Unsupported target platform + #endif + ); + + // Copy the postamble that was moved from the function that we are + // patching. + memcpy(dest + preamble + + #if defined(__x86_64__) + (is_indirect_call ? 37 : 47), + #elif defined(__i386__) + 11, + #else + #error Unsupported target platform + #endif + code[codeIdx].addr + code[codeIdx].len, + postamble); + + // Patch up the various computed values + #if defined(__x86_64__) + int post = preamble + (is_indirect_call ? 37 : 47) + postamble; + dest[post] = '\xE9'; + *reinterpret_cast<int *>(dest + post + 1) = + (code[second].addr + code[second].len) - (dest + post + 5); + if (is_indirect_call) { + *reinterpret_cast<int *>(dest + preamble + 13) = vsys_offset_; + } else { + *reinterpret_cast<int *>(dest + preamble + 11) = + (code[second].addr + code[second].len) - (dest + preamble + 15); + *reinterpret_cast<void **>(dest + preamble + 18) = + reinterpret_cast<void *>(&syscallWrapper); + } + #elif defined(__i386__) + *(dest + preamble + 11 + postamble) = '\x68'; // PUSH + *reinterpret_cast<char **>(dest + preamble + 12 + postamble) = + code[second].addr + code[second].len; + *(dest + preamble + 16 + postamble) = '\xC3'; // RET + *reinterpret_cast<char **>(dest + preamble + 1) = + dest + preamble + 11; + *reinterpret_cast<void (**)()>(dest + preamble + 6) = syscallWrapper; + #else + #error Unsupported target platform + #endif + + // Pad unused space in the original function with NOPs + memset(code[first].addr, 0x90 /* NOP */, + code[second].addr + code[second].len - code[first].addr); + + // Replace the system call with an unconditional jump to our new code. + #if defined(__x86_64__) + *code[first].addr = '\xE9'; // JMPQ + *reinterpret_cast<int *>(code[first].addr + 1) = + dest - (code[first].addr + 5); + #elif defined(__i386__) + code[first].addr[0] = '\x68'; // PUSH + *reinterpret_cast<char **>(code[first].addr + 1) = dest; + code[first].addr[5] = '\xC3'; // RET + #else + #error Unsupported target platform + #endif + } + replaced: + codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code)); + } +} + +void Library::patchVDSO(char** extraSpace, int* extraLength){ + #if defined(__i386__) + Sandbox::SysCalls sys; + if (!__kernel_vsyscall || + sys.mprotect(reinterpret_cast<void *>( + reinterpret_cast<long>(__kernel_vsyscall) & ~0xFFF), + 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) { + return; + } + + // x86-32 has a small number of well-defined functions in the VDSO library. + // These functions do not easily lend themselves to be rewritten by the + // automatic code. Instead, we explicitly find new definitions for them. + // + // We don't bother with optimizing the syscall instruction instead always + // use INT $0x80, no matter whether the hardware supports more modern + // calling conventions. + // + // TODO(markus): Investigate whether it is worthwhile to optimize this + // code path and use the platform-specific entry code. + if (__kernel_vsyscall) { + // Replace the kernel entry point with: + // + // E9 .. .. .. .. JMP syscallWrapper + *__kernel_vsyscall = '\xE9'; + *reinterpret_cast<long *>(__kernel_vsyscall + 1) = + reinterpret_cast<char *>(&syscallWrapper) - + reinterpret_cast<char *>(__kernel_vsyscall + 5); + } + if (__kernel_sigreturn) { + // Replace the sigreturn() system call with a jump to code that does: + // + // 58 POP %eax + // B8 77 00 00 00 MOV $0x77, %eax + // E8 .. .. .. .. CALL syscallWrapper + char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace, + extraLength); + memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE8", 7); + *reinterpret_cast<long *>(dest + 7) = + reinterpret_cast<char *>(&syscallWrapper) - dest - 11;; + *__kernel_sigreturn = '\xE9'; + *reinterpret_cast<long *>(__kernel_sigreturn + 1) = + dest - reinterpret_cast<char *>(__kernel_sigreturn) - 5; + } + if (__kernel_rt_sigreturn) { + // Replace the rt_sigreturn() system call with a jump to code that does: + // + // B8 AD 00 00 00 MOV $0xAD, %eax + // E8 .. .. .. .. CALL syscallWrapper + char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace, + extraLength); + memcpy(dest, "\xB8\xAD\x00\x00\x00\xE8", 6); + *reinterpret_cast<long *>(dest + 6) = + reinterpret_cast<char *>(&syscallWrapper) - dest - 10; + *__kernel_rt_sigreturn = '\xE9'; + *reinterpret_cast<long *>(__kernel_rt_sigreturn + 1) = + dest - reinterpret_cast<char *>(__kernel_rt_sigreturn) - 5; + } + #endif +} + +int Library::patchVSystemCalls() { + #if defined(__x86_64__) + // VSyscalls live in a shared 4kB page at the top of the address space. This + // page cannot be unmapped nor remapped. We have to create a copy within + // 2GB of the page, and rewrite all IP-relative accesses to shared variables. + // As the top of the address space is not accessible by mmap(), this means + // that we need to wrap around addresses to the bottom 2GB of the address + // space. + // Only x86-64 has VSyscalls. + if (maps_->vsyscall()) { + char* copy = maps_->allocNearAddr(maps_->vsyscall(), 0x1000, + PROT_READ|PROT_WRITE|PROT_EXEC); + char* extraSpace = copy; + int extraLength = 0x1000; + memcpy(copy, maps_->vsyscall(), 0x1000); + long adjust = (long)maps_->vsyscall() - (long)copy; + for (int vsys = 0; vsys < 0x1000; vsys += 0x400) { + char* start = copy + vsys; + char* end = start + 0x400; + + // There can only be up to four VSyscalls starting at an offset of + // n*0x1000, each. VSyscalls are invoked by functions in the VDSO + // and provide fast implementations of a time source. We don't exactly + // know where the code and where the data is in the VSyscalls page. + // So, we disassemble the code for each function and find all branch + // targets within the function in order to find the last address of + // function. + for (char *last = start, *vars = end, *ptr = start; ptr < end; ) { + new_function: + char* mod_rm; + unsigned short insn = next_inst((const char **)&ptr, true, 0, 0, + &mod_rm, 0, 0); + if (mod_rm && (*mod_rm & 0xC7) == 0x5) { + // Instruction has IP relative addressing mode. Adjust to reference + // the variables in the original VSyscall segment. + long offset = *reinterpret_cast<int *>(mod_rm + 1); + char* var = ptr + offset; + if (var >= ptr && var < vars) { + // Variables are stored somewhere past all the functions. Remember + // the first variable in the VSyscall slot, so that we stop + // scanning for instructions once we reach that address. + vars = var; + } + offset += adjust; + if ((offset >> 32) && (offset >> 32) != -1) { + Sandbox::die("Cannot patch [vsystemcall]"); + } + *reinterpret_cast<int *>(mod_rm + 1) = offset; + } + + // Check for jump targets to higher addresses (but within our own + // VSyscall slot). They extend the possible end-address of this + // function. + char *target = 0; + if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || + insn == 0xEB /* JMP */) { + target = ptr + (reinterpret_cast<signed char *>(ptr))[-1]; + } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ || + (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) { + target = ptr + (reinterpret_cast<int *>(ptr))[-1]; + } + + // The function end is found, once the loop reaches the last valid + // address in the VSyscall slot, or once it finds a RET instruction + // that is not followed by any jump targets. Unconditional jumps that + // point backwards are treated the same as a RET instruction. + if (insn == 0xC3 /* RET */ || + (target < ptr && + (insn == 0xEB /* JMP */ || insn == 0xE9 /* JMP */))) { + if (last >= ptr) { + continue; + } else { + // The function can optionally be followed by more functions in + // the same VSyscall slot. Allow for alignment to a 16 byte + // boundary. If we then find more non-zero bytes, and if this is + // not the known start of the variables, assume a new function + // started. + for (; ptr < vars; ++ptr) { + if ((long)ptr & 0xF) { + if (*ptr && *ptr != '\x90' /* NOP */) { + goto new_function; + } + *ptr = '\x90'; // NOP + } else { + if (*ptr && *ptr != '\x90' /* NOP */) { + goto new_function; + } + break; + } + } + + // Translate all SYSCALLs to jumps into our system call handler. + patchSystemCallsInFunction(NULL, start, ptr, + &extraSpace, &extraLength); + break; + } + } + + // Adjust assumed end address for this function, if a valid jump + // target has been found that originates from the current instruction. + if (target > last && target < start + 0x100) { + last = target; + } + } + } + + // We are done. Write-protect our code and make it executable. + Sandbox::SysCalls sys; + sys.mprotect(copy, 0x1000, PROT_READ|PROT_EXEC); + return maps_->vsyscall() - copy; + } + #endif + return 0; +} + +void Library::patchSystemCalls() { + if (!valid_) { + return; + } + int extraLength = 0; + char* extraSpace = NULL; + if (isVDSO_) { + // patchVDSO() calls patchSystemCallsInFunction() which needs vsys_offset_ + // iff processing the VDSO library. So, make sure we call + // patchVSystemCalls() first. + vsys_offset_ = patchVSystemCalls(); + #if defined(__i386__) + patchVDSO(&extraSpace, &extraLength); + return; + #endif + } + SectionTable::const_iterator iter; + if ((iter = section_table_.find(".text")) == section_table_.end()) { + return; + } + const Elf_Shdr& shdr = iter->second.second; + char* start = reinterpret_cast<char *>(shdr.sh_addr + asr_offset_); + char* stop = start + shdr.sh_size; + char* func = start; + int nopcount = 0; + bool has_syscall = false; + for (char *ptr = start; ptr < stop; ptr++) { + #if defined(__x86_64__) + if ((*ptr == '\x0F' && ptr[1] == '\x05' /* SYSCALL */) || + (isVDSO_ && *ptr == '\xFF')) { + #elif defined(__i386__) + if ((*ptr == '\xCD' && ptr[1] == '\x80' /* INT $0x80 */) || + (*ptr == '\x65' && ptr[1] == '\xFF' && + ptr[2] == '\x15' /* CALL %gs:.. */)) { + #else + #error Unsupported target platform + #endif + ptr++; + has_syscall = true; + nopcount = 0; + } else if (*ptr == '\x90' /* NOP */) { + nopcount++; + } else if (!(reinterpret_cast<long>(ptr) & 0xF)) { + if (nopcount > 2) { + // This is very likely the beginning of a new function. Functions + // are aligned on 16 byte boundaries and the preceding function is + // padded out with NOPs. + // + // For performance reasons, we quickly scan the entire text segment + // for potential SYSCALLs, and then patch the code in increments of + // individual functions. + if (has_syscall) { + has_syscall = false; + // Our quick scan of the function found a potential system call. + // Do a more thorough scan, now. + patchSystemCallsInFunction(maps_, func, ptr, &extraSpace, + &extraLength); + } + func = ptr; + } + nopcount = 0; + } else { + nopcount = 0; + } + } + if (has_syscall) { + // Patch any remaining system calls that were in the last function before + // the loop terminated. + patchSystemCallsInFunction(maps_, func, stop, &extraSpace, &extraLength); + } + + // Mark our scratch space as write-protected and executable. + if (extraSpace) { + Sandbox::SysCalls sys; + sys.mprotect(extraSpace, 4096, PROT_READ|PROT_EXEC); + } +} + +bool Library::parseElf() { + valid_ = true; + + // Verify ELF header + Elf_Shdr str_shdr; + if (!getOriginal(0, &ehdr_) || + ehdr_.e_ehsize < sizeof(Elf_Ehdr) || + ehdr_.e_phentsize < sizeof(Elf_Phdr) || + ehdr_.e_shentsize < sizeof(Elf_Shdr) || + !getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, + &str_shdr)) { + // Not all memory mappings are necessarily ELF files. Skip memory + // mappings that we cannot identify. + error: + valid_ = false; + return false; + } + + // Parse section table and find all sections in this ELF file + for (int i = 0; i < ehdr_.e_shnum; i++) { + Elf_Shdr shdr; + if (!getOriginal(ehdr_.e_shoff + i*ehdr_.e_shentsize, &shdr)) { + continue; + } + section_table_.insert( + std::make_pair(getOriginal(str_shdr.sh_offset + shdr.sh_name), + std::make_pair(i, shdr))); + } + + // Compute the offset of entries in the .text segment + const Elf_Shdr* text = getSection(".text"); + if (text == NULL) { + // On x86-32, the VDSO is unusual in as much as it does not have a single + // ".text" section. Instead, it has one section per function. Each + // section name starts with ".text". We just need to pick an arbitrary + // one in order to find the asr_offset_ -- which would typically be zero + // for the VDSO. + for (SectionTable::const_iterator iter = section_table_.begin(); + iter != section_table_.end(); ++iter) { + if (!strncmp(iter->first.c_str(), ".text", 5)) { + text = &iter->second.second; + break; + } + } + } + + // Now that we know where the .text segment is located, we can compute the + // asr_offset_. + if (text) { + RangeMap::const_iterator iter = + memory_ranges_.lower_bound(text->sh_offset); + if (iter != memory_ranges_.end()) { + asr_offset_ = reinterpret_cast<char *>(iter->second.start) - + (text->sh_addr - (text->sh_offset - iter->first)); + } else { + goto error; + } + } else { + goto error; + } + + return !isVDSO_ || parseSymbols(); +} + +bool Library::parseSymbols() { + if (!valid_) { + return false; + } + + Elf_Shdr str_shdr; + getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, &str_shdr); + + // Find PLT and symbol tables + const Elf_Shdr* plt = getSection(ELF_REL_PLT); + const Elf_Shdr* symtab = getSection(".dynsym"); + Elf_Shdr strtab = { 0 }; + if (symtab) { + if (symtab->sh_link >= ehdr_.e_shnum || + !getOriginal(ehdr_.e_shoff + symtab->sh_link * ehdr_.e_shentsize, + &strtab)) { + Debug::message("Cannot find valid symbol table\n"); + valid_ = false; + return false; + } + } + + if (plt && symtab) { + // Parse PLT table and add its entries + for (int i = plt->sh_size/sizeof(Elf_Rel); --i >= 0; ) { + Elf_Rel rel; + if (!getOriginal(plt->sh_offset + i * sizeof(Elf_Rel), &rel) || + ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym) >= symtab->sh_size) { + Debug::message("Encountered invalid plt entry\n"); + valid_ = false; + return false; + } + + if (ELF_R_TYPE(rel.r_info) != ELF_JUMP_SLOT) { + continue; + } + Elf_Sym sym; + if (!getOriginal(symtab->sh_offset + + ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym), &sym) || + sym.st_shndx >= ehdr_.e_shnum) { + Debug::message("Encountered invalid symbol for plt entry\n"); + valid_ = false; + return false; + } + string name = getOriginal(strtab.sh_offset + sym.st_name); + if (name.empty()) { + continue; + } + plt_entries_.insert(std::make_pair(name, rel.r_offset)); + } + } + + if (symtab) { + // Parse symbol table and add its entries + for (Elf_Addr addr = 0; addr < symtab->sh_size; addr += sizeof(Elf_Sym)) { + Elf_Sym sym; + if (!getOriginal(symtab->sh_offset + addr, &sym) || + (sym.st_shndx >= ehdr_.e_shnum && + sym.st_shndx < SHN_LORESERVE)) { + Debug::message("Encountered invalid symbol\n"); + valid_ = false; + return false; + } + string name = getOriginal(strtab.sh_offset + sym.st_name); + if (name.empty()) { + continue; + } + symbols_.insert(std::make_pair(name, sym)); + } + } + + SymbolTable::const_iterator iter = symbols_.find("__kernel_vsyscall"); + if (iter != symbols_.end() && iter->second.st_value) { + __kernel_vsyscall = asr_offset_ + iter->second.st_value; + } + iter = symbols_.find("__kernel_sigreturn"); + if (iter != symbols_.end() && iter->second.st_value) { + __kernel_sigreturn = asr_offset_ + iter->second.st_value; + } + iter = symbols_.find("__kernel_rt_sigreturn"); + if (iter != symbols_.end() && iter->second.st_value) { + __kernel_rt_sigreturn = asr_offset_ + iter->second.st_value; + } + + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/library.h b/sandbox/linux/seccomp/library.h new file mode 100644 index 0000000..e27bfde --- /dev/null +++ b/sandbox/linux/seccomp/library.h @@ -0,0 +1,199 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef LIBRARY_H__ +#define LIBRARY_H__ + +#include <elf.h> +#include <functional> +#include <map> +#include <set> +#include <string> +#include <string.h> +#include <sys/mman.h> + +#include "maps.h" + +#if defined(__x86_64__) +typedef Elf64_Ehdr Elf_Ehdr; +typedef Elf64_Shdr Elf_Shdr; +typedef Elf64_Sym Elf_Sym; +typedef Elf64_Addr Elf_Addr; +#elif defined(__i386__) +typedef Elf32_Ehdr Elf_Ehdr; +typedef Elf32_Shdr Elf_Shdr; +typedef Elf32_Sym Elf_Sym; +typedef Elf32_Addr Elf_Addr; +#else +#error Unsupported target platform +#endif + +struct SyscallTable; +namespace playground { + +class Library { + friend class Maps; + public: + typedef Maps::string string; + + Library() : + valid_(false), + isVDSO_(false), + asr_offset_(0), + vsys_offset_(0), + maps_(0), + image_(0), + image_size_(0) { + } + + ~Library(); + + void setLibraryInfo(Maps* maps) { + if (!maps_) { + maps_ = maps; + } + } + + void addMemoryRange(void* start, void* stop, Elf_Addr offset, + int prot, int isVDSO) { + isVDSO_ = isVDSO; + RangeMap::const_iterator iter = memory_ranges_.find(offset); + if (iter != memory_ranges_.end()) { + // It is possible to have overlapping mappings. This is particularly + // likely to happen with very small programs or libraries. If it does + // happen, we really only care about the text segment. Look for a + // mapping that is mapped executable. + if ((prot & PROT_EXEC) == 0) { + return; + } + } + memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot))); + } + + char *get(Elf_Addr offset, char *buf, size_t len); + string get(Elf_Addr offset); + char *getOriginal(Elf_Addr offset, char *buf, size_t len); + string getOriginal(Elf_Addr offset); + + template<class T>T* get(Elf_Addr offset, T* t) { + if (!valid_) { + memset(t, 0, sizeof(T)); + return NULL; + } + return reinterpret_cast<T *>(get(offset, reinterpret_cast<char *>(t), + sizeof(T))); + } + + template<class T>T* getOriginal(Elf_Addr offset, T* t) { + if (!valid_) { + memset(t, 0, sizeof(T)); + return NULL; + } + return reinterpret_cast<T *>(getOriginal(offset, + reinterpret_cast<char *>(t), + sizeof(T))); + } + + template<class T>bool set(void *addr, T* value) { + if (!valid_) { + return false; + } + *reinterpret_cast<T *>(addr) = *value; + return true; + } + + template<class T>bool set(Elf_Addr offset, T* value) { + if (!valid_) { + return false; + } + RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); + if (iter == memory_ranges_.end()) { + return false; + } + offset -= iter->first; + if (offset > + reinterpret_cast<char *>(iter->second.stop) - + reinterpret_cast<char *>(iter->second.start) - + sizeof(T)) { + return false; + } + *reinterpret_cast<T *>( + reinterpret_cast<char *>(iter->second.start) + offset) = *value; + return true; + } + + bool parseElf(); + const Elf_Ehdr* getEhdr(); + const Elf_Shdr* getSection(const string& section); + int getSectionIndex(const string& section); + void makeWritable(bool state) const; + void patchSystemCalls(); + bool isVDSO() const { return isVDSO_; } + + protected: + bool parseSymbols(); + + private: + class GreaterThan : public std::binary_function<Elf_Addr, Elf_Addr, bool> { + // We create the RangeMap with a GreaterThan rather than the default + // comparator, as that allows us to use lower_bound() to find memory + // mappings. + public: + bool operator() (Elf_Addr s1, Elf_Addr s2) const { + return s1 > s2; + } + }; + + struct Range { + Range(void* start, void* stop, int prot) : + start(start), stop(stop), prot(prot) { } + void* start; + void* stop; + int prot; + }; + + typedef std::map<Elf_Addr, Range, GreaterThan, + SystemAllocator<std::pair<const Elf_Addr, + Range> > > RangeMap; + typedef std::map<string, std::pair<int, Elf_Shdr>, std::less<string>, + SystemAllocator<std::pair<const string, + std::pair<int, Elf_Shdr> > > > + SectionTable; + typedef std::map<string, Elf_Sym, std::less<string>, + SystemAllocator<std::pair<const string, + Elf_Sym> > > SymbolTable; + typedef std::map<string, Elf_Addr, std::less<string>, + SystemAllocator<std::pair<const string, + Elf_Addr> > > PltTable; + + char* getBytes(char* dst, const char* src, ssize_t len); + static bool isSafeInsn(unsigned short insn); + static int isSimpleSystemCall(char *start, char *end); + static char* getScratchSpace(const Maps* maps, char* near, int needed, + char** extraSpace, int* extraLength); + void patchSystemCallsInFunction(const Maps* maps, char *start, char *end, + char** extraSpace, int* extraLength); + int patchVSystemCalls(); + void patchVDSO(char** extraSpace, int* extraLength); + + RangeMap memory_ranges_; + bool valid_; + bool isVDSO_; + char* asr_offset_; + int vsys_offset_; + Maps* maps_; + Elf_Ehdr ehdr_; + SectionTable section_table_; + SymbolTable symbols_; + PltTable plt_entries_; + char* image_; + size_t image_size_; + static char* __kernel_vsyscall; + static char* __kernel_sigreturn; + static char* __kernel_rt_sigreturn; +}; + +} // namespace + +#endif // LIBRARY_H__ diff --git a/sandbox/linux/seccomp/linux_syscall_support.h b/sandbox/linux/seccomp/linux_syscall_support.h new file mode 100644 index 0000000..2ee0426 --- /dev/null +++ b/sandbox/linux/seccomp/linux_syscall_support.h @@ -0,0 +1,3208 @@ +/* Copyright (c) 2005-2010, Google Inc. + * Author: Markus Gutschke + * + * All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium LICENSE file. + */ + +/* This file includes Linux-specific support functions common to the + * coredumper and the thread lister; primarily, this is a collection + * of direct system calls, and a couple of symbols missing from + * standard header files. + * There are a few options that the including file can set to control + * the behavior of this file: + * + * SYS_CPLUSPLUS: + * The entire header file will normally be wrapped in 'extern "C" { }", + * making it suitable for compilation as both C and C++ source. If you + * do not want to do this, you can set the SYS_CPLUSPLUS macro to inhibit + * the wrapping. N.B. doing so will suppress inclusion of all prerequisite + * system header files, too. It is the caller's responsibility to provide + * the necessary definitions. + * + * SYS_ERRNO: + * All system calls will update "errno" unless overriden by setting the + * SYS_ERRNO macro prior to including this file. SYS_ERRNO should be + * an l-value. + * + * SYS_INLINE: + * New symbols will be defined "static inline", unless overridden by + * the SYS_INLINE macro. + * + * SYS_LINUX_SYSCALL_SUPPORT_H + * This macro is used to avoid multiple inclusions of this header file. + * If you need to include this file more than once, make sure to + * unset SYS_LINUX_SYSCALL_SUPPORT_H before each inclusion. + * + * SYS_PREFIX: + * New system calls will have a prefix of "sys_" unless overridden by + * the SYS_PREFIX macro. Valid values for this macro are [0..9] which + * results in prefixes "sys[0..9]_". It is also possible to set this + * macro to -1, which avoids all prefixes. + * + * This file defines a few internal symbols that all start with "LSS_". + * Do not access these symbols from outside this file. They are not part + * of the supported API. + */ +#ifndef SYS_LINUX_SYSCALL_SUPPORT_H +#define SYS_LINUX_SYSCALL_SUPPORT_H + +/* We currently only support x86-32, x86-64, ARM, MIPS, and PPC on Linux. + * Porting to other related platforms should not be difficult. + */ +#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) || \ + defined(__mips__) || defined(__PPC__)) && defined(__linux) + +#ifndef SYS_CPLUSPLUS +#ifdef __cplusplus +/* Some system header files in older versions of gcc neglect to properly + * handle being included from C++. As it appears to be harmless to have + * multiple nested 'extern "C"' blocks, just add another one here. + */ +extern "C" { +#endif + +#include <errno.h> +#include <signal.h> +#include <stdarg.h> +#include <stddef.h> +#include <string.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/time.h> +#include <sys/types.h> +#include <syscall.h> +#include <unistd.h> +#include <linux/unistd.h> +#include <endian.h> + +#ifdef __mips__ +/* Include definitions of the ABI currently in use. */ +#include <sgidefs.h> +#endif + +#endif + +/* As glibc often provides subtly incompatible data structures (and implicit + * wrapper functions that convert them), we provide our own kernel data + * structures for use by the system calls. + * These structures have been developed by using Linux 2.6.23 headers for + * reference. Note though, we do not care about exact API compatibility + * with the kernel, and in fact the kernel often does not have a single + * API that works across architectures. Instead, we try to mimic the glibc + * API where reasonable, and only guarantee ABI compatibility with the + * kernel headers. + * Most notably, here are a few changes that were made to the structures + * defined by kernel headers: + * + * - we only define structures, but not symbolic names for kernel data + * types. For the latter, we directly use the native C datatype + * (i.e. "unsigned" instead of "mode_t"). + * - in a few cases, it is possible to define identical structures for + * both 32bit (e.g. i386) and 64bit (e.g. x86-64) platforms by + * standardizing on the 64bit version of the data types. In particular, + * this means that we use "unsigned" where the 32bit headers say + * "unsigned long". + * - overall, we try to minimize the number of cases where we need to + * conditionally define different structures. + * - the "struct kernel_sigaction" class of structures have been + * modified to more closely mimic glibc's API by introducing an + * anonymous union for the function pointer. + * - a small number of field names had to have an underscore appended to + * them, because glibc defines a global macro by the same name. + */ + +/* include/linux/dirent.h */ +struct kernel_dirent64 { + unsigned long long d_ino; + long long d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[256]; +}; + +/* include/linux/dirent.h */ +struct kernel_dirent { + long d_ino; + long d_off; + unsigned short d_reclen; + char d_name[256]; +}; + +/* include/linux/uio.h */ +struct kernel_iovec { + void *iov_base; + unsigned long iov_len; +}; + +/* include/linux/socket.h */ +struct kernel_msghdr { + void *msg_name; + int msg_namelen; + struct kernel_iovec*msg_iov; + unsigned long msg_iovlen; + void *msg_control; + unsigned long msg_controllen; + unsigned msg_flags; +}; + +/* include/asm-generic/poll.h */ +struct kernel_pollfd { + int fd; + short events; + short revents; +}; + +/* include/linux/resource.h */ +struct kernel_rlimit { + unsigned long rlim_cur; + unsigned long rlim_max; +}; + +/* include/linux/time.h */ +struct kernel_timespec { + long tv_sec; + long tv_nsec; +}; + +/* include/linux/time.h */ +struct kernel_timeval { + long tv_sec; + long tv_usec; +}; + +/* include/linux/resource.h */ +struct kernel_rusage { + struct kernel_timeval ru_utime; + struct kernel_timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +struct siginfo; +#if defined(__i386__) || defined(__ARM_ARCH_3__) || defined(__PPC__) + +/* include/asm-{arm,i386,mips,ppc}/signal.h */ +struct kernel_old_sigaction { + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + unsigned long sa_mask; + unsigned long sa_flags; + void (*sa_restorer)(void); +} __attribute__((packed,aligned(4))); +#elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + #define kernel_old_sigaction kernel_sigaction +#endif + +/* Some kernel functions (e.g. sigaction() in 2.6.23) require that the + * exactly match the size of the signal set, even though the API was + * intended to be extensible. We define our own KERNEL_NSIG to deal with + * this. + * Please note that glibc provides signals [1.._NSIG-1], whereas the + * kernel (and this header) provides the range [1..KERNEL_NSIG]. The + * actual number of signals is obviously the same, but the constants + * differ by one. + */ +#ifdef __mips__ +#define KERNEL_NSIG 128 +#else +#define KERNEL_NSIG 64 +#endif + +/* include/asm-{arm,i386,mips,x86_64}/signal.h */ +struct kernel_sigset_t { + unsigned long sig[(KERNEL_NSIG + 8*sizeof(unsigned long) - 1)/ + (8*sizeof(unsigned long))]; +}; + +/* include/asm-{arm,i386,mips,x86_64,ppc}/signal.h */ +struct kernel_sigaction { +#ifdef __mips__ + unsigned long sa_flags; + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + struct kernel_sigset_t sa_mask; +#else + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + unsigned long sa_flags; + void (*sa_restorer)(void); + struct kernel_sigset_t sa_mask; +#endif +}; + +/* include/linux/socket.h */ +struct kernel_sockaddr { + unsigned short sa_family; + char sa_data[14]; +}; + +/* include/asm-{arm,i386,mips,ppc}/stat.h */ +#ifdef __mips__ +#if _MIPS_SIM == _MIPS_SIM_ABI64 +struct kernel_stat { +#else +struct kernel_stat64 { +#endif + unsigned st_dev; + unsigned __pad0[3]; + unsigned long long st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned st_rdev; + unsigned __pad1[3]; + long long st_size; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned st_blksize; + unsigned __pad2; + unsigned long long st_blocks; +}; +#elif defined __PPC__ +struct kernel_stat64 { + unsigned long long st_dev; + unsigned long long st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned long long st_rdev; + unsigned short int __pad2; + long long st_size; + long st_blksize; + long long st_blocks; + long st_atime_; + unsigned long st_atime_nsec_; + long st_mtime_; + unsigned long st_mtime_nsec_; + long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long __unused4; + unsigned long __unused5; +}; +#else +struct kernel_stat64 { + unsigned long long st_dev; + unsigned char __pad0[4]; + unsigned __st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned long long st_rdev; + unsigned char __pad3[4]; + long long st_size; + unsigned st_blksize; + unsigned long long st_blocks; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned long long st_ino; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc}/stat.h */ +#if defined(__i386__) || defined(__ARM_ARCH_3__) +struct kernel_stat { + /* The kernel headers suggest that st_dev and st_rdev should be 32bit + * quantities encoding 12bit major and 20bit minor numbers in an interleaved + * format. In reality, we do not see useful data in the top bits. So, + * we'll leave the padding in here, until we find a better solution. + */ + unsigned short st_dev; + short pad1; + unsigned st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; + short pad2; + unsigned st_size; + unsigned st_blksize; + unsigned st_blocks; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned __unused4; + unsigned __unused5; +}; +#elif defined(__x86_64__) +struct kernel_stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned st_mode; + unsigned st_uid; + unsigned st_gid; + unsigned __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + long st_blocks; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + long __unused[3]; +}; +#elif defined(__PPC__) +struct kernel_stat { + unsigned st_dev; + unsigned long st_ino; // ino_t + unsigned long st_mode; // mode_t + unsigned short st_nlink; // nlink_t + unsigned st_uid; // uid_t + unsigned st_gid; // gid_t + unsigned st_rdev; + long st_size; // off_t + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long __unused4; + unsigned long __unused5; +}; +#elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) +struct kernel_stat { + unsigned st_dev; + int st_pad1[3]; + unsigned st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned st_rdev; + int st_pad2[2]; + long st_size; + int st_pad3; + long st_atime_; + long st_atime_nsec_; + long st_mtime_; + long st_mtime_nsec_; + long st_ctime_; + long st_ctime_nsec_; + int st_blksize; + int st_blocks; + int st_pad4[14]; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc}/statfs.h */ +#ifdef __mips__ +#if _MIPS_SIM != _MIPS_SIM_ABI64 +struct kernel_statfs64 { + unsigned long f_type; + unsigned long f_bsize; + unsigned long f_frsize; + unsigned long __pad; + unsigned long long f_blocks; + unsigned long long f_bfree; + unsigned long long f_files; + unsigned long long f_ffree; + unsigned long long f_bavail; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_spare[6]; +}; +#endif +#elif !defined(__x86_64__) +struct kernel_statfs64 { + unsigned long f_type; + unsigned long f_bsize; + unsigned long long f_blocks; + unsigned long long f_bfree; + unsigned long long f_bavail; + unsigned long long f_files; + unsigned long long f_ffree; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_frsize; + unsigned long f_spare[5]; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc,generic}/statfs.h */ +#ifdef __mips__ +struct kernel_statfs { + long f_type; + long f_bsize; + long f_frsize; + long f_blocks; + long f_bfree; + long f_files; + long f_ffree; + long f_bavail; + struct { int val[2]; } f_fsid; + long f_namelen; + long f_spare[6]; +}; +#else +struct kernel_statfs { + /* x86_64 actually defines all these fields as signed, whereas all other */ + /* platforms define them as unsigned. Leaving them at unsigned should not */ + /* cause any problems. */ + unsigned long f_type; + unsigned long f_bsize; + unsigned long f_blocks; + unsigned long f_bfree; + unsigned long f_bavail; + unsigned long f_files; + unsigned long f_ffree; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_frsize; + unsigned long f_spare[5]; +}; +#endif + + +/* Definitions missing from the standard header files */ +#ifndef O_DIRECTORY +#if defined(__ARM_ARCH_3__) +#define O_DIRECTORY 0040000 +#else +#define O_DIRECTORY 0200000 +#endif +#endif +#ifndef NT_PRXFPREG +#define NT_PRXFPREG 0x46e62b7f +#endif +#ifndef PTRACE_GETFPXREGS +#define PTRACE_GETFPXREGS ((enum __ptrace_request)18) +#endif +#ifndef PR_GET_DUMPABLE +#define PR_GET_DUMPABLE 3 +#endif +#ifndef PR_SET_DUMPABLE +#define PR_SET_DUMPABLE 4 +#endif +#ifndef PR_GET_SECCOMP +#define PR_GET_SECCOMP 21 +#endif +#ifndef PR_SET_SECCOMP +#define PR_SET_SECCOMP 22 +#endif +#ifndef AT_FDCWD +#define AT_FDCWD (-100) +#endif +#ifndef AT_SYMLINK_NOFOLLOW +#define AT_SYMLINK_NOFOLLOW 0x100 +#endif +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif +#ifndef MREMAP_FIXED +#define MREMAP_FIXED 2 +#endif +#ifndef SA_RESTORER +#define SA_RESTORER 0x04000000 +#endif +#ifndef CPUCLOCK_PROF +#define CPUCLOCK_PROF 0 +#endif +#ifndef CPUCLOCK_VIRT +#define CPUCLOCK_VIRT 1 +#endif +#ifndef CPUCLOCK_SCHED +#define CPUCLOCK_SCHED 2 +#endif +#ifndef CPUCLOCK_PERTHREAD_MASK +#define CPUCLOCK_PERTHREAD_MASK 4 +#endif +#ifndef MAKE_PROCESS_CPUCLOCK +#define MAKE_PROCESS_CPUCLOCK(pid, clock) \ + ((~(int)(pid) << 3) | (int)(clock)) +#endif +#ifndef MAKE_THREAD_CPUCLOCK +#define MAKE_THREAD_CPUCLOCK(tid, clock) \ + ((~(int)(tid) << 3) | (int)((clock) | CPUCLOCK_PERTHREAD_MASK)) +#endif + +#ifndef FUTEX_WAIT +#define FUTEX_WAIT 0 +#endif +#ifndef FUTEX_WAKE +#define FUTEX_WAKE 1 +#endif +#ifndef FUTEX_FD +#define FUTEX_FD 2 +#endif +#ifndef FUTEX_REQUEUE +#define FUTEX_REQUEUE 3 +#endif +#ifndef FUTEX_CMP_REQUEUE +#define FUTEX_CMP_REQUEUE 4 +#endif +#ifndef FUTEX_WAKE_OP +#define FUTEX_WAKE_OP 5 +#endif +#ifndef FUTEX_LOCK_PI +#define FUTEX_LOCK_PI 6 +#endif +#ifndef FUTEX_UNLOCK_PI +#define FUTEX_UNLOCK_PI 7 +#endif +#ifndef FUTEX_TRYLOCK_PI +#define FUTEX_TRYLOCK_PI 8 +#endif +#ifndef FUTEX_PRIVATE_FLAG +#define FUTEX_PRIVATE_FLAG 128 +#endif +#ifndef FUTEX_CMD_MASK +#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG +#endif +#ifndef FUTEX_WAIT_PRIVATE +#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_WAKE_PRIVATE +#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_REQUEUE_PRIVATE +#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_CMP_REQUEUE_PRIVATE +#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_WAKE_OP_PRIVATE +#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_LOCK_PI_PRIVATE +#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_UNLOCK_PI_PRIVATE +#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) +#endif +#ifndef FUTEX_TRYLOCK_PI_PRIVATE +#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) +#endif + + +#if defined(__x86_64__) +#ifndef ARCH_SET_GS +#define ARCH_SET_GS 0x1001 +#endif +#ifndef ARCH_GET_GS +#define ARCH_GET_GS 0x1004 +#endif +#endif + +#if defined(__i386__) +#ifndef __NR_quotactl +#define __NR_quotactl 131 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_setresgid 170 +#define __NR_getresgid 171 +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn 173 +#define __NR_rt_sigaction 174 +#define __NR_rt_sigprocmask 175 +#define __NR_rt_sigpending 176 +#define __NR_rt_sigsuspend 179 +#endif +#ifndef __NR_pread64 +#define __NR_pread64 180 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 181 +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit 191 +#endif +#ifndef __NR_stat64 +#define __NR_stat64 195 +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 197 +#endif +#ifndef __NR_setresuid32 +#define __NR_setresuid32 208 +#define __NR_getresuid32 209 +#define __NR_setresgid32 210 +#define __NR_getresgid32 211 +#endif +#ifndef __NR_setfsuid32 +#define __NR_setfsuid32 215 +#define __NR_setfsgid32 216 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 220 +#endif +#ifndef __NR_gettid +#define __NR_gettid 224 +#endif +#ifndef __NR_readahead +#define __NR_readahead 225 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 226 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 227 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 229 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 230 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 232 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 233 +#endif +#ifndef __NR_tkill +#define __NR_tkill 238 +#endif +#ifndef __NR_futex +#define __NR_futex 240 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 241 +#define __NR_sched_getaffinity 242 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 258 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 265 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 266 +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 268 +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 269 +#endif +#ifndef __NR_fadvise64_64 +#define __NR_fadvise64_64 272 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 289 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 290 +#endif +#ifndef __NR_openat +#define __NR_openat 295 +#endif +#ifndef __NR_fstatat64 +#define __NR_fstatat64 300 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 301 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 317 +#endif +#ifndef __NR_getcpu +#define __NR_getcpu 318 +#endif +#ifndef __NR_fallocate +#define __NR_fallocate 324 +#endif +/* End of i386 definitions */ +#elif defined(__ARM_ARCH_3__) +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_SYSCALL_BASE + 164) +#define __NR_getresuid (__NR_SYSCALL_BASE + 165) +#define __NR_setresgid (__NR_SYSCALL_BASE + 170) +#define __NR_getresgid (__NR_SYSCALL_BASE + 171) +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn (__NR_SYSCALL_BASE + 173) +#define __NR_rt_sigaction (__NR_SYSCALL_BASE + 174) +#define __NR_rt_sigprocmask (__NR_SYSCALL_BASE + 175) +#define __NR_rt_sigpending (__NR_SYSCALL_BASE + 176) +#define __NR_rt_sigsuspend (__NR_SYSCALL_BASE + 179) +#endif +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_SYSCALL_BASE + 180) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_SYSCALL_BASE + 181) +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit (__NR_SYSCALL_BASE + 191) +#endif +#ifndef __NR_stat64 +#define __NR_stat64 (__NR_SYSCALL_BASE + 195) +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 (__NR_SYSCALL_BASE + 197) +#endif +#ifndef __NR_setresuid32 +#define __NR_setresuid32 (__NR_SYSCALL_BASE + 208) +#define __NR_getresuid32 (__NR_SYSCALL_BASE + 209) +#define __NR_setresgid32 (__NR_SYSCALL_BASE + 210) +#define __NR_getresgid32 (__NR_SYSCALL_BASE + 211) +#endif +#ifndef __NR_setfsuid32 +#define __NR_setfsuid32 (__NR_SYSCALL_BASE + 215) +#define __NR_setfsgid32 (__NR_SYSCALL_BASE + 216) +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 (__NR_SYSCALL_BASE + 217) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_SYSCALL_BASE + 224) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_SYSCALL_BASE + 225) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_SYSCALL_BASE + 226) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_SYSCALL_BASE + 227) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_SYSCALL_BASE + 229) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_SYSCALL_BASE + 230) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_SYSCALL_BASE + 232) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_SYSCALL_BASE + 233) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_SYSCALL_BASE + 238) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_SYSCALL_BASE + 240) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_SYSCALL_BASE + 241) +#define __NR_sched_getaffinity (__NR_SYSCALL_BASE + 242) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_SYSCALL_BASE + 256) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_SYSCALL_BASE + 263) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_SYSCALL_BASE + 264) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_SYSCALL_BASE + 266) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_SYSCALL_BASE + 267) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_SYSCALL_BASE + 314) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_SYSCALL_BASE + 315) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_SYSCALL_BASE + 344) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_SYSCALL_BASE + 345) +#endif +/* End of ARM 3 definitions */ +#elif defined(__x86_64__) +#ifndef __NR_pread64 +#define __NR_pread64 17 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 18 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 117 +#define __NR_getresuid 118 +#define __NR_setresgid 119 +#define __NR_getresgid 120 +#endif +#ifndef __NR_quotactl +#define __NR_quotactl 179 +#endif +#ifndef __NR_gettid +#define __NR_gettid 186 +#endif +#ifndef __NR_readahead +#define __NR_readahead 187 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 188 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 189 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 191 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 192 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 194 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 195 +#endif +#ifndef __NR_tkill +#define __NR_tkill 200 +#endif +#ifndef __NR_futex +#define __NR_futex 202 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 203 +#define __NR_sched_getaffinity 204 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 217 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 218 +#endif +#ifndef __NR_fadvise64 +#define __NR_fadvise64 221 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 228 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 229 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 251 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 252 +#endif +#ifndef __NR_openat +#define __NR_openat 257 +#endif +#ifndef __NR_newfstatat +#define __NR_newfstatat 262 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 263 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 279 +#endif +#ifndef __NR_fallocate +#define __NR_fallocate 285 +#endif +/* End of x86-64 definitions */ +#elif defined(__mips__) +#if _MIPS_SIM == _MIPS_SIM_ABI32 +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 185) +#define __NR_getresuid (__NR_Linux + 186) +#define __NR_setresgid (__NR_Linux + 190) +#define __NR_getresgid (__NR_Linux + 191) +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn (__NR_Linux + 193) +#define __NR_rt_sigaction (__NR_Linux + 194) +#define __NR_rt_sigprocmask (__NR_Linux + 195) +#define __NR_rt_sigpending (__NR_Linux + 196) +#define __NR_rt_sigsuspend (__NR_Linux + 199) +#endif +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_Linux + 200) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_Linux + 201) +#endif +#ifndef __NR_stat64 +#define __NR_stat64 (__NR_Linux + 213) +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 (__NR_Linux + 215) +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 (__NR_Linux + 219) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 222) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 223) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 224) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 225) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 227) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 228) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 230) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 231) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 236) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 238) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 239) +#define __NR_sched_getaffinity (__NR_Linux + 240) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 252) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_Linux + 255) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_Linux + 256) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 263) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 264) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 288) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 293) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 294) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 308) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 312) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 314) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 315) +#endif +/* End of MIPS (old 32bit API) definitions */ +#elif _MIPS_SIM == _MIPS_SIM_ABI64 +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_Linux + 16) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_Linux + 17) +#endif +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 115) +#define __NR_getresuid (__NR_Linux + 116) +#define __NR_setresgid (__NR_Linux + 117) +#define __NR_getresgid (__NR_Linux + 118) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 178) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 179) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 180) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 181) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 183) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 184) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 186) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 187) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 192) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 194) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 195) +#define __NR_sched_getaffinity (__NR_Linux + 196) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 212) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 222) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 223) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 247) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 252) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 253) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 267) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 271) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 273) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 274) +#endif +/* End of MIPS (64bit API) definitions */ +#else +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 115) +#define __NR_getresuid (__NR_Linux + 116) +#define __NR_setresgid (__NR_Linux + 117) +#define __NR_getresgid (__NR_Linux + 118) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 178) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 179) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 180) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 181) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 183) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 184) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 186) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 187) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 192) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 194) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 195) +#define __NR_sched_getaffinity (__NR_Linux + 196) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 213) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_Linux + 217) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_Linux + 218) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 226) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 227) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 251) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 256) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 257) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 271) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 275) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 277) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 278) +#endif +/* End of MIPS (new 32bit API) definitions */ +#endif +/* End of MIPS definitions */ +#elif defined(__PPC__) +#ifndef __NR_setfsuid +#define __NR_setfsuid 138 +#define __NR_setfsgid 139 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_setresgid 169 +#define __NR_getresgid 170 +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn 172 +#define __NR_rt_sigaction 173 +#define __NR_rt_sigprocmask 174 +#define __NR_rt_sigpending 175 +#define __NR_rt_sigsuspend 178 +#endif +#ifndef __NR_pread64 +#define __NR_pread64 179 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 180 +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit 190 +#endif +#ifndef __NR_readahead +#define __NR_readahead 191 +#endif +#ifndef __NR_stat64 +#define __NR_stat64 195 +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 197 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 202 +#endif +#ifndef __NR_gettid +#define __NR_gettid 207 +#endif +#ifndef __NR_tkill +#define __NR_tkill 208 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 209 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 210 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 212 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 213 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 215 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 216 +#endif +#ifndef __NR_futex +#define __NR_futex 221 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 222 +#define __NR_sched_getaffinity 223 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 232 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 246 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 247 +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 252 +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 253 +#endif +#ifndef __NR_fadvise64_64 +#define __NR_fadvise64_64 254 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 273 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 274 +#endif +#ifndef __NR_openat +#define __NR_openat 286 +#endif +#ifndef __NR_fstatat64 +#define __NR_fstatat64 291 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 292 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 301 +#endif +#ifndef __NR_getcpu +#define __NR_getcpu 302 +#endif +/* End of powerpc defininitions */ +#endif + + +/* After forking, we must make sure to only call system calls. */ +#if __BOUNDED_POINTERS__ + #error "Need to port invocations of syscalls for bounded ptrs" +#else + /* The core dumper and the thread lister get executed after threads + * have been suspended. As a consequence, we cannot call any functions + * that acquire locks. Unfortunately, libc wraps most system calls + * (e.g. in order to implement pthread_atfork, and to make calls + * cancellable), which means we cannot call these functions. Instead, + * we have to call syscall() directly. + */ + #undef LSS_ERRNO + #ifdef SYS_ERRNO + /* Allow the including file to override the location of errno. This can + * be useful when using clone() with the CLONE_VM option. + */ + #define LSS_ERRNO SYS_ERRNO + #else + #define LSS_ERRNO errno + #endif + + #undef LSS_INLINE + #ifdef SYS_INLINE + #define LSS_INLINE SYS_INLINE + #else + #define LSS_INLINE static inline + #endif + + /* Allow the including file to override the prefix used for all new + * system calls. By default, it will be set to "sys_". + */ + #undef LSS_NAME + #ifndef SYS_PREFIX + #define LSS_NAME(name) sys_##name + #elif SYS_PREFIX < 0 + #define LSS_NAME(name) name + #elif SYS_PREFIX == 0 + #define LSS_NAME(name) sys0_##name + #elif SYS_PREFIX == 1 + #define LSS_NAME(name) sys1_##name + #elif SYS_PREFIX == 2 + #define LSS_NAME(name) sys2_##name + #elif SYS_PREFIX == 3 + #define LSS_NAME(name) sys3_##name + #elif SYS_PREFIX == 4 + #define LSS_NAME(name) sys4_##name + #elif SYS_PREFIX == 5 + #define LSS_NAME(name) sys5_##name + #elif SYS_PREFIX == 6 + #define LSS_NAME(name) sys6_##name + #elif SYS_PREFIX == 7 + #define LSS_NAME(name) sys7_##name + #elif SYS_PREFIX == 8 + #define LSS_NAME(name) sys8_##name + #elif SYS_PREFIX == 9 + #define LSS_NAME(name) sys9_##name + #endif + + #undef LSS_RETURN + #if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__)) + /* Failing system calls return a negative result in the range of + * -1..-4095. These are "errno" values with the sign inverted. + */ + #define LSS_RETURN(type, res) \ + do { \ + if ((unsigned long)(res) >= (unsigned long)(-4095)) { \ + LSS_ERRNO = -(res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #elif defined(__mips__) + /* On MIPS, failing system calls return -1, and set errno in a + * separate CPU register. + */ + #define LSS_RETURN(type, res, err) \ + do { \ + if (err) { \ + LSS_ERRNO = (res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #elif defined(__PPC__) + /* On PPC, failing system calls return -1, and set errno in a + * separate CPU register. See linux/unistd.h. + */ + #define LSS_RETURN(type, res, err) \ + do { \ + if (err & 0x10000000 ) { \ + LSS_ERRNO = (res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #endif + #if defined(__i386__) + /* In PIC mode (e.g. when building shared libraries), gcc for i386 + * reserves ebx. Unfortunately, most distribution ship with implementations + * of _syscallX() which clobber ebx. + * Also, most definitions of _syscallX() neglect to mark "memory" as being + * clobbered. This causes problems with compilers, that do a better job + * at optimizing across __asm__ calls. + * So, we just have to redefine all of the _syscallX() macros. + */ + #undef LSS_BODY + #define LSS_BODY(type,args...) \ + long __res; \ + __asm__ __volatile__("push %%ebx\n" \ + "movl %2,%%ebx\n" \ + "int $0x80\n" \ + "pop %%ebx" \ + args \ + : "esp", "memory"); \ + LSS_RETURN(type,__res) + #undef _syscall0 + #define _syscall0(type,name) \ + type LSS_NAME(name)(void) { \ + long __res; \ + __asm__ volatile("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name) \ + : "memory"); \ + LSS_RETURN(type,__res); \ + } + #undef _syscall1 + #define _syscall1(type,name,type1,arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1))); \ + } + #undef _syscall2 + #define _syscall2(type,name,type1,arg1,type2,arg2) \ + type LSS_NAME(name)(type1 arg1,type2 arg2) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name),"ri" ((long)(arg1)), "c" ((long)(arg2))); \ + } + #undef _syscall3 + #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ + type LSS_NAME(name)(type1 arg1,type2 arg2,type3 arg3) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \ + "d" ((long)(arg3))); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4))); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + long __res; \ + __asm__ __volatile__("push %%ebx\n" \ + "movl %2,%%ebx\n" \ + "movl %1,%%eax\n" \ + "int $0x80\n" \ + "pop %%ebx" \ + : "=a" (__res) \ + : "i" (__NR_##name), "ri" ((long)(arg1)), \ + "c" ((long)(arg2)), "d" ((long)(arg3)), \ + "S" ((long)(arg4)), "D" ((long)(arg5)) \ + : "esp", "memory"); \ + LSS_RETURN(type,__res); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + long __res; \ + struct { long __a1; long __a6; } __s = { (long)arg1, (long) arg6 }; \ + __asm__ __volatile__("push %%ebp\n" \ + "push %%ebx\n" \ + "movl 4(%2),%%ebp\n" \ + "movl 0(%2), %%ebx\n" \ + "movl %1,%%eax\n" \ + "int $0x80\n" \ + "pop %%ebx\n" \ + "pop %%ebp" \ + : "=a" (__res) \ + : "i" (__NR_##name), "0" ((long)(&__s)), \ + "c" ((long)(arg2)), "d" ((long)(arg3)), \ + "S" ((long)(arg4)), "D" ((long)(arg5)) \ + : "esp", "memory"); \ + LSS_RETURN(type,__res); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + __asm__ __volatile__(/* if (fn == NULL) + * return -EINVAL; + */ + "movl %3,%%ecx\n" + "jecxz 1f\n" + + /* if (child_stack == NULL) + * return -EINVAL; + */ + "movl %4,%%ecx\n" + "jecxz 1f\n" + + /* Set up alignment of the child stack: + * child_stack = (child_stack & ~0xF) - 20; + */ + "andl $-16,%%ecx\n" + "subl $20,%%ecx\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "movl %6,%%eax\n" + "movl %%eax,4(%%ecx)\n" + "movl %3,%%eax\n" + "movl %%eax,(%%ecx)\n" + + /* %eax = syscall(%eax = __NR_clone, + * %ebx = flags, + * %ecx = child_stack, + * %edx = parent_tidptr, + * %esi = newtls, + * %edi = child_tidptr) + * Also, make sure that %ebx gets preserved as it is + * used in PIC mode. + */ + "movl %8,%%esi\n" + "movl %7,%%edx\n" + "movl %5,%%eax\n" + "movl %9,%%edi\n" + "pushl %%ebx\n" + "movl %%eax,%%ebx\n" + "movl %2,%%eax\n" + "int $0x80\n" + + /* In the parent: restore %ebx + * In the child: move "fn" into %ebx + */ + "popl %%ebx\n" + + /* if (%eax != 0) + * return %eax; + */ + "test %%eax,%%eax\n" + "jnz 1f\n" + + /* In the child, now. Terminate frame pointer chain. + */ + "movl $0,%%ebp\n" + + /* Call "fn". "arg" is already on the stack. + */ + "call *%%ebx\n" + + /* Call _exit(%ebx). Unfortunately older versions + * of gcc restrict the number of arguments that can + * be passed to asm(). So, we need to hard-code the + * system call number. + */ + "movl %%eax,%%ebx\n" + "movl $1,%%eax\n" + "int $0x80\n" + + /* Return to parent. + */ + "1:\n" + : "=a" (__res) + : "0"(-EINVAL), "i"(__NR_clone), + "m"(fn), "m"(child_stack), "m"(flags), "m"(arg), + "m"(parent_tidptr), "m"(newtls), "m"(child_tidptr) + : "esp", "memory", "ecx", "edx", "esi", "edi"); + LSS_RETURN(int, __res); + } + + #define __NR__fadvise64_64 __NR_fadvise64_64 + LSS_INLINE _syscall6(int, _fadvise64_64, int, fd, + unsigned, offset_lo, unsigned, offset_hi, + unsigned, len_lo, unsigned, len_hi, + int, advice) + + LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset, + loff_t len, int advice) { + return LSS_NAME(_fadvise64_64)(fd, + (unsigned)offset, (unsigned)(offset >>32), + (unsigned)len, (unsigned)(len >> 32), + advice); + } + + #define __NR__fallocate __NR_fallocate + LSS_INLINE _syscall6(int, _fallocate, int, fd, + int, mode, + unsigned, offset_lo, unsigned, offset_hi, + unsigned, len_lo, unsigned, len_hi) + + LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode, + loff_t offset, loff_t len) { + union { loff_t off; unsigned w[2]; } o = { offset }, l = { len }; + return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]); + } + + LSS_INLINE _syscall1(int, set_thread_area, void *, u) + LSS_INLINE _syscall1(int, get_thread_area, void *, u) + + LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { + /* On i386, the kernel does not know how to return from a signal + * handler. Instead, it relies on user space to provide a + * restorer function that calls the {rt_,}sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:movl %1,%%eax\n" + "int $0x80\n" + "2:popl %0\n" + "addl $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_rt_sigreturn)); + return res; + } + LSS_INLINE void (*LSS_NAME(restore)(void))(void) { + /* On i386, the kernel does not know how to return from a signal + * handler. Instead, it relies on user space to provide a + * restorer function that calls the {rt_,}sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:pop %%eax\n" + "movl %1,%%eax\n" + "int $0x80\n" + "2:popl %0\n" + "addl $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_sigreturn)); + return res; + } + #elif defined(__x86_64__) + /* There are no known problems with any of the _syscallX() macros + * currently shipping for x86_64, but we still need to be able to define + * our own version so that we can override the location of the errno + * location (e.g. when using the clone() system call with the CLONE_VM + * option). + */ + #undef LSS_BODY + #define LSS_BODY(type,name, ...) \ + long __res; \ + __asm__ __volatile__("syscall" : "=a" (__res) : "0" (__NR_##name), \ + ##__VA_ARGS__ : "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type,name) \ + type LSS_NAME(name)() { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type,name,type1,arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(type, name, "D" ((long)(arg1))); \ + } + #undef _syscall2 + #define _syscall2(type,name,type1,arg1,type2,arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2))); \ + } + #undef _syscall3 + #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)), \ + "d" ((long)(arg3))); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "r" ((long)(arg4)) : "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "r" ((long)(arg4)), "r" ((long)(arg5)) : \ + "r8", "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; movq %7,%%r9;" \ + "syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "r" ((long)(arg4)), "r" ((long)(arg5)), "r" ((long)(arg6)) : \ + "r8", "r9", "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + { + register void *__tls __asm__("r8") = newtls; + register int *__ctid __asm__("r10") = child_tidptr; + __asm__ __volatile__(/* if (fn == NULL) + * return -EINVAL; + */ + "testq %4,%4\n" + "jz 1f\n" + + /* if (child_stack == NULL) + * return -EINVAL; + */ + "testq %5,%5\n" + "jz 1f\n" + + /* childstack -= 2*sizeof(void *); + */ + "subq $16,%5\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "movq %7,8(%5)\n" + "movq %4,0(%5)\n" + + /* %rax = syscall(%rax = __NR_clone, + * %rdi = flags, + * %rsi = child_stack, + * %rdx = parent_tidptr, + * %r8 = new_tls, + * %r10 = child_tidptr) + */ + "movq %2,%%rax\n" + "syscall\n" + + /* if (%rax != 0) + * return; + */ + "testq %%rax,%%rax\n" + "jnz 1f\n" + + /* In the child. Terminate frame pointer chain. + */ + "xorq %%rbp,%%rbp\n" + + /* Call "fn(arg)". + */ + "popq %%rax\n" + "popq %%rdi\n" + "call *%%rax\n" + + /* Call _exit(%ebx). + */ + "movq %%rax,%%rdi\n" + "movq %3,%%rax\n" + "syscall\n" + + /* Return to parent. + */ + "1:\n" + : "=a" (__res) + : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit), + "r"(fn), "S"(child_stack), "D"(flags), "r"(arg), + "d"(parent_tidptr), "r"(__tls), "r"(__ctid) + : "rsp", "memory", "r11", "rcx"); + } + LSS_RETURN(int, __res); + } + LSS_INLINE _syscall2(int, arch_prctl, int, c, void *, a) + LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len, + int, advice) + + LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { + /* On x86-64, the kernel does not know how to return from + * a signal handler. Instead, it relies on user space to provide a + * restorer function that calls the rt_sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:movq %1,%%rax\n" + "syscall\n" + "2:popq %0\n" + "addq $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_rt_sigreturn)); + return res; + } + #elif defined(__ARM_ARCH_3__) + /* Most definitions of _syscallX() neglect to mark "memory" as being + * clobbered. This causes problems with compilers, that do a better job + * at optimizing across __asm__ calls. + * So, we just have to redefine all fo the _syscallX() macros. + */ + #undef LSS_REG + #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a + #undef LSS_BODY + #define LSS_BODY(type,name,args...) \ + register long __res_r0 __asm__("r0"); \ + long __res; \ + __asm__ __volatile__ (__syscall(name) \ + : "=r"(__res_r0) : args : "lr", "memory"); \ + __res = __res_r0; \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)() { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__r0)); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1)); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2)); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3)); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); LSS_REG(4, arg5); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \ + "r"(__r4)); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \ + "r"(__r4), "r"(__r5)); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + { + register int __flags __asm__("r0") = flags; + register void *__stack __asm__("r1") = child_stack; + register void *__ptid __asm__("r2") = parent_tidptr; + register void *__tls __asm__("r3") = newtls; + register int *__ctid __asm__("r4") = child_tidptr; + __asm__ __volatile__(/* if (fn == NULL || child_stack == NULL) + * return -EINVAL; + */ + "cmp %2,#0\n" + "cmpne %3,#0\n" + "moveq %0,%1\n" + "beq 1f\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "str %5,[%3,#-4]!\n" + "str %2,[%3,#-4]!\n" + + /* %r0 = syscall(%r0 = flags, + * %r1 = child_stack, + * %r2 = parent_tidptr, + * %r3 = newtls, + * %r4 = child_tidptr) + */ + __syscall(clone)"\n" + + /* if (%r0 != 0) + * return %r0; + */ + "movs %0,r0\n" + "bne 1f\n" + + /* In the child, now. Call "fn(arg)". + */ + "ldr r0,[sp, #4]\n" + "mov lr,pc\n" + "ldr pc,[sp]\n" + + /* Call _exit(%r0). + */ + __syscall(exit)"\n" + "1:\n" + : "=r" (__res) + : "i"(-EINVAL), + "r"(fn), "r"(__stack), "r"(__flags), "r"(arg), + "r"(__ptid), "r"(__tls), "r"(__ctid) + : "lr", "memory"); + } + LSS_RETURN(int, __res); + } + #elif defined(__mips__) + #undef LSS_REG + #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) = \ + (unsigned long)(a) + #undef LSS_BODY + #define LSS_BODY(type,name,r7,...) \ + register unsigned long __v0 __asm__("$2") = __NR_##name; \ + __asm__ __volatile__ ("syscall\n" \ + : "=&r"(__v0), r7 (__r7) \ + : "0"(__v0), ##__VA_ARGS__ \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)() { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_BODY(type, name, "=r"); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_BODY(type, name, "=r", "r"(__r4)); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_REG(5, arg2); \ + LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5)); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5), "r"(__r6)); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6)); \ + } + #undef _syscall5 + #if _MIPS_SIM == _MIPS_SIM_ABI32 + /* The old 32bit MIPS system call API passes the fifth and sixth argument + * on the stack, whereas the new APIs use registers "r8" and "r9". + */ + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + register unsigned long __v0 __asm__("$2"); \ + __asm__ __volatile__ (".set noreorder\n" \ + "lw $2, %6\n" \ + "subu $29, 32\n" \ + "sw $2, 16($29)\n" \ + "li $2, %2\n" \ + "syscall\n" \ + "addiu $29, 32\n" \ + ".set reorder\n" \ + : "=&r"(__v0), "+r" (__r7) \ + : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ + "r"(__r6), "m" ((unsigned long)arg5) \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7); \ + } + #else + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); LSS_REG(8, arg5); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \ + "r"(__r8)); \ + } + #endif + #undef _syscall6 + #if _MIPS_SIM == _MIPS_SIM_ABI32 + /* The old 32bit MIPS system call API passes the fifth and sixth argument + * on the stack, whereas the new APIs use registers "r8" and "r9". + */ + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + register unsigned long __v0 __asm__("$2"); \ + __asm__ __volatile__ (".set noreorder\n" \ + "lw $2, %6\n" \ + "lw $8, %7\n" \ + "subu $29, 32\n" \ + "sw $2, 16($29)\n" \ + "sw $8, 20($29)\n" \ + "li $2, %2\n" \ + "syscall\n" \ + "addiu $29, 32\n" \ + ".set reorder\n" \ + : "=&r"(__v0), "+r" (__r7) \ + : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ + "r"(__r6), "r" ((unsigned long)arg5), \ + "r" ((unsigned long)arg6) \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7); \ + } + #else + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5,type6 arg6) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); LSS_REG(8, arg5); LSS_REG(9, arg6); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \ + "r"(__r8), "r"(__r9)); \ + } + #endif + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + register unsigned long __v0 __asm__("$2"); + register unsigned long __r7 __asm__("$7") = (unsigned long)newtls; + { + register int __flags __asm__("$4") = flags; + register void *__stack __asm__("$5") = child_stack; + register void *__ptid __asm__("$6") = parent_tidptr; + register int *__ctid __asm__("$8") = child_tidptr; + __asm__ __volatile__( + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "subu $29,24\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "sub $29,16\n" + #else + "dsubu $29,16\n" + #endif + + /* if (fn == NULL || child_stack == NULL) + * return -EINVAL; + */ + "li %0,%2\n" + "beqz %5,1f\n" + "beqz %6,1f\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "subu %6,32\n" + "sw %5,0(%6)\n" + "sw %8,4(%6)\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "sub %6,32\n" + "sw %5,0(%6)\n" + "sw %8,8(%6)\n" + #else + "dsubu %6,32\n" + "sd %5,0(%6)\n" + "sd %8,8(%6)\n" + #endif + + /* $7 = syscall($4 = flags, + * $5 = child_stack, + * $6 = parent_tidptr, + * $7 = newtls, + * $8 = child_tidptr) + */ + "li $2,%3\n" + "syscall\n" + + /* if ($7 != 0) + * return $2; + */ + "bnez $7,1f\n" + "bnez $2,1f\n" + + /* In the child, now. Call "fn(arg)". + */ + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "lw $25,0($29)\n" + "lw $4,4($29)\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "lw $25,0($29)\n" + "lw $4,8($29)\n" + #else + "ld $25,0($29)\n" + "ld $4,8($29)\n" + #endif + "jalr $25\n" + + /* Call _exit($2) + */ + "move $4,$2\n" + "li $2,%4\n" + "syscall\n" + + "1:\n" + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "addu $29, 24\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "add $29, 16\n" + #else + "daddu $29,16\n" + #endif + : "=&r" (__v0), "=r" (__r7) + : "i"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit), + "r"(fn), "r"(__stack), "r"(__flags), "r"(arg), + "r"(__ptid), "r"(__r7), "r"(__ctid) + : "$9", "$10", "$11", "$12", "$13", "$14", "$15", + "$24", "memory"); + } + LSS_RETURN(int, __v0, __r7); + } + #elif defined (__PPC__) + #undef LSS_LOADARGS_0 + #define LSS_LOADARGS_0(name, dummy...) \ + __sc_0 = __NR_##name + #undef LSS_LOADARGS_1 + #define LSS_LOADARGS_1(name, arg1) \ + LSS_LOADARGS_0(name); \ + __sc_3 = (unsigned long) (arg1) + #undef LSS_LOADARGS_2 + #define LSS_LOADARGS_2(name, arg1, arg2) \ + LSS_LOADARGS_1(name, arg1); \ + __sc_4 = (unsigned long) (arg2) + #undef LSS_LOADARGS_3 + #define LSS_LOADARGS_3(name, arg1, arg2, arg3) \ + LSS_LOADARGS_2(name, arg1, arg2); \ + __sc_5 = (unsigned long) (arg3) + #undef LSS_LOADARGS_4 + #define LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4) \ + LSS_LOADARGS_3(name, arg1, arg2, arg3); \ + __sc_6 = (unsigned long) (arg4) + #undef LSS_LOADARGS_5 + #define LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5) \ + LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4); \ + __sc_7 = (unsigned long) (arg5) + #undef LSS_LOADARGS_6 + #define LSS_LOADARGS_6(name, arg1, arg2, arg3, arg4, arg5, arg6) \ + LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5); \ + __sc_8 = (unsigned long) (arg6) + #undef LSS_ASMINPUT_0 + #define LSS_ASMINPUT_0 "0" (__sc_0) + #undef LSS_ASMINPUT_1 + #define LSS_ASMINPUT_1 LSS_ASMINPUT_0, "1" (__sc_3) + #undef LSS_ASMINPUT_2 + #define LSS_ASMINPUT_2 LSS_ASMINPUT_1, "2" (__sc_4) + #undef LSS_ASMINPUT_3 + #define LSS_ASMINPUT_3 LSS_ASMINPUT_2, "3" (__sc_5) + #undef LSS_ASMINPUT_4 + #define LSS_ASMINPUT_4 LSS_ASMINPUT_3, "4" (__sc_6) + #undef LSS_ASMINPUT_5 + #define LSS_ASMINPUT_5 LSS_ASMINPUT_4, "5" (__sc_7) + #undef LSS_ASMINPUT_6 + #define LSS_ASMINPUT_6 LSS_ASMINPUT_5, "6" (__sc_8) + #undef LSS_BODY + #define LSS_BODY(nr, type, name, args...) \ + long __sc_ret, __sc_err; \ + { \ + register unsigned long __sc_0 __asm__ ("r0"); \ + register unsigned long __sc_3 __asm__ ("r3"); \ + register unsigned long __sc_4 __asm__ ("r4"); \ + register unsigned long __sc_5 __asm__ ("r5"); \ + register unsigned long __sc_6 __asm__ ("r6"); \ + register unsigned long __sc_7 __asm__ ("r7"); \ + register unsigned long __sc_8 __asm__ ("r8"); \ + \ + LSS_LOADARGS_##nr(name, args); \ + __asm__ __volatile__ \ + ("sc\n\t" \ + "mfcr %0" \ + : "=&r" (__sc_0), \ + "=&r" (__sc_3), "=&r" (__sc_4), \ + "=&r" (__sc_5), "=&r" (__sc_6), \ + "=&r" (__sc_7), "=&r" (__sc_8) \ + : LSS_ASMINPUT_##nr \ + : "cr0", "ctr", "memory", \ + "r9", "r10", "r11", "r12"); \ + __sc_ret = __sc_3; \ + __sc_err = __sc_0; \ + } \ + LSS_RETURN(type, __sc_ret, __sc_err) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)(void) { \ + LSS_BODY(0, type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(1, type, name, arg1); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_BODY(2, type, name, arg1, arg2); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_BODY(3, type, name, arg1, arg2, arg3); \ + } + #undef _syscall4 + #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_BODY(4, type, name, arg1, arg2, arg3, arg4); \ + } + #undef _syscall5 + #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_BODY(5, type, name, arg1, arg2, arg3, arg4, arg5); \ + } + #undef _syscall6 + #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \ + } + /* clone function adapted from glibc 2.3.6 clone.S */ + /* TODO(csilvers): consider wrapping some args up in a struct, like we + * do for i386's _syscall6, so we can compile successfully on gcc 2.95 + */ + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __ret, __err; + { + register int (*__fn)(void *) __asm__ ("r8") = fn; + register void *__cstack __asm__ ("r4") = child_stack; + register int __flags __asm__ ("r3") = flags; + register void * __arg __asm__ ("r9") = arg; + register int * __ptidptr __asm__ ("r5") = parent_tidptr; + register void * __newtls __asm__ ("r6") = newtls; + register int * __ctidptr __asm__ ("r7") = child_tidptr; + __asm__ __volatile__( + /* check for fn == NULL + * and child_stack == NULL + */ + "cmpwi cr0, %6, 0\n\t" + "cmpwi cr1, %7, 0\n\t" + "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t" + "beq- cr0, 1f\n\t" + + /* set up stack frame for child */ + "clrrwi %7, %7, 4\n\t" + "li 0, 0\n\t" + "stwu 0, -16(%7)\n\t" + + /* fn, arg, child_stack are saved across the syscall: r28-30 */ + "mr 28, %6\n\t" + "mr 29, %7\n\t" + "mr 27, %9\n\t" + + /* syscall */ + "li 0, %4\n\t" + /* flags already in r3 + * child_stack already in r4 + * ptidptr already in r5 + * newtls already in r6 + * ctidptr already in r7 + */ + "sc\n\t" + + /* Test if syscall was successful */ + "cmpwi cr1, 3, 0\n\t" + "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t" + "bne- cr1, 1f\n\t" + + /* Do the function call */ + "mtctr 28\n\t" + "mr 3, 27\n\t" + "bctrl\n\t" + + /* Call _exit(r3) */ + "li 0, %5\n\t" + "sc\n\t" + + /* Return to parent */ + "1:\n" + "mfcr %1\n\t" + "mr %0, 3\n\t" + : "=r" (__ret), "=r" (__err) + : "0" (-1), "1" (EINVAL), + "i" (__NR_clone), "i" (__NR_exit), + "r" (__fn), "r" (__cstack), "r" (__flags), + "r" (__arg), "r" (__ptidptr), "r" (__newtls), + "r" (__ctidptr) + : "cr0", "cr1", "memory", "ctr", + "r0", "r29", "r27", "r28"); + } + LSS_RETURN(int, __ret, __err); + } + #endif + #define __NR__exit __NR_exit + #define __NR__gettid __NR_gettid + #define __NR__mremap __NR_mremap + LSS_INLINE _syscall1(int, brk, void *, e) + LSS_INLINE _syscall1(int, chdir, const char *,p) + LSS_INLINE _syscall1(int, close, int, f) + LSS_INLINE _syscall2(int, clock_getres, int, c, + struct kernel_timespec*, t) + LSS_INLINE _syscall2(int, clock_gettime, int, c, + struct kernel_timespec*, t) + LSS_INLINE _syscall1(int, dup, int, f) + LSS_INLINE _syscall2(int, dup2, int, s, + int, d) + LSS_INLINE _syscall3(int, execve, const char*, f, + const char*const*,a,const char*const*, e) + LSS_INLINE _syscall1(int, _exit, int, e) + LSS_INLINE _syscall1(int, exit_group, int, e) + LSS_INLINE _syscall3(int, fcntl, int, f, + int, c, long, a) + LSS_INLINE _syscall0(pid_t, fork) + LSS_INLINE _syscall2(int, fstat, int, f, + struct kernel_stat*, b) + LSS_INLINE _syscall2(int, fstatfs, int, f, + struct kernel_statfs*, b) + LSS_INLINE _syscall2(int, ftruncate, int, f, + off_t, l) + LSS_INLINE _syscall4(int, futex, int*, a, + int, o, int, v, + struct kernel_timespec*, t) + LSS_INLINE _syscall3(int, getdents, int, f, + struct kernel_dirent*, d, int, c) + LSS_INLINE _syscall3(int, getdents64, int, f, + struct kernel_dirent64*, d, int, c) + LSS_INLINE _syscall0(gid_t, getegid) + LSS_INLINE _syscall0(uid_t, geteuid) + LSS_INLINE _syscall0(pid_t, getpgrp) + LSS_INLINE _syscall0(pid_t, getpid) + LSS_INLINE _syscall0(pid_t, getppid) + LSS_INLINE _syscall2(int, getpriority, int, a, + int, b) + LSS_INLINE _syscall3(int, getresgid, gid_t *, r, + gid_t *, e, gid_t *, s) + LSS_INLINE _syscall3(int, getresuid, uid_t *, r, + uid_t *, e, uid_t *, s) + LSS_INLINE _syscall2(int, getrlimit, int, r, + struct kernel_rlimit*, l) + LSS_INLINE _syscall1(pid_t, getsid, pid_t, p) + LSS_INLINE _syscall0(pid_t, _gettid) + LSS_INLINE _syscall2(int, gettimeofday, struct timeval *, v, + struct timezone *, z) + LSS_INLINE _syscall5(int, setxattr, const char *,p, + const char *, n, const void *,v, + size_t, s, int, f) + LSS_INLINE _syscall5(int, lsetxattr, const char *,p, + const char *, n, const void *,v, + size_t, s, int, f) + LSS_INLINE _syscall4(ssize_t, getxattr, const char *,p, + const char *, n, void *, v, size_t, s) + LSS_INLINE _syscall4(ssize_t, lgetxattr, const char *,p, + const char *, n, void *, v, size_t, s) + LSS_INLINE _syscall3(ssize_t, listxattr, const char *,p, + char *, l, size_t, s) + LSS_INLINE _syscall3(ssize_t, llistxattr, const char *,p, + char *, l, size_t, s) + LSS_INLINE _syscall3(int, ioctl, int, d, + int, r, void *, a) + LSS_INLINE _syscall2(int, ioprio_get, int, which, + int, who) + LSS_INLINE _syscall3(int, ioprio_set, int, which, + int, who, int, ioprio) + LSS_INLINE _syscall2(int, kill, pid_t, p, + int, s) + LSS_INLINE _syscall3(off_t, lseek, int, f, + off_t, o, int, w) + LSS_INLINE _syscall2(int, munmap, void*, s, + size_t, l) + LSS_INLINE _syscall6(long, move_pages, pid_t, p, + unsigned long, n, void **,g, int *, d, + int *, s, int, f) + LSS_INLINE _syscall3(int, mprotect, const void *,a, + size_t, l, int, p) + LSS_INLINE _syscall5(void*, _mremap, void*, o, + size_t, os, size_t, ns, + unsigned long, f, void *, a) + LSS_INLINE _syscall3(int, open, const char*, p, + int, f, int, m) + LSS_INLINE _syscall3(int, poll, struct kernel_pollfd*, u, + unsigned int, n, int, t) + LSS_INLINE _syscall2(int, prctl, int, o, + long, a) + LSS_INLINE _syscall4(long, ptrace, int, r, + pid_t, p, void *, a, void *, d) + #if defined(__NR_quotactl) + // Defined on x86_64 / i386 only + LSS_INLINE _syscall4(int, quotactl, int, cmd, const char *, special, + int, id, caddr_t, addr) + #endif + LSS_INLINE _syscall3(ssize_t, read, int, f, + void *, b, size_t, c) + LSS_INLINE _syscall3(int, readlink, const char*, p, + char*, b, size_t, s) + LSS_INLINE _syscall4(int, rt_sigaction, int, s, + const struct kernel_sigaction*, a, + struct kernel_sigaction*, o, size_t, c) + LSS_INLINE _syscall2(int, rt_sigpending, struct kernel_sigset_t *, s, + size_t, c) + LSS_INLINE _syscall4(int, rt_sigprocmask, int, h, + const struct kernel_sigset_t*, s, + struct kernel_sigset_t*, o, size_t, c); + LSS_INLINE _syscall1(int, rt_sigreturn, unsigned long, u); + LSS_INLINE _syscall2(int, rt_sigsuspend, + const struct kernel_sigset_t*, s, size_t, c); + LSS_INLINE _syscall3(int, sched_getaffinity,pid_t, p, + unsigned int, l, unsigned long *, m) + LSS_INLINE _syscall3(int, sched_setaffinity,pid_t, p, + unsigned int, l, unsigned long *, m) + LSS_INLINE _syscall0(int, sched_yield) + LSS_INLINE _syscall1(long, set_tid_address, int *, t) + LSS_INLINE _syscall1(int, setfsgid, gid_t, g) + LSS_INLINE _syscall1(int, setfsuid, uid_t, u) + LSS_INLINE _syscall1(int, setuid, uid_t, u) + LSS_INLINE _syscall1(int, setgid, gid_t, g) + LSS_INLINE _syscall2(int, setpgid, pid_t, p, + pid_t, g) + LSS_INLINE _syscall3(int, setpriority, int, a, + int, b, int, p) + LSS_INLINE _syscall3(int, setresgid, gid_t, r, + gid_t, e, gid_t, s) + LSS_INLINE _syscall3(int, setresuid, uid_t, r, + uid_t, e, uid_t, s) + LSS_INLINE _syscall2(int, setrlimit, int, r, + const struct kernel_rlimit*, l) + LSS_INLINE _syscall0(pid_t, setsid) + LSS_INLINE _syscall2(int, sigaltstack, const stack_t*, s, + const stack_t*, o) + #if defined(__NR_sigreturn) + LSS_INLINE _syscall1(int, sigreturn, unsigned long, u); + #endif + LSS_INLINE _syscall2(int, stat, const char*, f, + struct kernel_stat*, b) + LSS_INLINE _syscall2(int, statfs, const char*, f, + struct kernel_statfs*, b) + LSS_INLINE _syscall3(int, tgkill, pid_t, p, + pid_t, t, int, s) + LSS_INLINE _syscall2(int, tkill, pid_t, p, + int, s) + LSS_INLINE _syscall3(ssize_t, write, int, f, + const void *, b, size_t, c) + LSS_INLINE _syscall3(ssize_t, writev, int, f, + const struct kernel_iovec*, v, size_t, c) + LSS_INLINE _syscall1(int, unlink, const char*, f) + #if defined(__NR_getcpu) + LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu, + unsigned *, node, void *, unused); + #endif + #if defined(__x86_64__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) + LSS_INLINE _syscall3(int, recvmsg, int, s, + struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall3(int, sendmsg, int, s, + const struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall6(int, sendto, int, s, + const void*, m, size_t, l, + int, f, + const struct kernel_sockaddr*, a, int, t) + LSS_INLINE _syscall2(int, shutdown, int, s, + int, h) + LSS_INLINE _syscall3(int, socket, int, d, + int, t, int, p) + LSS_INLINE _syscall4(int, socketpair, int, d, + int, t, int, p, int*, s) + #endif + #if defined(__x86_64__) + LSS_INLINE _syscall4(int, fallocate, int, fd, int, mode, + loff_t, offset, loff_t, len) + + LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, + gid_t *egid, + gid_t *sgid) { + return LSS_NAME(getresgid)(rgid, egid, sgid); + } + + LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, + uid_t *euid, + uid_t *suid) { + return LSS_NAME(getresuid)(ruid, euid, suid); + } + + LSS_INLINE _syscall6(void*, mmap, void*, s, + size_t, l, int, p, + int, f, int, d, + __off64_t, o) + + LSS_INLINE _syscall4(int, newfstatat, int, d, + const char *, p, + struct kernel_stat*, b, int, f) + + LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { + return LSS_NAME(setfsgid)(gid); + } + + LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { + return LSS_NAME(setfsuid)(uid); + } + + LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { + return LSS_NAME(setresgid)(rgid, egid, sgid); + } + + LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { + return LSS_NAME(setresuid)(ruid, euid, suid); + } + + LSS_INLINE int LSS_NAME(sigaction)(int signum, + const struct kernel_sigaction *act, + struct kernel_sigaction *oldact) { + /* On x86_64, the kernel requires us to always set our own + * SA_RESTORER in order to be able to return from a signal handler. + * This function must have a "magic" signature that the "gdb" + * (and maybe the kernel?) can recognize. + */ + if (act != NULL && !(act->sa_flags & SA_RESTORER)) { + struct kernel_sigaction a = *act; + a.sa_flags |= SA_RESTORER; + a.sa_restorer = LSS_NAME(restore_rt)(); + return LSS_NAME(rt_sigaction)(signum, &a, oldact, + (KERNEL_NSIG+7)/8); + } else { + return LSS_NAME(rt_sigaction)(signum, act, oldact, + (KERNEL_NSIG+7)/8); + } + } + + LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { + return LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); + } + + LSS_INLINE int LSS_NAME(sigprocmask)(int how, + const struct kernel_sigset_t *set, + struct kernel_sigset_t *oldset) { + return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); + } + + LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { + return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); + } + #endif + #if defined(__x86_64__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) + LSS_INLINE _syscall4(pid_t, wait4, pid_t, p, + int*, s, int, o, + struct kernel_rusage*, r) + + LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){ + return LSS_NAME(wait4)(pid, status, options, 0); + } + #endif + #if defined(__i386__) || defined(__x86_64__) + LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m) + LSS_INLINE _syscall3(int, unlinkat, int, d, const char *, p, int, f) + #endif + #if defined(__i386__) || defined(__ARM_ARCH_3__) + #define __NR__getresgid32 __NR_getresgid32 + #define __NR__getresuid32 __NR_getresuid32 + #define __NR__setfsgid32 __NR_setfsgid32 + #define __NR__setfsuid32 __NR_setfsuid32 + #define __NR__setresgid32 __NR_setresgid32 + #define __NR__setresuid32 __NR_setresuid32 + LSS_INLINE _syscall2(int, ugetrlimit, int, r, + struct kernel_rlimit*, l) + LSS_INLINE _syscall3(int, _getresgid32, gid_t *, r, + gid_t *, e, gid_t *, s) + LSS_INLINE _syscall3(int, _getresuid32, uid_t *, r, + uid_t *, e, uid_t *, s) + LSS_INLINE _syscall1(int, _setfsgid32, gid_t, f) + LSS_INLINE _syscall1(int, _setfsuid32, uid_t, f) + LSS_INLINE _syscall3(int, _setresgid32, gid_t, r, + gid_t, e, gid_t, s) + LSS_INLINE _syscall3(int, _setresuid32, uid_t, r, + uid_t, e, uid_t, s) + + LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, + gid_t *egid, + gid_t *sgid) { + int rc; + if ((rc = LSS_NAME(_getresgid32)(rgid, egid, sgid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((rgid == NULL) || (egid == NULL) || (sgid == NULL)) { + return EFAULT; + } + // Clear the high bits first, since getresgid only sets 16 bits + *rgid = *egid = *sgid = 0; + rc = LSS_NAME(getresgid)(rgid, egid, sgid); + } + return rc; + } + + LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, + uid_t *euid, + uid_t *suid) { + int rc; + if ((rc = LSS_NAME(_getresuid32)(ruid, euid, suid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((ruid == NULL) || (euid == NULL) || (suid == NULL)) { + return EFAULT; + } + // Clear the high bits first, since getresuid only sets 16 bits + *ruid = *euid = *suid = 0; + rc = LSS_NAME(getresuid)(ruid, euid, suid); + } + return rc; + } + + LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { + int rc; + if ((rc = LSS_NAME(_setfsgid32)(gid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)gid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setfsgid)(gid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { + int rc; + if ((rc = LSS_NAME(_setfsuid32)(uid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)uid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setfsuid)(uid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { + int rc; + if ((rc = LSS_NAME(_setresgid32)(rgid, egid, sgid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)rgid & ~0xFFFFu || + (unsigned int)egid & ~0xFFFFu || + (unsigned int)sgid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setresgid)(rgid, egid, sgid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { + int rc; + if ((rc = LSS_NAME(_setresuid32)(ruid, euid, suid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)ruid & ~0xFFFFu || + (unsigned int)euid & ~0xFFFFu || + (unsigned int)suid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setresuid)(ruid, euid, suid); + } + } + return rc; + } + #endif + LSS_INLINE int LSS_NAME(sigemptyset)(struct kernel_sigset_t *set) { + memset(&set->sig, 0, sizeof(set->sig)); + return 0; + } + + LSS_INLINE int LSS_NAME(sigfillset)(struct kernel_sigset_t *set) { + memset(&set->sig, -1, sizeof(set->sig)); + return 0; + } + + LSS_INLINE int LSS_NAME(sigaddset)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] + |= 1UL << ((signum - 1) % (8*sizeof(set->sig[0]))); + return 0; + } + } + + LSS_INLINE int LSS_NAME(sigdelset)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] + &= ~(1UL << ((signum - 1) % (8*sizeof(set->sig[0])))); + return 0; + } + } + + LSS_INLINE int LSS_NAME(sigismember)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + return !!(set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] & + (1UL << ((signum - 1) % (8*sizeof(set->sig[0]))))); + } + } + #if defined(__i386__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || defined(__PPC__) + #define __NR__sigaction __NR_sigaction + #define __NR__sigpending __NR_sigpending + #define __NR__sigprocmask __NR_sigprocmask + #define __NR__sigsuspend __NR_sigsuspend + #define __NR__socketcall __NR_socketcall + LSS_INLINE _syscall2(int, fstat64, int, f, + struct kernel_stat64 *, b) + LSS_INLINE _syscall5(int, _llseek, uint, fd, ulong, hi, ulong, lo, + loff_t *, res, uint, wh) + LSS_INLINE _syscall1(void*, mmap, void*, a) + LSS_INLINE _syscall6(void*, mmap2, void*, s, + size_t, l, int, p, + int, f, int, d, + __off64_t, o) + LSS_INLINE _syscall3(int, _sigaction, int, s, + const struct kernel_old_sigaction*, a, + struct kernel_old_sigaction*, o) + LSS_INLINE _syscall1(int, _sigpending, unsigned long*, s) + LSS_INLINE _syscall3(int, _sigprocmask, int, h, + const unsigned long*, s, + unsigned long*, o) + #ifdef __PPC__ + LSS_INLINE _syscall1(int, _sigsuspend, unsigned long, s) + #else + LSS_INLINE _syscall3(int, _sigsuspend, const void*, a, + int, b, + unsigned long, s) + #endif + LSS_INLINE _syscall2(int, stat64, const char *, p, + struct kernel_stat64 *, b) + + LSS_INLINE int LSS_NAME(sigaction)(int signum, + const struct kernel_sigaction *act, + struct kernel_sigaction *oldact) { + int old_errno = LSS_ERRNO; + int rc; + struct kernel_sigaction a; + if (act != NULL) { + a = *act; + #ifdef __i386__ + /* On i386, the kernel requires us to always set our own + * SA_RESTORER when using realtime signals. Otherwise, it does not + * know how to return from a signal handler. This function must have + * a "magic" signature that the "gdb" (and maybe the kernel?) can + * recognize. + * Apparently, a SA_RESTORER is implicitly set by the kernel, when + * using non-realtime signals. + * + * TODO: Test whether ARM needs a restorer + */ + if (!(a.sa_flags & SA_RESTORER)) { + a.sa_flags |= SA_RESTORER; + a.sa_restorer = (a.sa_flags & SA_SIGINFO) + ? LSS_NAME(restore_rt)() : LSS_NAME(restore)(); + } + #endif + } + rc = LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact, + (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + struct kernel_old_sigaction oa, ooa, *ptr_a = &oa, *ptr_oa = &ooa; + if (!act) { + ptr_a = NULL; + } else { + oa.sa_handler_ = act->sa_handler_; + memcpy(&oa.sa_mask, &act->sa_mask, sizeof(oa.sa_mask)); + #ifndef __mips__ + oa.sa_restorer = act->sa_restorer; + #endif + oa.sa_flags = act->sa_flags; + } + if (!oldact) { + ptr_oa = NULL; + } + LSS_ERRNO = old_errno; + rc = LSS_NAME(_sigaction)(signum, ptr_a, ptr_oa); + if (rc == 0 && oldact) { + if (act) { + memcpy(oldact, act, sizeof(*act)); + } else { + memset(oldact, 0, sizeof(*oldact)); + } + oldact->sa_handler_ = ptr_oa->sa_handler_; + oldact->sa_flags = ptr_oa->sa_flags; + memcpy(&oldact->sa_mask, &ptr_oa->sa_mask, sizeof(ptr_oa->sa_mask)); + #ifndef __mips__ + oldact->sa_restorer = ptr_oa->sa_restorer; + #endif + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { + int old_errno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = old_errno; + LSS_NAME(sigemptyset)(set); + rc = LSS_NAME(_sigpending)(&set->sig[0]); + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigprocmask)(int how, + const struct kernel_sigset_t *set, + struct kernel_sigset_t *oldset) { + int olderrno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = olderrno; + if (oldset) { + LSS_NAME(sigemptyset)(oldset); + } + rc = LSS_NAME(_sigprocmask)(how, + set ? &set->sig[0] : NULL, + oldset ? &oldset->sig[0] : NULL); + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { + int olderrno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = olderrno; + rc = LSS_NAME(_sigsuspend)( + #ifndef __PPC__ + set, 0, + #endif + set->sig[0]); + } + return rc; + } + #endif + #if defined(__PPC__) + #undef LSS_SC_LOADARGS_0 + #define LSS_SC_LOADARGS_0(dummy...) + #undef LSS_SC_LOADARGS_1 + #define LSS_SC_LOADARGS_1(arg1) \ + __sc_4 = (unsigned long) (arg1) + #undef LSS_SC_LOADARGS_2 + #define LSS_SC_LOADARGS_2(arg1, arg2) \ + LSS_SC_LOADARGS_1(arg1); \ + __sc_5 = (unsigned long) (arg2) + #undef LSS_SC_LOADARGS_3 + #define LSS_SC_LOADARGS_3(arg1, arg2, arg3) \ + LSS_SC_LOADARGS_2(arg1, arg2); \ + __sc_6 = (unsigned long) (arg3) + #undef LSS_SC_LOADARGS_4 + #define LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4) \ + LSS_SC_LOADARGS_3(arg1, arg2, arg3); \ + __sc_7 = (unsigned long) (arg4) + #undef LSS_SC_LOADARGS_5 + #define LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5) \ + LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4); \ + __sc_8 = (unsigned long) (arg5) + #undef LSS_SC_BODY + #define LSS_SC_BODY(nr, type, opt, args...) \ + long __sc_ret, __sc_err; \ + { \ + register unsigned long __sc_0 __asm__ ("r0") = __NR_socketcall; \ + register unsigned long __sc_3 __asm__ ("r3") = opt; \ + register unsigned long __sc_4 __asm__ ("r4"); \ + register unsigned long __sc_5 __asm__ ("r5"); \ + register unsigned long __sc_6 __asm__ ("r6"); \ + register unsigned long __sc_7 __asm__ ("r7"); \ + register unsigned long __sc_8 __asm__ ("r8"); \ + LSS_SC_LOADARGS_##nr(args); \ + __asm__ __volatile__ \ + ("stwu 1, -48(1)\n\t" \ + "stw 4, 20(1)\n\t" \ + "stw 5, 24(1)\n\t" \ + "stw 6, 28(1)\n\t" \ + "stw 7, 32(1)\n\t" \ + "stw 8, 36(1)\n\t" \ + "addi 4, 1, 20\n\t" \ + "sc\n\t" \ + "mfcr %0" \ + : "=&r" (__sc_0), \ + "=&r" (__sc_3), "=&r" (__sc_4), \ + "=&r" (__sc_5), "=&r" (__sc_6), \ + "=&r" (__sc_7), "=&r" (__sc_8) \ + : LSS_ASMINPUT_##nr \ + : "cr0", "ctr", "memory"); \ + __sc_ret = __sc_3; \ + __sc_err = __sc_0; \ + } \ + LSS_RETURN(type, __sc_ret, __sc_err) + + LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, + int flags){ + LSS_SC_BODY(3, ssize_t, 17, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, + const struct kernel_msghdr *msg, + int flags) { + LSS_SC_BODY(3, ssize_t, 16, s, msg, flags); + } + + // TODO(csilvers): why is this ifdef'ed out? +#if 0 + LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, + int flags, + const struct kernel_sockaddr *to, + unsigned int tolen) { + LSS_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen); + } +#endif + + LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { + LSS_SC_BODY(2, int, 13, s, how); + } + + LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { + LSS_SC_BODY(3, int, 1, domain, type, protocol); + } + + LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, + int sv[2]) { + LSS_SC_BODY(4, int, 8, d, type, protocol, sv); + } + #endif + #if defined(__i386__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + #define __NR__socketcall __NR_socketcall + LSS_INLINE _syscall2(int, _socketcall, int, c, + va_list, a) + + LSS_INLINE int LSS_NAME(socketcall)(int op, ...) { + int rc; + va_list ap; + va_start(ap, op); + rc = LSS_NAME(_socketcall)(op, ap); + va_end(ap); + return rc; + } + + LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, + int flags){ + return (ssize_t)LSS_NAME(socketcall)(17, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, + const struct kernel_msghdr *msg, + int flags) { + return (ssize_t)LSS_NAME(socketcall)(16, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, + int flags, + const struct kernel_sockaddr *to, + unsigned int tolen) { + return (ssize_t)LSS_NAME(socketcall)(11, s, buf, len, flags, to, tolen); + } + + LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { + return LSS_NAME(socketcall)(13, s, how); + } + + LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { + return LSS_NAME(socketcall)(1, domain, type, protocol); + } + + LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, + int sv[2]) { + return LSS_NAME(socketcall)(8, d, type, protocol, sv); + } + #endif + #if defined(__i386__) || defined(__PPC__) + LSS_INLINE _syscall4(int, fstatat64, int, d, + const char *, p, + struct kernel_stat64 *, b, int, f) + #endif + #if defined(__i386__) || defined(__PPC__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + LSS_INLINE _syscall3(pid_t, waitpid, pid_t, p, + int*, s, int, o) + #endif + #if defined(__mips__) + /* sys_pipe() on MIPS has non-standard calling conventions, as it returns + * both file handles through CPU registers. + */ + LSS_INLINE int LSS_NAME(pipe)(int *p) { + register unsigned long __v0 __asm__("$2") = __NR_pipe; + register unsigned long __v1 __asm__("$3"); + register unsigned long __r7 __asm__("$7"); + __asm__ __volatile__ ("syscall\n" + : "=&r"(__v0), "=&r"(__v1), "+r" (__r7) + : "0"(__v0) + : "$8", "$9", "$10", "$11", "$12", + "$13", "$14", "$15", "$24", "memory"); + if (__r7) { + LSS_ERRNO = __v0; + return -1; + } else { + p[0] = __v0; + p[1] = __v1; + return 0; + } + } + #else + LSS_INLINE _syscall1(int, pipe, int *, p) + #endif + /* TODO(csilvers): see if ppc can/should support this as well */ + #if defined(__i386__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) + #define __NR__statfs64 __NR_statfs64 + #define __NR__fstatfs64 __NR_fstatfs64 + LSS_INLINE _syscall3(int, _statfs64, const char*, p, + size_t, s,struct kernel_statfs64*, b) + LSS_INLINE _syscall3(int, _fstatfs64, int, f, + size_t, s,struct kernel_statfs64*, b) + LSS_INLINE int LSS_NAME(statfs64)(const char *p, + struct kernel_statfs64 *b) { + return LSS_NAME(_statfs64)(p, sizeof(*b), b); + } + LSS_INLINE int LSS_NAME(fstatfs64)(int f,struct kernel_statfs64 *b) { + return LSS_NAME(_fstatfs64)(f, sizeof(*b), b); + } + #endif + + LSS_INLINE int LSS_NAME(execv)(const char *path, const char *const argv[]) { + extern char **environ; + return LSS_NAME(execve)(path, argv, (const char *const *)environ); + } + + LSS_INLINE pid_t LSS_NAME(gettid)() { + pid_t tid = LSS_NAME(_gettid)(); + if (tid != -1) { + return tid; + } + return LSS_NAME(getpid)(); + } + + LSS_INLINE void *LSS_NAME(mremap)(void *old_address, size_t old_size, + size_t new_size, int flags, ...) { + va_list ap; + void *new_address, *rc; + va_start(ap, flags); + new_address = va_arg(ap, void *); + rc = LSS_NAME(_mremap)(old_address, old_size, new_size, + flags, new_address); + va_end(ap); + return rc; + } + + LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) { + /* PTRACE_DETACH can sometimes forget to wake up the tracee and it + * then sends job control signals to the real parent, rather than to + * the tracer. We reduce the risk of this happening by starting a + * whole new time slice, and then quickly sending a SIGCONT signal + * right after detaching from the tracee. + * + * We use tkill to ensure that we only issue a wakeup for the thread being + * detached. Large multi threaded apps can take a long time in the kernel + * processing SIGCONT. + */ + int rc, err; + LSS_NAME(sched_yield)(); + rc = LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0); + err = LSS_ERRNO; + LSS_NAME(tkill)(pid, SIGCONT); + /* Old systems don't have tkill */ + if (LSS_ERRNO == ENOSYS) + LSS_NAME(kill)(pid, SIGCONT); + LSS_ERRNO = err; + return rc; + } + + LSS_INLINE int LSS_NAME(raise)(int sig) { + return LSS_NAME(kill)(LSS_NAME(getpid)(), sig); + } + + LSS_INLINE int LSS_NAME(setpgrp)() { + return LSS_NAME(setpgid)(0, 0); + } + + LSS_INLINE int LSS_NAME(sysconf)(int name) { + extern int __getpagesize(void); + switch (name) { + case _SC_OPEN_MAX: { + struct kernel_rlimit limit; + return LSS_NAME(getrlimit)(RLIMIT_NOFILE, &limit) < 0 + ? 8192 : limit.rlim_cur; + } + case _SC_PAGESIZE: + return __getpagesize(); + default: + LSS_ERRNO = ENOSYS; + return -1; + } + } + #if defined(__x86_64__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64) + LSS_INLINE _syscall4(ssize_t, pread64, int, f, + void *, b, size_t, c, + loff_t, o) + LSS_INLINE _syscall4(ssize_t, pwrite64, int, f, + const void *, b, size_t, c, + loff_t, o) + LSS_INLINE _syscall3(int, readahead, int, f, + loff_t, o, unsigned, c) + #else + #define __NR__pread64 __NR_pread64 + #define __NR__pwrite64 __NR_pwrite64 + #define __NR__readahead __NR_readahead + LSS_INLINE _syscall5(ssize_t, _pread64, int, f, + void *, b, size_t, c, unsigned, o1, + unsigned, o2) + LSS_INLINE _syscall5(ssize_t, _pwrite64, int, f, + const void *, b, size_t, c, unsigned, o1, + long, o2) + LSS_INLINE _syscall4(int, _readahead, int, f, + unsigned, o1, unsigned, o2, size_t, c); + /* We force 64bit-wide parameters onto the stack, then access each + * 32-bit component individually. This guarantees that we build the + * correct parameters independent of the native byte-order of the + * underlying architecture. + */ + LSS_INLINE ssize_t LSS_NAME(pread64)(int fd, void *buf, size_t count, + loff_t off) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_pread64)(fd, buf, count, o.arg[0], o.arg[1]); + } + LSS_INLINE ssize_t LSS_NAME(pwrite64)(int fd, const void *buf, + size_t count, loff_t off) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_pwrite64)(fd, buf, count, o.arg[0], o.arg[1]); + } + LSS_INLINE int LSS_NAME(readahead)(int fd, loff_t off, int len) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_readahead)(fd, o.arg[0], o.arg[1], len); + } + #endif +#endif + +#if defined(__cplusplus) && !defined(SYS_CPLUSPLUS) +} +#endif + +#endif +#endif diff --git a/sandbox/linux/seccomp/madvise.cc b/sandbox/linux/seccomp/madvise.cc new file mode 100644 index 0000000..70c594f --- /dev/null +++ b/sandbox/linux/seccomp/madvise.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_madvise(void* start, size_t length, int advice) { + long long tm; + Debug::syscall(&tm, __NR_madvise, "Executing handler"); + struct { + int sysnum; + long long cookie; + MAdvise madvise_req; + } __attribute__((packed)) request; + request.sysnum = __NR_madvise; + request.cookie = cookie(); + request.madvise_req.start = start; + request.madvise_req.len = length; + request.madvise_req.advice = advice; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward madvise() request [sandbox]"); + } + Debug::elapsed(tm, __NR_madvise); + return rc; +} + +bool Sandbox::process_madvise(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + MAdvise madvise_req; + SysCalls sys; + if (read(sys, sandboxFd, &madvise_req, sizeof(madvise_req)) != + sizeof(madvise_req)) { + die("Failed to read parameters for madvise() [process]"); + } + int rc = -EINVAL; + switch (madvise_req.advice) { + case MADV_NORMAL: + case MADV_RANDOM: + case MADV_SEQUENTIAL: + case MADV_WILLNEED: + ok: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_madvise, + madvise_req.start, madvise_req.len, + madvise_req.advice); + return true; + default: + // All other flags to madvise() are potential dangerous (as opposed to + // merely affecting overall performance). Do not allow them on memory + // ranges that were part of the original mappings. + void *stop = reinterpret_cast<void *>( + (char *)madvise_req.start + madvise_req.len); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)madvise_req.start); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (madvise_req.start < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + + // Changing attributes on memory regions that were newly mapped inside of + // the sandbox is OK. + goto ok; + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/maps.cc b/sandbox/linux/seccomp/maps.cc new file mode 100644 index 0000000..8ae218d --- /dev/null +++ b/sandbox/linux/seccomp/maps.cc @@ -0,0 +1,267 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <errno.h> +#include <fcntl.h> +#include <linux/unistd.h> +#include <signal.h> +#include <stdarg.h> +#include <stdlib.h> +#include <sys/ptrace.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include "library.h" +#include "maps.h" +#include "sandbox_impl.h" + +namespace playground { + +Maps::Maps(int proc_self_maps) : + proc_self_maps_(proc_self_maps), + begin_iter_(this, true, false), + end_iter_(this, false, true), + vsyscall_(0) { + Sandbox::SysCalls sys; + if (proc_self_maps_ >= 0 && + !sys.lseek(proc_self_maps_, 0, SEEK_SET)) { + char buf[256] = { 0 }; + int len = 0, rc = 1; + bool long_line = false; + do { + if (rc > 0) { + rc = Sandbox::read(sys, proc_self_maps_, buf + len, + sizeof(buf) - len - 1); + if (rc > 0) { + len += rc; + } + } + char *ptr = buf; + if (!long_line) { + long_line = true; + unsigned long start = strtoul(ptr, &ptr, 16); + unsigned long stop = strtoul(ptr + 1, &ptr, 16); + while (*ptr == ' ' || *ptr == '\t') ++ptr; + char *perm_ptr = ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; + string perm(perm_ptr, ptr - perm_ptr); + unsigned long offset = strtoul(ptr, &ptr, 16); + while (*ptr == ' ' || *ptr == '\t') ++ptr; + char *id_ptr = ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; + while (*ptr == ' ' || *ptr == '\t') ++ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; + string id(id_ptr, ptr - id_ptr); + while (*ptr == ' ' || *ptr == '\t') ++ptr; + char *library_ptr = ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t' && *ptr != '\n') ++ptr; + string library(library_ptr, ptr - library_ptr); + bool isVDSO = false; + if (library == "[vdso]") { + // /proc/self/maps has a misleading file offset in the [vdso] entry. + // Override it with a sane value. + offset = 0; + isVDSO = true; + } else if (library == "[vsyscall]") { + vsyscall_ = reinterpret_cast<char *>(start); + } else if (library.empty() || library[0] == '[') { + goto skip_entry; + } + int prot = 0; + if (perm.find('r') != string::npos) { + prot |= PROT_READ; + } + if (perm.find('w') != string::npos) { + prot |= PROT_WRITE; + } + if (perm.find('x') != string::npos) { + prot |= PROT_EXEC; + } + if ((prot & (PROT_EXEC | PROT_READ)) == 0) { + goto skip_entry; + } + Library* lib = &libs_[id + ' ' + library]; + lib->setLibraryInfo(this); + lib->addMemoryRange(reinterpret_cast<void *>(start), + reinterpret_cast<void *>(stop), + Elf_Addr(offset), + prot, isVDSO); + } + skip_entry: + for (;;) { + if (!*ptr || *ptr++ == '\n') { + long_line = false; + memmove(buf, ptr, len - (ptr - buf)); + memset(buf + len - (ptr - buf), 0, ptr - buf); + len -= (ptr - buf); + break; + } + } + } while (len || long_line); + } +} + +Maps::Iterator::Iterator(Maps* maps, bool at_beginning, bool at_end) + : maps_(maps), + at_beginning_(at_beginning), + at_end_(at_end) { +} + +Maps::LibraryMap::iterator& Maps::Iterator::getIterator() const { + if (at_beginning_) { + iter_ = maps_->libs_.begin(); + } else if (at_end_) { + iter_ = maps_->libs_.end(); + } + return iter_; +} + +Maps::Iterator Maps::Iterator::begin() { + return maps_->begin_iter_; +} + +Maps::Iterator Maps::Iterator::end() { + return maps_->end_iter_; +} + +Maps::Iterator& Maps::Iterator::operator++() { + getIterator().operator++(); + at_beginning_ = false; + return *this; +} + +Maps::Iterator Maps::Iterator::operator++(int i) { + getIterator().operator++(i); + at_beginning_ = false; + return *this; +} + +Library* Maps::Iterator::operator*() const { + return &getIterator().operator*().second; +} + +bool Maps::Iterator::operator==(const Maps::Iterator& iter) const { + return getIterator().operator==(iter.getIterator()); +} + +bool Maps::Iterator::operator!=(const Maps::Iterator& iter) const { + return !operator==(iter); +} + +Maps::string Maps::Iterator::name() const { + return getIterator()->first; +} + +// Test whether a line ends with "[stack]"; used for identifying the +// stack entry of /proc/self/maps. +static bool isStackLine(char* buf, char* end) { + char* ptr = buf; + for ( ; *ptr != '\n' && ptr < end; ++ptr) + ; + if (ptr < end && ptr - 7 > buf) { + return (memcmp(ptr - 7, "[stack]", 7) == 0); + } + return false; +} + +char* Maps::allocNearAddr(char* addr_target, size_t size, int prot) const { + // We try to allocate memory within 1.5GB of a target address. This means, + // we will be able to perform relative 32bit jumps from the target address. + const unsigned long kMaxDistance = 1536 << 20; + // In most of the code below, we just care about the numeric value of + // the address. + const long addr = reinterpret_cast<long>(addr_target); + size = (size + 4095) & ~4095; + Sandbox::SysCalls sys; + if (sys.lseek(proc_self_maps_, 0, SEEK_SET)) { + return NULL; + } + + // Iterate through lines of /proc/self/maps to consider each mapped + // region one at a time, looking for a gap between regions to allocate. + char buf[256] = { 0 }; + int len = 0, rc = 1; + bool long_line = false; + unsigned long gap_start = 0x10000; + void* new_addr; + do { + if (rc > 0) { + do { + rc = Sandbox::read(sys, proc_self_maps_, buf + len, + sizeof(buf) - len - 1); + if (rc > 0) { + len += rc; + } + } while (rc > 0 && len < (int)sizeof(buf) - 1); + } + char *ptr = buf; + if (!long_line) { + long_line = true; + // Maps lines have the form "<start address>-<end address> ... <name>". + unsigned long gap_end = strtoul(ptr, &ptr, 16); + unsigned long map_end = strtoul(ptr + 1, &ptr, 16); + + // gap_start to gap_end now covers the region of empty space before + // the current line. Now we try to see if there's a place within the + // gap we can use. + + if (gap_end - gap_start >= size) { + // Is the gap before our target address? + if (addr - static_cast<long>(gap_end) >= 0) { + if (addr - (gap_end - size) < kMaxDistance) { + unsigned long position; + if (isStackLine(ptr, buf + len)) { + // If we're adjacent to the stack, try to stay away from + // the GROWS_DOWN region. Pick the farthest away region that + // is still within the gap. + + if (static_cast<unsigned long>(addr) < kMaxDistance || // Underflow protection. + static_cast<unsigned long>(addr) - kMaxDistance < gap_start) { + position = gap_start; + } else { + position = (addr - kMaxDistance) & ~4095; + if (position < gap_start) { + position = gap_start; + } + } + } else { + // Otherwise, take the end of the region. + position = gap_end - size; + } + new_addr = reinterpret_cast<char *>(sys.MMAP + (reinterpret_cast<void *>(position), size, prot, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0)); + if (new_addr != MAP_FAILED) { + goto done; + } + } + } else if (gap_start + size - addr < kMaxDistance) { + // Gap is after the address. Above checks that we can wrap around + // through 0 to a space we'd use. + new_addr = reinterpret_cast<char *>(sys.MMAP + (reinterpret_cast<void *>(gap_start), size, prot, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1 ,0)); + if (new_addr != MAP_FAILED) { + goto done; + } + } + } + gap_start = map_end; + } + for (;;) { + if (!*ptr || *ptr++ == '\n') { + long_line = false; + memmove(buf, ptr, len - (ptr - buf)); + memset(buf + len - (ptr - buf), 0, ptr - buf); + len -= (ptr - buf); + break; + } + } + } while (len || long_line); + new_addr = NULL; +done: + return reinterpret_cast<char*>(new_addr); +} + +} // namespace diff --git a/sandbox/linux/seccomp/maps.h b/sandbox/linux/seccomp/maps.h new file mode 100644 index 0000000..fbcc7672 --- /dev/null +++ b/sandbox/linux/seccomp/maps.h @@ -0,0 +1,94 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef MAPS_H__ +#define MAPS_H__ + +#include <elf.h> +#include <functional> +#include <map> +#include <string> + +#include "allocator.h" + +#if defined(__x86_64__) +typedef Elf64_Addr Elf_Addr; +#elif defined(__i386__) +typedef Elf32_Addr Elf_Addr; +#else +#error Undefined target platform +#endif + +namespace playground { + +class Library; +class Maps { + friend class Library; + public: + typedef std::basic_string<char, std::char_traits<char>, + SystemAllocator<char> > string; + + Maps(int proc_self_maps); + ~Maps() { } + + protected: + // A map with all the libraries currently loaded into the application. + // The key is a unique combination of device number, inode number, and + // file name. It should be treated as opaque. + typedef std::map<string, Library, std::less<string>, + SystemAllocator<std::pair<const string, + Library> > > LibraryMap; + friend class Iterator; + class Iterator { + friend class Maps; + + protected: + explicit Iterator(Maps* maps); + Iterator(Maps* maps, bool at_beginning, bool at_end); + Maps::LibraryMap::iterator& getIterator() const; + + public: + Iterator begin(); + Iterator end(); + Iterator& operator++(); + Iterator operator++(int i); + Library* operator*() const; + bool operator==(const Iterator& iter) const; + bool operator!=(const Iterator& iter) const; + string name() const; + + protected: + mutable LibraryMap::iterator iter_; + Maps *maps_; + bool at_beginning_; + bool at_end_; + }; + + public: + typedef class Iterator const_iterator; + + const_iterator begin() { + return begin_iter_; + } + + const_iterator end() { + return end_iter_; + } + + char* allocNearAddr(char *addr, size_t size, int prot) const; + + char* vsyscall() const { return vsyscall_; } + + protected: + const int proc_self_maps_; + const Iterator begin_iter_; + const Iterator end_iter_; + + LibraryMap libs_; + char* vsyscall_; +}; + +} // namespace + +#endif // MAPS_H__ diff --git a/sandbox/linux/seccomp/mmap.cc b/sandbox/linux/seccomp/mmap.cc new file mode 100644 index 0000000..700da91 --- /dev/null +++ b/sandbox/linux/seccomp/mmap.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +void* Sandbox::sandbox_mmap(void *start, size_t length, int prot, int flags, + int fd, off_t offset) { + long long tm; + Debug::syscall(&tm, __NR_mmap, "Executing handler"); + struct { + int sysnum; + long long cookie; + MMap mmap_req; + } __attribute__((packed)) request; + request.sysnum = __NR_MMAP; + request.cookie = cookie(); + request.mmap_req.start = start; + request.mmap_req.length = length; + request.mmap_req.prot = prot; + request.mmap_req.flags = flags; + request.mmap_req.fd = fd; + request.mmap_req.offset = offset; + + void* rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward mmap() request [sandbox]"); + } + Debug::elapsed(tm, __NR_mmap); + return rc; +} + +bool Sandbox::process_mmap(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + MMap mmap_req; + if (read(sys, sandboxFd, &mmap_req, sizeof(mmap_req)) != sizeof(mmap_req)) { + die("Failed to read parameters for mmap() [process]"); + } + + if (mmap_req.flags & MAP_FIXED) { + // Cannot map a memory area that was part of the original memory mappings. + void *stop = reinterpret_cast<void *>( + (char *)mmap_req.start + mmap_req.length); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)mmap_req.start); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (mmap_req.start < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + int rc = -EINVAL; + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + } + + // All other mmap() requests are OK + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_MMAP, + mmap_req.start, mmap_req.length, mmap_req.prot, + mmap_req.flags, mmap_req.fd, mmap_req.offset); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/mprotect.cc b/sandbox/linux/seccomp/mprotect.cc new file mode 100644 index 0000000..548199d --- /dev/null +++ b/sandbox/linux/seccomp/mprotect.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_mprotect(const void *addr, size_t len, int prot) { + long long tm; + Debug::syscall(&tm, __NR_mprotect, "Executing handler"); + struct { + int sysnum; + long long cookie; + MProtect mprotect_req; + } __attribute__((packed)) request; + request.sysnum = __NR_mprotect; + request.cookie = cookie(); + request.mprotect_req.addr = addr; + request.mprotect_req.len = len; + request.mprotect_req.prot = prot; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward mprotect() request [sandbox]"); + } + Debug::elapsed(tm, __NR_mprotect); + return rc; +} + +bool Sandbox::process_mprotect(int parentMapsFd, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + SysCalls sys; + MProtect mprotect_req; + if (read(sys, sandboxFd, &mprotect_req, sizeof(mprotect_req)) != + sizeof(mprotect_req)) { + die("Failed to read parameters for mprotect() [process]"); + } + + // Cannot change permissions on any memory region that was part of the + // original memory mappings. + int rc = -EINVAL; + void *stop = reinterpret_cast<void *>( + (char *)mprotect_req.addr + mprotect_req.len); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)mprotect_req.addr); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (mprotect_req.addr < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + + // Changing permissions on memory regions that were newly mapped inside of + // the sandbox is OK. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_mprotect, + mprotect_req.addr, mprotect_req.len, + mprotect_req.prot); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/munmap.cc b/sandbox/linux/seccomp/munmap.cc new file mode 100644 index 0000000..dde7c7a --- /dev/null +++ b/sandbox/linux/seccomp/munmap.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_munmap(void* start, size_t length) { + long long tm; + Debug::syscall(&tm, __NR_munmap, "Executing handler"); + struct { + int sysnum; + long long cookie; + MUnmap munmap_req; + } __attribute__((packed)) request; + request.sysnum = __NR_munmap; + request.cookie = cookie(); + request.munmap_req.start = start; + request.munmap_req.length = length; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward munmap() request [sandbox]"); + } + Debug::elapsed(tm, __NR_munmap); + return rc; +} + +bool Sandbox::process_munmap(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + MUnmap munmap_req; + if (read(sys, sandboxFd, &munmap_req, sizeof(munmap_req)) != + sizeof(munmap_req)) { + die("Failed to read parameters for munmap() [process]"); + } + + // Cannot unmap any memory region that was part of the original memory + // mappings. + int rc = -EINVAL; + void *stop = reinterpret_cast<void *>( + reinterpret_cast<char *>(munmap_req.start) + munmap_req.length); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + munmap_req.start); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (munmap_req.start < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + + // Unmapping memory regions that were newly mapped inside of the sandbox + // is OK. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_munmap, + munmap_req.start, munmap_req.length); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/mutex.h b/sandbox/linux/seccomp/mutex.h new file mode 100644 index 0000000..d7e1c5d --- /dev/null +++ b/sandbox/linux/seccomp/mutex.h @@ -0,0 +1,153 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef MUTEX_H__ +#define MUTEX_H__ + +#include "sandbox_impl.h" + +namespace playground { + +class Mutex { + public: + typedef int mutex_t; + + enum { kInitValue = 0 }; + + static void initMutex(mutex_t* mutex) { + // Mutex is unlocked, and nobody is waiting for it + *mutex = kInitValue; + } + + static void unlockMutex(mutex_t* mutex) { + char status; + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; addl %2, %0\n" + "setz %1" + : "=m"(*mutex), "=qm"(status) + : "ir"(0x80000000), "m"(*mutex)); + #else + #error Unsupported target platform + #endif + if (status) { + // Mutex is zero now. No other waiters. So, we can return. + return; + } + // We unlocked the mutex, but still need to wake up other waiters. + Sandbox::SysCalls sys; + sys.futex(mutex, FUTEX_WAKE, 1, NULL); + } + + static bool lockMutex(mutex_t* mutex, int timeout = 0) { + bool rc = true; + // Increment mutex to add ourselves to the list of waiters + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; incl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + for (;;) { + // Atomically check whether the mutex is available and if so, acquire it + char status; + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; btsl %3, %1\n" + "setc %0" + : "=q"(status), "=m"(*mutex) + : "m"(*mutex), "ir"(31)); + #else + #error Unsupported target platform + #endif + if (!status) { + done: + // If the mutex was available, remove ourselves from list of waiters + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; decl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + return rc; + } + int value = *mutex; + if (value >= 0) { + // Mutex has just become available, no need to call kernel + continue; + } + Sandbox::SysCalls sys; + Sandbox::SysCalls::kernel_timespec tm; + if (timeout) { + tm.tv_sec = timeout / 1000; + tm.tv_nsec = (timeout % 1000) * 1000 * 1000; + } else { + tm.tv_sec = 0; + tm.tv_nsec = 0; + } + if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) && + sys.my_errno == ETIMEDOUT) { + rc = false; + goto done; + } + } + } + + static bool waitForUnlock(mutex_t* mutex, int timeout = 0) { + bool rc = true; + // Increment mutex to add ourselves to the list of waiters + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; incl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + Sandbox::SysCalls sys; + for (;;) { + mutex_t value = *mutex; + if (value >= 0) { + done: + // Mutex was not locked. Remove ourselves from list of waiters, notify + // any other waiters (if any), and return. + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; decl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + NOINTR_SYS(sys.futex(mutex, FUTEX_WAKE, 1, 0)); + return rc; + } + + // Wait for mutex to become unlocked + Sandbox::SysCalls::kernel_timespec tm; + if (timeout) { + tm.tv_sec = timeout / 1000; + tm.tv_nsec = (timeout % 1000) * 1000 * 1000; + } else { + tm.tv_sec = 0; + tm.tv_nsec = 0; + } + + if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) && + sys.my_errno == ETIMEDOUT) { + rc = false; + goto done; + } + } + } + +}; + +} // namespace + +#endif // MUTEX_H__ diff --git a/sandbox/linux/seccomp/open.cc b/sandbox/linux/seccomp/open.cc new file mode 100644 index 0000000..8a9093c --- /dev/null +++ b/sandbox/linux/seccomp/open.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_open(const char *pathname, int flags, mode_t mode) { + long long tm; + Debug::syscall(&tm, __NR_open, "Executing handler"); + size_t len = strlen(pathname); + struct Request { + int sysnum; + long long cookie; + Open open_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_open; + request->cookie = cookie(); + request->open_req.path_length = len; + request->open_req.flags = flags; + request->open_req.mode = mode; + memcpy(request->pathname, pathname, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward open() request [sandbox]"); + } + Debug::elapsed(tm, __NR_open); + return rc; +} + +bool Sandbox::process_open(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + Open open_req; + if (read(sys, sandboxFd, &open_req, sizeof(open_req)) != sizeof(open_req)) { + read_parm_failed: + die("Failed to read parameters for open() [process]"); + } + int rc = -ENAMETOOLONG; + if (open_req.path_length >= sizeof(mem->pathname)) { + char buf[32]; + while (open_req.path_length > 0) { + size_t len = open_req.path_length > sizeof(buf) ? + sizeof(buf) : open_req.path_length; + ssize_t i = read(sys, sandboxFd, buf, len); + if (i <= 0) { + goto read_parm_failed; + } + open_req.path_length -= i; + } + if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to return data from open() [process]"); + } + return false; + } + + if ((open_req.flags & O_ACCMODE) != O_RDONLY || + !g_policy.allow_file_namespace) { + // After locking the mutex, we can no longer abandon the system call. So, + // perform checks before clobbering the securely shared memory. + char tmp[open_req.path_length]; + if (read(sys, sandboxFd, tmp, open_req.path_length) != + (ssize_t)open_req.path_length) { + goto read_parm_failed; + } + Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str()); + SecureMem::abandonSystemCall(threadFd, -EACCES); + return false; + } + + SecureMem::lockSystemCall(parentMapsFd, mem); + if (read(sys, sandboxFd, mem->pathname, open_req.path_length) != + (ssize_t)open_req.path_length) { + goto read_parm_failed; + } + mem->pathname[open_req.path_length] = '\000'; + + // TODO(markus): Implement sandboxing policy. For now, we allow read + // access to everything. That's probably not correct. + Debug::message(("Allowing access to \"" + std::string(mem->pathname) + + "\"").c_str()); + + // Tell trusted thread to open the file. + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, __NR_open, + mem->pathname - (char*)mem + (char*)mem->self, + open_req.flags, open_req.mode); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc new file mode 100644 index 0000000..0b09457 --- /dev/null +++ b/sandbox/linux/seccomp/sandbox.cc @@ -0,0 +1,838 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "library.h" +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +// Global variables +int Sandbox::proc_self_maps_ = -1; +enum Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; +int Sandbox::pid_; +int Sandbox::processFdPub_; +int Sandbox::cloneFdPub_; +Sandbox::SysCalls::kernel_sigaction Sandbox::sa_segv_; +Sandbox::ProtectedMap Sandbox::protectedMap_; +std::vector<SecureMem::Args*> Sandbox::secureMemPool_; + +bool Sandbox::sendFd(int transport, int fd0, int fd1, const void* buf, + size_t len) { + int fds[2], count = 0; + if (fd0 >= 0) { fds[count++] = fd0; } + if (fd1 >= 0) { fds[count++] = fd1; } + if (!count) { + return false; + } + char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct SysCalls::kernel_iovec iov[2] = { { 0 } }; + struct SysCalls::kernel_msghdr msg = { 0 }; + int dummy = 0; + iov[0].iov_base = &dummy; + iov[0].iov_len = sizeof(dummy); + if (buf && len > 0) { + iov[1].iov_base = const_cast<void *>(buf); + iov[1].iov_len = len; + } + msg.msg_iov = iov; + msg.msg_iovlen = (buf && len > 0) ? 2 : 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = CMSG_LEN(count*sizeof(int)); + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(count*sizeof(int)); + memcpy(CMSG_DATA(cmsg), fds, count*sizeof(int)); + SysCalls sys; + return NOINTR_SYS(sys.sendmsg(transport, &msg, 0)) == + (ssize_t)(sizeof(dummy) + ((buf && len > 0) ? len : 0)); +} + +bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) { + int count = 0; + int *err = NULL; + if (fd0) { + count++; + err = fd0; + *fd0 = -1; + } + if (fd1) { + if (!count++) { + err = fd1; + } + *fd1 = -1; + } + if (!count) { + return false; + } + char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct SysCalls::kernel_iovec iov[2] = { { 0 } }; + struct SysCalls::kernel_msghdr msg = { 0 }; + iov[0].iov_base = err; + iov[0].iov_len = sizeof(int); + if (buf && len && *len > 0) { + iov[1].iov_base = buf; + iov[1].iov_len = *len; + } + msg.msg_iov = iov; + msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = CMSG_LEN(count*sizeof(int)); + SysCalls sys; + ssize_t bytes = NOINTR_SYS(sys.recvmsg(transport, &msg, 0)); + if (len) { + *len = bytes > (int)sizeof(int) ? + bytes - sizeof(int) : 0; + } + if (bytes != (ssize_t)(sizeof(int) + ((buf && len && *len > 0) ? *len : 0))){ + *err = bytes >= 0 ? 0 : -EBADF; + return false; + } + if (*err) { + // "err" is the first four bytes of the payload. If these are non-zero, + // the sender on the other side of the socketpair sent us an errno value. + // We don't expect to get any file handles in this case. + return false; + } + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) || + !cmsg || + cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS || + cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) { + *err = -EBADF; + return false; + } + if (fd1) { *fd1 = ((int *)CMSG_DATA(cmsg))[--count]; } + if (fd0) { *fd0 = ((int *)CMSG_DATA(cmsg))[--count]; } + return true; +} + +void Sandbox::setupSignalHandlers() { + // Set SIGCHLD to SIG_DFL so that waitpid() can work + SysCalls sys; + struct SysCalls::kernel_sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler_ = SIG_DFL; + sys.sigaction(SIGCHLD, &sa, NULL); + + // Set up SEGV handler for dealing with RDTSC instructions, system calls + // that have been rewritten to use INT0, for sigprocmask() emulation, for + // the creation of threads, and for user-provided SEGV handlers. + sa.sa_sigaction_ = segv(); + sa.sa_flags = SA_SIGINFO | SA_NODEFER; + sys.sigaction(SIGSEGV, &sa, &sa_segv_); + + // Unblock SIGSEGV and SIGCHLD + SysCalls::kernel_sigset_t mask; + memset(&mask, 0x00, sizeof(mask)); + mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1)); + sys.sigprocmask(SIG_UNBLOCK, &mask, 0); +} + +void (*Sandbox::segv())(int signo, SysCalls::siginfo *context, void *unused) { + void (*fnc)(int signo, SysCalls::siginfo *context, void *unused); + asm volatile( + "call 999f\n" +#if defined(__x86_64__) + // Inspect instruction at the point where the segmentation fault + // happened. If it is RDTSC, forward the request to the trusted + // thread. + "mov $-3, %%r14\n" // request for RDTSC + "mov 0xB0(%%rsp), %%r15\n" // %rip at time of segmentation fault + "cmpw $0x310F, (%%r15)\n" // RDTSC + "jz 0f\n" + "cmpw $0x010F, (%%r15)\n" // RDTSCP + "jnz 8f\n" + "cmpb $0xF9, 2(%%r15)\n" + "jnz 8f\n" + "mov $-4, %%r14\n" // request for RDTSCP + "0:" +#ifndef NDEBUG + "lea 100f(%%rip), %%rdi\n" + "call playground$debugMessage\n" +#endif + "sub $4, %%rsp\n" + "push %%r14\n" + "mov %%gs:16, %%edi\n" // fd = threadFdPub + "mov %%rsp, %%rsi\n" // buf = %rsp + "mov $4, %%edx\n" // len = sizeof(int) + "1:mov $1, %%eax\n" // NR_write + "syscall\n" + "cmp %%rax, %%rdx\n" + "jz 5f\n" + "cmp $-4, %%eax\n" // EINTR + "jz 1b\n" + "2:add $12, %%rsp\n" + "movq $0, 0x98(%%rsp)\n" // %rax at time of segmentation fault + "movq $0, 0x90(%%rsp)\n" // %rdx at time of segmentation fault + "cmpw $0x310F, (%%r15)\n" // RDTSC + "jz 3f\n" + "movq $0, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault + "3:addq $2, 0xB0(%%rsp)\n" // %rip at time of segmentation fault + "cmpw $0x010F, (%%r15)\n" // RDTSC + "jnz 4f\n" + "addq $1, 0xB0(%%rsp)\n" // %rip at time of segmentation fault + "4:ret\n" + "5:mov $12, %%edx\n" // len = 3*sizeof(int) + "6:mov $0, %%eax\n" // NR_read + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 6b\n" + "cmp %%rax, %%rdx\n" + "jnz 2b\n" + "mov 0(%%rsp), %%eax\n" + "mov 4(%%rsp), %%edx\n" + "mov 8(%%rsp), %%ecx\n" + "add $12, %%rsp\n" + "mov %%rdx, 0x90(%%rsp)\n" // %rdx at time of segmentation fault + "cmpw $0x310F, (%%r15)\n" // RDTSC + "jz 7f\n" + "mov %%rcx, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault + "7:mov %%rax, 0x98(%%rsp)\n" // %rax at time of segmentation fault + "jmp 3b\n" + + // If the instruction is INT 0, then this was probably the result + // of playground::Library being unable to find a way to safely + // rewrite the system call instruction. Retrieve the CPU register + // at the time of the segmentation fault and invoke syscallWrapper(). + "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0 + "jnz 16f\n" +#ifndef NDEBUG + "lea 200f(%%rip), %%rdi\n" + "call playground$debugMessage\n" +#endif + "mov 0x98(%%rsp), %%rax\n" // %rax at time of segmentation fault + "mov 0x70(%%rsp), %%rdi\n" // %rdi at time of segmentation fault + "mov 0x78(%%rsp), %%rsi\n" // %rsi at time of segmentation fault + "mov 0x90(%%rsp), %%rdx\n" // %rdx at time of segmentation fault + "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault + "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault + "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault + + // Handle rt_sigprocmask() + "cmp $14, %%rax\n" // NR_rt_sigprocmask + "jnz 12f\n" + "mov $-22, %%rax\n" // -EINVAL + "cmp $8, %%r10\n" // %r10 = sigsetsize (8 bytes = 64 signals) + "jl 7b\n" + "mov 0x130(%%rsp), %%r10\n" // signal mask at time of segmentation fault + "test %%rsi, %%rsi\n" // only set mask, if set is non-NULL + "jz 11f\n" + "mov 0(%%rsi), %%rsi\n" + "cmp $0, %%rdi\n" // %rdi = how (SIG_BLOCK) + "jnz 9f\n" + "or %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "jmp 11f\n" + "9:cmp $1, %%rdi\n" // %rdi = how (SIG_UNBLOCK) + "jnz 10f\n" + "xor $-1, %%rsi\n" + "and %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "jmp 11f\n" + "10:cmp $2, %%rdi\n" // %rdi = how (SIG_SETMASK) + "jnz 7b\n" + "mov %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault + "11:xor %%rax, %%rax\n" + "test %%rdx, %%rdx\n" // only return old mask, if set is non-NULL + "jz 7b\n" + "mov %%r10, 0(%%rdx)\n" // old_set + "jmp 7b\n" + + // Handle rt_sigreturn() + "12:cmp $15, %%rax\n" // NR_rt_sigreturn + "jnz 14f\n" + "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault + "13:syscall\n" // rt_sigreturn() is unrestricted + "mov $66, %%edi\n" // rt_sigreturn() should never return + "mov $231, %%eax\n" // NR_exit_group + "jmp 13b\n" + + // Copy signal frame onto new stack. See clone.cc for details + "14:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000 + "jnz 15f\n" + "lea 8(%%rsp), %%rax\n" // retain stack frame upon returning + "mov %%rax, 0xA8(%%rsp)\n" // %rsp at time of segmentation fault + "jmp 7b\n" + + // Forward system call to syscallWrapper() + "15:lea 7b(%%rip), %%rcx\n" + "push %%rcx\n" + "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault + "lea playground$syscallWrapper(%%rip), %%rcx\n" + "jmp *%%rcx\n" + + // In order to implement SA_NODEFER, we have to keep track of recursive + // calls to SIGSEGV handlers. This means we have to increment a counter + // before calling the user's signal handler, and decrement it on + // leaving the user's signal handler. + // Some signal handlers look at the return address of the signal + // stack, and more importantly "gdb" uses the call to rt_sigreturn() + // as a magic signature when doing stacktraces. So, we have to use + // a little more unusual code to regain control after the user's + // signal handler is done. We adjust the return address to point to + // non-executable memory. And when we trigger another SEGV we pop the + // extraneous signal frame and then call rt_sigreturn(). + // N.B. We currently do not correctly adjust the SEGV counter, if the + // user's signal handler exits in way other than by returning (e.g. by + // directly calling rt_sigreturn(), or by calling siglongjmp()). + "16:lea 22f(%%rip), %%r14\n" + "cmp %%r14, %%r15\n" + "jnz 17f\n" // check if returning from user's handler + "decl %%gs:0x105C-0xE0\n" // decrement SEGV recursion counter + "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault + "mov $0xF, %%eax\n" // NR_rt_sigreturn + "syscall\n" + + // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for + // what we are supposed to do. + "17:mov playground$sa_segv@GOTPCREL(%%rip), %%rax\n" + "cmp $0, 0(%%rax)\n" // SIG_DFL + "jz 18f\n" + "cmp $1, 0(%%rax)\n" // SIG_IGN + "jnz 19f\n" // can't really ignore synchronous signals + + // Trigger the kernel's default signal disposition. The only way we can + // do this from seccomp mode is by blocking the signal and retriggering + // it. + "18:orb $4, 0x131(%%rsp)\n" // signal mask at time of segmentation fault + "ret\n" + + // Check sa_flags: + // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they + // do not have any effect for SIGSEGV. + // - On x86-64, we can also ignore SA_SIGINFO, as the calling + // conventions for sa_handler() are a subset of the conventions for + // sa_sigaction(). + // - We have to always register our signal handler with SA_NODEFER so + // that the user's signal handler can make system calls which might + // require additional help from our SEGV handler. + // - If the user's signal handler wasn't supposed to be SA_NODEFER, then + // we emulate this behavior by keeping track of a recursion counter. + // + // TODO(markus): If/when we add support for sigaltstack(), we have to + // handle SA_ONSTACK. + "19:cmpl $0, %%gs:0x105C-0xE0\n"// check if we failed inside of SEGV handler + "jnz 18b\n" // if so, then terminate program + "mov 0(%%rax), %%rbx\n" // sa_segv_.sa_sigaction + "mov 8(%%rax), %%rcx\n" // sa_segv_.sa_flags + "btl $31, %%ecx\n" // SA_RESETHAND + "jnc 20f\n" + "movq $0, 0(%%rax)\n" // set handler to SIG_DFL + "20:btl $30, %%ecx\n" // SA_NODEFER + "jc 21f\n" + "mov %%r14, 0(%%rsp)\n" // trigger a SEGV on return, so that we can + "incl %%gs:0x105C-0xE0\n" // clean up state; incr. recursion counter + "21:jmp *%%rbx\n" // call user's signal handler + + + // Non-executable version of the restorer function. We use this to + // trigger a SEGV upon returning from the user's signal handler, giving + // us an ability to clean up prior to returning from the SEGV handler. + ".pushsection .data\n" // move code into non-executable section + "22:mov $0xF, %%rax\n" // gdb looks for this signature when doing + "syscall\n" // backtraces + ".popsection\n" +#elif defined(__i386__) + // Inspect instruction at the point where the segmentation fault + // happened. If it is RDTSC, forward the request to the trusted + // thread. + "mov $-3, %%ebx\n" // request for RDTSC + "mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault + "cmpw $0x310F, (%%ebp)\n" // RDTSC + "jz 0f\n" + "cmpw $0x010F, (%%ebp)\n" // RDTSCP + "jnz 9f\n" + "cmpb $0xF9, 2(%%ebp)\n" + "jnz 9f\n" + "mov $-4, %%ebx\n" // request for RDTSCP + "0:" +#ifndef NDEBUG + "lea 100f, %%eax\n" + "push %%eax\n" + "call playground$debugMessage\n" + "sub $4, %%esp\n" +#else + "sub $8, %%esp\n" // allocate buffer for receiving timestamp +#endif + "push %%ebx\n" + "mov %%fs:16, %%ebx\n" // fd = threadFdPub + "mov %%esp, %%ecx\n" // buf = %esp + "mov $4, %%edx\n" // len = sizeof(int) + "1:mov %%edx, %%eax\n" // NR_write + "int $0x80\n" + "cmp %%eax, %%edx\n" + "jz 7f\n" + "cmp $-4, %%eax\n" // EINTR + "jz 1b\n" + "2:add $12, %%esp\n" // remove temporary buffer from stack + "xor %%eax, %%eax\n" + "movl $0, 0xC8(%%esp)\n" // %edx at time of segmentation fault + "cmpw $0x310F, (%%ebp)\n" // RDTSC + "jz 3f\n" + "movl $0, 0xCC(%%esp)\n" // %ecx at time of segmentation fault + "3:mov %%eax, 0xD0(%%esp)\n" // %eax at time of segmentation fault + "4:mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault + "addl $2, 0xDC(%%esp)\n" // %eip at time of segmentation fault + "cmpw $0x010F, (%%ebp)\n" // RDTSCP + "jnz 5f\n" + "addl $1, 0xDC(%%esp)\n" // %eip at time of segmentation fault + "5:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger + "mov 0x1CC(%%esp), %%eax\n" // push signal number + "push %%eax\n" + "lea 0x270(%%esp), %%esi\n" // copy siginfo register values + "lea 0x4(%%esp), %%edi\n" // into new location + "mov $22, %%ecx\n" + "cld\n" + "rep movsl\n" + "mov 0x2C8(%%esp), %%ebx\n" // copy first half of signal mask + "mov %%ebx, 0x54(%%esp)\n" + "lea 6f, %%esi\n" // copy "magic" restorer function + "push %%esi\n" // push restorer function + "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers + "movb $2, %%cl\n" + "rep movsl\n" + "ret\n" // return to restorer function + + // The restorer function is sometimes used by gdb as a magic marker to + // recognize signal stack frames. Don't change any of the next three + // instructions. + "6:pop %%eax\n" // remove dummy argument (signo) + "mov $119, %%eax\n" // NR_sigreturn + "int $0x80\n" + "7:mov $12, %%edx\n" // len = 3*sizeof(int) + "8:mov $3, %%eax\n" // NR_read + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 8b\n" + "cmp %%eax, %%edx\n" + "jnz 2b\n" + "pop %%eax\n" + "pop %%edx\n" + "pop %%ecx\n" + "mov %%edx, 0xC8(%%esp)\n" // %edx at time of segmentation fault + "cmpw $0x310F, (%%ebp)\n" // RDTSC + "jz 3b\n" + "mov %%ecx, 0xCC(%%esp)\n" // %ecx at time of segmentation fault + "jmp 3b\n" + + // If the instruction is INT 0, then this was probably the result + // of playground::Library being unable to find a way to safely + // rewrite the system call instruction. Retrieve the CPU register + // at the time of the segmentation fault and invoke syscallWrapper(). + "9:cmpw $0x00CD, (%%ebp)\n" // INT $0x0 + "jnz 20f\n" +#ifndef NDEBUG + "lea 200f, %%eax\n" + "push %%eax\n" + "call playground$debugMessage\n" + "add $0x4, %%esp\n" +#endif + "mov 0xD0(%%esp), %%eax\n" // %eax at time of segmentation fault + "mov 0xC4(%%esp), %%ebx\n" // %ebx at time of segmentation fault + "mov 0xCC(%%esp), %%ecx\n" // %ecx at time of segmentation fault + "mov 0xC8(%%esp), %%edx\n" // %edx at time of segmentation fault + "mov 0xB8(%%esp), %%esi\n" // %esi at time of segmentation fault + "mov 0xB4(%%esp), %%edi\n" // %edi at time of segmentation fault + "mov 0xB2(%%esp), %%ebp\n" // %ebp at time of segmentation fault + + // Handle sigprocmask() and rt_sigprocmask() + "cmp $175, %%eax\n" // NR_rt_sigprocmask + "jnz 10f\n" + "mov $-22, %%eax\n" // -EINVAL + "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals) + "jl 3b\n" + "jmp 11f\n" + "10:cmp $126, %%eax\n" // NR_sigprocmask + "jnz 15f\n" + "mov $-22, %%eax\n" + "11:mov 0xFC(%%esp), %%edi\n" // signal mask at time of segmentation fault + "mov 0x100(%%esp), %%ebp\n" + "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL + "jz 14f\n" + "mov 0(%%ecx), %%esi\n" + "mov 4(%%ecx), %%ecx\n" + "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK) + "jnz 12f\n" + "or %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault + "or %%ecx, 0x100(%%esp)\n" + "jmp 14f\n" + "12:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK) + "jnz 13f\n" + "xor $-1, %%esi\n" + "xor $-1, %%ecx\n" + "and %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault + "and %%ecx, 0x100(%%esp)\n" + "jmp 14f\n" + "13:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK) + "jnz 3b\n" + "mov %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault + "mov %%ecx, 0x100(%%esp)\n" + "14:xor %%eax, %%eax\n" + "test %%edx, %%edx\n" // only return old mask, if set is non-NULL + "jz 3b\n" + "mov %%edi, 0(%%edx)\n" // old_set + "mov %%ebp, 4(%%edx)\n" + "jmp 3b\n" + + // Handle sigreturn() and rt_sigreturn() + // See syscall.cc for a discussion on how we can emulate rt_sigreturn() + // by calling sigreturn() with a suitably adjusted stack. + "15:cmp $119, %%eax\n" // NR_sigreturn + "jnz 17f\n" + "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault + "16:int $0x80\n" // sigreturn() is unrestricted + "17:cmp $173, %%eax\n" // NR_rt_sigreturn + "jnz 18f\n" + "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault + "sub $4, %%esp\n" // add fake return address + "jmp 4b\n" + + // Copy signal frame onto new stack. In the process, we have to convert + // it from an RT signal frame to a legacy signal frame. + // See clone.cc for details + "18:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000 + "jnz 19f\n" + "lea -0x1C8(%%esp), %%eax\n"// retain stack frame upon returning + "mov %%eax, 0xC0(%%esp)\n" // %esp at time of segmentation fault + "jmp 3b\n" + + // Forward system call to syscallWrapper() + "19:call playground$syscallWrapper\n" + "jmp 3b\n" + + // In order to implement SA_NODEFER, we have to keep track of recursive + // calls to SIGSEGV handlers. This means we have to increment a counter + // before calling the user's signal handler, and decrement it on + // leaving the user's signal handler. + // Some signal handlers look at the return address of the signal + // stack, and more importantly "gdb" uses the call to {,rt_}sigreturn() + // as a magic signature when doing stacktraces. So, we have to use + // a little more unusual code to regain control after the user's + // signal handler is done. We adjust the return address to point to + // non-executable memory. And when we trigger another SEGV we pop the + // extraneous signal frame and then call sigreturn(). + // N.B. We currently do not correctly adjust the SEGV counter, if the + // user's signal handler exits in way other than by returning (e.g. by + // directly calling {,rt_}sigreturn(), or by calling siglongjmp()). + "20:lea 30f, %%edi\n" // rt-style restorer function + "lea 31f, %%esi\n" // legacy restorer function + "cmp %%ebp, %%edi\n" // check if returning from user's handler + "jnz 21f\n" + "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter + "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault + "jmp 29f\n" + "21:cmp %%ebp, %%esi\n" // check if returning from user's handler + "jnz 22f\n" + "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter + "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault + "jmp 6b\n" + + // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for + // what we are supposed to do. + "22:lea playground$sa_segv, %%eax\n" + "cmp $0, 0(%%eax)\n" // SIG_DFL + "jz 23f\n" + "cmp $1, 0(%%eax)\n" // SIG_IGN + "jnz 24f\n" // can't really ignore synchronous signals + + // Trigger the kernel's default signal disposition. The only way we can + // do this from seccomp mode is by blocking the signal and retriggering + // it. + "23:orb $4, 0xFD(%%esp)\n" // signal mask at time of segmentation fault + "jmp 5b\n" + + // Check sa_flags: + // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they + // do not have any effect for SIGSEGV. + // - We have to always register our signal handler with SA_NODEFER so + // that the user's signal handler can make system calls which might + // require additional help from our SEGV handler. + // - If the user's signal handler wasn't supposed to be SA_NODEFER, then + // we emulate this behavior by keeping track of a recursion counter. + // + // TODO(markus): If/when we add support for sigaltstack(), we have to + // handle SA_ONSTACK. + "24:cmpl $0, %%fs:0x1040-0x58\n"// check if we failed inside of SEGV handler + "jnz 23b\n" // if so, then terminate program + "mov 0(%%eax), %%ebx\n" // sa_segv_.sa_sigaction + "mov 4(%%eax), %%ecx\n" // sa_segv_.sa_flags + "btl $31, %%ecx\n" // SA_RESETHAND + "jnc 25f\n" + "movl $0, 0(%%eax)\n" // set handler to SIG_DFL + "25:btl $30, %%ecx\n" // SA_NODEFER + "jc 28f\n" + "btl $2, %%ecx\n" // SA_SIGINFO + "jnc 26f\n" + "mov %%edi, 0(%%esp)\n" // trigger a SEGV on return + "incl %%fs:0x1040-0x58\n" // increment recursion counter + "jmp *%%ebx\n" // call user's signal handler + "26:mov %%esi, 0(%%esp)\n" + "incl %%fs:0x1040-0x58\n" // increment recursion counter + + // We always register the signal handler to give us rt-style signal + // frames. But if the user asked for legacy signal frames, we must + // convert the signal frame prior to calling the user's signal handler. + "27:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger + "mov 0x1CC(%%esp), %%eax\n" // push signal number + "push %%eax\n" + "mov 0x1CC(%%esp), %%eax\n" // push restorer function + "push %%eax\n" + "lea 0x274(%%esp), %%esi\n" // copy siginfo register values + "lea 0x8(%%esp), %%edi\n" // into new location + "mov $22, %%ecx\n" + "cld\n" + "rep movsl\n" + "mov 0x2CC(%%esp), %%eax\n" // copy first half of signal mask + "mov %%eax, 0x58(%%esp)\n" + "lea 31f, %%esi\n" + "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers + "movb $2, %%cl\n" + "rep movsl\n" + "jmp *%%ebx\n" // call user's signal handler + "28:lea 6b, %%eax\n" // set appropriate restorer function + "mov %%eax, 0(%%esp)\n" + "btl $2, %%ecx\n" // SA_SIGINFO + "jnc 27b\n" + "lea 29f, %%eax\n" + "mov %%eax, 0(%%esp)\n" // set appropriate restorer function + "jmp *%%ebx\n" // call user's signal handler + "29:pushl $30f\n" // emulate rt_sigreturn() + "jmp 5b\n" + + // Non-executable versions of the restorer function. We use these to + // trigger a SEGV upon returning from the user's signal handler, giving + // us an ability to clean up prior to returning from the SEGV handler. + ".pushsection .data\n" // move code into non-executable section + "30:mov $173, %%eax\n" // NR_rt_sigreturn + "int $0x80\n" // gdb looks for this signature when doing + ".byte 0\n" // backtraces + "31:pop %%eax\n" + "mov $119, %%eax\n" // NR_sigreturn + "int $0x80\n" + ".popsection\n" +#else +#error Unsupported target platform +#endif + ".pushsection \".rodata\"\n" +#ifndef NDEBUG + "100:.asciz \"RDTSC(P): Executing handler\\n\"\n" + "200:.asciz \"INT $0x0: Executing handler\\n\"\n" +#endif + ".popsection\n" + "999:pop %0\n" + : "=g"(fnc) + : + : "memory" +#if defined(__x86_64__) + , "rsp" +#elif defined(__i386__) + , "esp" +#endif + ); + return fnc; +} + +SecureMem::Args* Sandbox::getSecureMem() { + // Check trusted_thread.cc for the magic offset that gets us from the TLS + // to the beginning of the secure memory area. + SecureMem::Args* ret; +#if defined(__x86_64__) + asm volatile( + "movq %%gs:-0xE0, %0\n" + : "=q"(ret)); +#elif defined(__i386__) + asm volatile( + "movl %%fs:-0x58, %0\n" + : "=r"(ret)); +#else +#error Unsupported target platform +#endif + return ret; +} + +void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) { + SysCalls sys; + if (sys.lseek(proc_self_maps, 0, SEEK_SET) || + !sendFd(processFd, proc_self_maps, -1, NULL, 0)) { + failure: + die("Cannot access /proc/self/maps"); + } + int dummy; + if (read(sys, processFd, &dummy, sizeof(dummy)) != sizeof(dummy)) { + goto failure; + } +} + +int Sandbox::supportsSeccompSandbox(int proc_fd) { + if (status_ != STATUS_UNKNOWN) { + return status_ != STATUS_UNSUPPORTED; + } + int fds[2]; + SysCalls sys; + if (sys.pipe(fds)) { + status_ = STATUS_UNSUPPORTED; + return 0; + } + pid_t pid; + switch ((pid = sys.fork())) { + case -1: + status_ = STATUS_UNSUPPORTED; + return 0; + case 0: { + int devnull = sys.open("/dev/null", O_RDWR, 0); + if (devnull >= 0) { + sys.dup2(devnull, 0); + sys.dup2(devnull, 1); + sys.dup2(devnull, 2); + sys.close(devnull); + } + if (proc_fd >= 0) { + setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0)); + } + startSandbox(); + write(sys, fds[1], "", 1); + + // Try to tell the trusted thread to shut down the entire process in an + // orderly fashion + defaultSystemCallHandler(__NR_exit_group, 0, 0, 0, 0, 0, 0); + + // If that did not work (e.g. because the kernel does not know about the + // exit_group() system call), make a direct _exit() system call instead. + // This system call is unrestricted in seccomp mode, so it will always + // succeed. Normally, we don't like it, because unlike exit_group() it + // does not terminate any other thread. But since we know that + // exit_group() exists in all kernels which support kernel-level threads, + // this is OK we only get here for old kernels where _exit() is OK. + sys._exit(0); + } + default: + NOINTR_SYS(sys.close(fds[1])); + char ch; + if (read(sys, fds[0], &ch, 1) != 1) { + status_ = STATUS_UNSUPPORTED; + } else { + status_ = STATUS_AVAILABLE; + } + int rc; + NOINTR_SYS(sys.waitpid(pid, &rc, 0)); + NOINTR_SYS(sys.close(fds[0])); + return status_ != STATUS_UNSUPPORTED; + } +} + +void Sandbox::setProcSelfMaps(int proc_self_maps) { + proc_self_maps_ = proc_self_maps; +} + +void Sandbox::startSandbox() { + if (status_ == STATUS_UNSUPPORTED) { + die("The seccomp sandbox is not supported on this computer"); + } else if (status_ == STATUS_ENABLED) { + return; + } + + SysCalls sys; + if (proc_self_maps_ < 0) { + proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0); + if (proc_self_maps_ < 0) { + die("Cannot access \"/proc/self/maps\""); + } + } + + // The pid is unchanged for the entire program, so we can retrieve it once + // and store it in a global variable. + pid_ = sys.getpid(); + + // Block all signals, except for the RDTSC handler + setupSignalHandlers(); + + // Get socketpairs for talking to the trusted process + int pair[4]; + if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || + sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { + die("Failed to create trusted thread"); + } + processFdPub_ = pair[0]; + cloneFdPub_ = pair[2]; + SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1], + pair[2], pair[3]); + + // We find all libraries that have system calls and redirect the system + // calls to the sandbox. If we miss any system calls, the application will be + // terminated by the kernel's seccomp code. So, from a security point of + // view, if this code fails to identify system calls, we are still behaving + // correctly. + { + Maps maps(proc_self_maps_); + const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; + + // Intercept system calls in the VDSO segment (if any). This has to happen + // before intercepting system calls in any of the other libraries, as + // the main kernel entry point might be inside of the VDSO and we need to + // determine its address before we can compare it to jumps from inside + // other libraries. + for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ + Library* library = *iter; + if (library->isVDSO() && library->parseElf()) { + library->makeWritable(true); + library->patchSystemCalls(); + library->makeWritable(false); + break; + } + } + + // Intercept system calls in libraries that are known to have them. + for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ + Library* library = *iter; + const char* mapping = iter.name().c_str(); + + // Find the actual base name of the mapped library by skipping past any + // SPC and forward-slashes. We don't want to accidentally find matches, + // because the directory name included part of our well-known lib names. + // + // Typically, prior to pruning, entries would look something like this: + // 08:01 2289011 /lib/libc-2.7.so + for (const char *delim = " /"; *delim; ++delim) { + const char* skip = strrchr(mapping, *delim); + if (skip) { + mapping = skip + 1; + } + } + + for (const char **ptr = libs; *ptr; ptr++) { + const char *name = strstr(mapping, *ptr); + if (name == mapping) { + char ch = name[strlen(*ptr)]; + if (ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z') { + if (library->parseElf()) { + library->makeWritable(true); + library->patchSystemCalls(); + library->makeWritable(false); + break; + } + } + } + } + } + } + + // Take a snapshot of the current memory mappings. These mappings will be + // off-limits to all future mmap(), munmap(), mremap(), and mprotect() calls. + snapshotMemoryMappings(processFdPub_, proc_self_maps_); + NOINTR_SYS(sys.close(proc_self_maps_)); + proc_self_maps_ = -1; + + // Creating the trusted thread enables sandboxing + createTrustedThread(processFdPub_, cloneFdPub_, secureMem); + + // We can no longer check for sandboxing support at this point, but we also + // know for a fact that it is available (as we just turned it on). So update + // the status to reflect this information. + status_ = STATUS_ENABLED; +} + +} // namespace diff --git a/sandbox/linux/seccomp/sandbox.h b/sandbox/linux/seccomp/sandbox.h new file mode 100644 index 0000000..8f49575 --- /dev/null +++ b/sandbox/linux/seccomp/sandbox.h @@ -0,0 +1,12 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef SANDBOX_H__ +#define SANDBOX_H__ + +extern "C" int SupportsSeccompSandbox(int proc_fd); +extern "C" void SeccompSandboxSetProcSelfMaps(int proc_self_maps); +extern "C" void StartSeccompSandbox(); + +#endif // SANDBOX_H__ diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h new file mode 100644 index 0000000..3e99a5510 --- /dev/null +++ b/sandbox/linux/seccomp/sandbox_impl.h @@ -0,0 +1,715 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef SANDBOX_IMPL_H__ +#define SANDBOX_IMPL_H__ + +#include <asm/ldt.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/prctl.h> +#include <linux/unistd.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sched.h> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> + +#define NOINTR_SYS(x) \ + ({ typeof(x) i__; while ((i__ = (x)) < 0 && sys.my_errno == EINTR); i__;}) + +#ifdef __cplusplus +#include <map> +#include <vector> +#include "sandbox.h" +#include "securemem.h" +#include "tls.h" + +namespace playground { + +class Sandbox { + // TODO(markus): restrict access to our private file handles + public: + enum { kMaxThreads = 100 }; + + + // There are a lot of reasons why the Seccomp sandbox might not be available. + // This could be because the kernel does not support Seccomp mode, or it + // could be because we fail to successfully rewrite all system call entry + // points. + // "proc_fd" should be a file descriptor for "/proc", or -1 if not provided + // by the caller. + static int supportsSeccompSandbox(int proc_fd) + asm("SupportsSeccompSandbox"); + + // The sandbox needs to be able to access "/proc/self/maps". If this file + // is not accessible when "startSandbox()" gets called, the caller can + // provide an already opened file descriptor by calling "setProcSelfMaps()". + // The sandbox becomes the newer owner of this file descriptor and will + // eventually close it when "startSandbox()" executes. + static void setProcSelfMaps(int proc_self_maps) + asm("SeccompSandboxSetProcSelfMaps"); + + // This is the main public entry point. It finds all system calls that + // need rewriting, sets up the resources needed by the sandbox, and + // enters Seccomp mode. + static void startSandbox() asm("StartSeccompSandbox"); + + private: +// syscall_table.c has to be implemented in C, as C++ does not support +// designated initializers for arrays. The only other alternative would be +// to have a source code generator for this table. +// +// We would still like the C source file to include our header file. This +// requires some define statements to transform C++ specific constructs to +// something that is palatable to a C compiler. +#define STATIC static +#define SecureMemArgs SecureMem::Args + // Clone() is special as it has a wrapper in syscall_table.c. The wrapper + // adds one extra argument (the pointer to the saved registers) and then + // calls playground$sandbox__clone(). + static long sandbox_clone(int flags, char* stack, int* pid, int* ctid, + void* tls, void* wrapper_sp) + asm("playground$sandbox__clone") + #if defined(__x86_64__) + __attribute__((visibility("internal"))) +#endif + ; +#else +#define STATIC +#define bool int +#define SecureMemArgs void + // This is the wrapper entry point that is found in the syscall_table. + long sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls) + asm("playground$sandbox_clone"); +#endif + + // Entry points for sandboxed code that is attempting to make system calls + STATIC long sandbox_access(const char*, int) + asm("playground$sandbox_access"); + STATIC long sandbox_exit(int status) asm("playground$sandbox_exit"); + STATIC long sandbox_getpid() asm("playground$sandbox_getpid"); + #if defined(__NR_getsockopt) + STATIC long sandbox_getsockopt(int, int, int, void*, socklen_t*) + asm("playground$sandbox_getsockopt"); + #endif + STATIC long sandbox_gettid() asm("playground$sandbox_gettid"); + STATIC long sandbox_ioctl(int d, int req, void* arg) + asm("playground$sandbox_ioctl"); + #if defined(__NR_ipc) + STATIC long sandbox_ipc(unsigned, int, int, int, void*, long) + asm("playground$sandbox_ipc"); + #endif + STATIC long sandbox_lstat(const char* path, void* buf) + asm("playground$sandbox_lstat"); + #if defined(__NR_lstat64) + STATIC long sandbox_lstat64(const char *path, void* b) + asm("playground$sandbox_lstat64"); + #endif + STATIC long sandbox_madvise(void*, size_t, int) + asm("playground$sandbox_madvise"); + STATIC void *sandbox_mmap(void* start, size_t length, int prot, int flags, + int fd, off_t offset) + asm("playground$sandbox_mmap"); + STATIC long sandbox_mprotect(const void*, size_t, int) + asm("playground$sandbox_mprotect"); + STATIC long sandbox_munmap(void* start, size_t length) + asm("playground$sandbox_munmap"); + STATIC long sandbox_open(const char*, int, mode_t) + asm("playground$sandbox_open"); + #if defined(__NR_recvfrom) + STATIC ssize_t sandbox_recvfrom(int, void*, size_t, int, void*, socklen_t*) + asm("playground$sandbox_recvfrom"); + STATIC ssize_t sandbox_recvmsg(int, struct msghdr*, int) + asm("playground$sandbox_recvmsg"); + #endif + #if defined(__NR_rt_sigaction) + STATIC long sandbox_rt_sigaction(int, const void*, void*, size_t) + asm("playground$sandbox_rt_sigaction"); + #endif + #if defined(__NR_rt_sigprocmask) + STATIC long sandbox_rt_sigprocmask(int how, const void*, void*, size_t) + asm("playground$sandbox_rt_sigprocmask"); + #endif + #if defined(__NR_sendmsg) + STATIC size_t sandbox_sendmsg(int, const struct msghdr*, int) + asm("playground$sandbox_sendmsg"); + STATIC ssize_t sandbox_sendto(int, const void*, size_t, int, const void*, + socklen_t)asm("playground$sandbox_sendto"); + #endif + #if defined(__NR_shmat) + STATIC void* sandbox_shmat(int, const void*, int) + asm("playground$sandbox_shmat"); + STATIC long sandbox_shmctl(int, int, void*) + asm("playground$sandbox_shmctl"); + STATIC long sandbox_shmdt(const void*) asm("playground$sandbox_shmdt"); + STATIC long sandbox_shmget(int, size_t, int) + asm("playground$sandbox_shmget"); + #endif + #if defined(__NR_setsockopt) + STATIC long sandbox_setsockopt(int, int, int, const void*, socklen_t) + asm("playground$sandbox_setsockopt"); + #endif + #if defined(__NR_sigaction) + STATIC long sandbox_sigaction(int, const void*, void*) + asm("playground$sandbox_sigaction"); + #endif + #if defined(__NR_signal) + STATIC void* sandbox_signal(int, const void*) + asm("playground$sandbox_signal"); + #endif + #if defined(__NR_sigprocmask) + STATIC long sandbox_sigprocmask(int how, const void*, void*) + asm("playground$sandbox_sigprocmask"); + #endif + #if defined(__NR_socketcall) + STATIC long sandbox_socketcall(int call, void* args) + asm("playground$sandbox_socketcall"); + #endif + STATIC long sandbox_stat(const char* path, void* buf) + asm("playground$sandbox_stat"); + #if defined(__NR_stat64) + STATIC long sandbox_stat64(const char *path, void* b) + asm("playground$sandbox_stat64"); + #endif + + // Functions for system calls that need to be handled in the trusted process + STATIC bool process_access(int, int, int, int, SecureMemArgs*) + asm("playground$process_access"); + STATIC bool process_clone(int, int, int, int, SecureMemArgs*) + asm("playground$process_clone"); + STATIC bool process_exit(int, int, int, int, SecureMemArgs*) + asm("playground$process_exit"); + #if defined(__NR_getsockopt) + STATIC bool process_getsockopt(int, int, int, int, SecureMemArgs*) + asm("playground$process_getsockopt"); + #endif + STATIC bool process_ioctl(int, int, int, int, SecureMemArgs*) + asm("playground$process_ioctl"); + #if defined(__NR_ipc) + STATIC bool process_ipc(int, int, int, int, SecureMemArgs*) + asm("playground$process_ipc"); + #endif + STATIC bool process_madvise(int, int, int, int, SecureMemArgs*) + asm("playground$process_madvise"); + STATIC bool process_mmap(int, int, int, int, SecureMemArgs*) + asm("playground$process_mmap"); + STATIC bool process_mprotect(int, int, int, int, SecureMemArgs*) + asm("playground$process_mprotect"); + STATIC bool process_munmap(int, int, int, int, SecureMemArgs*) + asm("playground$process_munmap"); + STATIC bool process_open(int, int, int, int, SecureMemArgs*) + asm("playground$process_open"); + #if defined(__NR_recvfrom) + STATIC bool process_recvfrom(int, int, int, int, SecureMemArgs*) + asm("playground$process_recvfrom"); + STATIC bool process_recvmsg(int, int, int, int, SecureMemArgs*) + asm("playground$process_recvmsg"); + STATIC bool process_sendmsg(int, int, int, int, SecureMemArgs*) + asm("playground$process_sendmsg"); + STATIC bool process_sendto(int, int, int, int, SecureMemArgs*) + asm("playground$process_sendto"); + STATIC bool process_setsockopt(int, int, int, int, SecureMemArgs*) + asm("playground$process_setsockopt"); + #endif + #if defined(__NR_shmat) + STATIC bool process_shmat(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmat"); + STATIC bool process_shmctl(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmctl"); + STATIC bool process_shmdt(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmdt"); + STATIC bool process_shmget(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmget"); + #endif + STATIC bool process_sigaction(int, int, int, int, SecureMemArgs*) + asm("playground$process_sigaction"); + #if defined(__NR_socketcall) + STATIC bool process_socketcall(int, int, int, int, SecureMemArgs*) + asm("playground$process_socketcall"); + #endif + STATIC bool process_stat(int, int, int, int, SecureMemArgs*) + asm("playground$process_stat"); + +#ifdef __cplusplus + friend class Debug; + friend class Library; + friend class Maps; + friend class Mutex; + friend class SecureMem; + friend class TLS; + + // Define our own inline system calls. These calls will not be rewritten + // to point to the sandboxed wrapper functions. They thus allow us to + // make actual system calls (e.g. in the sandbox initialization code, and + // in the trusted process) + class SysCalls { + public: + #define SYS_CPLUSPLUS + #define SYS_ERRNO my_errno + #define SYS_INLINE inline + #define SYS_PREFIX -1 + #undef SYS_LINUX_SYSCALL_SUPPORT_H + #include "linux_syscall_support.h" + SysCalls() : my_errno(0) { } + int my_errno; + }; + #ifdef __NR_mmap2 + #define MMAP mmap2 + #define __NR_MMAP __NR_mmap2 + #else + #define MMAP mmap + #define __NR_MMAP __NR_mmap + #endif + + // Print an error message and terminate the program. Used for fatal errors. + static void die(const char *msg = 0) __attribute__((noreturn)) { + SysCalls sys; + if (msg) { + sys.write(2, msg, strlen(msg)); + sys.write(2, "\n", 1); + } + for (;;) { + sys.exit_group(1); + sys._exit(1); + } + } + + // Wrapper around "read()" that can deal with partial and interrupted reads + // and that does not modify the global errno variable. + static ssize_t read(SysCalls& sys, int fd, void* buf, size_t len) { + if (static_cast<ssize_t>(len) < 0) { + sys.my_errno = EINVAL; + return -1; + } + size_t offset = 0; + while (offset < len) { + ssize_t partial = + NOINTR_SYS(sys.read(fd, reinterpret_cast<char*>(buf) + offset, + len - offset)); + if (partial < 0) { + return partial; + } else if (!partial) { + break; + } + offset += partial; + } + return offset; + } + + // Wrapper around "write()" that can deal with interrupted writes and that + // does not modify the global errno variable. + static ssize_t write(SysCalls& sys, int fd, const void* buf, size_t len){ + return NOINTR_SYS(sys.write(fd, buf, len)); + } + + // Sends a file handle to another process. + // N.B. trusted_thread.cc has an assembly version of this function that + // is safe to use without a call stack. If the wire-format is changed, + /// make sure to update the assembly code. + static bool sendFd(int transport, int fd0, int fd1, const void* buf, + size_t len); + + // If getFd() fails, it will set the first valid fd slot (e.g. fd0) to + // -errno. + static bool getFd(int transport, int* fd0, int* fd1, void* buf, + size_t* len); + + // Data structures used to forward system calls to the trusted process. + struct Accept { + int sockfd; + void* addr; + socklen_t* addrlen; + } __attribute__((packed)); + + struct Accept4 { + int sockfd; + void* addr; + socklen_t* addrlen; + int flags; + } __attribute__((packed)); + + struct Access { + size_t path_length; + int mode; + } __attribute__((packed)); + + struct Bind { + int sockfd; + void* addr; + socklen_t addrlen; + } __attribute__((packed)); + + struct Clone { + int flags; + char* stack; + int* pid; + int* ctid; + void* tls; + #if defined(__x86_64__) + struct { + void* r15; + void* r14; + void* r13; + void* r12; + void* r11; + void* r10; + void* r9; + void* r8; + void* rdi; + void* rsi; + void* rdx; + void* rcx; + void* rbx; + void* rbp; + void* fake_ret; + } regs64 __attribute__((packed)); + #elif defined(__i386__) + struct { + void* ebp; + void* edi; + void* esi; + void* edx; + void* ecx; + void* ebx; + } regs32 __attribute__((packed)); + #else + #error Unsupported target platform + #endif + void* ret; + } __attribute__((packed)); + + struct Connect { + int sockfd; + void* addr; + socklen_t addrlen; + } __attribute__((packed)); + + struct GetSockName { + int sockfd; + void* name; + socklen_t* namelen; + } __attribute__((packed)); + + struct GetPeerName { + int sockfd; + void* name; + socklen_t* namelen; + } __attribute__((packed)); + + struct GetSockOpt { + int sockfd; + int level; + int optname; + void* optval; + socklen_t* optlen; + } __attribute__((packed)); + + struct IOCtl { + int d; + int req; + void *arg; + } __attribute__((packed)); + + #if defined(__NR_ipc) + struct IPC { + unsigned call; + int first; + int second; + int third; + void* ptr; + long fifth; + } __attribute__((packed)); + #endif + + struct Listen { + int sockfd; + int backlog; + } __attribute__((packed)); + + struct MAdvise { + const void* start; + size_t len; + int advice; + } __attribute__((packed)); + + struct MMap { + void* start; + size_t length; + int prot; + int flags; + int fd; + off_t offset; + } __attribute__((packed)); + + struct MProtect { + const void* addr; + size_t len; + int prot; + }; + + struct MUnmap { + void* start; + size_t length; + } __attribute__((packed)); + + struct Open { + size_t path_length; + int flags; + mode_t mode; + } __attribute__((packed)); + + struct Recv { + int sockfd; + void* buf; + size_t len; + int flags; + } __attribute__((packed)); + + struct RecvFrom { + int sockfd; + void* buf; + size_t len; + int flags; + void* from; + socklen_t *fromlen; + } __attribute__((packed)); + + struct RecvMsg { + int sockfd; + struct msghdr* msg; + int flags; + } __attribute__((packed)); + + struct Send { + int sockfd; + const void* buf; + size_t len; + int flags; + } __attribute__((packed)); + + struct SendMsg { + int sockfd; + const struct msghdr* msg; + int flags; + } __attribute__((packed)); + + struct SendTo { + int sockfd; + const void* buf; + size_t len; + int flags; + const void* to; + socklen_t tolen; + } __attribute__((packed)); + + struct SetSockOpt { + int sockfd; + int level; + int optname; + const void* optval; + socklen_t optlen; + } __attribute__((packed)); + + #if defined(__NR_shmat) + struct ShmAt { + int shmid; + const void* shmaddr; + int shmflg; + } __attribute__((packed)); + + struct ShmCtl { + int shmid; + int cmd; + void *buf; + } __attribute__((packed)); + + struct ShmDt { + const void *shmaddr; + } __attribute__((packed)); + + struct ShmGet { + int key; + size_t size; + int shmflg; + } __attribute__((packed)); + #endif + + struct ShutDown { + int sockfd; + int how; + } __attribute__((packed)); + + struct SigAction { + int sysnum; + int signum; + const SysCalls::kernel_sigaction* action; + const SysCalls::kernel_sigaction* old_action; + size_t sigsetsize; + } __attribute__((packed)); + + struct Socket { + int domain; + int type; + int protocol; + } __attribute__((packed)); + + struct SocketPair { + int domain; + int type; + int protocol; + int* pair; + } __attribute__((packed)); + + #if defined(__NR_socketcall) + struct SocketCall { + int call; + void* arg_ptr; + union { + Socket socket; + Bind bind; + Connect connect; + Listen listen; + Accept accept; + GetSockName getsockname; + GetPeerName getpeername; + SocketPair socketpair; + Send send; + Recv recv; + SendTo sendto; + RecvFrom recvfrom; + ShutDown shutdown; + SetSockOpt setsockopt; + GetSockOpt getsockopt; + SendMsg sendmsg; + RecvMsg recvmsg; + Accept4 accept4; + } args; + } __attribute__((packed)); + #endif + + struct Stat { + int sysnum; + size_t path_length; + void* buf; + } __attribute__((packed)); + + // Thread local data available from each sandboxed thread. + enum { TLS_COOKIE, TLS_TID, TLS_THREAD_FD }; + static long long cookie() { return TLS::getTLSValue<long long>(TLS_COOKIE); } + static int tid() { return TLS::getTLSValue<int>(TLS_TID); } + static int threadFdPub() { return TLS::getTLSValue<int>(TLS_THREAD_FD); } + static int processFdPub() { return processFdPub_; } + static kernel_sigset_t* signalMask() { return &getSecureMem()->signalMask; } + + // The SEGV handler knows how to handle RDTSC instructions + static void setupSignalHandlers(); + static void (*segv())(int signo, SysCalls::siginfo *context, void *unused); + + // If no specific handler has been registered for a system call, call this + // function which asks the trusted thread to perform the call. This is used + // for system calls that are not restricted. + static void* defaultSystemCallHandler(int syscallNum, void* arg0, + void* arg1, void* arg2, void* arg3, + void* arg4, void* arg5) + asm("playground$defaultSystemCallHandler") + #if defined(__x86_64__) + __attribute__((visibility("internal"))) + #endif + ; + + // Return the current secure memory structure for this thread. + static SecureMem::Args* getSecureMem(); + + // Return a secure memory structure that can be used by a newly created + // thread. + static SecureMem::Args* getNewSecureMem(); + + // This functions runs in the trusted process at startup and finds all the + // memory mappings that existed when the sandbox was first enabled. Going + // forward, all these mappings are off-limits for operations such as + // mmap(), munmap(), and mprotect(). + static int initializeProtectedMap(int fd); + + // Helper functions that allows the trusted process to get access to + // "/proc/self/maps" in the sandbox. + static void snapshotMemoryMappings(int processFd, int proc_self_maps); + + // Main loop for the trusted process. + static void trustedProcess(int parentMapsFd, int processFdPub, + int sandboxFd, int cloneFd, + SecureMem::Args* secureArena) + __attribute__((noreturn)); + + // Fork()s of the trusted process. + static SecureMem::Args* createTrustedProcess(int processFdPub, int sandboxFd, + int cloneFdPub, int cloneFd); + + // Creates the trusted thread for the initial thread, then enables + // Seccomp mode. + static void createTrustedThread(int processFdPub, int cloneFdPub, + SecureMem::Args* secureMem); + + static int proc_self_maps_; + static enum SandboxStatus { + STATUS_UNKNOWN, STATUS_UNSUPPORTED, STATUS_AVAILABLE, STATUS_ENABLED + } status_; + static int pid_; + static int processFdPub_; + static int cloneFdPub_; + + #ifdef __i386__ + struct SocketCallArgInfo; + static const struct SocketCallArgInfo socketCallArgInfo[]; + #endif + + // We always have to intercept SIGSEGV. If the application wants to set its + // own SEGV handler, we forward to it whenever necessary. + static SysCalls::kernel_sigaction sa_segv_ asm("playground$sa_segv"); + + // The syscall_mutex_ can only be directly accessed by the trusted process. + // It can be accessed by the trusted thread after fork()ing and calling + // mprotect(PROT_READ|PROT_WRITE). The mutex is used for system calls that + // require passing additional data, and that require the trusted process to + // wait until the trusted thread is done processing (e.g. exit(), clone(), + // open(), stat()) + static int syscall_mutex_ asm("playground$syscall_mutex"); + + // Available in trusted process, only + typedef std::map<void *, long> ProtectedMap; + static ProtectedMap protectedMap_; + static std::vector<SecureMem::Args*> secureMemPool_; +}; + +// If this struct is extended to contain parameters that are read by +// the trusted thread, we will have to mprotect() it to be read-only when +// starting the sandbox. However, currently it is read only by the +// trusted process, and the sandboxed process cannot change the values +// that the fork()'d trusted process sees. +struct SandboxPolicy { + bool allow_file_namespace; // Allow filename-based system calls. +}; + +extern struct SandboxPolicy g_policy; + +} // namespace + +using playground::Sandbox; +#endif // __cplusplus + +#endif // SANDBOX_IMPL_H__ diff --git a/sandbox/linux/seccomp/seccomp.gyp b/sandbox/linux/seccomp/seccomp.gyp new file mode 100644 index 0000000..596be21 --- /dev/null +++ b/sandbox/linux/seccomp/seccomp.gyp @@ -0,0 +1,93 @@ +# Copyright (c) 2010 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +{ + 'variables': { + 'chromium_code': 1, + 'seccomp_intermediate_dir': '<(INTERMEDIATE_DIR)/seccomp-sandbox', + }, + 'targets': [ + { + 'target_name': 'seccomp_sandbox', + 'type': 'static_library', + 'sources': [ + 'access.cc', + 'allocator.cc', + 'allocator.h', + 'clone.cc', + 'exit.cc', + 'debug.cc', + 'getpid.cc', + 'gettid.cc', + 'ioctl.cc', + 'ipc.cc', + 'library.cc', + 'library.h', + 'linux_syscall_support.h', + 'madvise.cc', + 'maps.cc', + 'maps.h', + 'mmap.cc', + 'mprotect.cc', + 'munmap.cc', + 'mutex.h', + 'open.cc', + 'sandbox.cc', + 'sandbox.h', + 'sandbox_impl.h', + 'securemem.cc', + 'securemem.h', + 'sigaction.cc', + 'sigprocmask.cc', + 'socketcall.cc', + 'stat.cc', + 'syscall.cc', + 'syscall.h', + 'syscall_table.c', + 'syscall_table.h', + 'tls.h', + 'trusted_process.cc', + 'trusted_thread.cc', + 'x86_decode.cc', + 'x86_decode.h', + ], + }, + { + 'target_name': 'seccomp_tests', + 'type': 'executable', + 'sources': [ + 'tests/test_syscalls.cc', + ], + 'include_dirs': [ + '.', + '<(seccomp_intermediate_dir)', + ], + 'dependencies': [ + 'seccomp_sandbox', + ], + 'libraries': [ + '-lpthread', + '-lutil', # For openpty() + ], + 'actions': [ + { + 'action_name': 'make_test_list', + 'inputs': [ + 'tests/list_tests.py', + 'tests/test_syscalls.cc', + ], + 'outputs': ['<(seccomp_intermediate_dir)/test-list.h'], + 'action': ['sh', '-c', 'python <(_inputs) > <(_outputs)'], + }, + ], + }, + { + 'target_name': 'timestats', + 'type': 'executable', + 'sources': [ + 'timestats.cc', + ], + }, + ], +} diff --git a/sandbox/linux/seccomp/securemem.cc b/sandbox/linux/seccomp/securemem.cc new file mode 100644 index 0000000..5f07bbe --- /dev/null +++ b/sandbox/linux/seccomp/securemem.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "mutex.h" +#include "sandbox_impl.h" +#include "securemem.h" + +namespace playground { + +void SecureMem::abandonSystemCall(int fd, int err) { + void* rc = reinterpret_cast<void *>(err); + if (err) { + Debug::message("System call failed\n"); + } + Sandbox::SysCalls sys; + if (Sandbox::write(sys, fd, &rc, sizeof(rc)) != sizeof(rc)) { + Sandbox::die("Failed to send system call"); + } +} + +void SecureMem::dieIfParentDied(int parentMapsFd) { + // The syscall_mutex_ should not be contended. If it is, we are either + // experiencing a very unusual load of system calls that the sandbox is not + // optimized for; or, more likely, the sandboxed process terminated while the + // trusted process was in the middle of waiting for the mutex. We detect + // this situation and terminate the trusted process. + int alive = !lseek(parentMapsFd, 0, SEEK_SET); + if (alive) { + char buf; + do { + alive = read(parentMapsFd, &buf, 1); + } while (alive < 0 && errno == EINTR); + } + if (!alive) { + Sandbox::die(); + } +} + +void SecureMem::lockSystemCall(int parentMapsFd, Args* mem) { + while (!Mutex::lockMutex(&Sandbox::syscall_mutex_, 500)) { + dieIfParentDied(parentMapsFd); + } + asm volatile( + #if defined(__x86_64__) + "lock; incq (%0)\n" + #elif defined(__i386__) + "lock; incl (%0)\n" + #else + #error Unsupported target platform + #endif + : + : "q"(&mem->sequence) + : "memory"); +} + +void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd, + Args* mem, int syscallNum, void* arg1, + void* arg2, void* arg3, void* arg4, + void* arg5, void* arg6) { + if (!locked) { + asm volatile( + #if defined(__x86_64__) + "lock; incq (%0)\n" + #elif defined(__i386__) + "lock; incl (%0)\n" + #else + #error Unsupported target platform + #endif + : + : "q"(&mem->sequence) + : "memory"); + } + mem->callType = locked ? -2 : -1; + mem->syscallNum = syscallNum; + mem->arg1 = arg1; + mem->arg2 = arg2; + mem->arg3 = arg3; + mem->arg4 = arg4; + mem->arg5 = arg5; + mem->arg6 = arg6; + asm volatile( + #if defined(__x86_64__) + "lock; incq (%0)\n" + #elif defined(__i386__) + "lock; incl (%0)\n" + #else + #error Unsupported target platform + #endif + : + : "q"(&mem->sequence) + : "memory"); + Sandbox::SysCalls sys; + if (Sandbox::write(sys, fd, &mem->callType, sizeof(int)) != sizeof(int)) { + Sandbox::die("Failed to send system call"); + } + if (parentMapsFd >= 0) { + while (!Mutex::waitForUnlock(&Sandbox::syscall_mutex_, 500)) { + dieIfParentDied(parentMapsFd); + } + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h new file mode 100644 index 0000000..91283db --- /dev/null +++ b/sandbox/linux/seccomp/securemem.h @@ -0,0 +1,205 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef SECURE_MEM_H__ +#define SECURE_MEM_H__ + +#include <stdlib.h> +#include "linux_syscall_support.h" + +namespace playground { + +class SecureMem { + public: + // Each thread is associated with two memory pages (i.e. 8192 bytes). This + // memory is fully accessible by the trusted process, but in the trusted + // thread and the sandboxed thread, the first page is only mapped PROT_READ, + // and the second one is PROT_READ|PROT_WRITE. + // + // The first page can be modified by the trusted process and this is the + // main mechanism how it communicates with the trusted thread. After each + // update, it updates the "sequence" number. The trusted process must + // check the "sequence" number has the expected value, and only then can + // it trust the data in this page. + typedef struct Args { + union { + struct { + union { + struct { + struct Args* self; + long sequence; + long callType; + long syscallNum; + void* arg1; + void* arg2; + void* arg3; + void* arg4; + void* arg5; + void* arg6; + + // Used by clone() to allow return from the syscall wrapper. + void* ret; + #if defined(__x86_64__) + void* rbp; + void* rbx; + void* rcx; + void* rdx; + void* rsi; + void* rdi; + void* r8; + void* r9; + void* r10; + void* r11; + void* r12; + void* r13; + void* r14; + void* r15; + #elif defined(__i386__) + void* ebp; + void* edi; + void* esi; + void* edx; + void* ecx; + void* ebx; + #else + #error Unsupported target platform + #endif + + // Used by clone() to set up data for the new thread. + struct Args* newSecureMem; + int processFdPub; + int cloneFdPub; + + // Set to non-zero, if in debugging mode + int allowAllSystemCalls; + + // The most recent SysV SHM identifier returned by + // shmget(IPC_PRIVATE) + int shmId; + + // The following entries make up the sandboxed thread's TLS + long long cookie; + long long threadId; + long long threadFdPub; + } __attribute__((packed)); + char header[512]; + }; + // Used for calls such as open() and stat(). + char pathname[4096 - 512]; + } __attribute__((packed)); + char securePage[4096]; + }; + union { + struct { + // This scratch space is used by the trusted thread to read parameters + // for unrestricted system calls. + int tmpSyscallNum; + void* tmpArg1; + void* tmpArg2; + void* tmpArg3; + void* tmpArg4; + void* tmpArg5; + void* tmpArg6; + void* tmpReturnValue; + + // Scratch space used to return the result of a rdtsc instruction + int rdtscpEax; + int rdtscpEdx; + int rdtscpEcx; + + // We often have long sequences of calls to gettimeofday(). This is + // needlessly expensive. Coalesce them into a single call. + int lastSyscallNum; + int gettimeofdayCounter; + + // For debugging purposes, we want to be able to log messages. This can + // result in additional system calls. Make sure that we don't trigger + // logging of those recursive calls. + int recursionLevel; + + // Computing the signal mask is expensive. Keep a cached copy. + kernel_sigset_t signalMask; + + // Keep track of whether we are in a SEGV handler + int inSegvHandler; + } __attribute__((packed)); + char scratchPage[4096]; + }; + } __attribute__((packed)) Args; + + // Allows the trusted process to check whether the parent process still + // exists. If it doesn't, kill the trusted process. + static void dieIfParentDied(int parentProc); + + // The trusted process received a system call that it intends to deny. + static void abandonSystemCall(int fd, int err); + + // Acquires the syscall_mutex_ prior to making changes to the parameters in + // the secure memory page. Used by calls such as exit(), clone(), open(), + // socketcall(), and stat(). + // After locking the mutex, it is no longer valid to abandon the system + // call! + static void lockSystemCall(int parentProc, Args* mem); + + // Sends a system call to the trusted thread. If "locked" is true, the + // caller must first call lockSystemCall() and must also provide + // "parentProc". In locked mode, sendSystemCall() won't return until the + // trusted thread has completed processing. + // Use sparingly as it serializes the operation of the trusted process. + static void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum); + } + template<class T1> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1); + } + template<class T1, class T2> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2); + } + template<class T1, class T2, class T3> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3); + } + template<class T1, class T2, class T3, class T4> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4); + } + template<class T1, class T2, class T3, class T4, class T5> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4, + T5 arg5) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4, + (void*)arg5); + } + template<class T1, class T2, class T3, class T4, class T5, class T6> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4, + T5 arg5, T6 arg6) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4, + (void*)arg5, (void*)arg6); + } + + private: + static void sendSystemCallInternal(int fd, bool locked, int parentProc, + Args* mem, int syscallNum, void* arg1 = 0, + void* arg2 = 0, void* arg3 = 0, + void* arg4 = 0, void* arg5 = 0, + void* arg6 = 0); +}; + +} // namespace + +#endif // SECURE_MEM_H__ diff --git a/sandbox/linux/seccomp/sigaction.cc b/sandbox/linux/seccomp/sigaction.cc new file mode 100644 index 0000000..162416d --- /dev/null +++ b/sandbox/linux/seccomp/sigaction.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// TODO(markus): We currently instrument the restorer functions with calls to +// the syscallWrapper(). This prevents gdb from properly +// creating backtraces of code that is running in signal +// handlers. We might instead want to always override the +// restorer with a function that contains the "magic" signature +// but that is not executable. The SEGV handler can detect this +// and then invoke the appropriate restorer. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +#if defined(__NR_sigaction) +long Sandbox::sandbox_sigaction(int signum, const void* a_, void* oa_) { + const SysCalls::kernel_old_sigaction* action = + reinterpret_cast<const SysCalls::kernel_old_sigaction*>(a_); + SysCalls::kernel_old_sigaction* old_action = + reinterpret_cast<SysCalls::kernel_old_sigaction*>(oa_); + + long rc = 0; + long long tm; + Debug::syscall(&tm, __NR_sigaction, "Executing handler"); + if (signum == SIGSEGV) { + if (old_action) { + old_action->sa_handler_ = sa_segv_.sa_handler_; + old_action->sa_mask = sa_segv_.sa_mask.sig[0]; + old_action->sa_flags = sa_segv_.sa_flags; + old_action->sa_restorer = sa_segv_.sa_restorer; + } + if (action) { + sa_segv_.sa_handler_ = action->sa_handler_; + sa_segv_.sa_mask.sig[0] = action->sa_mask; + sa_segv_.sa_flags = action->sa_flags; + sa_segv_.sa_restorer = action->sa_restorer; + } + } else { + struct { + int sysnum; + long long cookie; + SigAction sigaction_req; + } __attribute__((packed)) request; + request.sysnum = __NR_sigaction; + request.cookie = cookie(); + request.sigaction_req.sysnum = __NR_sigaction; + request.sigaction_req.signum = signum; + request.sigaction_req.action = + reinterpret_cast<const SysCalls::kernel_sigaction *>(action); + request.sigaction_req.old_action = + reinterpret_cast<const SysCalls::kernel_sigaction *>(old_action); + request.sigaction_req.sigsetsize = 8; + + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward sigaction() request [sandbox]"); + } + } + Debug::elapsed(tm, __NR_sigaction); + return rc; +} +#endif + +#if defined(__NR_rt_sigaction) +#define min(a,b) ({ typeof(a) a_=(a); typeof(b) b_=(b); a_ < b_ ? a_ : b_; }) +#define max(a,b) ({ typeof(a) a_=(a); typeof(b) b_=(b); a_ > b_ ? a_ : b_; }) + +long Sandbox::sandbox_rt_sigaction(int signum, const void* a_, void* oa_, + size_t sigsetsize) { + const SysCalls::kernel_sigaction* action = + reinterpret_cast<const SysCalls::kernel_sigaction*>(a_); + SysCalls::kernel_sigaction* old_action = + reinterpret_cast<SysCalls::kernel_sigaction*>(oa_); + + long rc = 0; + long long tm; + Debug::syscall(&tm, __NR_rt_sigaction, "Executing handler"); + if (signum == SIGSEGV) { + size_t theirSize = offsetof(SysCalls::kernel_sigaction, sa_mask) + + sigsetsize; + if (old_action) { + memcpy(old_action, &sa_segv_, min(sizeof(sa_segv_), theirSize)); + memset(old_action + 1, 0, max(0u, theirSize - sizeof(sa_segv_))); + } + if (action) { + memcpy(&sa_segv_, action, min(sizeof(sa_segv_), theirSize)); + memset(&sa_segv_.sa_mask, 0, max(0u, 8 - sigsetsize)); + } + } else { + struct { + int sysnum; + long long cookie; + SigAction sigaction_req; + } __attribute__((packed)) request; + request.sysnum = __NR_rt_sigaction; + request.cookie = cookie(); + request.sigaction_req.sysnum = __NR_rt_sigaction; + request.sigaction_req.signum = signum; + request.sigaction_req.action = action; + request.sigaction_req.old_action = old_action; + request.sigaction_req.sigsetsize = sigsetsize; + + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward rt_sigaction() request [sandbox]"); + } + } + Debug::elapsed(tm, __NR_rt_sigaction); + return rc; +} +#endif + +#if defined(__NR_signal) +void* Sandbox::sandbox_signal(int signum, const void* handler) { + struct kernel_old_sigaction sa, osa; + sa.sa_handler_ = reinterpret_cast<void (*)(int)>(handler); + sa.sa_flags = SA_NODEFER | SA_RESETHAND | SA_RESTORER; + sa.sa_mask = 0; + asm volatile( + "lea 0f, %0\n" + "jmp 1f\n" + "0:pop %%eax\n" + "mov $119, %%eax\n" // __NR_sigreturn + "int $0x80\n" + "1:\n" + : "=r"(sa.sa_restorer)); + long rc = sandbox_sigaction(signum, &sa, &osa); + if (rc < 0) { + return (void *)rc; + } + return reinterpret_cast<void *>(osa.sa_handler_); +} +#endif + +bool Sandbox::process_sigaction(int parentMapsFd, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // We need to intercept sigaction() in order to properly rewrite calls to + // sigaction(SEGV). While there is no security implication if we didn't do + // so, it would end up preventing the program from running correctly as the + // the sandbox's SEGV handler could accidentally get removed. All of this is + // done in sandbox_{,rt_}sigaction(). But we still bounce through the + // trusted process as that is the only way we can instrument system calls. + // This is somewhat needlessly complicated. But as sigaction() is not a + // performance critical system call, it is easier to do this way than to + // extend the format of the syscall_table so that it could deal with this + // special case. + + // Read request + SigAction sigaction_req; + SysCalls sys; + if (read(sys, sandboxFd, &sigaction_req, sizeof(sigaction_req)) != + sizeof(sigaction_req)) { + die("Failed to read parameters for sigaction() [process]"); + } + if (sigaction_req.signum == SIGSEGV) { + // This should never happen. Something went wrong when intercepting the + // system call. This is not a security problem, but it clearly doesn't + // make sense to let the system call pass. + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, sigaction_req.sysnum, + sigaction_req.signum, sigaction_req.action, + sigaction_req.old_action, + sigaction_req.sigsetsize); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/sigprocmask.cc b/sandbox/linux/seccomp/sigprocmask.cc new file mode 100644 index 0000000..9ff2922 --- /dev/null +++ b/sandbox/linux/seccomp/sigprocmask.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +// If the sandboxed process tries to mask SIGSEGV, there is a good chance +// the process will eventually get terminated. If this is really ever a +// problem, we can hide the fact that SIGSEGV is unmasked. But I don't think +// we really need this. Masking of synchronous signals is rarely necessary. + +#if defined(__NR_sigprocmask) +long Sandbox::sandbox_sigprocmask(int how, const void* set, void* old_set) { + long long tm; + Debug::syscall(&tm, __NR_sigprocmask, "Executing handler"); + + // Access the signal mask by triggering a SEGV and modifying the signal state + // prior to calling rt_sigreturn(). + long res = -ENOSYS; + #if defined(__x86_64__) + #error x86-64 does not support sigprocmask(); use rt_sigprocmask() instead + #elif defined(__i386__) + asm volatile( + "push %%ebx\n" + "movl %2, %%ebx\n" + "int $0\n" + "pop %%ebx\n" + : "=a"(res) + : "0"(__NR_sigprocmask), "ri"((long)how), + "c"((long)set), "d"((long)old_set) + : "esp", "memory"); + #else + #error Unsupported target platform + #endif + + // Update our shadow signal mask, so that we can copy it upon creation of + // new threads. + if (res == 0 && set != NULL) { + SecureMem::Args* args = getSecureMem(); + switch (how) { + case SIG_BLOCK: + *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set; + break; + case SIG_UNBLOCK: + *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set; + break; + case SIG_SETMASK: + *(unsigned long long *)&args->signalMask = *(unsigned long long *)set; + break; + default: + break; + } + } + + Debug::elapsed(tm, __NR_sigprocmask); + + return res; +} +#endif + +#if defined(__NR_rt_sigprocmask) +long Sandbox::sandbox_rt_sigprocmask(int how, const void* set, void* old_set, + size_t bytes) { + long long tm; + Debug::syscall(&tm, __NR_rt_sigprocmask, "Executing handler"); + + // Access the signal mask by triggering a SEGV and modifying the signal state + // prior to calling rt_sigreturn(). + long res = -ENOSYS; + #if defined(__x86_64__) + asm volatile( + "movq %5, %%r10\n" + "int $0\n" + : "=a"(res) + : "0"(__NR_rt_sigprocmask), "D"((long)how), + "S"((long)set), "d"((long)old_set), "r"((long)bytes) + : "r10", "r11", "rcx", "memory"); + #elif defined(__i386__) + asm volatile( + "push %%ebx\n" + "movl %2, %%ebx\n" + "int $0\n" + "pop %%ebx\n" + : "=a"(res) + : "0"(__NR_rt_sigprocmask), "ri"((long)how), + "c"((long)set), "d"((long)old_set), "S"((long)bytes) + : "esp", "memory"); + #else + #error Unsupported target platform + #endif + + // Update our shadow signal mask, so that we can copy it upon creation of + // new threads. + if (res == 0 && set != NULL && bytes >= 8) { + SecureMem::Args* args = getSecureMem(); + switch (how) { + case SIG_BLOCK: + *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set; + break; + case SIG_UNBLOCK: + *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set; + break; + case SIG_SETMASK: + *(unsigned long long *)&args->signalMask = *(unsigned long long *)set; + break; + default: + break; + } + } + + Debug::elapsed(tm, __NR_rt_sigprocmask); + + return res; +} +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/socketcall.cc b/sandbox/linux/seccomp/socketcall.cc new file mode 100644 index 0000000..c7b2015 --- /dev/null +++ b/sandbox/linux/seccomp/socketcall.cc @@ -0,0 +1,1039 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +#if defined(__NR_socket) + +ssize_t Sandbox::sandbox_recvfrom(int sockfd, void* buf, size_t len, int flags, + void* from, socklen_t* fromlen) { + long long tm; + Debug::syscall(&tm, __NR_recvfrom, "Executing handler"); + + SysCalls sys; + if (!from && !flags) { + // recv() with a NULL sender and no flags is the same as read(), which + // is unrestricted in seccomp mode. + Debug::message("Replaced recv() with call to read()"); + ssize_t rc = sys.read(sockfd, buf, len); + if (rc < 0) { + Debug::elapsed(tm, __NR_recvfrom); + return -sys.my_errno; + } else { + Debug::elapsed(tm, __NR_recvfrom); + return rc; + } + } + + struct { + int sysnum; + long long cookie; + RecvFrom recvfrom_req; + } __attribute__((packed)) request; + request.sysnum = __NR_recvfrom; + request.cookie = cookie(); + request.recvfrom_req.sockfd = sockfd; + request.recvfrom_req.buf = buf; + request.recvfrom_req.len = len; + request.recvfrom_req.flags = flags; + request.recvfrom_req.from = from; + request.recvfrom_req.fromlen = fromlen; + + long rc; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward recvfrom() request [sandbox]"); + } + Debug::elapsed(tm, __NR_recvfrom); + return static_cast<ssize_t>(rc); +} + +ssize_t Sandbox::sandbox_recvmsg(int sockfd, struct msghdr* msg, int flags) { + long long tm; + Debug::syscall(&tm, __NR_recvmsg, "Executing handler"); + + // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do + // not know whether the caller needs us to set msg->msg_flags. + struct { + int sysnum; + long long cookie; + RecvMsg recvmsg_req; + } __attribute__((packed)) request; + request.sysnum = __NR_recvmsg; + request.cookie = cookie(); + request.recvmsg_req.sockfd = sockfd; + request.recvmsg_req.msg = msg; + request.recvmsg_req.flags = flags; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward recvmsg() request [sandbox]"); + } + Debug::elapsed(tm, __NR_recvmsg); + return static_cast<ssize_t>(rc); +} + +size_t Sandbox::sandbox_sendmsg(int sockfd, const struct msghdr* msg, + int flags) { + long long tm; + Debug::syscall(&tm, __NR_sendmsg, "Executing handler"); + + if (msg->msg_iovlen == 1 && msg->msg_controllen == 0) { + // sendmsg() can sometimes be simplified as sendto() + return sandbox_sendto(sockfd, msg->msg_iov, msg->msg_iovlen, + flags, msg->msg_name, msg->msg_namelen); + } + + struct Request { + int sysnum; + long long cookie; + SendMsg sendmsg_req; + struct msghdr msg; + } __attribute__((packed)); + char data[sizeof(struct Request) + msg->msg_namelen + msg->msg_controllen]; + struct Request *request = reinterpret_cast<struct Request *>(data); + request->sysnum = __NR_sendmsg; + request->cookie = cookie(); + request->sendmsg_req.sockfd = sockfd; + request->sendmsg_req.msg = msg; + request->sendmsg_req.flags = flags; + request->msg = *msg; + memcpy(reinterpret_cast<char *>( + memcpy(request + 1, msg->msg_name, msg->msg_namelen)) + + msg->msg_namelen, + msg->msg_control, msg->msg_controllen); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &data, sizeof(data)) != + (ssize_t)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward sendmsg() request [sandbox]"); + } + Debug::elapsed(tm, __NR_sendmsg); + return static_cast<ssize_t>(rc); +} + +ssize_t Sandbox::sandbox_sendto(int sockfd, const void* buf, size_t len, + int flags, const void* to, socklen_t tolen) { + long long tm; + Debug::syscall(&tm, __NR_sendto, "Executing handler"); + + SysCalls sys; + if (!to && !flags) { + // sendto() with a NULL recipient and no flags is the same as write(), + // which is unrestricted in seccomp mode. + Debug::message("Replaced sendto() with call to write()"); + ssize_t rc = sys.write(sockfd, buf, len); + if (rc < 0) { + Debug::elapsed(tm, __NR_sendto); + return -sys.my_errno; + } else { + Debug::elapsed(tm, __NR_sendto); + return rc; + } + } + + struct { + int sysnum; + long long cookie; + SendTo sendto_req; + } __attribute__((packed)) request; + request.sysnum = __NR_sendto; + request.cookie = cookie(); + request.sendto_req.sockfd = sockfd; + request.sendto_req.buf = buf; + request.sendto_req.len = len; + request.sendto_req.flags = flags; + request.sendto_req.to = to; + request.sendto_req.tolen = tolen; + + long rc; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward sendto() request [sandbox]"); + } + Debug::elapsed(tm, __NR_sendto); + return static_cast<ssize_t>(rc); +} + +long Sandbox::sandbox_setsockopt(int sockfd, int level, int optname, + const void* optval, socklen_t optlen) { + long long tm; + Debug::syscall(&tm, __NR_setsockopt, "Executing handler"); + + struct { + int sysnum; + long long cookie; + SetSockOpt setsockopt_req; + } __attribute__((packed)) request; + request.sysnum = __NR_setsockopt; + request.cookie = cookie(); + request.setsockopt_req.sockfd = sockfd; + request.setsockopt_req.level = level; + request.setsockopt_req.optname = optname; + request.setsockopt_req.optval = optval; + request.setsockopt_req.optlen = optlen; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward setsockopt() request [sandbox]"); + } + Debug::elapsed(tm, __NR_setsockopt); + return rc; +} + +long Sandbox::sandbox_getsockopt(int sockfd, int level, int optname, + void* optval, socklen_t* optlen) { + long long tm; + Debug::syscall(&tm, __NR_getsockopt, "Executing handler"); + + struct { + int sysnum; + long long cookie; + GetSockOpt getsockopt_req; + } __attribute__((packed)) request; + request.sysnum = __NR_getsockopt; + request.cookie = cookie(); + request.getsockopt_req.sockfd = sockfd; + request.getsockopt_req.level = level; + request.getsockopt_req.optname = optname; + request.getsockopt_req.optval = optval; + request.getsockopt_req.optlen = optlen; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward getsockopt() request [sandbox]"); + } + Debug::elapsed(tm, __NR_getsockopt); + return rc; +} + +bool Sandbox::process_recvfrom(int parentMapsFd, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + RecvFrom recvfrom_req; + SysCalls sys; + if (read(sys, sandboxFd, &recvfrom_req, sizeof(recvfrom_req)) != + sizeof(recvfrom_req)) { + die("Failed to read parameters for recvfrom() [process]"); + } + + // Unsupported flag encountered. Deny the call. + if (recvfrom_req.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // While we do not anticipate any particular need to receive data on + // unconnected sockets, there is no particular risk in doing so. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_recvfrom, recvfrom_req.sockfd, + recvfrom_req.buf, recvfrom_req.len, + recvfrom_req.flags, recvfrom_req.from, + recvfrom_req.fromlen); + return true; +} + +bool Sandbox::process_recvmsg(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + RecvMsg recvmsg_req; + SysCalls sys; + if (read(sys, sandboxFd, &recvmsg_req, sizeof(recvmsg_req)) != + sizeof(recvmsg_req)) { + die("Failed to read parameters for recvmsg() [process]"); + } + + // Unsupported flag encountered. Deny the call. + if (recvmsg_req.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // Receiving messages is general not security critical. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_recvmsg, recvmsg_req.sockfd, + recvmsg_req.msg, recvmsg_req.flags); + return true; +} + +bool Sandbox::process_sendmsg(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + struct { + SendMsg sendmsg_req; + struct msghdr msg; + } __attribute__((packed)) data; + SysCalls sys; + if (read(sys, sandboxFd, &data, sizeof(data)) != sizeof(data)) { + die("Failed to read parameters for sendmsg() [process]"); + } + + if (data.msg.msg_namelen > 4096 || data.msg.msg_controllen > 4096) { + die("Unexpected size for socketcall() payload [process]"); + } + char extra[data.msg.msg_namelen + data.msg.msg_controllen]; + if (read(sys, sandboxFd, &extra, sizeof(extra)) != (ssize_t)sizeof(extra)) { + die("Failed to read parameters for sendmsg() [process]"); + } + if (sizeof(struct msghdr) + sizeof(extra) > sizeof(mem->pathname)) { + goto deny; + } + + if (data.msg.msg_namelen || + (data.sendmsg_req.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))) { + deny: + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // The trusted process receives file handles when a new untrusted thread + // gets created. We have security checks in place that prevent any + // critical information from being tampered with during thread creation. + // But if we disallowed passing of file handles, this would add an extra + // hurdle for an attacker. + // Unfortunately, for now, this is not possible as Chrome's + // base::SendRecvMsg() needs the ability to pass file handles. + if (data.msg.msg_controllen) { + data.msg.msg_control = extra + data.msg.msg_namelen; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&data.msg); + do { + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) { + goto deny; + } + } while ((cmsg = CMSG_NXTHDR(&data.msg, cmsg)) != NULL); + } + + // This must be a locked system call, because we have to ensure that the + // untrusted code does not tamper with the msghdr after we have examined it. + SecureMem::lockSystemCall(parentMapsFd, mem); + if (sizeof(extra) > 0) { + if (data.msg.msg_namelen > 0) { + data.msg.msg_name = mem->pathname + sizeof(struct msghdr); + } + if (data.msg.msg_controllen > 0) { + data.msg.msg_control = mem->pathname + sizeof(struct msghdr) + + data.msg.msg_namelen; + } + memcpy(mem->pathname + sizeof(struct msghdr), extra, sizeof(extra)); + } + memcpy(mem->pathname, &data.msg, sizeof(struct msghdr)); + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, + __NR_sendmsg, data.sendmsg_req.sockfd, + mem->pathname - (char*)mem + (char*)mem->self, + data.sendmsg_req.flags); + return true; +} + +bool Sandbox::process_sendto(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SendTo sendto_req; + SysCalls sys; + if (read(sys, sandboxFd, &sendto_req, sizeof(sendto_req)) != + sizeof(sendto_req)) { + die("Failed to read parameters for sendto() [process]"); + } + + // The sandbox does not allow sending to arbitrary addresses. + if (sendto_req.to) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // Unsupported flag encountered. Deny the call. + if (sendto_req.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // Sending data on a connected socket is similar to calling write(). + // Allow it. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_sendto, sendto_req.sockfd, + sendto_req.buf, sendto_req.len, + sendto_req.flags, sendto_req.to, + sendto_req.tolen); + return true; +} + +bool Sandbox::process_setsockopt(int parentMapsFd, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + SetSockOpt setsockopt_req; + SysCalls sys; + if (read(sys, sandboxFd, &setsockopt_req, sizeof(setsockopt_req)) != + sizeof(setsockopt_req)) { + die("Failed to read parameters for setsockopt() [process]"); + } + + switch (setsockopt_req.level) { + case SOL_SOCKET: + switch (setsockopt_req.optname) { + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_setsockopt, setsockopt_req.sockfd, + setsockopt_req.level, setsockopt_req.optname, + setsockopt_req.optval, setsockopt_req.optlen); + return true; + default: + break; + } + break; + case IPPROTO_TCP: + switch (setsockopt_req.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_setsockopt, setsockopt_req.sockfd, + setsockopt_req.level, setsockopt_req.optname, + setsockopt_req.optval, setsockopt_req.optlen); + return true; + default: + break; + } + break; + default: + break; + } + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; +} + +bool Sandbox::process_getsockopt(int parentMapsFd, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + GetSockOpt getsockopt_req; + SysCalls sys; + if (read(sys, sandboxFd, &getsockopt_req, sizeof(getsockopt_req)) != + sizeof(getsockopt_req)) { + die("Failed to read parameters for getsockopt() [process]"); + } + + switch (getsockopt_req.level) { + case SOL_SOCKET: + switch (getsockopt_req.optname) { + case SO_ACCEPTCONN: + case SO_ERROR: + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + case SO_TYPE: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_getsockopt, getsockopt_req.sockfd, + getsockopt_req.level, getsockopt_req.optname, + getsockopt_req.optval, getsockopt_req.optlen); + return true; + default: + break; + } + break; + case IPPROTO_TCP: + switch (getsockopt_req.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_getsockopt, getsockopt_req.sockfd, + getsockopt_req.level, getsockopt_req.optname, + getsockopt_req.optval, getsockopt_req.optlen); + return true; + default: + break; + } + break; + default: + break; + } + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; +} + +#endif +#if defined(__NR_socketcall) + +enum { + SYS_SOCKET = 1, + SYS_BIND = 2, + SYS_CONNECT = 3, + SYS_LISTEN = 4, + SYS_ACCEPT = 5, + SYS_GETSOCKNAME = 6, + SYS_GETPEERNAME = 7, + SYS_SOCKETPAIR = 8, + SYS_SEND = 9, + SYS_RECV = 10, + SYS_SENDTO = 11, + SYS_RECVFROM = 12, + SYS_SHUTDOWN = 13, + SYS_SETSOCKOPT = 14, + SYS_GETSOCKOPT = 15, + SYS_SENDMSG = 16, + SYS_RECVMSG = 17, + SYS_ACCEPT4 = 18 +}; + +struct Sandbox::SocketCallArgInfo { + size_t len; + off_t addrOff; + off_t lengthOff; +}; +const struct Sandbox::SocketCallArgInfo Sandbox::socketCallArgInfo[] = { + #define STRUCT(s) reinterpret_cast<SocketCall *>(0)->args.s + #define SIZE(s) sizeof(STRUCT(s)) + #define OFF(s, f) offsetof(typeof STRUCT(s), f) + { 0 }, + { SIZE(socket) }, + { SIZE(bind), OFF(bind, addr), OFF(bind, addrlen) }, + { SIZE(connect), OFF(connect, addr), OFF(connect, addrlen) }, + { SIZE(listen) }, + { SIZE(accept) }, + { SIZE(getsockname) }, + { SIZE(getpeername) }, + { SIZE(socketpair) }, + { SIZE(send) }, + { SIZE(recv) }, + { SIZE(sendto), OFF(sendto, to), OFF(sendto, tolen) }, + { SIZE(recvfrom) }, + { SIZE(shutdown) }, + { SIZE(setsockopt), OFF(setsockopt, optval), OFF(setsockopt, optlen) }, + { SIZE(getsockopt) }, + { SIZE(sendmsg) }, + { SIZE(recvmsg) }, + { SIZE(accept4) } + #undef STRUCT + #undef SIZE + #undef OFF +}; + +long Sandbox::sandbox_socketcall(int call, void* args) { + long long tm; + Debug::syscall(&tm, __NR_socketcall, "Executing handler", call); + + // When demultiplexing socketcall(), only accept calls that have a valid + // "call" opcode. + if (call < SYS_SOCKET || call > SYS_ACCEPT4) { + Debug::elapsed(tm, __NR_socketcall, call); + return -ENOSYS; + } + + // Some type of calls include a pointer to an address or name, which cannot + // be accessed by the trusted process, as it lives in a separate address + // space. For these calls, append the extra data to the serialized request. + // This requires some copying of data, as we have to make sure there is + // only a single atomic call to write(). + socklen_t numExtraData = 0; + const void* extraDataAddr = NULL; + if (socketCallArgInfo[call].lengthOff) { + memcpy(&numExtraData, + reinterpret_cast<char *>(args) + socketCallArgInfo[call].lengthOff, + sizeof(socklen_t)); + extraDataAddr = reinterpret_cast<char *>(args) + + socketCallArgInfo[call].addrOff; + } + + // sendmsg() and recvmsg() have more complicated requirements for computing + // the amount of extra data that needs to be sent to the trusted process. + if (call == SYS_SENDMSG) { + SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args); + if (sendmsg_args->msg->msg_iovlen == 1 && + !sendmsg_args->msg->msg_control) { + // Further down in the code, this sendmsg() call will be simplified to + // a sendto() call. Make sure we already compute the correct value for + // numExtraData, as it is needed when we allocate "data[]" on the stack. + numExtraData = sendmsg_args->msg->msg_namelen; + extraDataAddr = sendmsg_args->msg->msg_name; + } else { + // sendmsg() needs to include some of the extra data so that we can + // inspect it in process_socketcall() + numExtraData = sizeof(*sendmsg_args->msg) + + sendmsg_args->msg->msg_namelen + + sendmsg_args->msg->msg_controllen; + extraDataAddr = NULL; + } + } + if (call == SYS_RECVMSG) { + RecvMsg *recvmsg_args = reinterpret_cast<RecvMsg *>(args); + numExtraData = sizeof(*recvmsg_args->msg); + extraDataAddr = recvmsg_args->msg; + } + + // Set up storage for the request header and copy the data from "args" + // into it. + struct Request { + int sysnum; + long long cookie; + SocketCall socketcall_req; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + numExtraData]; + request = reinterpret_cast<struct Request *>(data); + memcpy(&request->socketcall_req.args, args, socketCallArgInfo[call].len); + + // Simplify send(), sendto() and sendmsg(), if there are simpler equivalent + // calls. This allows us to occasionally replace them with calls to write(), + // which don't have to be forwarded to the trusted process. + SysCalls sys; + if (call == SYS_SENDMSG && + request->socketcall_req.args.sendmsg.msg->msg_iovlen == 1 && + !request->socketcall_req.args.sendmsg.msg->msg_control) { + // Ordering of these assignments is important, as we are reshuffling + // fields inside of a union. + call = SYS_SENDTO; + request->socketcall_req.args.sendto.flags = + request->socketcall_req.args.sendmsg.flags; + request->socketcall_req.args.sendto.to = + request->socketcall_req.args.sendmsg.msg->msg_name; + request->socketcall_req.args.sendto.tolen = + request->socketcall_req.args.sendmsg.msg->msg_namelen; + request->socketcall_req.args.sendto.len = + request->socketcall_req.args.sendmsg.msg->msg_iov->iov_len; + request->socketcall_req.args.sendto.buf = + request->socketcall_req.args.sendmsg.msg->msg_iov->iov_base; + } + if (call == SYS_SENDTO && !request->socketcall_req.args.sendto.to) { + // sendto() with a NULL address is the same as send() + call = SYS_SEND; + numExtraData = 0; + } + if (call == SYS_SEND && !request->socketcall_req.args.send.flags) { + // send() with no flags is the same as write(), which is unrestricted + // in seccomp mode. + Debug::message("Replaced socketcall() with call to write()"); + ssize_t rc = sys.write(request->socketcall_req.args.send.sockfd, + request->socketcall_req.args.send.buf, + request->socketcall_req.args.send.len); + if (rc < 0) { + Debug::elapsed(tm, __NR_socketcall, call); + return -sys.my_errno; + } else { + Debug::elapsed(tm, __NR_socketcall, call); + return rc; + } + } + + // Simplify recv(), and recvfrom(), if there are simpler equivalent calls. + // This allows us to occasionally replace them with calls to read(), which + // don't have to be forwarded to the trusted process. + // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do + // not know whether the caller needs us to set msg->msg_flags. + if (call == SYS_RECVFROM && !request->socketcall_req.args.recvfrom.from) { + // recvfrom() with a NULL address buffer is the same as recv() + call = SYS_RECV; + } + if (call == SYS_RECV && !request->socketcall_req.args.recv.flags) { + // recv() with no flags is the same as read(), which is unrestricted + // in seccomp mode. + Debug::message("Replaced socketcall() with call to read()"); + ssize_t rc = sys.read(request->socketcall_req.args.recv.sockfd, + request->socketcall_req.args.recv.buf, + request->socketcall_req.args.recv.len); + if (rc < 0) { + Debug::elapsed(tm, __NR_socketcall, call); + return -sys.my_errno; + } else { + Debug::elapsed(tm, __NR_socketcall, call); + return rc; + } + } + + // Fill in the rest of the request header. + request->sysnum = __NR_socketcall; + request->cookie = cookie(); + request->socketcall_req.call = call; + request->socketcall_req.arg_ptr = args; + int padding = sizeof(request->socketcall_req.args) - + socketCallArgInfo[call].len; + if (padding > 0) { + memset((char *)(&request->socketcall_req.args + 1) - padding, 0, padding); + } + if (call == SYS_SENDMSG) { + // for sendmsg() we include the (optional) destination address, and the + // (optional) control data in the payload. + SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args); + memcpy(reinterpret_cast<char *>( + memcpy(reinterpret_cast<char *>( + memcpy(request + 1, sendmsg_args->msg, sizeof(*sendmsg_args->msg))) + + sizeof(*sendmsg_args->msg), + sendmsg_args->msg->msg_name, sendmsg_args->msg->msg_namelen)) + + sendmsg_args->msg->msg_namelen, + sendmsg_args->msg->msg_control, sendmsg_args->msg->msg_controllen); + } else if (extraDataAddr) { + memcpy(request + 1, extraDataAddr, numExtraData); + } + + // Send request to trusted process and collect response from trusted thread. + long rc; + ssize_t len = sizeof(struct Request) + numExtraData; + if (write(sys, processFdPub(), data, len) != len || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward socketcall() request [sandbox]"); + } + Debug::elapsed(tm, __NR_socketcall, call); + return rc; +} + +bool Sandbox::process_socketcall(int parentMapsFd, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + SocketCall socketcall_req; + SysCalls sys; + if (read(sys, sandboxFd, &socketcall_req, sizeof(socketcall_req)) != + sizeof(socketcall_req)) { + die("Failed to read parameters for socketcall() [process]"); + } + + // sandbox_socketcall() should never send us an unexpected "call" opcode. + // If it did, something went very wrong and we better terminate the process. + if (socketcall_req.call < SYS_SOCKET || socketcall_req.call > SYS_ACCEPT4) { + die("Unexpected socketcall() [process]"); + } + + // Check if this particular operation carries an extra payload. + socklen_t numExtraData = 0; + if (socketCallArgInfo[socketcall_req.call].lengthOff) { + memcpy(&numExtraData, + reinterpret_cast<char *>(&socketcall_req) + + socketCallArgInfo[socketcall_req.call].lengthOff, + sizeof(socklen_t)); + } else if (socketcall_req.call == SYS_SENDMSG) { + numExtraData = sizeof(*socketcall_req.args.sendmsg.msg); + } else if (socketcall_req.call == SYS_RECVMSG) { + numExtraData = sizeof(*socketcall_req.args.recvmsg.msg); + } + + // Verify that the length for the payload is reasonable. We don't want to + // blow up our stack, and excessive (or negative) buffer sizes are almost + // certainly a bug. + if (numExtraData > 4096) { + die("Unexpected size for socketcall() payload [process]"); + } + + // Read the extra payload, if any. + char extra[numExtraData]; + if (numExtraData) { + if (read(sys, sandboxFd, extra, numExtraData) != (ssize_t)numExtraData) { + die("Failed to read socketcall() payload [process]"); + } + } + + // sendmsg() has another level of indirection and can carry even more payload + ssize_t numSendmsgExtra = 0; + if (socketcall_req.call == SYS_SENDMSG) { + struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra); + if (msg->msg_namelen > 4096 || msg->msg_controllen > 4096) { + die("Unexpected size for socketcall() payload [process]"); + } + numSendmsgExtra = msg->msg_namelen + msg->msg_controllen; + } + char sendmsgExtra[numSendmsgExtra]; + if (numSendmsgExtra) { + if (read(sys, sandboxFd, sendmsgExtra, numSendmsgExtra) != + numSendmsgExtra) { + die("Failed to read socketcall() payload [process]"); + } + } + + int rc = -EINVAL; + switch (socketcall_req.call) { + case SYS_SOCKET: + // The sandbox does not allow creation of any new sockets. + goto deny; + case SYS_BIND: + // The sandbox does not allow binding an address to a socket. + goto deny; + case SYS_CONNECT: + // The sandbox does not allow connecting a socket. + goto deny; + case SYS_LISTEN: + // The sandbox does not allow a socket to enter listening state. + goto deny; + case SYS_ACCEPT4: + case SYS_ACCEPT: + // If the sandbox obtained a socket that is already in the listening + // state (e.g. because somebody sent it a suitable file descriptor), it + // is permissible to call accept(). + + accept_simple: + // None of the parameters need to be checked, so it is OK to refer + // to the parameter block created by the untrusted code. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_socketcall, + socketcall_req.call, socketcall_req.arg_ptr); + return true; + case SYS_GETSOCKNAME: + case SYS_GETPEERNAME: + // Querying the local and the remote name is not considered security + // sensitive for the purposes of the sandbox. + goto accept_simple; + case SYS_SOCKETPAIR: + // Socket pairs are connected to each other and not considered + // security sensitive. + goto accept_simple; + case SYS_SENDTO: + if (socketcall_req.args.sendto.to) { + // The sandbox does not allow sending to arbitrary addresses. + goto deny; + } + // Fall through + case SYS_SEND: + if (socketcall_req.args.send.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) { + // Unsupported flag encountered. Deny the call. + goto deny; + } + // Sending data on a connected socket is similar to calling write(). + // Allow it. + + accept_complex: + // The parameter block contains potentially security critical information + // that should not be tampered with after it has been inspected. Copy it + // into the write-protected securely shared memory before telling the + // trusted thread to execute the socket call. + SecureMem::lockSystemCall(parentMapsFd, mem); + memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args)); + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, + __NR_socketcall, socketcall_req.call, + mem->pathname - (char*)mem + (char*)mem->self); + return true; + case SYS_RECVFROM: + // While we do not anticipate any particular need to receive data on + // unconnected sockets, there is no particular risk in doing so. + // Fall through + case SYS_RECV: + if (socketcall_req.args.recv.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + // Unsupported flag encountered. Deny the call. + goto deny; + } + // Receiving data on a connected socket is similar to calling read(). + // Allow it. + goto accept_complex; + case SYS_SHUTDOWN: + // Shutting down a socket is always OK. + goto accept_simple; + case SYS_SETSOCKOPT: + switch (socketcall_req.args.setsockopt.level) { + case SOL_SOCKET: + switch (socketcall_req.args.setsockopt.optname) { + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + goto accept_complex; + default: + break; + } + break; + case IPPROTO_TCP: + switch (socketcall_req.args.setsockopt.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + goto accept_complex; + default: + break; + } + break; + default: + break; + } + goto deny; + case SYS_GETSOCKOPT: + switch (socketcall_req.args.getsockopt.level) { + case SOL_SOCKET: + switch (socketcall_req.args.getsockopt.optname) { + case SO_ACCEPTCONN: + case SO_ERROR: + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + case SO_TYPE: + goto accept_complex; + default: + break; + } + break; + case IPPROTO_TCP: + switch (socketcall_req.args.getsockopt.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + goto accept_complex; + default: + break; + } + break; + default: + break; + } + goto deny; + case SYS_SENDMSG: { + struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra); + + if (sizeof(socketcall_req.args) + sizeof(*msg) + numSendmsgExtra > + sizeof(mem->pathname)) { + goto deny; + } + + if (msg->msg_namelen || + (socketcall_req.args.sendmsg.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))){ + goto deny; + } + + // The trusted process receives file handles when a new untrusted thread + // gets created. We have security checks in place that prevent any + // critical information from being tampered with during thread creation. + // But if we disallowed passing of file handles, this would add an extra + // hurdle for an attacker. + // Unfortunately, for now, this is not possible as Chrome's + // base::SendRecvMsg() needs the ability to pass file handles. + if (msg->msg_controllen) { + msg->msg_control = sendmsgExtra + msg->msg_namelen; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + do { + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) { + goto deny; + } + } while ((cmsg = CMSG_NXTHDR(msg, cmsg)) != NULL); + } + + // This must be a locked system call, because we have to ensure that + // the untrusted code does not tamper with the msghdr after we have + // examined it. + SecureMem::lockSystemCall(parentMapsFd, mem); + socketcall_req.args.sendmsg.msg = + reinterpret_cast<struct msghdr*>(mem->pathname + + sizeof(socketcall_req.args) - + (char*)mem + (char*)mem->self); + memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args)); + if (numSendmsgExtra) { + if (msg->msg_namelen > 0) { + msg->msg_name = const_cast<struct msghdr*>( + socketcall_req.args.sendmsg.msg) + 1; + } + if (msg->msg_controllen > 0) { + msg->msg_control = (char *)( + socketcall_req.args.sendmsg.msg + 1) + msg->msg_namelen; + } + memcpy(mem->pathname + sizeof(socketcall_req.args) + sizeof(*msg), + sendmsgExtra, numSendmsgExtra); + } + memcpy(mem->pathname + sizeof(socketcall_req.args), msg, sizeof(*msg)); + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, + __NR_socketcall, socketcall_req.call, + mem->pathname - (char*)mem + (char*)mem->self); + return true; + } + case SYS_RECVMSG: + // Receiving messages is general not security critical. + if (socketcall_req.args.recvmsg.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + goto deny; + } + goto accept_complex; + default: + deny: + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } +} + +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/stat.cc b/sandbox/linux/seccomp/stat.cc new file mode 100644 index 0000000..cdf7e4c --- /dev/null +++ b/sandbox/linux/seccomp/stat.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +long Sandbox::sandbox_stat(const char *path, void *buf) { + long long tm; + Debug::syscall(&tm, __NR_stat, "Executing handler"); + size_t len = strlen(path); + struct Request { + int sysnum; + long long cookie; + Stat stat_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_stat; + request->cookie = cookie(); + request->stat_req.sysnum = __NR_stat; + request->stat_req.path_length = len; + request->stat_req.buf = buf; + memcpy(request->pathname, path, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward stat() request [sandbox]"); + } + Debug::elapsed(tm, __NR_stat); + return rc; +} + +long Sandbox::sandbox_lstat(const char *path, void *buf) { + long long tm; + Debug::syscall(&tm, __NR_lstat, "Executing handler"); + size_t len = strlen(path); + struct Request { + int sysnum; + long long cookie; + Stat stat_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_lstat; + request->cookie = cookie(); + request->stat_req.sysnum = __NR_lstat; + request->stat_req.path_length = len; + request->stat_req.buf = buf; + memcpy(request->pathname, path, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward lstat() request [sandbox]"); + } + Debug::elapsed(tm, __NR_lstat); + return rc; +} + +#if defined(__NR_stat64) +long Sandbox::sandbox_stat64(const char *path, void *buf) { + long long tm; + Debug::syscall(&tm, __NR_stat64, "Executing handler"); + size_t len = strlen(path); + struct Request { + int sysnum; + long long cookie; + Stat stat_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_stat64; + request->cookie = cookie(); + request->stat_req.sysnum = __NR_stat64; + request->stat_req.path_length = len; + request->stat_req.buf = buf; + memcpy(request->pathname, path, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward stat64() request [sandbox]"); + } + Debug::elapsed(tm, __NR_stat64); + return rc; +} + +long Sandbox::sandbox_lstat64(const char *path, void *buf) { + long long tm; + Debug::syscall(&tm, __NR_lstat64, "Executing handler"); + size_t len = strlen(path); + struct Request { + int sysnum; + long long cookie; + Stat stat_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_lstat64; + request->cookie = cookie(); + request->stat_req.sysnum = __NR_lstat64; + request->stat_req.path_length = len; + request->stat_req.buf = buf; + memcpy(request->pathname, path, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward lstat64() request [sandbox]"); + } + Debug::elapsed(tm, __NR_lstat64); + return rc; +} +#endif + +bool Sandbox::process_stat(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + Stat stat_req; + if (read(sys, sandboxFd, &stat_req, sizeof(stat_req)) != sizeof(stat_req)) { + read_parm_failed: + die("Failed to read parameters for stat() [process]"); + } + int rc = -ENAMETOOLONG; + if (stat_req.path_length >= (int)sizeof(mem->pathname)) { + char buf[32]; + while (stat_req.path_length > 0) { + size_t len = stat_req.path_length > sizeof(buf) ? + sizeof(buf) : stat_req.path_length; + ssize_t i = read(sys, sandboxFd, buf, len); + if (i <= 0) { + goto read_parm_failed; + } + stat_req.path_length -= i; + } + if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to return data from stat() [process]"); + } + return false; + } + if (stat_req.sysnum != __NR_stat && stat_req.sysnum != __NR_lstat + #ifdef __NR_stat64 + && stat_req.sysnum != __NR_stat64 + #endif + #ifdef __NR_lstat64 + && stat_req.sysnum != __NR_lstat64 + #endif + ) { + die("Corrupted stat() request"); + } + + if (!g_policy.allow_file_namespace) { + // After locking the mutex, we can no longer abandon the system call. So, + // perform checks before clobbering the securely shared memory. + char tmp[stat_req.path_length]; + if (read(sys, sandboxFd, tmp, stat_req.path_length) != + (ssize_t)stat_req.path_length) { + goto read_parm_failed; + } + Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str()); + SecureMem::abandonSystemCall(threadFd, -EACCES); + return false; + } + + SecureMem::lockSystemCall(parentMapsFd, mem); + if (read(sys, sandboxFd, mem->pathname, stat_req.path_length) != + (ssize_t)stat_req.path_length) { + goto read_parm_failed; + } + mem->pathname[stat_req.path_length] = '\000'; + + // TODO(markus): Implement sandboxing policy + Debug::message(("Allowing access to \"" + std::string(mem->pathname) + + "\"").c_str()); + + // Tell trusted thread to stat the file. + SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, + stat_req.sysnum, + mem->pathname - (char*)mem + (char*)mem->self, + stat_req.buf); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc new file mode 100644 index 0000000..681fec9 --- /dev/null +++ b/sandbox/linux/seccomp/syscall.cc @@ -0,0 +1,380 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "debug.h" +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +// TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file +asm( + ".pushsection .text, \"ax\", @progbits\n" + + // This is the special wrapper for the clone() system call. The code + // relies on the stack layout of the system call wrapper (c.f. below). It + // passes the stack pointer as an additional argument to sandbox__clone(), + // so that upon starting the child, register values can be restored and + // the child can start executing at the correct IP, instead of trying to + // run in the trusted thread. + "playground$sandbox_clone:" + ".globl playground$sandbox_clone\n" + ".type playground$sandbox_clone, @function\n" + #if defined(__x86_64__) + // Skip the 8 byte return address into the system call wrapper. The + // following bytes are the saved register values that we need to restore + // upon return from clone() in the new thread. + "lea 8(%rsp), %r9\n" + "jmp playground$sandbox__clone\n" + #elif defined(__i386__) + // As i386 passes function arguments on the stack, we need to skip a few + // more values before we can get to the saved registers. + "lea 28(%esp), %eax\n" + "mov %eax, 24(%esp)\n" + "jmp playground$sandbox__clone\n" + #else + #error Unsupported target platform + #endif + ".size playground$sandbox_clone, .-playground$sandbox_clone\n" + + + // This is the wrapper which is called by the untrusted code, trying to + // make a system call. + "playground$syscallWrapper:" + ".internal playground$syscallWrapper\n" + ".globl playground$syscallWrapper\n" + ".type playground$syscallWrapper, @function\n" + #if defined(__x86_64__) + // Check for rt_sigreturn(). It needs to be handled specially. + "cmp $15, %rax\n" // NR_rt_sigreturn + "jnz 1f\n" + "add $0x90, %rsp\n" // pop return addresses and red zone + "0:syscall\n" // rt_sigreturn() is unrestricted + "mov $66, %edi\n" // rt_sigreturn() should never return + "mov $231, %eax\n" // NR_exit_group + "jmp 0b\n" + + // Save all registers + "1:push %rbp\n" + "mov %rsp, %rbp\n" + "push %rbx\n" + "push %rcx\n" + "push %rdx\n" + "push %rsi\n" + "push %rdi\n" + "push %r8\n" + "push %r9\n" + "push %r10\n" + "push %r11\n" + "push %r12\n" + "push %r13\n" + "push %r14\n" + "push %r15\n" + + // Convert from syscall calling conventions to C calling conventions. + // System calls have a subtly different register ordering than the user- + // space x86-64 ABI. + "mov %r10, %rcx\n" + + // Check range of system call + "cmp playground$maxSyscall(%rip), %eax\n" + "ja 3f\n" + + // Retrieve function call from system call table (c.f. syscall_table.c). + // We have three different types of entries; zero for denied system calls, + // that should be handled by the defaultSystemCallHandler(); minus one + // for unrestricted system calls that need to be forwarded to the trusted + // thread; and function pointers to specific handler functions. + "mov %rax, %r10\n" + "shl $4, %r10\n" + "lea playground$syscallTable(%rip), %r11\n" + "add %r11, %r10\n" + "mov 0(%r10), %r10\n" + + // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise + // jump to fallback handler. + "cmp $1, %r10\n" + "jbe 3f\n" + "call *%r10\n" + "2:" + + // Restore CPU registers, except for %rax which was set by the system call. + "pop %r15\n" + "pop %r14\n" + "pop %r13\n" + "pop %r12\n" + "pop %r11\n" + "pop %r10\n" + "pop %r9\n" + "pop %r8\n" + "pop %rdi\n" + "pop %rsi\n" + "pop %rdx\n" + "pop %rcx\n" + "pop %rbx\n" + "pop %rbp\n" + + // Remove fake return address. This is added in the patching code in + // library.cc and it makes stack traces a little cleaner. + "add $8, %rsp\n" + + // Return to caller + "ret\n" + + "3:" + // If we end up calling a specific handler, we don't need to know the + // system call number. However, in the generic case, we do. Shift + // registers so that the system call number becomes visible as the + // first function argument. + "push %r9\n" + "mov %r8, %r9\n" + "mov %rcx, %r8\n" + "mov %rdx, %rcx\n" + "mov %rsi, %rdx\n" + "mov %rdi, %rsi\n" + "mov %rax, %rdi\n" + + // Call default handler. + "call playground$defaultSystemCallHandler\n" + "pop %r9\n" + "jmp 2b\n" + #elif defined(__i386__) + "cmp $119, %eax\n" // NR_sigreturn + "jnz 1f\n" + "add $0x4, %esp\n" // pop return address + "0:int $0x80\n" // sigreturn() is unrestricted + "mov $66, %ebx\n" // sigreturn() should never return + "mov %ebx, %eax\n" // NR_exit + "jmp 0b\n" + "1:cmp $173, %eax\n" // NR_rt_sigreturn + "jnz 3f\n" + + // Convert rt_sigframe into sigframe, allowing us to call sigreturn(). + // This is possible since the first part of signal stack frames have + // stayed very stable since the earliest kernel versions. While never + // officially documented, lots of user space applications rely on this + // part of the ABI, and kernel developers have been careful to maintain + // backwards compatibility. + // In general, the rt_sigframe includes a lot of extra information that + // the signal handler can look at. Most notably, this means a complete + // siginfo record. + // Fortunately though, the kernel doesn't look at any of this extra data + // when returning from a signal handler. So, we can safely convert an + // rt_sigframe to a legacy sigframe, discarding the extra data in the + // process. Interestingly, the legacy signal frame is actually larger than + // the rt signal frame, as it includes a lot more padding. + "sub $0x1C8, %esp\n" // a legacy signal stack is much larger + "mov 0x1CC(%esp), %eax\n" // push signal number + "push %eax\n" + "lea 0x270(%esp), %esi\n" // copy siginfo register values + "lea 0x4(%esp), %edi\n" // into new location + "mov $0x16, %ecx\n" + "cld\n" + "rep movsl\n" + "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask + "mov %ebx, 0x54(%esp)\n" + "lea 2f, %esi\n" + "push %esi\n" // push restorer function + "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers + "movb $2, %cl\n" + "rep movsl\n" + "ret\n" // return to restorer function + "2:pop %eax\n" // remove dummy argument (signo) + "mov $119, %eax\n" // NR_sigaction + "int $0x80\n" + + + // Preserve all registers + "3:push %ebx\n" + "push %ecx\n" + "push %edx\n" + "push %esi\n" + "push %edi\n" + "push %ebp\n" + + // Convert from syscall calling conventions to C calling conventions + "push %ebp\n" + "push %edi\n" + "push %esi\n" + "push %edx\n" + "push %ecx\n" + "push %ebx\n" + "push %eax\n" + + // Check range of system call + "cmp playground$maxSyscall, %eax\n" + "ja 9f\n" + + // We often have long sequences of calls to gettimeofday(). This is + // needlessly expensive. Coalesce them into a single call. + // + // We keep track of state in TLS storage that we can access through + // the %fs segment register. See trusted_thread.cc for the exact + // memory layout. + // + // TODO(markus): maybe, we should proactively call gettimeofday() and + // clock_gettime(), whenever we talk to the trusted thread? + // or maybe, if we have recently seen requests to compute + // the time. There might be a repeated pattern of those. + "cmp $78, %eax\n" // __NR_gettimeofday + "jnz 6f\n" + "cmp %eax, %fs:0x102C-0x58\n" // last system call + "jnz 4f\n" + + // This system call and the last system call prior to this one both are + // calls to gettimeofday(). Try to avoid making the new call and just + // return the same result as in the previous call. + // Just in case the caller is spinning on the result from gettimeofday(), + // every so often, call the actual system call. + "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday() + "jz 4f\n" + + // Atomically read the 64bit word representing last-known timestamp and + // return it to the caller. On x86-32 this is a little more complicated and + // requires the use of the cmpxchg8b instruction. + "mov %ebx, %eax\n" + "mov %ecx, %edx\n" + "lock; cmpxchg8b 100f\n" + "mov %eax, 0(%ebx)\n" + "mov %edx, 4(%ebx)\n" + "xor %eax, %eax\n" + "add $28, %esp\n" + "jmp 8f\n" + + // This is a call to gettimeofday(), but we don't have a valid cached + // result, yet. + "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number + "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations + "call playground$defaultSystemCallHandler\n" + + // Returned from gettimeofday(). Remember return value, in case the + // application calls us again right away. + // Again, this has to happen atomically and requires cmpxchg8b. + "mov 4(%ebx), %ecx\n" + "mov 0(%ebx), %ebx\n" + "mov 100f, %eax\n" + "mov 101f, %edx\n" + "5:lock; cmpxchg8b 100f\n" + "jnz 5b\n" + "xor %eax, %eax\n" + "jmp 10f\n" + + // Remember the number of the last system call made. We deliberately do + // not remember calls to gettid(), as we have often seen long sequences + // of calls to just gettimeofday() and gettid(). In that situation, we + // would still like to coalesce the gettimeofday() calls. + "6:cmp $224, %eax\n" // __NR_gettid + "jz 7f\n" + "mov %eax, %fs:0x102C-0x58\n" // remember syscall number + + // Retrieve function call from system call table (c.f. syscall_table.c). + // We have three different types of entries; zero for denied system calls, + // that should be handled by the defaultSystemCallHandler(); minus one + // for unrestricted system calls that need to be forwarded to the trusted + // thread; and function pointers to specific handler functions. + "7:shl $3, %eax\n" + "lea playground$syscallTable, %ebx\n" + "add %ebx, %eax\n" + "mov 0(%eax), %eax\n" + + // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise + // jump to fallback handler. + "cmp $1, %eax\n" + "jbe 9f\n" + "add $4, %esp\n" + "call *%eax\n" + "add $24, %esp\n" + + // Restore CPU registers, except for %eax which was set by the system call. + "8:pop %ebp\n" + "pop %edi\n" + "pop %esi\n" + "pop %edx\n" + "pop %ecx\n" + "pop %ebx\n" + + // Return to caller + "ret\n" + + // Call default handler. + "9:call playground$defaultSystemCallHandler\n" + "10:add $28, %esp\n" + "jmp 8b\n" + + ".pushsection \".bss\"\n" + ".balign 8\n" +"100:.byte 0, 0, 0, 0\n" +"101:.byte 0, 0, 0, 0\n" + ".popsection\n" + + #else + #error Unsupported target platform + #endif + ".size playground$syscallWrapper, .-playground$syscallWrapper\n" + ".popsection\n" +); + + +void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1, + void* arg2, void* arg3, void* arg4, + void* arg5) { + // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that. + + // We try to avoid intercepting read(), and write(), as these system calls + // are not restricted in Seccomp mode. But depending on the exact + // instruction sequence in libc, we might not be able to reliably + // filter out these system calls at the time when we instrument the code. + SysCalls sys; + long rc; + long long tm; + switch (syscallNum) { + case __NR_read: + Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); + rc = sys.read((long)arg0, arg1, (size_t)arg2); + break; + case __NR_write: + Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); + rc = sys.write((long)arg0, arg1, (size_t)arg2); + break; + default: + if (Debug::isEnabled()) { + // In debug mode, prevent stderr from being closed + if (syscallNum == __NR_close && arg0 == (void *)2) + return 0; + } + + if ((unsigned)syscallNum <= maxSyscall && + syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) { + Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); + perform_unrestricted: + struct { + int sysnum; + void* unrestricted_req[6]; + } __attribute__((packed)) request = { + syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } }; + + int thread = threadFdPub(); + void* rc; + if (write(sys, thread, &request, sizeof(request)) != sizeof(request) || + read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward unrestricted system call"); + } + Debug::elapsed(tm, syscallNum); + return rc; + } else if (Debug::isEnabled()) { + Debug::syscall(&tm, syscallNum, + "In production mode, this call would be disallowed"); + goto perform_unrestricted; + } else { + return (void *)-ENOSYS; + } + } + if (rc < 0) { + rc = -sys.my_errno; + } + Debug::elapsed(tm, syscallNum); + return (void *)rc; +} + +} // namespace diff --git a/sandbox/linux/seccomp/syscall.h b/sandbox/linux/seccomp/syscall.h new file mode 100644 index 0000000..1315e12 --- /dev/null +++ b/sandbox/linux/seccomp/syscall.h @@ -0,0 +1,22 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef SYSCALL_H__ +#define SYSCALL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +void syscallWrapper() asm("playground$syscallWrapper") +#if defined(__x86_64__) + __attribute__((visibility("internal"))) +#endif +; + +#ifdef __cplusplus +} +#endif + +#endif // SYSCALL_H__ diff --git a/sandbox/linux/seccomp/syscall_table.c b/sandbox/linux/seccomp/syscall_table.c new file mode 100644 index 0000000..c9dd7a4 --- /dev/null +++ b/sandbox/linux/seccomp/syscall_table.c @@ -0,0 +1,153 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <asm/unistd.h> +#include "sandbox_impl.h" +#include "syscall_table.h" + +#if defined(__x86_64__) +#ifndef __NR_set_robust_list +#define __NR_set_robust_list 273 +#endif +#ifndef __NR_accept4 +#define __NR_accept4 288 +#endif +#elif defined(__i386__) +#ifndef __NR_set_robust_list +#define __NR_set_robust_list 311 +#endif +#else +#error Unsupported target platform +#endif + +// TODO(markus): This is an incredibly dirty hack to make the syscallTable +// live in r/o memory. +// Unfortunately, gcc doesn't give us a clean option to do +// this. Ultimately, we should probably write some code that +// parses /usr/include/asm/unistd*.h and generates a *.S file. +// But we then need to figure out how to integrate this code +// with our build system. + +const struct SyscallTable syscallTable[] __attribute__(( + section(".rodata, \"a\", @progbits\n#"))) ={ + + #if defined(__NR_accept) + [ __NR_accept ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_accept4 ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_access ] = { (void*)&sandbox_access, process_access }, + [ __NR_brk ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_clock_gettime ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_clone ] = { (void*)&sandbox_clone, process_clone }, + [ __NR_close ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_dup ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_dup2 ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_epoll_create ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_epoll_ctl ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_epoll_wait ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_exit ] = { (void*)&sandbox_exit, process_exit }, + [ __NR_exit_group ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_fcntl ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_fcntl64) + [ __NR_fcntl64 ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_fstat ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_fstat64) + [ __NR_fstat64 ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_futex ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_getdents ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_getdents64 ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_getpeername) + [ __NR_getpeername ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_getpid ] = { (void*)&sandbox_getpid, 0 }, + #if defined(__NR_getsockname) + [ __NR_getsockname ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_getsockopt ] = { (void*)&sandbox_getsockopt,process_getsockopt }, + #endif + [ __NR_gettid ] = { (void*)&sandbox_gettid, 0 }, + [ __NR_gettimeofday ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_ioctl ] = { (void*)&sandbox_ioctl, process_ioctl }, + #if defined(__NR_ipc) + [ __NR_ipc ] = { (void*)&sandbox_ipc, process_ipc }, + #endif + #if defined(__NR__llseek) + [ __NR__llseek ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_lseek ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_lstat ] = { (void*)&sandbox_lstat, process_stat }, + #if defined(__NR_lstat64) + [ __NR_lstat64 ] = { (void*)&sandbox_lstat64, process_stat }, + #endif + [ __NR_madvise ] = { (void*)&sandbox_madvise, process_madvise }, + #if defined(__NR_mmap2) + [ __NR_mmap2 ] = + #else + [ __NR_mmap ] = + #endif + { (void*)&sandbox_mmap, process_mmap }, + [ __NR_mprotect ] = { (void*)&sandbox_mprotect, process_mprotect }, + [ __NR_munmap ] = { (void*)&sandbox_munmap, process_munmap }, + [ __NR_open ] = { (void*)&sandbox_open, process_open }, + [ __NR_pipe ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_poll ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_recvfrom) + [ __NR_recvfrom ] = { (void*)&sandbox_recvfrom, process_recvfrom }, + [ __NR_recvmsg ] = { (void*)&sandbox_recvmsg, process_recvmsg }, + #endif + #if defined(__NR_rt_sigaction) + [ __NR_rt_sigaction ] = { (void*)&sandbox_rt_sigaction,process_sigaction}, + #endif + #if defined(__NR_rt_sigprocmask) + [ __NR_rt_sigprocmask ] = { (void*)&sandbox_rt_sigprocmask, 0 }, + #endif + #if defined(__NR_sendmsg) + [ __NR_sendmsg ] = { (void*)&sandbox_sendmsg, process_sendmsg }, + [ __NR_sendto ] = { (void*)&sandbox_sendto, process_sendto }, + #endif + [ __NR_set_robust_list ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_setsockopt) + [ __NR_setsockopt ] = { (void*)&sandbox_setsockopt,process_setsockopt }, + #endif + #if defined(__NR_shmat) + [ __NR_shmat ] = { (void*)&sandbox_shmat, process_shmat }, + [ __NR_shmctl ] = { (void*)&sandbox_shmctl, process_shmctl }, + [ __NR_shmdt ] = { (void*)&sandbox_shmdt, process_shmdt }, + [ __NR_shmget ] = { (void*)&sandbox_shmget, process_shmget }, + #endif + #if defined(__NR_shutdown) + [ __NR_shutdown ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + #if defined(__NR_sigaction) + [ __NR_sigaction ] = { (void*)&sandbox_sigaction,process_sigaction }, + #endif + #if defined(__NR_signal) + [ __NR_signal ] = { (void*)&sandbox_signal, process_sigaction }, + #endif + #if defined(__NR_sigprocmask) + [ __NR_sigprocmask ] = { (void*)&sandbox_sigprocmask, 0 }, + #endif + #if defined(__NR_socketpair) + [ __NR_socketpair ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + #if defined(__NR_socketcall) + [ __NR_socketcall ] = { (void*)&sandbox_socketcall,process_socketcall }, + #endif + [ __NR_stat ] = { (void*)&sandbox_stat, process_stat }, + #if defined(__NR_stat64) + [ __NR_stat64 ] = { (void*)&sandbox_stat64, process_stat }, + #endif + [ __NR_time ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_uname ] = { UNRESTRICTED_SYSCALL, 0 }, +}; +const unsigned maxSyscall __attribute__((section(".rodata"))) = + sizeof(syscallTable)/sizeof(struct SyscallTable); + +const int syscall_mutex_[4096/sizeof(int)] asm("playground$syscall_mutex") + __attribute__((section(".rodata"),aligned(4096) +#if defined(__x86_64__) + ,visibility("internal") +#endif + )) = { 0x80000000 }; diff --git a/sandbox/linux/seccomp/syscall_table.h b/sandbox/linux/seccomp/syscall_table.h new file mode 100644 index 0000000..5bd6791 --- /dev/null +++ b/sandbox/linux/seccomp/syscall_table.h @@ -0,0 +1,43 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef SYSCALL_TABLE_H__ +#define SYSCALL_TABLE_H__ + +#include <sys/types.h> + +#ifdef __cplusplus +#include "securemem.h" +extern "C" { +namespace playground { +#define SecureMemArgs SecureMem::Args +#else +#define SecureMemArgs void +#define bool int +#endif + #define UNRESTRICTED_SYSCALL ((void *)1) + + struct SyscallTable { + void *handler; + bool (*trustedProcess)(int parentMapsFd, int sandboxFd, int threadFdPub, + int threadFd, SecureMemArgs* mem); + }; + extern const struct SyscallTable syscallTable[] + asm("playground$syscallTable") +#if defined(__x86_64__) + __attribute__((visibility("internal"))) +#endif + ; + extern const unsigned maxSyscall + asm("playground$maxSyscall") +#if defined(__x86_64__) + __attribute__((visibility("internal"))) +#endif + ; +#ifdef __cplusplus +} // namespace +} +#endif + +#endif // SYSCALL_TABLE_H__ diff --git a/sandbox/linux/seccomp/tests/list_tests.py b/sandbox/linux/seccomp/tests/list_tests.py new file mode 100644 index 0000000..011a52e --- /dev/null +++ b/sandbox/linux/seccomp/tests/list_tests.py @@ -0,0 +1,22 @@ +# Copyright (c) 2010 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import re +import sys + + +def get_tests(filename): + for line in open(filename): + match = re.match(r"TEST\((\w+)\)", line) + if match is not None: + yield match.group(1) + + +def main(args): + for name in get_tests(args[0]): + print ' { "%s", %s },' % (name, name) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/sandbox/linux/seccomp/tests/test_syscalls.cc b/sandbox/linux/seccomp/tests/test_syscalls.cc new file mode 100644 index 0000000..3e6acd5 --- /dev/null +++ b/sandbox/linux/seccomp/tests/test_syscalls.cc @@ -0,0 +1,758 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <assert.h> +#include <dirent.h> +#include <pthread.h> +#include <pty.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include "sandbox_impl.h" + +#ifdef DEBUG +#define MSG(fmt, ...) printf(fmt, ##__VA_ARGS__) +#else +#define MSG(fmt, ...) do { } while (0) +#endif + +int g_intended_status_fd = -1; + +// Declares the wait() status that the test subprocess intends to exit with. +void intend_exit_status(int val, bool is_signal) { + if (is_signal) { + val = W_EXITCODE(0, val); + } else { + val = W_EXITCODE(val, 0); + } + if (g_intended_status_fd != -1) { + int sent = write(g_intended_status_fd, &val, sizeof(val)); + assert(sent == sizeof(val)); + } else { + // This prints in cases where we run one test without forking + printf("Intending to exit with status %i...\n", val); + } +} + + +// This is basically a marker to grep for. +#define TEST(name) void name() + +TEST(test_dup) { + StartSeccompSandbox(); + // Test a simple syscall that is marked as UNRESTRICTED_SYSCALL. + int fd = dup(1); + assert(fd >= 0); + int rc = close(fd); + assert(rc == 0); +} + +TEST(test_segfault) { + StartSeccompSandbox(); + // Check that the sandbox's SIGSEGV handler does not stop the + // process from dying cleanly in the event of a real segfault. + intend_exit_status(SIGSEGV, true); + asm("hlt"); +} + +TEST(test_exit) { + StartSeccompSandbox(); + intend_exit_status(123, false); + _exit(123); +} + +// This has an off-by-three error because it counts ".", "..", and the +// FD for the /proc/self/fd directory. This doesn't matter because it +// is only used to check for differences in the number of open FDs. +static int count_fds() { + DIR *dir = opendir("/proc/self/fd"); + assert(dir != NULL); + int count = 0; + while (1) { + struct dirent *d = readdir(dir); + if (d == NULL) + break; + count++; + } + int rc = closedir(dir); + assert(rc == 0); + return count; +} + +static void *thread_func(void *x) { + int *ptr = (int *) x; + *ptr = 123; + MSG("In new thread\n"); + return (void *) 456; +} + +TEST(test_thread) { + playground::g_policy.allow_file_namespace = true; // To allow count_fds() + StartSeccompSandbox(); + int fd_count1 = count_fds(); + pthread_t tid; + int x = 999; + void *result; + pthread_create(&tid, NULL, thread_func, &x); + MSG("Waiting for thread\n"); + pthread_join(tid, &result); + assert(result == (void *) 456); + assert(x == 123); + // Check that the process has not leaked FDs. + int fd_count2 = count_fds(); + assert(fd_count2 == fd_count1); +} + +static int clone_func(void *x) { + int *ptr = (int *) x; + *ptr = 124; + MSG("In thread\n"); + // On x86-64, returning from this function calls the __NR_exit_group + // syscall instead of __NR_exit. + syscall(__NR_exit, 100); + // Not reached. + return 200; +} + +#if defined(__i386__) +static int get_gs() { + int gs; + asm volatile("mov %%gs, %0" : "=r"(gs)); + return gs; +} +#endif + +static void *get_tls_base() { + void *base; +#if defined(__x86_64__) + asm volatile("mov %%fs:0, %0" : "=r"(base)); +#elif defined(__i386__) + asm volatile("mov %%gs:0, %0" : "=r"(base)); +#else +#error Unsupported target platform +#endif + return base; +} + +TEST(test_clone) { + playground::g_policy.allow_file_namespace = true; // To allow count_fds() + StartSeccompSandbox(); + int fd_count1 = count_fds(); + int stack_size = 0x1000; + char *stack = (char *) malloc(stack_size); + assert(stack != NULL); + int flags = CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID; + int tid = -1; + int x = 999; + + // The sandbox requires us to pass CLONE_TLS. Pass settings that + // are enough to copy the parent thread's TLS setup. This allows us + // to invoke libc in the child thread. +#if defined(__x86_64__) + void *tls = get_tls_base(); +#elif defined(__i386__) + struct user_desc tls_desc, *tls = &tls_desc; + tls_desc.entry_number = get_gs() >> 3; + tls_desc.base_addr = (long) get_tls_base(); + tls_desc.limit = 0xfffff; + tls_desc.seg_32bit = 1; + tls_desc.contents = 0; + tls_desc.read_exec_only = 0; + tls_desc.limit_in_pages = 1; + tls_desc.seg_not_present = 0; + tls_desc.useable = 1; +#else +#error Unsupported target platform +#endif + + int rc = clone(clone_func, (void *) (stack + stack_size), flags, &x, + &tid, tls, &tid); + assert(rc > 0); + while (tid == rc) { + syscall(__NR_futex, &tid, FUTEX_WAIT, rc, NULL); + } + assert(tid == 0); + assert(x == 124); + // Check that the process has not leaked FDs. + int fd_count2 = count_fds(); + assert(fd_count2 == fd_count1); +} + +static int uncalled_clone_func(void *x) { + printf("In thread func, which shouldn't happen\n"); + return 1; +} + +TEST(test_clone_disallowed_flags) { + StartSeccompSandbox(); + int stack_size = 4096; + char *stack = (char *) malloc(stack_size); + assert(stack != NULL); + /* We omit the flags CLONE_SETTLS, CLONE_PARENT_SETTID and + CLONE_CHILD_CLEARTID, which is disallowed by the sandbox. */ + int flags = CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM; + int rc = clone(uncalled_clone_func, (void *) (stack + stack_size), + flags, NULL, NULL, NULL, NULL); + assert(rc == -1); + assert(errno == EPERM); +} + +static void *fp_thread(void *x) { + int val; + asm("movss %%xmm0, %0" : "=m"(val)); + MSG("val=%i\n", val); + return NULL; +} + +TEST(test_fp_regs) { + StartSeccompSandbox(); + int val = 1234; + asm("movss %0, %%xmm0" : "=m"(val)); + pthread_t tid; + pthread_create(&tid, NULL, fp_thread, NULL); + pthread_join(tid, NULL); + MSG("thread done OK\n"); +} + +static long long read_tsc() { + long long rc; + asm volatile( + "rdtsc\n" + "mov %%eax, (%0)\n" + "mov %%edx, 4(%0)\n" + : + : "c"(&rc), "a"(-1), "d"(-1)); + return rc; +} + +TEST(test_rdtsc) { + StartSeccompSandbox(); + // Just check that we can do the instruction. + read_tsc(); +} + +TEST(test_getpid) { + int pid1 = getpid(); + StartSeccompSandbox(); + int pid2 = getpid(); + assert(pid1 == pid2); + // Bypass any caching that glibc's getpid() wrapper might do. + int pid3 = syscall(__NR_getpid); + assert(pid1 == pid3); +} + +TEST(test_gettid) { + // glibc doesn't provide a gettid() wrapper. + int tid1 = syscall(__NR_gettid); + assert(tid1 > 0); + StartSeccompSandbox(); + int tid2 = syscall(__NR_gettid); + assert(tid1 == tid2); +} + +static void *map_something() { + void *addr = mmap(NULL, 0x1000, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + assert(addr != MAP_FAILED); + return addr; +} + +TEST(test_mmap_disallows_remapping) { + void *addr = map_something(); + StartSeccompSandbox(); + // Overwriting a mapping that was created before the sandbox was + // enabled is not allowed. + void *result = mmap(addr, 0x1000, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + assert(result == MAP_FAILED); + assert(errno == EINVAL); +} + +TEST(test_mmap_disallows_low_address) { + StartSeccompSandbox(); + // Mapping pages at low addresses is not allowed because this helps + // with exploiting buggy kernels. + void *result = mmap(NULL, 0x1000, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + assert(result == MAP_FAILED); + assert(errno == EINVAL); +} + +TEST(test_munmap_allowed) { + StartSeccompSandbox(); + void *addr = map_something(); + int result = munmap(addr, 0x1000); + assert(result == 0); +} + +TEST(test_munmap_disallowed) { + void *addr = map_something(); + StartSeccompSandbox(); + int result = munmap(addr, 0x1000); + assert(result == -1); + assert(errno == EINVAL); +} + +TEST(test_mprotect_allowed) { + StartSeccompSandbox(); + void *addr = map_something(); + int result = mprotect(addr, 0x1000, PROT_READ | PROT_WRITE); + assert(result == 0); +} + +TEST(test_mprotect_disallowed) { + void *addr = map_something(); + StartSeccompSandbox(); + int result = mprotect(addr, 0x1000, PROT_READ | PROT_WRITE); + assert(result == -1); + assert(errno == EINVAL); +} + +static int get_tty_fd() { + int master_fd, tty_fd; + int rc = openpty(&master_fd, &tty_fd, NULL, NULL, NULL); + assert(rc == 0); + return tty_fd; +} + +TEST(test_ioctl_tiocgwinsz_allowed) { + int tty_fd = get_tty_fd(); + StartSeccompSandbox(); + int size[2]; + // Get terminal width and height. + int result = ioctl(tty_fd, TIOCGWINSZ, size); + assert(result == 0); +} + +TEST(test_ioctl_disallowed) { + int tty_fd = get_tty_fd(); + StartSeccompSandbox(); + // This ioctl call inserts a character into the tty's input queue, + // which provides a way to send commands to an interactive shell. + char c = 'x'; + int result = ioctl(tty_fd, TIOCSTI, &c); + assert(result == -1); + assert(errno == EINVAL); +} + +TEST(test_socket) { + StartSeccompSandbox(); + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + assert(fd == -1); + // TODO: Make it consistent between i386 and x86-64. + assert(errno == EINVAL || errno == ENOSYS); +} + +TEST(test_open_disabled) { + StartSeccompSandbox(); + int fd = open("/dev/null", O_RDONLY); + assert(fd == -1); + assert(errno == EACCES); + + // Writing to the policy flag does not change this. + playground::g_policy.allow_file_namespace = true; + fd = open("/dev/null", O_RDONLY); + assert(fd == -1); + assert(errno == EACCES); +} + +TEST(test_open_enabled) { + playground::g_policy.allow_file_namespace = true; + StartSeccompSandbox(); + int fd = open("/dev/null", O_RDONLY); + assert(fd >= 0); + int rc = close(fd); + assert(rc == 0); + fd = open("/dev/null", O_WRONLY); + assert(fd == -1); + assert(errno == EACCES); +} + +TEST(test_access_disabled) { + StartSeccompSandbox(); + int rc = access("/dev/null", R_OK); + assert(rc == -1); + assert(errno == EACCES); +} + +TEST(test_access_enabled) { + playground::g_policy.allow_file_namespace = true; + StartSeccompSandbox(); + int rc = access("/dev/null", R_OK); + assert(rc == 0); + rc = access("path-that-does-not-exist", R_OK); + assert(rc == -1); + assert(errno == ENOENT); +} + +TEST(test_stat_disabled) { + StartSeccompSandbox(); + struct stat st; + int rc = stat("/dev/null", &st); + assert(rc == -1); + assert(errno == EACCES); +} + +TEST(test_stat_enabled) { + playground::g_policy.allow_file_namespace = true; + StartSeccompSandbox(); + struct stat st; + int rc = stat("/dev/null", &st); + assert(rc == 0); + rc = stat("path-that-does-not-exist", &st); + assert(rc == -1); + assert(errno == ENOENT); +} + +static int g_value; + +static void signal_handler(int sig) { + g_value = 300; + MSG("In signal handler\n"); +} + +static void sigaction_handler(int sig, siginfo_t *a, void *b) { + g_value = 300; + MSG("In sigaction handler\n"); +} + +static void (*g_sig_handler_ptr)(int sig, void *addr) asm("g_sig_handler_ptr"); + +static void non_fatal_sig_handler(int sig, void *addr) { + g_value = 300; + MSG("Caught signal %d at %p\n", sig, addr); +} + +static void fatal_sig_handler(int sig, void *addr) { + // Recursively trigger another segmentation fault while already in the SEGV + // handler. This should terminate the program if SIGSEGV is marked as a + // deferred signal. + // Only do this on the first entry to this function. Otherwise, the signal + // handler was probably marked as SA_NODEFER and we want to continue + // execution. + if (!g_value++) { + MSG("Caught signal %d at %p\n", sig, addr); + if (sig == SIGSEGV) { + asm volatile("hlt"); + } else { + asm volatile("int3"); + } + } +} + +static void (*generic_signal_handler(void)) + (int signo, siginfo_t *info, void *context) { + void (*hdl)(int, siginfo_t *, void *); + asm volatile( + "lea 0f, %0\n" + "jmp 999f\n" + "0:\n" + +#if defined(__x86_64__) + "mov 0xB0(%%rsp), %%rsi\n" // Pass original %rip to signal handler + "cmpb $0xF4, 0(%%rsi)\n" // hlt + "jnz 1f\n" + "addq $1, 0xB0(%%rsp)\n" // Adjust %eip past failing instruction + "1:jmp *g_sig_handler_ptr\n" // Call actual signal handler +#elif defined(__i386__) + // TODO(markus): We currently don't guarantee that signal handlers always + // have the correct "magic" restorer function. If we fix + // this, we should add a test for it (both for SEGV and + // non-SEGV). + "cmpw $0, 0xA(%%esp)\n" + "lea 0x40(%%esp), %%eax\n" // %eip at time of exception + "jz 1f\n" + "add $0x9C, %%eax\n" // %eip at time of exception + "1:mov 0(%%eax), %%ecx\n" + "cmpb $0xF4, 0(%%ecx)\n" // hlt + "jnz 2f\n" + "addl $1, 0(%%eax)\n" // Adjust %eip past failing instruction + "2:push %%ecx\n" // Pass original %eip to signal handler + "mov 8(%%esp), %%eax\n" + "push %%eax\n" // Pass signal number to signal handler + "call *g_sig_handler_ptr\n" // Call actual signal handler + "pop %%eax\n" + "pop %%ecx\n" + "ret\n" +#else +#error Unsupported target platform +#endif + +"999:\n" + : "=r"(hdl)); + return hdl; +} + +TEST(test_signal_handler) { + sighandler_t result = signal(SIGTRAP, signal_handler); + assert(result != SIG_ERR); + + StartSeccompSandbox(); + + result = signal(SIGTRAP, signal_handler); + assert(result != SIG_ERR); + + g_value = 200; + asm("int3"); + assert(g_value == 300); +} + +TEST(test_sigaction_handler) { + struct sigaction act; + act.sa_sigaction = sigaction_handler; + sigemptyset(&act.sa_mask); + act.sa_flags = SA_SIGINFO; + int rc = sigaction(SIGTRAP, &act, NULL); + assert(rc == 0); + + StartSeccompSandbox(); + + rc = sigaction(SIGTRAP, &act, NULL); + assert(rc == 0); + + g_value = 200; + asm("int3"); + assert(g_value == 300); +} + +TEST(test_blocked_signal) { + sighandler_t result = signal(SIGTRAP, signal_handler); + assert(result != SIG_ERR); + StartSeccompSandbox(); + + // Initially the signal should not be blocked. + sigset_t sigs; + sigfillset(&sigs); + int rc = sigprocmask(0, NULL, &sigs); + assert(rc == 0); + assert(!sigismember(&sigs, SIGTRAP)); + + sigemptyset(&sigs); + sigaddset(&sigs, SIGTRAP); + rc = sigprocmask(SIG_BLOCK, &sigs, NULL); + assert(rc == 0); + + // Check that we can read back the blocked status. + sigemptyset(&sigs); + rc = sigprocmask(0, NULL, &sigs); + assert(rc == 0); + assert(sigismember(&sigs, SIGTRAP)); + + // Check that the signal handler really is blocked. + intend_exit_status(SIGTRAP, true); + asm("int3"); +} + +TEST(test_sigaltstack) { + // The sandbox does not support sigaltstack() yet. Just test that + // it returns an error. + StartSeccompSandbox(); + stack_t st; + st.ss_size = 0x4000; + st.ss_sp = malloc(st.ss_size); + assert(st.ss_sp != NULL); + st.ss_flags = 0; + int rc = sigaltstack(&st, NULL); + assert(rc == -1); + assert(errno == ENOSYS); +} + +TEST(test_sa_flags) { + StartSeccompSandbox(); + int flags[4] = { 0, SA_NODEFER, SA_SIGINFO, SA_SIGINFO | SA_NODEFER }; + for (int i = 0; i < 4; ++i) { + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = generic_signal_handler(); + g_sig_handler_ptr = non_fatal_sig_handler; + sa.sa_flags = flags[i]; + + // Test SEGV handling + g_value = 200; + sigaction(SIGSEGV, &sa, NULL); + asm volatile("hlt"); + assert(g_value == 300); + + // Test non-SEGV handling + g_value = 200; + sigaction(SIGTRAP, &sa, NULL); + asm volatile("int3"); + assert(g_value == 300); + } +} + +TEST(test_segv_defer) { + StartSeccompSandbox(); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = generic_signal_handler(); + g_sig_handler_ptr = fatal_sig_handler; + + // Test non-deferred SEGV (should continue execution) + sa.sa_flags = SA_NODEFER; + sigaction(SIGSEGV, &sa, NULL); + g_value = 0; + asm volatile("hlt"); + + // Test deferred SEGV (should terminate program) + sa.sa_flags = 0; + sigaction(SIGSEGV, &sa, NULL); + g_value = 0; + intend_exit_status(SIGSEGV, true); + asm volatile("hlt"); +} + +TEST(test_trap_defer) { + StartSeccompSandbox(); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = generic_signal_handler(); + g_sig_handler_ptr = fatal_sig_handler; + + // Test non-deferred TRAP (should continue execution) + sa.sa_flags = SA_NODEFER; + sigaction(SIGTRAP, &sa, NULL); + g_value = 0; + asm volatile("int3"); + + // Test deferred TRAP (should terminate program) + sa.sa_flags = 0; + sigaction(SIGTRAP, &sa, NULL); + g_value = 0; + intend_exit_status(SIGTRAP, true); + asm volatile("int3"); +} + +TEST(test_segv_resethand) { + StartSeccompSandbox(); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = generic_signal_handler(); + g_sig_handler_ptr = non_fatal_sig_handler; + sa.sa_flags = SA_RESETHAND; + sigaction(SIGSEGV, &sa, NULL); + + // Test first invocation of signal handler (should continue execution) + asm volatile("hlt"); + + // Test second invocation of signal handler (should terminate program) + intend_exit_status(SIGSEGV, true); + asm volatile("hlt"); +} + +TEST(test_trap_resethand) { + StartSeccompSandbox(); + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = generic_signal_handler(); + g_sig_handler_ptr = non_fatal_sig_handler; + sa.sa_flags = SA_RESETHAND; + sigaction(SIGTRAP, &sa, NULL); + + // Test first invocation of signal handler (should continue execution) + asm volatile("int3"); + + // Test second invocation of signal handler (should terminate program) + intend_exit_status(SIGTRAP, true); + asm volatile("int3"); +} + +struct testcase { + const char *test_name; + void (*test_func)(); +}; + +struct testcase all_tests[] = { +#include "test-list.h" + { NULL, NULL }, +}; + +static int run_test_forked(struct testcase *test) { + printf("** %s\n", test->test_name); + int pipe_fds[2]; + int rc = pipe(pipe_fds); + assert(rc == 0); + int pid = fork(); + if (pid == 0) { + rc = close(pipe_fds[0]); + assert(rc == 0); + g_intended_status_fd = pipe_fds[1]; + + test->test_func(); + intend_exit_status(0, false); + _exit(0); + } + rc = close(pipe_fds[1]); + assert(rc == 0); + + int intended_status; + int got = read(pipe_fds[0], &intended_status, sizeof(intended_status)); + bool got_intended_status = got == sizeof(intended_status); + if (!got_intended_status) { + printf("Test runner: Did not receive intended status\n"); + } + + int status; + int pid2 = waitpid(pid, &status, 0); + assert(pid2 == pid); + if (!got_intended_status) { + printf("Test returned exit status %i\n", status); + return 1; + } + else if ((status & ~WCOREFLAG) != intended_status) { + printf("Test failed with exit status %i, expected %i\n", + status, intended_status); + return 1; + } + else { + return 0; + } +} + +static int run_test_by_name(const char *name) { + struct testcase *test; + for (test = all_tests; test->test_name != NULL; test++) { + if (strcmp(name, test->test_name) == 0) { + printf("Running test %s...\n", name); + test->test_func(); + printf("OK\n"); + return 0; + } + } + fprintf(stderr, "Test '%s' not found\n", name); + return 1; +} + +int main(int argc, char **argv) { + setvbuf(stdout, NULL, _IONBF, 0); + setvbuf(stderr, NULL, _IONBF, 0); + if (argc == 2) { + // Run one test without forking, to aid debugging. + return run_test_by_name(argv[1]); + } + else if (argc > 2) { + // TODO: run multiple tests. + fprintf(stderr, "Too many arguments\n"); + return 1; + } + else { + // Run all tests. + struct testcase *test; + int failures = 0; + for (test = all_tests; test->test_name != NULL; test++) { + failures += run_test_forked(test); + } + if (failures == 0) { + printf("OK\n"); + return 0; + } + else { + printf("%i FAILURE(S)\n", failures); + return 1; + } + } +} diff --git a/sandbox/linux/seccomp/timestats.cc b/sandbox/linux/seccomp/timestats.cc new file mode 100644 index 0000000..5d9b66a --- /dev/null +++ b/sandbox/linux/seccomp/timestats.cc @@ -0,0 +1,191 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Helper program to analyze the time that Chrome's renderers spend in system +// calls. Start Chrome like this: +// +// SECCOMP_SANDBOX_DEBUGGING=1 chrome --enable-seccomp-sandbox 2>&1 | timestats +// +// The program prints CPU time (0-100%) spent within system calls. This gives +// a general idea of where it is worthwhile to spend effort optimizing Chrome. +// +// Caveats: +// - there currently is no way to estimate what the overhead is for running +// inside of the sandbox vs. running without a sandbox. +// - we currently use a very simple heuristic to decide whether a system call +// is blocking or not. Blocking system calls should not be included in the +// computations. But it is quite possible for the numbers to be somewhat +// wrong, because the heuristic failed. +// - in order to collect this data, we have to turn on sandbox debugging. +// There is a measurable performance penalty to doing so. Production numbers +// are strictly better than the numbers reported by this tool. +#include <set> +#include <vector> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <time.h> + +static const int kAvgWindowSizeMs = 500; +static const int kPeakWindowSizeMs = 2*1000; + +// Class containing information on a single system call. Most notably, it +// contains the time when the system call happened, and the time that it +// took to complete. +class Datum { + friend class Data; + public: + Datum(const char* name, double ms) + : name_(name), + ms_(ms) { + struct timeval tv; + gettimeofday(&tv, NULL); + timestamp_ = tv.tv_sec*1000.0 + tv.tv_usec/1000.0; + } + virtual ~Datum() { } + + double operator-(const Datum& b) { + return timestamp_ - b.timestamp_; + } + + protected: + const char* name_; + double ms_; + double timestamp_; +}; + +// Class containing data on the most recent system calls. It maintains +// sliding averages for total CPU time used, and it also maintains a peak +// CPU usage. The peak usage is usually updated slower than the average +// usage, as that makes it easier to inspect visually. +class Data { + public: + Data() { } + virtual ~Data() { } + + void addData(const char* name, double ms) { + average_.push_back(Datum(name, ms)); + peak_.push_back(Datum(name, ms)); + + // Prune entries outside of the window + std::vector<Datum>::iterator iter; + for (iter = average_.begin(); + *average_.rbegin() - *iter > kAvgWindowSizeMs; + ++iter) { + } + average_.erase(average_.begin(), iter); + + for (iter = peak_.begin(); + *peak_.rbegin() - *iter > kPeakWindowSizeMs; + ++iter){ + } + peak_.erase(peak_.begin(), iter); + + // Add the total usage of all system calls inside of the window + double total = 0; + for (iter = average_.begin(); iter != average_.end(); ++iter) { + total += iter->ms_; + } + + // Compute the peak CPU usage during the last window + double peak = 0; + double max = 0; + std::vector<Datum>::iterator tail = peak_.begin(); + for (iter = tail; iter != peak_.end(); ++iter) { + while (*iter - *tail > kAvgWindowSizeMs) { + peak -= tail->ms_; + ++tail; + } + peak += iter->ms_; + if (peak > max) { + max = peak; + } + } + + // Print the average CPU usage in the last window + char buf[80]; + total *= 100.0/kAvgWindowSizeMs; + max *= 100.0/kAvgWindowSizeMs; + sprintf(buf, "%6.2f%% (peak=%6.2f%%) ", total, max); + + // Animate the actual usage, displaying both average and peak values + int len = strlen(buf); + int space = sizeof(buf) - len - 1; + int mark = (total * space + 50)/100; + int bar = (max * space + 50)/100; + for (int i = 0; i < mark; ++i) { + buf[len++] = '*'; + } + if (mark == bar) { + if (bar) { + len--; + } + } else { + for (int i = 0; i < bar - mark - 1; ++i) { + buf[len++] = ' '; + } + } + buf[len++] = '|'; + while (len < static_cast<int>(sizeof(buf))) { + buf[len++] = ' '; + } + strcpy(buf + len, "\r"); + fwrite(buf, len + 1, 1, stdout); + fflush(stdout); + } + + private: + std::vector<Datum> average_; + std::vector<Datum> peak_; +}; +static Data data; + + +int main(int argc, char *argv[]) { + char buf[80]; + bool expensive = false; + while (fgets(buf, sizeof(buf), stdin)) { + // Allow longer delays for expensive system calls + if (strstr(buf, "This is an expensive system call")) { + expensive = true; + continue; + } + + // Parse the string and extract the elapsed time + const char elapsed[] = "Elapsed time: "; + char* ms_string = strstr(buf, elapsed); + char* endptr; + double ms; + char* colon = strchr(buf, ':'); + + // If this string doesn't match, then it must be some other type of + // message. Just ignore it. + // It is quite likely that we will regularly encounter debug messages + // that either should be parsed by a completely different tool, or + // messages that were intended for humans to read. + if (!ms_string || + ((ms = strtod(ms_string + sizeof(elapsed) - 1, &endptr)), + endptr == ms_string) || + !colon) { + continue; + } + + // Filter out system calls that were probably just blocking + // TODO(markus): automatically compute the cut-off for blocking calls + if (!expensive && ms > 0.05) { + continue; + } + expensive = false; + + // Extract the name of the system call + *colon = '\000'; + + // Add the data point and update the display + data.addData(buf, ms); + } + puts(""); + return 0; +} diff --git a/sandbox/linux/seccomp/tls.h b/sandbox/linux/seccomp/tls.h new file mode 100644 index 0000000..7ec5a28 --- /dev/null +++ b/sandbox/linux/seccomp/tls.h @@ -0,0 +1,155 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TLS_H__ +#define TLS_H__ + +#include <asm/ldt.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/prctl.h> + +namespace playground { + +class TLS { + private: + class SysCalls { + public: + #define SYS_CPLUSPLUS + #define SYS_ERRNO my_errno + #define SYS_INLINE inline + #define SYS_PREFIX -1 + #undef SYS_LINUX_SYSCALL_SUPPORT_H + #include "linux_syscall_support.h" + SysCalls() : my_errno(0) { } + int my_errno; + }; + + public: + static void *allocateTLS() { + SysCalls sys; + #if defined(__x86_64__) + void *addr = sys.mmap(0, 4096, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (sys.arch_prctl(ARCH_SET_GS, addr) < 0) { + return NULL; + } + #elif defined(__i386__) + void *addr = sys.mmap2(0, 4096, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + struct user_desc u; + u.entry_number = (typeof u.entry_number)-1; + u.base_addr = (int)addr; + u.limit = 0xfffff; + u.seg_32bit = 1; + u.contents = 0; + u.read_exec_only = 0; + u.limit_in_pages = 1; + u.seg_not_present = 0; + u.useable = 1; + if (sys.set_thread_area(&u) < 0) { + return NULL; + } + asm volatile( + "movw %w0, %%fs" + : + : "q"(8*u.entry_number+3)); + #else + #error Unsupported target platform + #endif + return addr; + } + + static void freeTLS() { + SysCalls sys; + void *addr; + #if defined(__x86_64__) + sys.arch_prctl(ARCH_GET_GS, &addr); + #elif defined(__i386__) + struct user_desc u; + sys.get_thread_area(&u); + addr = (void *)u.base_addr; + #else + #error Unsupported target platform + #endif + sys.munmap(addr, 4096); + } + + template<class T> static inline bool setTLSValue(int idx, T val) { + #if defined(__x86_64__) + if (idx < 0 || idx >= 4096/8) { + return false; + } + asm volatile( + "movq %0, %%gs:(%1)\n" + : + : "q"((void *)val), "q"(8ll * idx)); + #elif defined(__i386__) + if (idx < 0 || idx >= 4096/8) { + return false; + } + if (sizeof(T) == 8) { + asm volatile( + "movl %0, %%fs:(%1)\n" + : + : "r"((unsigned)val), "r"(8 * idx)); + asm volatile( + "movl %0, %%fs:(%1)\n" + : + : "r"((unsigned)((unsigned long long)val >> 32)), "r"(8 * idx + 4)); + } else { + asm volatile( + "movl %0, %%fs:(%1)\n" + : + : "r"(val), "r"(8 * idx)); + } + #else + #error Unsupported target platform + #endif + return true; + } + + template<class T> static inline T getTLSValue(int idx) { + #if defined(__x86_64__) + long long rc; + if (idx < 0 || idx >= 4096/8) { + return 0; + } + asm volatile( + "movq %%gs:(%1), %0\n" + : "=q"(rc) + : "q"(8ll * idx)); + return (T)rc; + #elif defined(__i386__) + if (idx < 0 || idx >= 4096/8) { + return 0; + } + if (sizeof(T) == 8) { + unsigned lo, hi; + asm volatile( + "movl %%fs:(%1), %0\n" + : "=r"(lo) + : "r"(8 * idx)); + asm volatile( + "movl %%fs:(%1), %0\n" + : "=r"(hi) + : "r"(8 * idx + 4)); + return (T)((unsigned long long)lo + ((unsigned long long)hi << 32)); + } else { + long rc; + asm volatile( + "movl %%fs:(%1), %0\n" + : "=r"(rc) + : "r"(8 * idx)); + return (T)rc; + } + #else + #error Unsupported target platform + #endif + } + +}; + +} // namespace +#endif diff --git a/sandbox/linux/seccomp/trusted_process.cc b/sandbox/linux/seccomp/trusted_process.cc new file mode 100644 index 0000000..5c62b0f --- /dev/null +++ b/sandbox/linux/seccomp/trusted_process.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <dirent.h> +#include <map> + +#include "debug.h" +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +struct SandboxPolicy g_policy; + +struct Thread { + int fdPub, fd; + SecureMem::Args* mem; +}; + +SecureMem::Args* Sandbox::getNewSecureMem() { + if (!secureMemPool_.empty()) { + SecureMem::Args* rc = secureMemPool_.back(); + secureMemPool_.pop_back(); + memset(rc->scratchPage, 0, sizeof(rc->scratchPage)); + return rc; + } + return NULL; +} + +void Sandbox::trustedProcess(int parentMapsFd, int processFdPub, int sandboxFd, + int cloneFd, SecureMem::Args* secureArena) { + // The trusted process doesn't have access to TLS. Zero out the segment + // registers so that we can later test that we are in the trusted process. + #if defined(__x86_64__) + asm volatile("mov %0, %%gs\n" : : "r"(0)); + #elif defined(__i386__) + asm volatile("mov %0, %%fs\n" : : "r"(0)); + #else + #error Unsupported target platform + #endif + + std::map<long long, struct Thread> threads; + SysCalls sys; + long long cookie = 0; + + // The very first entry in the secure memory arena has been assigned to the + // initial thread. The remaining entries are available for allocation. + SecureMem::Args* startAddress = secureArena; + SecureMem::Args* nextThread = startAddress; + for (int i = 0; i < kMaxThreads-1; i++) { + secureMemPool_.push_back(++startAddress); + } + +newThreadCreated: + // Receive information from newly created thread + Thread *newThread = &threads[++cookie]; + memset(newThread, 0, sizeof(Thread)); + struct { + SecureMem::Args* self; + int tid; + int fdPub; + } __attribute__((packed)) data; + + size_t dataLen = sizeof(data); + if (!getFd(cloneFd, &newThread->fdPub, &newThread->fd, &data, &dataLen) || + dataLen != sizeof(data)) { + // We get here either because the sandbox got corrupted, or because our + // parent process has terminated. + if (newThread->fdPub || dataLen) { + die("Failed to receive new thread information"); + } + die(); + } + if (data.self != nextThread) { + // The only potentially security critical information received from the + // newly created thread is "self". The "tid" is for informational purposes + // (and for use in the new thread's TLS), and "fdPub" is uncritical as all + // file descriptors are considered untrusted. + // Thus, we only use "self" for a sanity check, but don't actually trust + // it beyond that. + die("Received corrupted thread information"); + } + newThread->mem = nextThread; + + // Set up TLS area and let thread know that the data is now ready + nextThread->cookie = cookie; + nextThread->threadId = data.tid; + nextThread->threadFdPub = data.fdPub; + write(sys, newThread->fd, "", 1); + + // Dispatch system calls that have been forwarded from the trusted thread(s). + for (;;) { + struct { + unsigned int sysnum; + long long cookie; + } __attribute__((packed)) header; + + int rc; + if ((rc = read(sys, sandboxFd, &header, sizeof(header))) !=sizeof(header)){ + if (rc) { + die("Failed to read system call number and thread id"); + } + die(); + } + std::map<long long, struct Thread>::iterator iter = + threads.find(header.cookie); + if (iter == threads.end()) { + die("Received request from unknown thread"); + } + struct Thread* currentThread = &iter->second; + if (header.sysnum > maxSyscall || + !syscallTable[header.sysnum].trustedProcess) { + die("Trusted process encountered unexpected system call"); + } + + // Dispatch system call to handler function. Treat both exit() and clone() + // specially. + if (syscallTable[header.sysnum].trustedProcess(parentMapsFd, + sandboxFd, + currentThread->fdPub, + currentThread->fd, + currentThread->mem) && + header.sysnum == __NR_clone) { + nextThread = currentThread->mem->newSecureMem; + goto newThreadCreated; + } else if (header.sysnum == __NR_exit) { + NOINTR_SYS(sys.close(iter->second.fdPub)); + NOINTR_SYS(sys.close(iter->second.fd)); + SecureMem::Args* secureMem = currentThread->mem; + threads.erase(iter); + secureMemPool_.push_back(secureMem); + } + } +} + +int Sandbox::initializeProtectedMap(int fd) { + int mapsFd; + if (!getFd(fd, &mapsFd, NULL, NULL, NULL)) { + maps_failure: + die("Cannot access /proc/self/maps"); + } + + // Read the memory mappings as they were before the sandbox takes effect. + // These mappings cannot be changed by the sandboxed process. + char line[80]; + FILE *fp = fdopen(mapsFd, "r"); + for (bool truncated = false;;) { + if (fgets(line, sizeof(line), fp) == NULL) { + if (feof(fp) || errno != EINTR) { + break; + } + continue; + } + if (!truncated) { + unsigned long start, stop; + char *ptr = line; + errno = 0; + start = strtoul(ptr, &ptr, 16); + if (errno || *ptr++ != '-') { + parse_failure: + die("Failed to parse /proc/self/maps"); + } + stop = strtoul(ptr, &ptr, 16); + if (errno || *ptr++ != ' ') { + goto parse_failure; + } + protectedMap_[reinterpret_cast<void *>(start)] = stop - start; + } + truncated = strchr(line, '\n') == NULL; + } + + // Prevent low address memory allocations. Some buggy kernels allow those + if (protectedMap_[0] < (64 << 10)) { + protectedMap_[0] = 64 << 10; + } + + // Let the sandbox know that we are done parsing the memory map. + SysCalls sys; + if (write(sys, fd, &mapsFd, sizeof(mapsFd)) != sizeof(mapsFd)) { + goto maps_failure; + } + + return mapsFd; +} + +SecureMem::Args* Sandbox::createTrustedProcess(int processFdPub, int sandboxFd, + int cloneFdPub, int cloneFd) { + // Allocate memory that will be used by an arena for storing the secure + // memory. While we allow this memory area to be empty at times (e.g. when + // not all threads are in use), we make sure that it never gets overwritten + // by user-allocated memory. This happens in initializeProtectedMap() and + // snapshotMemoryMappings(). + SecureMem::Args* secureArena = reinterpret_cast<SecureMem::Args*>( + mmap(NULL, 8192*kMaxThreads, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0)); + if (secureArena == MAP_FAILED) { + die("Failed to allocate secure memory arena"); + } + + // Set up the mutex to be accessible from the trusted process and from + // children of the trusted thread(s) + if (mmap(&syscall_mutex_, 4096, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0) != &syscall_mutex_) { + die("Failed to initialize secure mutex"); + } + syscall_mutex_ = 0x80000000; + + + // Create a trusted process that can evaluate system call parameters and + // decide whether a system call should execute. This process runs outside of + // the seccomp sandbox. It communicates with the sandbox'd process through + // a socketpair() and through securely shared memory. + pid_t pid = fork(); + if (pid < 0) { + die("Failed to create trusted process"); + } + if (!pid) { + // Close all file handles except for sandboxFd, cloneFd, and stdio + DIR *dir = opendir("/proc/self/fd"); + if (dir == 0) { + // If we don't know the list of our open file handles, just try closing + // all valid ones. + for (int fd = sysconf(_SC_OPEN_MAX); --fd > 2; ) { + if (fd != sandboxFd && fd != cloneFd) { + close(fd); + } + } + } else { + // If available, if is much more efficient to just close the file + // handles that show up in /proc/self/fd/ + struct dirent de, *res; + while (!readdir_r(dir, &de, &res) && res) { + if (res->d_name[0] < '0') + continue; + int fd = atoi(res->d_name); + if (fd > 2 && + fd != sandboxFd && fd != cloneFd && fd != dirfd(dir)) { + close(fd); + } + } + closedir(dir); + } + + // Initialize secure memory used for threads + for (int i = 0; i < kMaxThreads; i++) { + SecureMem::Args* args = secureArena + i; + args->self = args; + #ifndef NDEBUG + args->allowAllSystemCalls= Debug::isEnabled(); + #endif + } + + int parentMapsFd = initializeProtectedMap(sandboxFd); + trustedProcess(parentMapsFd, processFdPub, sandboxFd, + cloneFd, secureArena); + die(); + } + + // We are still in the untrusted code. Deny access to restricted resources. + mprotect(secureArena, 8192*kMaxThreads, PROT_NONE); + mprotect(&syscall_mutex_, 4096, PROT_NONE); + close(sandboxFd); + + return secureArena; +} + +} // namespace diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc new file mode 100644 index 0000000..6d6a3f5 --- /dev/null +++ b/sandbox/linux/seccomp/trusted_thread.cc @@ -0,0 +1,1483 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, + SecureMem::Args* secureMem) { + SecureMem::Args args = { { { { { 0 } } } } }; + args.self = &args; + args.newSecureMem = secureMem; + args.processFdPub = processFdPub; + args.cloneFdPub = cloneFdPub; +#if defined(__x86_64__) + asm volatile( + "push %%rbx\n" + "push %%rbp\n" + "mov %0, %%rbp\n" // %rbp = args + "xor %%rbx, %%rbx\n" // initial sequence number + "lea 999f(%%rip), %%r15\n" // continue in same thread + + // Signal handlers are process-wide. This means that for security + // reasons, we cannot allow that the trusted thread ever executes any + // signal handlers. + // We prevent the execution of signal handlers by setting a signal + // mask that blocks all signals. In addition, we make sure that the + // stack pointer is invalid. + // We cannot reset the signal mask until after we have enabled + // Seccomp mode. Our sigprocmask() wrapper would normally do this by + // raising a signal, modifying the signal mask in the kernel-generated + // signal frame, and then calling sigreturn(). This presents a bit of + // a Catch-22, as all signals are masked and we can therefore not + // raise any signal that would allow us to generate the signal stack + // frame. + // Instead, we have to create the signal stack frame prior to entering + // Seccomp mode. This incidentally also helps us to restore the + // signal mask to the same value that it had prior to entering the + // sandbox. + // The signal wrapper for clone() is the second entry point into this + // code (by means of sending an IPC to its trusted thread). It goes + // through the same steps of creating a signal stack frame on the + // newly created thread's stacks prior to cloning. See clone.cc for + // details. + "mov $56+0xF000, %%eax\n" // __NR_clone + 0xF000 + "mov %%rsp, %%rcx\n" + "int $0\n" // push a signal stack frame (see clone.cc) + "mov %%rcx, 0xA0(%%rsp)\n" // pop stack upon call to sigreturn() + "mov %%rsp, %%r9\n" + "mov $2, %%rdi\n" // how = SIG_SETMASK + "pushq $-1\n" + "mov %%rsp, %%rsi\n" // set = full mask + "xor %%rdx, %%rdx\n" // old_set = NULL + "mov $8, %%r10\n" // mask all 64 signals + "mov $14, %%eax\n" // NR_rt_sigprocmask + "syscall\n" + "xor %%rsp, %%rsp\n" // invalidate the stack in all trusted code + "jmp 20f\n" // create trusted thread + + // TODO(markus): Coalesce the read() operations by reading into a bigger + // buffer. + + // Parameters: + // *%fs: secure memory region + // the page following this one contains the scratch space + // %r13: thread's side of threadFd + // %r15: processFdPub + + // Local variables: + // %rbx: sequence number for trusted calls + + // Temporary variables: + // %r8: child stack + // %r9: system call number, child stack + // %rbp: secure memory of previous thread + + // Layout of secure shared memory region (c.f. securemem.h): + // 0x00: pointer to the secure shared memory region (i.e. self) + // 0x08: sequence number; must match %rbx + // 0x10: call type; must match %eax, iff %eax == -1 || %eax == -2 + // 0x18: system call number; passed to syscall in %rax + // 0x20: first argument; passed to syscall in %rdi + // 0x28: second argument; passed to syscall in %rsi + // 0x30: third argument; passed to syscall in %rdx + // 0x38: fourth argument; passed to syscall in %r10 + // 0x40: fifth argument; passed to syscall in %r8 + // 0x48: sixth argument; passed to syscall in %r9 + // 0x50: stored return address for clone() system call + // 0x58: stored %rbp value for clone() system call + // 0x60: stored %rbx value for clone() system call + // 0x68: stored %rcx value for clone() system call + // 0x70: stored %rdx value for clone() system call + // 0x78: stored %rsi value for clone() system call + // 0x80: stored %rdi value for clone() system call + // 0x88: stored %r8 value for clone() system call + // 0x90: stored %r9 value for clone() system call + // 0x98: stored %r10 value for clone() system call + // 0xA0: stored %r11 value for clone() system call + // 0xA8: stored %r12 value for clone() system call + // 0xB0: stored %r13 value for clone() system call + // 0xB8: stored %r14 value for clone() system call + // 0xC0: stored %r15 value for clone() system call + // 0xC8: new shared memory for clone() + // 0xD0: processFdPub for talking to trusted process + // 0xD4: cloneFdPub for talking to trusted process + // 0xD8: set to non-zero, if in debugging mode + // 0xDC: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0xE0: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0xE8: thread id (TLS_TID) + // 0xF0: threadFdPub (TLS_THREAD_FD) + // 0x200-0x1000: securely passed verified file name(s) + + // Layout of (untrusted) scratch space: + // 0x00: syscall number; passed in %rax + // 0x04: first argument; passed in %rdi + // 0x0C: second argument; passed in %rsi + // 0x14: third argument; passed in %rdx + // 0x1C: fourth argument; passed in %r10 + // 0x24: fifth argument; passed in %r8 + // 0x2C: sixth argument; passed in %r9 + // 0x34: return value + // 0x3C: RDTSCP result (%eax) + // 0x40: RDTSCP result (%edx) + // 0x44: RDTSCP result (%ecx) + // 0x48: last system call (not used on x86-64) + // 0x4C: number of consecutive calls to a time fnc (not used on x86-64) + // 0x50: nesting level of system calls (for debugging purposes only) + // 0x54: signal mask + // 0x5C: in SEGV handler + + // We use the %fs register for accessing the secure read-only page, and + // the untrusted scratch space immediately following it. The segment + // register and the local descriptor table is set up by passing + // appropriate arguments to clone(). + + "0:xor %%rsp, %%rsp\n" + "mov $2, %%ebx\n" // %rbx = initial sequence number + + // Read request from untrusted thread, or from trusted process. In either + // case, the data that we read has to be considered untrusted. + // read(threadFd, &scratch, 4) + "1:xor %%rax, %%rax\n" // NR_read + "mov %%r13, %%rdi\n" // fd = threadFd + "mov %%fs:0x0, %%rsi\n" // secure_mem + "add $0x1000, %%rsi\n" // buf = &scratch + "mov $4, %%edx\n" // len = 4 + "2:syscall\n" + "cmp $-4, %%rax\n" // EINTR + "jz 2b\n" + "cmp %%rdx, %%rax\n" + "jnz 25f\n" // exit process + + // Retrieve system call number. It is crucial that we only dereference + // %fs:0x1000 exactly once. Afterwards, memory becomes untrusted and + // we must use the value that we have read the first time. + "mov 0(%%rsi), %%eax\n" + + // If syscall number is -1, execute an unlocked system call from the + // secure memory area + "cmp $-1, %%eax\n" + "jnz 5f\n" + "3:cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "cmp %%fs:0x10, %%eax\n" + "jne 25f\n" // exit process + "mov %%fs:0x18, %%rax\n" + "mov %%fs:0x20, %%rdi\n" + "mov %%fs:0x28, %%rsi\n" + "mov %%fs:0x30, %%rdx\n" + "mov %%fs:0x38, %%r10\n" + "mov %%fs:0x40, %%r8\n" + "mov %%fs:0x48, %%r9\n" + "cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "add $2, %%rbx\n" + + // shmget() gets some special treatment. Whenever we return from this + // system call, we remember the most recently returned SysV shm id. + "cmp $29, %%eax\n" // NR_shmget + "jnz 4f\n" + "syscall\n" + "mov %%rax, %%r8\n" + "mov $56, %%eax\n" // NR_clone + "mov $17, %%edi\n" // flags = SIGCHLD + "mov $1, %%esi\n" // stack = 1 + "syscall\n" + "test %%rax, %%rax\n" + "js 25f\n" // exit process + "mov %%rax, %%rdi\n" + "jnz 8f\n" // wait for child, then return result + "mov %%fs:0x0, %%rdi\n" // start = secure_mem + "mov $4096, %%esi\n" // len = 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $10, %%eax\n" // NR_mprotect + "syscall\n" + "mov %%r8d, 0xDC(%%rdi)\n" // set most recently returned SysV shm id + "xor %%rdi, %%rdi\n" + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, %%fs:0xD8\n" // debug mode + "jz 27f\n" + "mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "syscall\n" + "xor %%rdi, %%rdi\n" + #endif + + "jmp 27f\n" // exit program, no message + "4:syscall\n" + "jmp 15f\n" // return result + + // If syscall number is -2, execute locked system call from the + // secure memory area + "5:jg 12f\n" + "cmp $-2, %%eax\n" + "jnz 9f\n" + "cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "cmp %%eax, %%fs:0x10\n" + "jne 25f\n" // exit process + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, %%fs:0xD8\n" // debug mode + "jz 6f\n" + "mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "syscall\n" + "6:" + #endif + + "mov %%fs:0x18, %%rax\n" + "mov %%fs:0x20, %%rdi\n" + "mov %%fs:0x28, %%rsi\n" + "mov %%fs:0x30, %%rdx\n" + "mov %%fs:0x38, %%r10\n" + "mov %%fs:0x40, %%r8\n" + "mov %%fs:0x48, %%r9\n" + "cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + + // clone() has unusual calling conventions and must be handled specially + "cmp $56, %%rax\n" // NR_clone + "jz 19f\n" + + // exit() terminates trusted thread + "cmp $60, %%eax\n" // NR_exit + "jz 18f\n" + + // Perform requested system call + "syscall\n" + + // Unlock mutex + "7:cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "add $2, %%rbx\n" + "mov %%rax, %%r8\n" + "mov $56, %%eax\n" // NR_clone + "mov $17, %%rdi\n" // flags = SIGCHLD + "mov $1, %%rsi\n" // stack = 1 + "syscall\n" + "test %%rax, %%rax\n" + "js 25f\n" // exit process + "jz 22f\n" // unlock and exit + "mov %%rax, %%rdi\n" + "8:xor %%rsi, %%rsi\n" + "xor %%rdx, %%rdx\n" + "xor %%r10, %%r10\n" + "mov $61, %%eax\n" // NR_wait4 + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 8b\n" + "mov %%r8, %%rax\n" + "jmp 15f\n" // return result + + // If syscall number is -3, read the time stamp counter + "9:cmp $-3, %%eax\n" + "jnz 10f\n" + "rdtsc\n" // sets %edx:%eax + "xor %%rcx, %%rcx\n" + "jmp 11f\n" + "10:cmp $-4, %%eax\n" + "jnz 12f\n" + "rdtscp\n" // sets %edx:%eax and %ecx + "11:add $0x3C, %%rsi\n" + "mov %%eax, 0(%%rsi)\n" + "mov %%edx, 4(%%rsi)\n" + "mov %%ecx, 8(%%rsi)\n" + "mov $12, %%edx\n" + "jmp 16f\n" // return result + + // Check in syscallTable whether this system call is unrestricted + "12:mov %%rax, %%r9\n" + #ifndef NDEBUG + "cmpw $0, %%fs:0xD8\n" // debug mode + "jnz 13f\n" + #endif + "cmp playground$maxSyscall(%%rip), %%eax\n" + "ja 25f\n" // exit process + "shl $4, %%rax\n" + "lea playground$syscallTable(%%rip), %%rdi\n" + "add %%rdi, %%rax\n" + "mov 0(%%rax), %%rax\n" + "cmp $1, %%rax\n" + "jne 25f\n" // exit process + + // Default behavior for unrestricted system calls is to just execute + // them. Read the remaining arguments first. + "13:mov %%rsi, %%r8\n" + "xor %%rax, %%rax\n" // NR_read + "mov %%r13, %%rdi\n" // fd = threadFd + "add $4, %%rsi\n" // buf = &scratch + 4 + "mov $48, %%edx\n" // len = 6*sizeof(void *) + "14:syscall\n" + "cmp $-4, %%rax\n" // EINTR + "jz 14b\n" + "cmp %%rdx, %%rax\n" + "jnz 25f\n" // exit process + "mov %%r9, %%rax\n" + "mov 0x04(%%r8), %%rdi\n" + "mov 0x0C(%%r8), %%rsi\n" + "mov 0x14(%%r8), %%rdx\n" + "mov 0x1C(%%r8), %%r10\n" + "mov 0x2C(%%r8), %%r9\n" + "mov 0x24(%%r8), %%r8\n" + "cmp $231, %%rax\n" // NR_exit_group + "jz 27f\n" // exit program, no message + "syscall\n" + + // Return result of system call to sandboxed thread + "15:mov %%fs:0x0, %%rsi\n" // secure_mem + "add $0x1034, %%rsi\n" // buf = &scratch + 52 + "mov %%rax, (%%rsi)\n" + "mov $8, %%edx\n" // len = 8 + "16:mov %%r13, %%rdi\n" // fd = threadFd + "mov $1, %%eax\n" // NR_write + "17:syscall\n" + "cmp %%rdx, %%rax\n" + "jz 1b\n" + "cmp $-4, %%rax\n" // EINTR + "jz 17b\n" + "jmp 25f\n" // exit process + + // NR_exit: + // Exit trusted thread after cleaning up resources + "18:mov %%fs:0x0, %%rsi\n" // secure_mem + "mov 0xF0(%%rsi), %%rdi\n" // fd = threadFdPub + "mov $3, %%eax\n" // NR_close + "syscall\n" + "mov %%rsi, %%rdi\n" // start = secure_mem + "mov $8192, %%esi\n" // length = 8192 + "xor %%rdx, %%rdx\n" // prot = PROT_NONE + "mov $10, %%eax\n" // NR_mprotect + "syscall\n" + "mov %%r13, %%rdi\n" // fd = threadFd + "mov $3, %%eax\n" // NR_close + "syscall\n" + "mov $56, %%eax\n" // NR_clone + "mov $17, %%rdi\n" // flags = SIGCHLD + "mov $1, %%rsi\n" // stack = 1 + "syscall\n" + "mov %%rax, %%rdi\n" + "test %%rax, %%rax\n" + "js 27f\n" // exit process + "jne 21f\n" // reap helper, exit thread + "jmp 22f\n" // unlock mutex + + // NR_clone: + // Original trusted thread calls clone() to create new nascent + // thread. This thread is (typically) fully privileged and shares all + // resources with the caller (i.e. the previous trusted thread), + // and by extension it shares all resources with the sandbox'd + // threads. + "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "mov %%rsi, %%r15\n" // remember child stack + "mov $1, %%rsi\n" // stack = 1 + "syscall\n" // calls NR_clone + "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values + "jae 7b\n" // unlock mutex, return result + "add $2, %%rbx\n" + "test %%rax, %%rax\n" + "jne 15b\n" // return result + + // In nascent thread, now. + "sub $2, %%rbx\n" + + // We want to maintain an invalid %rsp whenver we access untrusted + // memory. This ensures that even if an attacker can trick us into + // triggering a SIGSEGV, we will never successfully execute a signal + // handler. + // Signal handlers are inherently dangerous, as an attacker could trick + // us into returning to the wrong address by adjusting the signal stack + // right before the handler returns. + // N.B. While POSIX is curiously silent about this, it appears that on + // Linux, alternate signal stacks are a per-thread property. That is + // good. It means that this security mechanism works, even if the + // sandboxed thread manages to set up an alternate signal stack. + // + // TODO(markus): We currently do not support emulating calls to + // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc + // for a discussion on how to fix this, if this ever becomes neccessary. + "mov %%r15, %%r9\n" // %r9 = child_stack + "xor %%r15, %%r15\n" // Request to return from clone() when done + + // Get thread id of nascent thread + "20:mov $186, %%eax\n" // NR_gettid + "syscall\n" + "mov %%rax, %%r14\n" + + // Nascent thread creates socketpair() for sending requests to + // trusted thread. + // We can create the filehandles on the child's stack. Filehandles are + // always treated as untrusted. + // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) + "sub $0x10, %%r9\n" + "mov %%r15, 8(%%r9)\n" // preserve return address on child stack + "mov $53, %%eax\n" // NR_socketpair + "mov $1, %%edi\n" // domain = AF_UNIX + "mov $1, %%esi\n" // type = SOCK_STREAM + "xor %%rdx, %%rdx\n" // protocol = 0 + "mov %%r9, %%r10\n" // sv = child_stack + "syscall\n" + "test %%rax, %%rax\n" + "jz 28f\n" + + // If things went wrong, we don't have an (easy) way of signaling + // the parent. For our purposes, it is sufficient to fail with a + // fatal error. + "jmp 25f\n" // exit process + "21:xor %%rsi, %%rsi\n" + "xor %%rdx, %%rdx\n" + "xor %%r10, %%r10\n" + "mov $61, %%eax\n" // NR_wait4 + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 21b\n" + "jmp 23f\n" // exit thread (no message) + "22:lea playground$syscall_mutex(%%rip), %%rdi\n" + "mov $4096, %%esi\n" + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $10, %%eax\n" // NR_mprotect + "syscall\n" + "lock; addl $0x80000000, (%%rdi)\n" + "jz 23f\n" // exit thread + "mov $1, %%edx\n" + "mov %%rdx, %%rsi\n" // FUTEX_WAKE + "mov $202, %%eax\n" // NR_futex + "syscall\n" + "23:mov $60, %%eax\n" // NR_exit + "mov $1, %%edi\n" // status = 1 + "24:syscall\n" + "25:mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected" + "mov $101f-100f, %%edx\n" // len = strlen(msg) + "syscall\n" + "26:mov $1, %%edi\n" + "27:mov $231, %%eax\n" // NR_exit_group + "jmp 24b\n" + + // The first page is mapped read-only for use as securely shared memory + "28:mov 0xC8(%%rbp), %%r12\n" // %r12 = secure shared memory + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + "mov $10, %%eax\n" // NR_mprotect + "mov %%r12, %%rdi\n" // addr = secure_mem + "mov $4096, %%esi\n" // len = 4096 + "mov $1, %%edx\n" // prot = PROT_READ + "syscall\n" + + // The second page is used as scratch space by the trusted thread. + // Make it writable. + "mov $10, %%eax\n" // NR_mprotect + "add $4096, %%rdi\n" // addr = secure_mem + 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "syscall\n" + + // Call clone() to create new trusted thread(). + // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + // CLONE_SYSVSEM|CLONE_UNTRACED|CLONE_SETTLS, stack, NULL, NULL, + // tls) + "mov 4(%%r9), %%r13d\n" // %r13 = threadFd (on child's stack) + "mov $56, %%eax\n" // NR_clone + "mov $0x8D0F00, %%edi\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR|TLS + "mov $1, %%rsi\n" // stack = 1 + "mov %%r12, %%r8\n" // tls = new_secure_mem + "mov 0xD0(%%rbp), %%r15d\n" // %r15 = processFdPub + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + "syscall\n" + "test %%rax, %%rax\n" + "js 25b\n" // exit process + "jz 0b\n" // invoke trustedThreadFnc() + + // Copy the caller's signal mask + "mov 0x1054(%%rbp), %%rax\n" + "mov %%rax, 0x1054(%%r12)\n" + + // Done creating trusted thread. We can now get ready to return to caller + "mov %%r9, %%r8\n" // %r8 = child_stack + "mov 0(%%r9), %%r9d\n" // %r9 = threadFdPub + + // Set up thread local storage with information on how to talk to + // trusted thread and trusted process. + "lea 0xE0(%%r12), %%rsi\n" // args = &secure_mem.TLS; + "mov $158, %%eax\n" // NR_arch_prctl + "mov $0x1001, %%edi\n" // option = ARCH_SET_GS + "syscall\n" + "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values + "jae 25b\n" // exit process + + // Check whether this is the initial thread, or a newly created one. + // At startup we run the same code as when we create a new thread. At + // the very top of this function, you will find that we push 999(%rip) + // on the stack. That is the signal that we should return on the same + // stack rather than return to where clone was called. + "mov 8(%%r8), %%r15\n" + "add $0x10, %%r8\n" + "test %%r15, %%r15\n" + "jne 29f\n" + + // Returning from clone() into the newly created thread is special. We + // cannot unroll the stack, as we just set up a new stack for this + // thread. We have to explicitly restore CPU registers to the values + // that they had when the program originally called clone(). + // We patch the register values in the signal stack frame so that we + // can ask sigreturn() to restore all registers for us. + "sub $0x8, %%r8\n" + "mov 0x50(%%rbp), %%rax\n" + "mov %%rax, 0x00(%%r8)\n" // return address + "xor %%rax, %%rax\n" + "mov %%rax, 0x98(%%r8)\n" // %rax = 0 + "mov 0x58(%%rbp), %%rax\n" + "mov %%rax, 0x80(%%r8)\n" // %rbp + "mov 0x60(%%rbp), %%rax\n" + "mov %%rax, 0x88(%%r8)\n" // %rbx + "mov 0x68(%%rbp), %%rax\n" + "mov %%rax, 0xA0(%%r8)\n" // %rcx + "mov 0x70(%%rbp), %%rax\n" + "mov %%rax, 0x90(%%r8)\n" // %rdx + "mov 0x78(%%rbp), %%rax\n" + "mov %%rax, 0x78(%%r8)\n" // %rsi + "mov 0x80(%%rbp), %%rax\n" + "mov %%rax, 0x70(%%r8)\n" // %rdi + "mov 0x88(%%rbp), %%rax\n" + "mov %%rax, 0x30(%%r8)\n" // %r8 + "mov 0x90(%%rbp), %%rax\n" + "mov %%rax, 0x38(%%r8)\n" // %r9 + "mov 0x98(%%rbp), %%rax\n" + "mov %%rax, 0x40(%%r8)\n" // %r10 + "mov 0xA0(%%rbp), %%rax\n" + "mov %%rax, 0x48(%%r8)\n" // %r11 + "mov 0xA8(%%rbp), %%rax\n" + "mov %%rax, 0x50(%%r8)\n" // %r12 + "mov 0xB0(%%rbp), %%rax\n" + "mov %%rax, 0x58(%%r8)\n" // %r13 + "mov 0xB8(%%rbp), %%rax\n" + "mov %%rax, 0x60(%%r8)\n" // %r14 + "mov 0xC0(%%rbp), %%rax\n" + "mov %%rax, 0x68(%%r8)\n" // %r15 + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + + // Nascent thread launches a helper that doesn't share any of our + // resources, except for pages mapped as MAP_SHARED. + // clone(SIGCHLD, stack=1) + "29:mov $56, %%eax\n" // NR_clone + "mov $17, %%rdi\n" // flags = SIGCHLD + "mov $1, %%rsi\n" // stack = 1 + "syscall\n" + "test %%rax, %%rax\n" + "js 25b\n" // exit process + "jne 31f\n" + + // Use sendmsg() to send to the trusted process the file handles for + // communicating with the new trusted thread. We also send the address + // of the secure memory area (for sanity checks) and the thread id. + "mov 0xD4(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub() + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + + // 0x00 msg: + // 0x00 msg_name ($0) + // 0x08 msg_namelen ($0) + // 0x10 msg_iov (%r8 + 0x44) + // 0x18 msg_iovlen ($1) + // 0x20 msg_control (%r8 + 0x54) + // 0x28 msg_controllen ($0x18) + // 0x30 data: + // 0x30 msg_flags/err ($0) + // 0x34 secure_mem (%r12) + // 0x3C threadId (%r14d) + // 0x40 threadFdPub (%r9d) + // 0x44 iov: + // 0x44 iov_base (%r8 + 0x30) + // 0x4C iov_len ($0x14) + // 0x54 cmsg: + // 0x54 cmsg_len ($0x18) + // 0x5C cmsg_level ($1, SOL_SOCKET) + // 0x60 cmsg_type ($1, SCM_RIGHTS) + // 0x64 threadFdPub (%r9d) + // 0x68 threadFd (%r13d) + // 0x6C + "sub $0x6C, %%r8\n" + "xor %%rdx, %%rdx\n" // flags = 0 + "mov %%rdx, 0x00(%%r8)\n" // msg_name + "mov %%edx, 0x08(%%r8)\n" // msg_namelen + "mov %%edx, 0x30(%%r8)\n" // msg_flags + "mov $1, %%r11d\n" + "mov %%r11, 0x18(%%r8)\n" // msg_iovlen + "mov %%r11d, 0x5C(%%r8)\n" // cmsg_level + "mov %%r11d, 0x60(%%r8)\n" // cmsg_type + "lea 0x30(%%r8), %%r11\n" + "mov %%r11, 0x44(%%r8)\n" // iov_base + "add $0x14, %%r11\n" + "mov %%r11, 0x10(%%r8)\n" // msg_iov + "add $0x10, %%r11\n" + "mov %%r11, 0x20(%%r8)\n" // msg_control + "mov $0x14, %%r11d\n" + "mov %%r11, 0x4C(%%r8)\n" // iov_len + "add $4, %%r11d\n" + "mov %%r11, 0x28(%%r8)\n" // msg_controllen + "mov %%r11, 0x54(%%r8)\n" // cmsg_len + "mov %%r12, 0x34(%%r8)\n" // secure_mem + "mov %%r14d, 0x3C(%%r8)\n" // threadId + "mov %%r9d, 0x40(%%r8)\n" // threadFdPub + "mov %%r9d, 0x64(%%r8)\n" // threadFdPub + "mov %%r13d, 0x68(%%r8)\n" // threadFd + "mov $46, %%eax\n" // NR_sendmsg + "mov %%r8, %%rsi\n" // msg + "syscall\n" + + // Release syscall_mutex_. This signals the trusted process that + // it can write into the original thread's secure memory again. + "mov $10, %%eax\n" // NR_mprotect + "lea playground$syscall_mutex(%%rip), %%rdi\n" + "mov $4096, %%esi\n" + "mov $3, %%edx\n" // PROT_READ | PROT_WRITE + "syscall\n" + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + "lock; addl $0x80000000, (%%rdi)\n" + "jz 30f\n" // exit process (no error message) + "mov $1, %%edx\n" + "mov %%rdx, %%rsi\n" // FUTEX_WAKE + "mov $202, %%eax\n" // NR_futex + "syscall\n" + "30:xor %%rdi, %%rdi\n" + "jmp 27b\n" // exit process (no error message) + + // Reap helper + "31:mov %%rax, %%rdi\n" + "32:lea -4(%%r8), %%rsi\n" + "xor %%rdx, %%rdx\n" + "xor %%r10, %%r10\n" + "mov $61, %%eax\n" // NR_wait4 + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 32b\n" + "mov -4(%%r8), %%eax\n" + "test %%rax, %%rax\n" + "jnz 26b\n" // exit process (no error message) + + // Release privileges by entering seccomp mode. + "mov $157, %%eax\n" // NR_prctl + "mov $22, %%edi\n" // PR_SET_SECCOMP + "mov $1, %%esi\n" + "syscall\n" + "test %%rax, %%rax\n" + "jnz 25b\n" // exit process + + // We can finally start using the stack. Signal handlers no longer pose + // a threat to us. + "mov %%r8, %%rsp\n" + + // Back in the newly created sandboxed thread, wait for trusted process + // to receive request. It is possible for an attacker to make us + // continue even before the trusted process is done. This is OK. It'll + // result in us putting stale values into the new thread's TLS. But that + // data is considered untrusted anyway. + "push %%rax\n" + "mov $1, %%edx\n" // len = 1 + "mov %%rsp, %%rsi\n" // buf = %rsp + "mov %%r9, %%rdi\n" // fd = threadFdPub + "33:xor %%rax, %%rax\n" // NR_read + "syscall\n" + "cmp $-4, %%rax\n" // EINTR + "jz 33b\n" + "cmp %%rdx, %%rax\n" + "jne 25b\n" // exit process + "pop %%rax\n" + + // Return to caller. We are in the new thread, now. + "test %%r15, %%r15\n" + "jnz 34f\n" // Returning to createTrustedThread() + + // Returning to the place where clone() had been called. We rely on + // using rt_sigreturn() for restoring our registers. The caller already + // created a signal stack frame, and we patched the register values + // with the ones that were in effect prior to calling sandbox_clone(). + "pop %%r15\n" + "34:mov %%r15, 0xA8(%%rsp)\n" // compute new %rip + "mov $15, %%eax\n" // NR_rt_sigreturn + "syscall\n" + + ".pushsection \".rodata\"\n" + "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" + "101:.ascii \"WARNING! This is an expensive system call\\n\"\n" + "102:\n" + ".popsection\n" + + "999:pop %%rbp\n" + "pop %%rbx\n" + : + : "g"(&args) + : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15", "rsp", "memory" +#elif defined(__i386__) + struct user_desc u; + u.entry_number = (typeof u.entry_number)-1; + u.base_addr = 0; + u.limit = 0xfffff; + u.seg_32bit = 1; + u.contents = 0; + u.read_exec_only = 0; + u.limit_in_pages = 1; + u.seg_not_present = 0; + u.useable = 1; + SysCalls sys; + if (sys.set_thread_area(&u) < 0) { + die("Cannot set up thread local storage"); + } + asm volatile("movw %w0, %%fs" + : + : "q"(8*u.entry_number+3)); + asm volatile( + "push %%ebx\n" + "push %%ebp\n" + + // Signal handlers are process-wide. This means that for security + // reasons, we cannot allow that the trusted thread ever executes any + // signal handlers. + // We prevent the execution of signal handlers by setting a signal + // mask that blocks all signals. In addition, we make sure that the + // stack pointer is invalid. + // We cannot reset the signal mask until after we have enabled + // Seccomp mode. Our sigprocmask() wrapper would normally do this by + // raising a signal, modifying the signal mask in the kernel-generated + // signal frame, and then calling sigreturn(). This presents a bit of + // a Catch-22, as all signals are masked and we can therefore not + // raise any signal that would allow us to generate the signal stack + // frame. + // Instead, we have to create the signal stack frame prior to entering + // Seccomp mode. This incidentally also helps us to restore the + // signal mask to the same value that it had prior to entering the + // sandbox. + // The signal wrapper for clone() is the second entry point into this + // code (by means of sending an IPC to its trusted thread). It goes + // through the same steps of creating a signal stack frame on the + // newly created thread's stacks prior to cloning. See clone.cc for + // details. + "mov %0, %%edi\n" // create signal stack before accessing MMX + "mov $120+0xF000, %%eax\n" // __NR_clone + 0xF000 + "mov %%esp, %%ebp\n" + "int $0\n" // push a signal stack frame (see clone.cc) + "mov %%ebp, 0x1C(%%esp)\n" // pop stack upon call to sigreturn() + "mov %%esp, %%ebp\n" + "mov $2, %%ebx\n" // how = SIG_SETMASK + "pushl $-1\n" + "pushl $-1\n" + "mov %%esp, %%ecx\n" // set = full mask + "xor %%edx, %%edx\n" // old_set = NULL + "mov $8, %%esi\n" // mask all 64 signals + "mov $175, %%eax\n" // NR_rt_sigprocmask + "int $0x80\n" + "mov $126, %%eax\n" // NR_sigprocmask + "int $0x80\n" + "xor %%esp, %%esp\n" // invalidate the stack in all trusted code + "movd %%edi, %%mm6\n" // %mm6 = args + "lea 999f, %%edi\n" // continue in same thread + "movd %%edi, %%mm3\n" + "xor %%edi, %%edi\n" // initial sequence number + "movd %%edi, %%mm2\n" + "jmp 20f\n" // create trusted thread + + // TODO(markus): Coalesce the read() operations by reading into a bigger + // buffer. + + // Parameters: + // %mm0: thread's side of threadFd + // %mm1: processFdPub + // %mm3: return address after creation of new trusted thread + // %mm5: secure memory region + // the page following this one contains the scratch space + + // Local variables: + // %mm2: sequence number for trusted calls + // %mm4: thread id + + // Temporary variables: + // %ebp: system call number + // %mm6: secure memory of previous thread + // %mm7: temporary variable for spilling data + + // Layout of secure shared memory region (c.f. securemem.h): + // 0x00: pointer to the secure shared memory region (i.e. self) + // 0x04: sequence number; must match %mm2 + // 0x08: call type; must match %eax, iff %eax == -1 || %eax == -2 + // 0x0C: system call number; passed to syscall in %eax + // 0x10: first argument; passed to syscall in %ebx + // 0x14: second argument; passed to syscall in %ecx + // 0x18: third argument; passed to syscall in %edx + // 0x1C: fourth argument; passed to syscall in %esi + // 0x20: fifth argument; passed to syscall in %edi + // 0x24: sixth argument; passed to syscall in %ebp + // 0x28: stored return address for clone() system call + // 0x2C: stored %ebp value for clone() system call + // 0x30: stored %edi value for clone() system call + // 0x34: stored %esi value for clone() system call + // 0x38: stored %edx value for clone() system call + // 0x3C: stored %ecx value for clone() system call + // 0x40: stored %ebx value for clone() system call + // 0x44: new shared memory for clone() + // 0x48: processFdPub for talking to trusted process + // 0x4C: cloneFdPub for talking to trusted process + // 0x50: set to non-zero, if in debugging mode + // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0x60: thread id (TLS_TID) + // 0x68: threadFdPub (TLS_THREAD_FD) + // 0x200-0x1000: securely passed verified file name(s) + + // Layout of (untrusted) scratch space: + // 0x00: syscall number; passed in %eax + // 0x04: first argument; passed in %ebx + // 0x08: second argument; passed in %ecx + // 0x0C: third argument; passed in %edx + // 0x10: fourth argument; passed in %esi + // 0x14: fifth argument; passed in %edi + // 0x18: sixth argument; passed in %ebp + // 0x1C: return value + // 0x20: RDTSCP result (%eax) + // 0x24: RDTSCP result (%edx) + // 0x28: RDTSCP result (%ecx) + // 0x2C: last system call (updated in syscall.cc) + // 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday) + // 0x34: nesting level of system calls (for debugging purposes only) + // 0x38: signal mask + // 0x40: in SEGV handler + + "0:xor %%esp, %%esp\n" + "mov $2, %%eax\n" // %mm2 = initial sequence number + "movd %%eax, %%mm2\n" + + // Read request from untrusted thread, or from trusted process. In either + // case, the data that we read has to be considered untrusted. + // read(threadFd, &scratch, 4) + "1:mov $3, %%eax\n" // NR_read + "movd %%mm0, %%ebx\n" // fd = threadFd + "movd %%mm5, %%ecx\n" // secure_mem + "add $0x1000, %%ecx\n" // buf = &scratch + "mov $4, %%edx\n" // len = 4 + "2:int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 2b\n" + "cmp %%edx, %%eax\n" + "jnz 25f\n" // exit process + + // Retrieve system call number. It is crucial that we only dereference + // 0x1000(%mm5) exactly once. Afterwards, memory becomes untrusted and + // we must use the value that we have read the first time. + "mov 0(%%ecx), %%eax\n" + + // If syscall number is -1, execute an unlocked system call from the + // secure memory area + "cmp $-1, %%eax\n" + "jnz 5f\n" + "3:movd %%mm2, %%ebp\n" + "cmp %%ebp, 0x4-0x1000(%%ecx)\n" + "jne 25f\n" // exit process + "cmp 0x08-0x1000(%%ecx), %%eax\n" + "jne 25f\n" // exit process + "mov 0x0C-0x1000(%%ecx), %%eax\n" + "mov 0x10-0x1000(%%ecx), %%ebx\n" + "mov 0x18-0x1000(%%ecx), %%edx\n" + "mov 0x1C-0x1000(%%ecx), %%esi\n" + "mov 0x20-0x1000(%%ecx), %%edi\n" + "mov 0x24-0x1000(%%ecx), %%ebp\n" + "mov 0x14-0x1000(%%ecx), %%ecx\n" + "movd %%edi, %%mm4\n" + "movd %%ebp, %%mm7\n" + "movd %%mm2, %%ebp\n" + "movd %%mm5, %%edi\n" + "cmp %%ebp, 4(%%edi)\n" + "jne 25f\n" // exit process + "add $2, %%ebp\n" + "movd %%ebp, %%mm2\n" + "movd %%mm4, %%edi\n" + "movd %%mm7, %%ebp\n" + + // shmget() gets some special treatment. Whenever we return from this + // system call, we remember the most recently returned SysV shm id. + "cmp $117, %%eax\n" // NR_ipc + "jnz 4f\n" + "cmp $23, %%ebx\n" // shmget() + "jnz 4f\n" + "int $0x80\n" + "mov %%eax, %%ebp\n" + "mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25f\n" // exit process + "mov %%eax, %%ebx\n" + "jnz 8f\n" // wait for child, then return result + "movd %%mm5, %%ebx\n" // start = secure_mem + "mov $4096, %%ecx\n" // len = 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $125, %%eax\n" // NR_mprotect + "int $0x80\n" + "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id + "xor %%ebx, %%ebx\n" + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "movd %%mm5, %%ecx\n" + "cmpw $0, 0x50(%%ecx)\n" // debug mode + "jz 27f\n" + "mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 101f, %%ecx\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "xor %%ebx, %%ebx\n" + #endif + + "jmp 27f\n" // exit program, no message + "4:int $0x80\n" + "jmp 15f\n" // return result + + // If syscall number is -2, execute locked system call from the + // secure memory area + "5:jg 12f\n" + "cmp $-2, %%eax\n" + "jnz 9f\n" + "movd %%mm2, %%ebp\n" + "cmp %%ebp, 0x4-0x1000(%%ecx)\n" + "jne 25f\n" // exit process + "cmp %%eax, 0x8-0x1000(%%ecx)\n" + "jne 25f\n" // exit process + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, 0x50-0x1000(%%ecx)\n" + "jz 6f\n" // debug mode + "mov %%ecx, %%ebp\n" + "mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 101f, %%ecx\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "mov %%ebp, %%ecx\n" + "6:" + #endif + + "mov 0x0C-0x1000(%%ecx), %%eax\n" + "mov 0x10-0x1000(%%ecx), %%ebx\n" + "mov 0x18-0x1000(%%ecx), %%edx\n" + "mov 0x1C-0x1000(%%ecx), %%esi\n" + "mov 0x20-0x1000(%%ecx), %%edi\n" + "mov 0x24-0x1000(%%ecx), %%ebp\n" + "mov 0x14-0x1000(%%ecx), %%ecx\n" + "movd %%edi, %%mm4\n" + "movd %%ebp, %%mm7\n" + "movd %%mm2, %%ebp\n" + "movd %%mm5, %%edi\n" + "cmp %%ebp, 4(%%edi)\n" + "jne 25f\n" // exit process + + // clone() has unusual calling conventions and must be handled specially + "cmp $120, %%eax\n" // NR_clone + "jz 19f\n" + + // exit() terminates trusted thread + "cmp $1, %%eax\n" // NR_exit + "jz 18f\n" + + // Perform requested system call + "movd %%mm4, %%edi\n" + "movd %%mm7, %%ebp\n" + "int $0x80\n" + + // Unlock mutex + "7:movd %%mm2, %%ebp\n" + "movd %%mm5, %%edi\n" + "cmp %%ebp, 4(%%edi)\n" + "jne 25f\n" // exit process + "add $2, %%ebp\n" + "movd %%ebp, %%mm2\n" + "mov %%eax, %%ebp\n" + "mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25f\n" // exit process + "jz 22f\n" // unlock and exit + "mov %%eax, %%ebx\n" + "8:xor %%ecx, %%ecx\n" + "xor %%edx, %%edx\n" + "mov $7, %%eax\n" // NR_waitpid + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 8b\n" + "mov %%ebp, %%eax\n" + "jmp 15f\n" // return result + + // If syscall number is -3, read the time stamp counter + "9:cmp $-3, %%eax\n" + "jnz 10f\n" + "rdtsc\n" // sets %edx:%eax + "xor %%ecx, %%ecx\n" + "jmp 11f\n" + "10:cmp $-4, %%eax\n" + "jnz 12f\n" + "rdtscp\n" // sets %edx:%eax and %ecx + "11:movd %%mm5, %%ebx\n" + "add $0x1020, %%ebx\n" + "mov %%eax, 0(%%ebx)\n" + "mov %%edx, 4(%%ebx)\n" + "mov %%ecx, 8(%%ebx)\n" + "mov %%ebx, %%ecx\n" + "mov $12, %%edx\n" + "jmp 16f\n" // return result + + // Check in syscallTable whether this system call is unrestricted + "12:mov %%eax, %%ebp\n" + #ifndef NDEBUG + "cmpw $0, 0x50-0x1000(%%ecx)\n" + "jnz 13f\n" // debug mode + #endif + "cmp playground$maxSyscall, %%eax\n" + "ja 25f\n" // exit process + "shl $3, %%eax\n" + "add $playground$syscallTable, %%eax\n" + "mov 0(%%eax), %%eax\n" + "cmp $1, %%eax\n" + "jne 25f\n" // exit process + + // Default behavior for unrestricted system calls is to just execute + // them. Read the remaining arguments first. + "13:mov $3, %%eax\n" // NR_read + "movd %%mm0, %%ebx\n" // fd = threadFd + "add $4, %%ecx\n" // buf = &scratch + 4 + "mov $24, %%edx\n" // len = 6*sizeof(void *) + "14:int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 14b\n" + "cmp %%edx, %%eax\n" + "jnz 25f\n" // exit process + "mov %%ebp, %%eax\n" + "mov 0x00(%%ecx), %%ebx\n" + "mov 0x08(%%ecx), %%edx\n" + "mov 0x0C(%%ecx), %%esi\n" + "mov 0x10(%%ecx), %%edi\n" + "mov 0x14(%%ecx), %%ebp\n" + "mov 0x04(%%ecx), %%ecx\n" + "cmp $252, %%eax\n" // NR_exit_group + "jz 27f\n" // exit program, no message + "int $0x80\n" + + // Return result of system call to sandboxed thread + "15:movd %%mm5, %%ecx\n" // secure_mem + "add $0x101C, %%ecx\n" // buf = &scratch + 28 + "mov %%eax, (%%ecx)\n" + "mov $4, %%edx\n" // len = 4 + "16:movd %%mm0, %%ebx\n" // fd = threadFd + "mov $4, %%eax\n" // NR_write + "17:int $0x80\n" + "cmp %%edx, %%eax\n" + "jz 1b\n" + "cmp $-4, %%eax\n" // EINTR + "jz 17b\n" + "jmp 25f\n" // exit process + + // NR_exit: + // Exit trusted thread after cleaning up resources + "18:mov %%edi, %%ecx\n" // secure_mem + "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub + "mov $6, %%eax\n" // NR_close + "int $0x80\n" + "mov %%ecx, %%ebx\n" // start = secure_mem + "mov $8192, %%ecx\n" // length = 8192 + "xor %%edx, %%edx\n" // prot = PROT_NONE + "mov $125, %%eax\n" // NR_mprotect + "int $0x80\n" + "movd %%mm0, %%ebx\n" // fd = threadFd + "mov $6, %%eax\n" // NR_close + "int $0x80\n" + "mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "mov %%eax, %%ebx\n" + "test %%eax, %%eax\n" + "js 25f\n" // exit process + "jne 21f\n" // reap helper, exit thread + "jmp 22f\n" // unlock mutex + + // NR_clone: + // Original trusted thread calls clone() to create new nascent + // thread. This thread is (typically) fully privileged and shares all + // resources with the caller (i.e. the previous trusted thread), + // and by extension it shares all resources with the sandbox'd + // threads. + "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem + "movd %%mm4, %%edi\n" // child_tidptr + "mov %%ecx, %%ebp\n" // remember child stack + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" // calls NR_clone + "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values + "jae 7b\n" // unlock mutex, return result + "movd %%mm2, %%edi\n" + "add $2, %%edi\n" + "movd %%edi, %%mm2\n" + "test %%eax, %%eax\n" + "jne 15b\n" // return result + + // In nascent thread, now. + "sub $2, %%edi\n" + "movd %%edi, %%mm2\n" + + // We want to maintain an invalid %esp whenver we access untrusted + // memory. This ensures that even if an attacker can trick us into + // triggering a SIGSEGV, we will never successfully execute a signal + // handler. + // Signal handlers are inherently dangerous, as an attacker could trick + // us into returning to the wrong address by adjusting the signal stack + // right before the handler returns. + // N.B. While POSIX is curiously silent about this, it appears that on + // Linux, alternate signal stacks are a per-thread property. That is + // good. It means that this security mechanism works, even if the + // sandboxed thread manages to set up an alternate signal stack. + // + // TODO(markus): We currently do not support emulating calls to + // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc + // for a discussion on how to fix this, if this ever becomes neccessary. + "movd %%eax, %%mm3\n" // Request to return from clone() when done + + // Get thread id of nascent thread + "20:mov $224, %%eax\n" // NR_gettid + "int $0x80\n" + "movd %%eax, %%mm4\n" + + // Nascent thread creates socketpair() for sending requests to + // trusted thread. + // We can create the filehandles on the child's stack. Filehandles are + // always treated as untrusted. + // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) + "mov $102, %%eax\n" // NR_socketcall + "mov $8, %%ebx\n" // socketpair + "sub $8, %%ebp\n" // sv = child_stack + "mov %%ebp, -0x04(%%ebp)\n" + "movl $0, -0x08(%%ebp)\n" // protocol = 0 + "movl $1, -0x0C(%%ebp)\n" // type = SOCK_STREAM + "movl $1, -0x10(%%ebp)\n" // domain = AF_UNIX + "lea -0x10(%%ebp), %%ecx\n" + "int $0x80\n" + "test %%eax, %%eax\n" + "jz 28f\n" + + // If things went wrong, we don't have an (easy) way of signaling + // the parent. For our purposes, it is sufficient to fail with a + // fatal error. + "jmp 25f\n" // exit process + "21:xor %%ecx, %%ecx\n" + "xor %%edx, %%edx\n" + "mov $7, %%eax\n" // NR_waitpid + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 21b\n" + "jmp 23f\n" // exit thread (no message) + "22:lea playground$syscall_mutex, %%ebx\n" + "mov $4096, %%ecx\n" + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $125, %%eax\n" // NR_mprotect + "int $0x80\n" + "lock; addl $0x80000000, (%%ebx)\n" + "jz 23f\n" // exit thread + "mov $1, %%edx\n" + "mov %%edx, %%ecx\n" // FUTEX_WAKE + "mov $240, %%eax\n" // NR_futex + "int $0x80\n" + "23:mov $1, %%eax\n" // NR_exit + "mov $1, %%ebx\n" // status = 1 + "24:int $0x80\n" + "25:mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 100f, %%ecx\n" // "Sandbox violation detected" + "mov $101f-100f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "26:mov $1, %%ebx\n" + "27:mov $252, %%eax\n" // NR_exit_group + "jmp 24b\n" + + // The first page is mapped read-only for use as securely shared memory + "28:movd %%mm6, %%edi\n" // %edi = old_shared_mem + "mov 0x44(%%edi), %%ebx\n" // addr = secure_mem + "movd %%ebx, %%mm5\n" // %mm5 = secure_mem + "movd %%mm2, %%esi\n" + "cmp %%esi, 4(%%edi)\n" + "jne 25b\n" // exit process + "mov $125, %%eax\n" // NR_mprotect + "mov $4096, %%ecx\n" // len = 4096 + "mov $1, %%edx\n" // prot = PROT_READ + "int $0x80\n" + + // The second page is used as scratch space by the trusted thread. + // Make it writable. + "mov $125, %%eax\n" // NR_mprotect + "add $4096, %%ebx\n" // addr = secure_mem + 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "int $0x80\n" + + // Call clone() to create new trusted thread(). + // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + // CLONE_SYSVSEM|CLONE_UNTRACED, stack, NULL, NULL, NULL) + "mov 4(%%ebp), %%eax\n" // threadFd (on child's stack) + "movd %%eax, %%mm0\n" // %mm0 = threadFd + "mov $120, %%eax\n" // NR_clone + "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR + "mov $1, %%ecx\n" // stack = 1 + "movd 0x48(%%edi), %%mm1\n" // %mm1 = processFdPub + "cmp %%esi, 4(%%edi)\n" + "jne 25b\n" // exit process + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25b\n" // exit process + "jz 0b\n" // invoke trustedThreadFnc() + + // Set up thread local storage + "mov $0x51, %%eax\n" // seg_32bit, limit_in_pages, useable + "mov %%eax, -0x04(%%ebp)\n" + "mov $0xFFFFF, %%eax\n" // limit + "mov %%eax, -0x08(%%ebp)\n" + "movd %%mm5, %%eax\n" + "add $0x58, %%eax\n" + "mov %%eax, -0x0C(%%ebp)\n" // base_addr = &secure_mem.TLS + "mov %%fs, %%eax\n" + "shr $3, %%eax\n" + "mov %%eax, -0x10(%%ebp)\n" // entry_number + "mov $243, %%eax\n" // NR_set_thread_area + "lea -0x10(%%ebp), %%ebx\n" + "int $0x80\n" + "test %%eax, %%eax\n" + "jnz 25b\n" // exit process + + // Copy the caller's signal mask + "movd %%mm5, %%edx\n" + "mov 0x1038(%%edi), %%eax\n" + "mov %%eax, 0x1038(%%edx)\n" + "mov 0x103C(%%edi), %%eax\n" + "mov %%eax, 0x103C(%%edx)\n" + + // Done creating trusted thread. We can now get ready to return to caller + "mov 0(%%ebp), %%esi\n" // %esi = threadFdPub + "add $8, %%ebp\n" + + // Check whether this is the initial thread, or a newly created one. + // At startup we run the same code as when we create a new thread. At + // the very top of this function, you will find that we store 999f + // in %%mm3. That is the signal that we should return on the same + // stack rather than return to where clone was called. + "movd %%mm3, %%eax\n" + "movd %%mm2, %%edx\n" + "test %%eax, %%eax\n" + "jne 29f\n" + + // Returning from clone() into the newly created thread is special. We + // cannot unroll the stack, as we just set up a new stack for this + // thread. We have to explicitly restore CPU registers to the values + // that they had when the program originally called clone(). + // We patch the register values in the signal stack frame so that we + // can ask sigreturn() to restore all registers for us. + "sub $0x4, %%ebp\n" + "mov 0x28(%%edi), %%eax\n" + "mov %%eax, 0x00(%%ebp)\n" // return address + "xor %%eax, %%eax\n" + "mov %%eax, 0x30(%%ebp)\n" // %eax = 0 + "mov 0x2C(%%edi), %%eax\n" + "mov %%eax, 0x1C(%%ebp)\n" // %ebp + "mov 0x30(%%edi), %%eax\n" + "mov %%eax, 0x14(%%ebp)\n" // %edi + "mov 0x34(%%edi), %%eax\n" + "mov %%eax, 0x18(%%ebp)\n" // %esi + "mov 0x38(%%edi), %%eax\n" + "mov %%eax, 0x28(%%ebp)\n" // %edx + "mov 0x3C(%%edi), %%eax\n" + "mov %%eax, 0x2C(%%ebp)\n" // %ecx + "mov 0x40(%%edi), %%eax\n" + "mov %%eax, 0x24(%%ebp)\n" // %ebx + "cmp %%edx, 4(%%edi)\n" + "jne 25b\n" // exit process + + // Nascent thread launches a helper that doesn't share any of our + // resources, except for pages mapped as MAP_SHARED. + // clone(SIGCHLD, stack=1) + "29:mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25b\n" // exit process + "jne 31f\n" + + // Use sendmsg() to send to the trusted process the file handles for + // communicating with the new trusted thread. We also send the address + // of the secure memory area (for sanity checks) and the thread id. + "cmp %%edx, 4(%%edi)\n" + "jne 25b\n" // exit process + + // 0x00 socketcall: + // 0x00 socket (0x4C(%edi)) + // 0x04 msg (%ecx + 0x0C) + // 0x08 flags ($0) + // 0x0C msg: + // 0x0C msg_name ($0) + // 0x10 msg_namelen ($0) + // 0x14 msg_iov (%ecx + 0x34) + // 0x18 msg_iovlen ($1) + // 0x1C msg_control (%ecx + 0x3C) + // 0x20 msg_controllen ($0x14) + // 0x24 data: + // 0x24 msg_flags/err ($0) + // 0x28 secure_mem (%mm5) + // 0x2C threadId (%mm4) + // 0x30 threadFdPub (%esi) + // 0x34 iov: + // 0x34 iov_base (%ecx + 0x24) + // 0x38 iov_len ($0x10) + // 0x3C cmsg: + // 0x3C cmsg_len ($0x14) + // 0x40 cmsg_level ($1, SOL_SOCKET) + // 0x44 cmsg_type ($1, SCM_RIGHTS) + // 0x48 threadFdPub (%esi) + // 0x4C threadFd (%mm0) + // 0x50 + "lea -0x50(%%ebp), %%ecx\n" + "xor %%eax, %%eax\n" + "mov %%eax, 0x08(%%ecx)\n" // flags + "mov %%eax, 0x0C(%%ecx)\n" // msg_name + "mov %%eax, 0x10(%%ecx)\n" // msg_namelen + "mov %%eax, 0x24(%%ecx)\n" // msg_flags + "inc %%eax\n" + "mov %%eax, 0x18(%%ecx)\n" // msg_iovlen + "mov %%eax, 0x40(%%ecx)\n" // cmsg_level + "mov %%eax, 0x44(%%ecx)\n" // cmsg_type + "movl $0x10, 0x38(%%ecx)\n" // iov_len + "mov $0x14, %%eax\n" + "mov %%eax, 0x20(%%ecx)\n" // msg_controllen + "mov %%eax, 0x3C(%%ecx)\n" // cmsg_len + "mov 0x4C(%%edi), %%eax\n" // cloneFdPub + "mov %%eax, 0x00(%%ecx)\n" // socket + "lea 0x0C(%%ecx), %%eax\n" + "mov %%eax, 0x04(%%ecx)\n" // msg + "add $0x18, %%eax\n" + "mov %%eax, 0x34(%%ecx)\n" // iov_base + "add $0x10, %%eax\n" + "mov %%eax, 0x14(%%ecx)\n" // msg_iov + "add $8, %%eax\n" + "mov %%eax, 0x1C(%%ecx)\n" // msg_control + "mov %%esi, 0x30(%%ecx)\n" // threadFdPub + "mov %%esi, 0x48(%%ecx)\n" // threadFdPub + "movd %%mm5, %%eax\n" + "mov %%eax, 0x28(%%ecx)\n" // secure_mem + "movd %%mm4, %%eax\n" + "mov %%eax, 0x2C(%%ecx)\n" // threadId + "movd %%mm0, %%eax\n" + "mov %%eax, 0x4C(%%ecx)\n" // threadFd + "mov $16, %%ebx\n" // sendmsg() + "mov $102, %%eax\n" // NR_socketcall + "int $0x80\n" + + // Release syscall_mutex_. This signals the trusted process that + // it can write into the original thread's secure memory again. + "mov $125, %%eax\n" // NR_mprotect + "lea playground$syscall_mutex, %%ebx\n" + "mov $4096, %%ecx\n" + "mov $3, %%edx\n" // PROT_READ | PROT_WRITE + "int $0x80\n" + "movd %%mm2, %%edx\n" + "cmp %%edx, 0x4(%%edi)\n" + "jnz 25b\n" // exit process + "lock; addl $0x80000000, (%%ebx)\n" + "jz 30f\n" // exit process (no error message) + "mov $1, %%edx\n" + "mov %%edx, %%ecx\n" // FUTEX_WAKE + "mov $240, %%eax\n" // NR_futex + "int $0x80\n" + "30:xor %%ebx, %%ebx\n" + "jmp 27b\n" // exit process (no error message) + + // Reap helper + "31:mov %%eax, %%ebx\n" + "32:lea -4(%%ebp), %%ecx\n" + "xor %%edx, %%edx\n" + "mov $7, %%eax\n" // NR_waitpid + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 32b\n" + "mov -4(%%ebp), %%eax\n" + "test %%eax, %%eax\n" + "jnz 26b\n" // exit process (no error message) + + // Release privileges by entering seccomp mode. + "33:mov $172, %%eax\n" // NR_prctl + "mov $22, %%ebx\n" // PR_SET_SECCOMP + "mov $1, %%ecx\n" + "int $0x80\n" + "test %%eax, %%eax\n" + "jnz 25b\n" // exit process + + // We can finally start using the stack. Signal handlers no longer pose + // a threat to us. + "mov %%ebp, %%esp\n" + + // Back in the newly created sandboxed thread, wait for trusted process + // to receive request. It is possible for an attacker to make us + // continue even before the trusted process is done. This is OK. It'll + // result in us putting stale values into the new thread's TLS. But that + // data is considered untrusted anyway. + "push %%eax\n" + "mov $1, %%edx\n" // len = 1 + "mov %%esp, %%ecx\n" // buf = %esp + "mov %%esi, %%ebx\n" // fd = threadFdPub + "34:mov $3, %%eax\n" // NR_read + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 34b\n" + "cmp %%edx, %%eax\n" + "jne 25b\n" // exit process + "pop %%eax\n" + + // Return to caller. We are in the new thread, now. + "movd %%mm3, %%ebx\n" + "test %%ebx, %%ebx\n" + "jnz 35f\n" // Returning to createTrustedThread() + + // Returning to the place where clone() had been called. We rely on + // using sigreturn() for restoring our registers. The caller already + // created a signal stack frame, and we patched the register values + // with the ones that were in effect prior to calling sandbox_clone(). + "pop %%ebx\n" + "35:mov %%ebx, 0x38(%%esp)\n" // compute new %eip + "mov $119, %%eax\n" // NR_sigreturn + "int $0x80\n" + + ".pushsection \".rodata\"\n" + "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" + "101:.ascii \"WARNING! This is an expensive system call\\n\"\n" + "102:\n" + ".popsection\n" + + "999:pop %%ebp\n" + "pop %%ebx\n" + : + : "g"(&args) + : "eax", "ecx", "edx", "edi", "esi", "esp", "memory" +#else +#error Unsupported target platform +#endif +); +} + +} // namespace diff --git a/sandbox/linux/seccomp/x86_decode.cc b/sandbox/linux/seccomp/x86_decode.cc new file mode 100644 index 0000000..1b55139 --- /dev/null +++ b/sandbox/linux/seccomp/x86_decode.cc @@ -0,0 +1,310 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "x86_decode.h" + +namespace playground { + +#if defined(__x86_64__) || defined(__i386__) +unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix, + char **rex_ptr, char **mod_rm_ptr, char **sib_ptr, + bool *is_group) { + enum { + BYTE_OP = (1<<1), // 0x02 + IMM = (1<<2), // 0x04 + IMM_BYTE = (2<<2), // 0x08 + MEM_ABS = (3<<2), // 0x0C + MODE_MASK = (7<<2), // 0x1C + MOD_RM = (1<<5), // 0x20 + STACK = (1<<6), // 0x40 + GROUP = (1<<7), // 0x80 + GROUP_MASK = 0x7F, + }; + + static unsigned char opcode_types[512] = { + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x00 - 0x07 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x00, // 0x08 - 0x0F + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x10 - 0x17 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x18 - 0x1F + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x20 - 0x27 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x28 - 0x2F + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x30 - 0x37 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x38 - 0x3F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x40 - 0x47 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x48 - 0x4F + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x50 - 0x57 + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x58 - 0x5F + 0x01, 0x01, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0x60 - 0x67 + 0x45, 0x25, 0x49, 0x29, 0x03, 0x01, 0x03, 0x01, // 0x68 - 0x6F + 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x70 - 0x77 + 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x78 - 0x7F + 0x27, 0x25, 0x27, 0x29, 0x23, 0x21, 0x23, 0x21, // 0x80 - 0x87 + 0x23, 0x21, 0x23, 0x21, 0x21, 0x21, 0x21, 0x80, // 0x88 - 0x8F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x90 - 0x97 + 0x01, 0x01, 0x05, 0x01, 0x41, 0x41, 0x01, 0x01, // 0x98 - 0x9F + 0x0F, 0x0D, 0x0F, 0x0D, 0x03, 0x01, 0x03, 0x01, // 0xA0 - 0xA7 + 0x09, 0x05, 0x03, 0x01, 0x03, 0x01, 0x03, 0x01, // 0xA8 - 0xAF + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // 0xB0 - 0xB7 + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0xB8 - 0xBF + 0x27, 0x29, 0x01, 0x01, 0x21, 0x21, 0x27, 0x25, // 0xC0 - 0xC7 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x01, // 0xC8 - 0xCF + 0x23, 0x21, 0x23, 0x21, 0x09, 0x09, 0x01, 0x01, // 0xD0 - 0xD7 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xD8 - 0xDF + 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0xE0 - 0xE7 + 0x05, 0x05, 0x05, 0x09, 0x03, 0x01, 0x03, 0x01, // 0xE8 - 0xEF + 0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x88, 0x90, // 0xF0 - 0xF7 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x98, 0xA0, // 0xF8 - 0xFF + 0x00, 0xA8, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, // 0xF00 - 0xF07 + 0x01, 0x01, 0x00, 0x01, 0x00, 0x21, 0x01, 0x00, // 0xF08 - 0xF0F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF10 - 0xF17 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF18 - 0xF1F + 0x21, 0x21, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0xF20 - 0xF27 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF28 - 0xF2F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, // 0xF30 - 0xF37 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF38 - 0xF3F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF40 - 0xF47 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF48 - 0xF4F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF50 - 0xF57 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF58 - 0xF5F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF60 - 0xF67 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF68 - 0xF6F + 0x21, 0x00, 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, // 0xF70 - 0xF77 + 0x21, 0x21, 0x00, 0x00, 0x21, 0x21, 0x21, 0x21, // 0xF78 - 0xF7F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF80 - 0xF87 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF88 - 0xF8F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF90 - 0xF97 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF98 - 0xF9F + 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x00, 0x00, // 0xFA0 - 0xFA7 + 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x21, 0x21, // 0xFA8 - 0xFAF + 0x23, 0x21, 0x00, 0x21, 0x00, 0x00, 0x23, 0x21, // 0xFB0 - 0xFB7 + 0x21, 0x00, 0x29, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFB8 - 0xFBF + 0x21, 0x21, 0x00, 0x21, 0x00, 0x00, 0x00, 0x21, // 0xFC0 - 0xFC7 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xFC8 - 0xFCF + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD0 - 0xFD7 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD8 - 0xFDF + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE0 - 0xFE7 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE8 - 0xFEF + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF0 - 0xFF7 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF8 - 0xFFF + }; + + static unsigned char group_table[56] = { + 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 1A + 0x27, 0x27, 0x23, 0x23, 0x23, 0x23, 0x23, 0x23, // Group 3 (Byte) + 0x25, 0x25, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // Group 3 + 0x23, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 4 + 0x21, 0x21, 0x61, 0x21, 0x61, 0x21, 0x61, 0x00, // Group 5 + 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, 0x21, 0x23, // Group 7 + 0x21, 0x00, 0x00, 0x21, 0x21, 0x00, 0x21, 0x00, // Group 7 (Alternate) + }; + + const unsigned char *insn_ptr = reinterpret_cast<const unsigned char *>(*ip); + int operand_width = 4; + int address_width = 4; + if (is64bit) { + address_width = 8; + } + unsigned char byte, rex = 0; + bool found_prefix = false; + if (rex_ptr) { + *rex_ptr = 0; + } + if (mod_rm_ptr) { + *mod_rm_ptr = 0; + } + if (sib_ptr) { + *sib_ptr = 0; + } + for (;; ++insn_ptr) { + switch (byte = *insn_ptr) { + case 0x66: // Operand width prefix + operand_width ^= 6; + break; + case 0x67: // Address width prefix + address_width ^= is64bit ? 12 : 6; + break; + case 0x26: // Segment selector prefixes + case 0x2e: + case 0x36: + case 0x3e: + case 0x64: + case 0x65: + case 0xF0: + case 0xF2: + case 0xF3: + break; + case 0x40: case 0x41: case 0x42: case 0x43: // 64 bit REX prefixes + case 0x44: case 0x45: case 0x46: case 0x47: + case 0x48: case 0x49: case 0x4A: case 0x4B: + case 0x4C: case 0x4D: case 0x4E: case 0x4F: + if (is64bit) { + if (rex_ptr) { + *rex_ptr = (char *)insn_ptr; + } + rex = byte; + found_prefix = true; + continue; + } + // fall through + default: + ++insn_ptr; + goto no_more_prefixes; + } + rex = 0; + found_prefix = true; + } +no_more_prefixes: + if (has_prefix) { + *has_prefix = found_prefix; + } + if (rex & REX_W) { + operand_width = 8; + } + unsigned char type; + unsigned short insn = byte; + unsigned int idx = 0; + if (byte == 0x0F) { + byte = *insn_ptr++; + insn = (insn << 8) | byte; + idx = 256; + } + type = opcode_types[idx + byte]; + bool found_mod_rm = false; + bool found_group = false; + bool found_sib = false; + unsigned char mod_rm = 0; + unsigned char sib = 0; + if (type & GROUP) { + found_mod_rm = true; + found_group = true; + mod_rm = *insn_ptr; + if (mod_rm_ptr) { + *mod_rm_ptr = (char *)insn_ptr; + } + unsigned char group = (type & GROUP_MASK) + ((mod_rm >> 3) & 0x7); + if ((type & GROUP_MASK) == 40 && (mod_rm >> 6) == 3) { + group += 8; + } + type = group_table[group]; + } + if (!type) { + // We know that we still don't decode some of the more obscure + // instructions, but for all practical purposes that doesn't matter. + // Compilers are unlikely to output them, and even if we encounter + // hand-coded assembly, we will soon synchronize to the instruction + // stream again. + // + // std::cerr << "Unsupported instruction at 0x" << std::hex << + // std::uppercase << reinterpret_cast<long>(*ip) << " [ "; + // for (const unsigned char *ptr = + // reinterpret_cast<const unsigned char *>(*ip); + // ptr < insn_ptr; ) { + // std::cerr << std::hex << std::uppercase << std::setw(2) << + // std::setfill('0') << (unsigned int)*ptr++ << ' '; + // } + // std::cerr << "]" << std::endl; + } else { + if (is64bit && (type & STACK)) { + operand_width = 8; + } + if (type & MOD_RM) { + found_mod_rm = true; + if (mod_rm_ptr) { + *mod_rm_ptr = (char *)insn_ptr; + } + mod_rm = *insn_ptr++; + int mod = (mod_rm >> 6) & 0x3; + int rm = 8*(rex & REX_B) + (mod_rm & 0x7); + if (mod != 3) { + if (address_width == 2) { + switch (mod) { + case 0: + if (rm != 6 /* SI */) { + break; + } + // fall through + case 2: + insn_ptr++; + // fall through + case 1: + insn_ptr++; + break; + } + } else { + if ((rm & 0x7) == 4) { + found_sib = true; + if (sib_ptr) { + *sib_ptr = (char *)insn_ptr; + } + sib = *insn_ptr++; + if (!mod && (sib & 0x7) == 5 /* BP */) { + insn_ptr += 4; + } + } + switch (mod) { + case 0: + if (rm != 5 /* BP */) { + break; + } + // fall through + case 2: + insn_ptr += 3; + // fall through + case 1: + insn_ptr++; + break; + } + } + } + } + switch (insn) { + case 0xC8: // ENTER + insn_ptr++; + // fall through + case 0x9A: // CALL (far) + case 0xC2: // RET (near) + case 0xCA: // LRET + case 0xEA: // JMP (far) + insn_ptr += 2; + break; + case 0xF80: case 0xF81: case 0xF82: case 0xF83: // Jcc (rel) + case 0xF84: case 0xF85: case 0xF86: case 0xF87: + case 0xF88: case 0xF89: case 0xF8A: case 0xF8B: + case 0xF8C: case 0xF8D: case 0xF8E: case 0xF8F: + insn_ptr += operand_width; + break; + } + switch (type & MODE_MASK) { + case IMM: + if (!(type & BYTE_OP)) { + switch (insn) { + case 0xB8: case 0xB9: case 0xBA: case 0xBB: + case 0xBC: case 0xBD: case 0xBE: case 0xBF: + // Allow MOV to/from 64bit addresses + insn_ptr += operand_width; + break; + default: + insn_ptr += (operand_width == 8) ? 4 : operand_width; + break; + } + break; + } + // fall through + case IMM_BYTE: + insn_ptr++; + break; + case MEM_ABS: + insn_ptr += address_width; + break; + } + } + if (is_group) { + *is_group = found_group; + } + *ip = reinterpret_cast<const char *>(insn_ptr); + return insn; +} +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/x86_decode.h b/sandbox/linux/seccomp/x86_decode.h new file mode 100644 index 0000000..68f0ab5 --- /dev/null +++ b/sandbox/linux/seccomp/x86_decode.h @@ -0,0 +1,19 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef X86_DECODE_H__ +#define X86_DECODE_H__ +namespace playground { +enum { + REX_B = 0x01, + REX_X = 0x02, + REX_R = 0x04, + REX_W = 0x08 +}; + +unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix = 0, + char **rex_ptr = 0, char **mod_rm_ptr = 0, + char **sib_ptr = 0, bool *is_group = 0); +} // namespace +#endif // X86_DECODE_H__ |