summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authornsylvain@chromium.org <nsylvain@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-31 01:16:35 +0000
committernsylvain@chromium.org <nsylvain@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-31 01:16:35 +0000
commitfb7b5328a5fd3aecfc27f765dea94b961c657597 (patch)
tree84adc617db0031a881265e95f9c569de66fa733d
parent7302ea910ce937d482780649d6a84bbfff4ac521 (diff)
downloadchromium_src-fb7b5328a5fd3aecfc27f765dea94b961c657597.zip
chromium_src-fb7b5328a5fd3aecfc27f765dea94b961c657597.tar.gz
chromium_src-fb7b5328a5fd3aecfc27f765dea94b961c657597.tar.bz2
Revert 57921 - Pull seccomp-sandbox in via DEPS rather than using an in-tree copy
This means changes to the sandbox won't have to be committed twice, to both trees. BUG=none TEST=smoke test of running chromium with --enable-seccomp-sandbox Review URL: http://codereview.chromium.org/3249003 TBR=mseaborn@chromium.org Review URL: http://codereview.chromium.org/3245011 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57933 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--DEPS3
-rw-r--r--sandbox/linux/seccomp/Makefile59
-rw-r--r--sandbox/linux/seccomp/access.cc97
-rw-r--r--sandbox/linux/seccomp/allocator.cc136
-rw-r--r--sandbox/linux/seccomp/allocator.h88
-rw-r--r--sandbox/linux/seccomp/clone.cc179
-rw-r--r--sandbox/linux/seccomp/debug.cc363
-rw-r--r--sandbox/linux/seccomp/debug.h80
-rw-r--r--sandbox/linux/seccomp/exit.cc38
-rw-r--r--sandbox/linux/seccomp/getpid.cc17
-rw-r--r--sandbox/linux/seccomp/gettid.cc18
-rw-r--r--sandbox/linux/seccomp/ioctl.cc61
-rw-r--r--sandbox/linux/seccomp/ipc.cc351
-rw-r--r--sandbox/linux/seccomp/library.cc1208
-rw-r--r--sandbox/linux/seccomp/library.h199
-rw-r--r--sandbox/linux/seccomp/linux_syscall_support.h3208
-rw-r--r--sandbox/linux/seccomp/madvise.cc81
-rw-r--r--sandbox/linux/seccomp/maps.cc267
-rw-r--r--sandbox/linux/seccomp/maps.h94
-rw-r--r--sandbox/linux/seccomp/mmap.cc75
-rw-r--r--sandbox/linux/seccomp/mprotect.cc73
-rw-r--r--sandbox/linux/seccomp/munmap.cc70
-rw-r--r--sandbox/linux/seccomp/mutex.h153
-rw-r--r--sandbox/linux/seccomp/open.cc99
-rw-r--r--sandbox/linux/seccomp/sandbox.cc838
-rw-r--r--sandbox/linux/seccomp/sandbox.h12
-rw-r--r--sandbox/linux/seccomp/sandbox_impl.h715
-rw-r--r--sandbox/linux/seccomp/seccomp.gyp93
-rw-r--r--sandbox/linux/seccomp/securemem.cc105
-rw-r--r--sandbox/linux/seccomp/securemem.h205
-rw-r--r--sandbox/linux/seccomp/sigaction.cc177
-rw-r--r--sandbox/linux/seccomp/sigprocmask.cc120
-rw-r--r--sandbox/linux/seccomp/socketcall.cc1039
-rw-r--r--sandbox/linux/seccomp/stat.cc197
-rw-r--r--sandbox/linux/seccomp/syscall.cc380
-rw-r--r--sandbox/linux/seccomp/syscall.h22
-rw-r--r--sandbox/linux/seccomp/syscall_table.c153
-rw-r--r--sandbox/linux/seccomp/syscall_table.h43
-rw-r--r--sandbox/linux/seccomp/tests/list_tests.py22
-rw-r--r--sandbox/linux/seccomp/tests/test_syscalls.cc758
-rw-r--r--sandbox/linux/seccomp/timestats.cc191
-rw-r--r--sandbox/linux/seccomp/tls.h155
-rw-r--r--sandbox/linux/seccomp/trusted_process.cc268
-rw-r--r--sandbox/linux/seccomp/trusted_thread.cc1483
-rw-r--r--sandbox/linux/seccomp/x86_decode.cc310
-rw-r--r--sandbox/linux/seccomp/x86_decode.h19
46 files changed, 14319 insertions, 3 deletions
diff --git a/DEPS b/DEPS
index c356536..004c6f4 100644
--- a/DEPS
+++ b/DEPS
@@ -21,9 +21,6 @@ deps = {
"src/googleurl":
"http://google-url.googlecode.com/svn/trunk@145",
- "src/sandbox/linux/seccomp":
- "http://seccompsandbox.googlecode.com/svn/trunk@91",
-
"src/sdch/open-vcdiff":
"http://open-vcdiff.googlecode.com/svn/trunk@28",
diff --git a/sandbox/linux/seccomp/Makefile b/sandbox/linux/seccomp/Makefile
new file mode 100644
index 0000000..141d8c3
--- /dev/null
+++ b/sandbox/linux/seccomp/Makefile
@@ -0,0 +1,59 @@
+# Copyright (c) 2010 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# This Makefile temporarily has been checked into the source tree so that
+# we can run the tests. It will be replaced with a proper gyp file.
+
+CFLAGS = -g -O0 -Wall -Werror -Wextra -Wno-missing-field-initializers \
+ -Wno-unused-parameter -I.
+LDFLAGS = -g
+CPPFLAGS =
+MODS := allocator library debug maps x86_decode securemem sandbox \
+ syscall syscall_table trusted_thread trusted_process \
+ access exit clone getpid gettid ioctl ipc madvise mmap mprotect \
+ munmap open sigaction sigprocmask socketcall stat
+OBJS64 := $(shell echo ${MODS} | xargs -n 1 | sed -e 's/$$/.o64/')
+OBJS32 := $(shell echo ${MODS} | xargs -n 1 | sed -e 's/$$/.o32/')
+HEADERS:= $(shell for i in ${MODS}; do [ -r "$$i" ] && echo "$$i"; done)
+
+.SUFFIXES: .o64 .o32
+
+all: test
+
+clean:
+ -rm -f *.o *.o32 *.o64 tests/*.o32 tests/*.o.64
+ -rm -f core core.* vgcore vgcore.* strace.log*
+ -rm -f run_tests_32 run_tests_64
+ -rm -f tests/test_syscalls.o64 tests/test_syscalls.o32
+ -rm -f tests/test-list.h
+
+test: run_tests_64 run_tests_32
+ ./run_tests_64
+ ./run_tests_32
+
+# TODO: Track header file dependencies properly
+tests/test_syscalls.o64 tests/test_syscalls.o32: tests/test-list.h
+
+tests/test-list.h: tests/list_tests.py tests/test_syscalls.cc
+ python tests/list_tests.py tests/test_syscalls.cc > $@
+
+run_tests_64: $(OBJS64) tests/test_syscalls.o64 tests/test-list.h
+ g++ -m64 tests/test_syscalls.o64 $(OBJS64) -lpthread -lutil -o $@
+run_tests_32: $(OBJS32) tests/test_syscalls.o32 tests/test-list.h
+ g++ -m32 tests/test_syscalls.o32 $(OBJS32) -lpthread -lutil -o $@
+
+.cc.o: ${HEADERS}
+ ${CXX} ${CFLAGS} ${CPPFLAGS} -c -o $@ $<
+
+.cc.o64: ${HEADERS}
+ ${CXX} ${CFLAGS} ${CPPFLAGS} -fPIC -c -o $@ $<
+
+.c.o64: ${HEADERS}
+ ${CC} ${CFLAGS} ${CPPFLAGS} --std=gnu99 -fPIC -c -o $@ $<
+
+.cc.o32: ${HEADERS}
+ ${CXX} ${CFLAGS} ${CPPFLAGS} -m32 -fPIC -c -o $@ $<
+
+.c.o32: ${HEADERS}
+ ${CC} ${CFLAGS} ${CPPFLAGS} -m32 --std=gnu99 -fPIC -c -o $@ $<
diff --git a/sandbox/linux/seccomp/access.cc b/sandbox/linux/seccomp/access.cc
new file mode 100644
index 0000000..fbe7e53
--- /dev/null
+++ b/sandbox/linux/seccomp/access.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_access(const char *pathname, int mode) {
+ long long tm;
+ Debug::syscall(&tm, __NR_access, "Executing handler");
+ size_t len = strlen(pathname);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Access access_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_access;
+ request->cookie = cookie();
+ request->access_req.path_length = len;
+ request->access_req.mode = mode;
+ memcpy(request->pathname, pathname, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward access() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_access);
+ return rc;
+}
+
+bool Sandbox::process_access(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ Access access_req;
+ if (read(sys, sandboxFd, &access_req, sizeof(access_req)) !=
+ sizeof(access_req)) {
+ read_parm_failed:
+ die("Failed to read parameters for access() [process]");
+ }
+ int rc = -ENAMETOOLONG;
+ if (access_req.path_length >= sizeof(mem->pathname)) {
+ char buf[32];
+ while (access_req.path_length > 0) {
+ size_t len = access_req.path_length > sizeof(buf) ?
+ sizeof(buf) : access_req.path_length;
+ ssize_t i = read(sys, sandboxFd, buf, len);
+ if (i <= 0) {
+ goto read_parm_failed;
+ }
+ access_req.path_length -= i;
+ }
+ if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to return data from access() [process]");
+ }
+ return false;
+ }
+
+ if (!g_policy.allow_file_namespace) {
+ // After locking the mutex, we can no longer abandon the system call. So,
+ // perform checks before clobbering the securely shared memory.
+ char tmp[access_req.path_length];
+ if (read(sys, sandboxFd, tmp, access_req.path_length) !=
+ (ssize_t)access_req.path_length) {
+ goto read_parm_failed;
+ }
+ Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str());
+ SecureMem::abandonSystemCall(threadFd, -EACCES);
+ return false;
+ }
+
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ if (read(sys, sandboxFd, mem->pathname, access_req.path_length) !=
+ (ssize_t)access_req.path_length) {
+ goto read_parm_failed;
+ }
+ mem->pathname[access_req.path_length] = '\000';
+
+ // TODO(markus): Implement sandboxing policy
+ Debug::message(("Allowing access to \"" + std::string(mem->pathname) +
+ "\"").c_str());
+
+ // Tell trusted thread to access the file.
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, __NR_access,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ access_req.mode);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/allocator.cc b/sandbox/linux/seccomp/allocator.cc
new file mode 100644
index 0000000..6e11a4a
--- /dev/null
+++ b/sandbox/linux/seccomp/allocator.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// The allocator is very simplistic. It requests memory pages directly from
+// the system. Each page starts with a header describing the allocation. This
+// makes sure that we can return the memory to the system when it is
+// deallocated.
+// For allocations that are smaller than a single page, we try to squeeze
+// multiple of them into the same page.
+// We expect to use this allocator for a moderate number of small allocations.
+// In most cases, it will only need to ever make a single request to the
+// operating system for the lifetime of the STL container object.
+// We don't worry about memory fragmentation as the allocator is expected to
+// be short-lived.
+
+#include <stdint.h>
+#include <sys/mman.h>
+
+#include "allocator.h"
+#include "linux_syscall_support.h"
+
+namespace playground {
+
+class SysCalls {
+ public:
+ #define SYS_CPLUSPLUS
+ #define SYS_ERRNO my_errno
+ #define SYS_INLINE inline
+ #define SYS_PREFIX -1
+ #undef SYS_LINUX_SYSCALL_SUPPORT_H
+ #include "linux_syscall_support.h"
+ SysCalls() : my_errno(0) { }
+ int my_errno;
+};
+#ifdef __NR_mmap2
+ #define MMAP mmap2
+ #define __NR_MMAP __NR_mmap2
+#else
+ #define MMAP mmap
+ #define __NR_MMAP __NR_mmap
+#endif
+
+// We only ever keep track of the very last partial page that was used for
+// allocations. This approach simplifies the code a lot. It can theoretically
+// lead to more memory fragmentation, but for our use case that is unlikely
+// to happen.
+struct Header {
+ // The total amount of memory allocated for this chunk of memory. Typically,
+ // this would be a single page.
+ size_t total_len;
+
+ // "used" keeps track of the number of bytes currently allocated in this
+ // page. Note that as elements are freed from this page, "used" is updated
+ // allowing us to track when the page is free. However, these holes in the
+ // page are never re-used, so "tail" is the only way to find out how much
+ // free space remains and when we need to request another chunk of memory
+ // from the system.
+ size_t used;
+ void *tail;
+};
+static Header* last_alloc;
+
+void* SystemAllocatorHelper::sys_allocate(size_t size) {
+ // Number of bytes that need to be allocated
+ if (size + 3 < size) {
+ return NULL;
+ }
+ size_t len = (size + 3) & ~3;
+
+ if (last_alloc) {
+ // Remaining space in the last chunk of memory allocated from system
+ size_t remainder = last_alloc->total_len -
+ (reinterpret_cast<char *>(last_alloc->tail) -
+ reinterpret_cast<char *>(last_alloc));
+
+ if (remainder >= len) {
+ void* ret = last_alloc->tail;
+ last_alloc->tail = reinterpret_cast<char *>(last_alloc->tail) + len;
+ last_alloc->used += len;
+ return ret;
+ }
+ }
+
+ SysCalls sys;
+ if (sizeof(Header) + len + 4095 < len) {
+ return NULL;
+ }
+ size_t total_len = (sizeof(Header) + len + 4095) & ~4095;
+ Header* mem = reinterpret_cast<Header *>(
+ sys.MMAP(NULL, total_len, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
+ if (mem == MAP_FAILED) {
+ return NULL;
+ }
+
+ // If we were only asked to allocate a single page, then we will use any
+ // remaining space for other small allocations.
+ if (total_len - sizeof(Header) - len >= 4) {
+ last_alloc = mem;
+ }
+ mem->total_len = total_len;
+ mem->used = len;
+ char* ret = reinterpret_cast<char *>(mem) + sizeof(Header);
+ mem->tail = ret + len;
+
+ return ret;
+}
+
+void SystemAllocatorHelper::sys_deallocate(void* p, size_t size) {
+ // Number of bytes in this allocation
+ if (size + 3 < size) {
+ return;
+ }
+ size_t len = (size + 3) & ~3;
+
+ // All allocations (small and large) have starting addresses in the
+ // first page that was allocated from the system. This page starts with
+ // a header that keeps track of how many bytes are currently used. The
+ // header can be found by truncating the last few bits of the address.
+ Header* header = reinterpret_cast<Header *>(
+ reinterpret_cast<uintptr_t>(p) & ~4095);
+ header->used -= len;
+
+ // After the last allocation has been freed, return the page(s) to the
+ // system
+ if (!header->used) {
+ SysCalls sys;
+ sys.munmap(header, header->total_len);
+ if (last_alloc == header) {
+ last_alloc = NULL;
+ }
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/allocator.h b/sandbox/linux/seccomp/allocator.h
new file mode 100644
index 0000000..29e0065
--- /dev/null
+++ b/sandbox/linux/seccomp/allocator.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Implement a very basic memory allocator that make direct system calls
+// instead of relying on libc.
+// This allocator is not thread-safe.
+
+#ifndef ALLOCATOR_H__
+#define ALLOCATOR_H__
+
+#include <cstddef>
+
+namespace playground {
+
+class SystemAllocatorHelper {
+ protected:
+ static void *sys_allocate(size_t size);
+ static void sys_deallocate(void* p, size_t size);
+};
+
+template <class T>
+class SystemAllocator : SystemAllocatorHelper {
+ public:
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ template <class U>
+ struct rebind {
+ typedef SystemAllocator<U> other;
+ };
+
+ pointer address(reference value) const {
+ return &value;
+ }
+
+ const_pointer address(const_reference value) const {
+ return &value;
+ }
+
+ SystemAllocator() throw() { }
+ SystemAllocator(const SystemAllocator& src) throw() { }
+ template <class U> SystemAllocator(const SystemAllocator<U>& src) throw() { }
+ ~SystemAllocator() throw() { }
+
+ size_type max_size() const throw() {
+ return (1 << 30) / sizeof(T);
+ }
+
+ pointer allocate(size_type num, const void* = 0) {
+ if (num > max_size()) {
+ return NULL;
+ }
+ return (pointer)sys_allocate(num * sizeof(T));
+ }
+
+ void construct(pointer p, const T& value) {
+ new(reinterpret_cast<void *>(p))T(value);
+ }
+
+ void destroy(pointer p) {
+ p->~T();
+ }
+
+ void deallocate(pointer p, size_type num) {
+ sys_deallocate(p, num * sizeof(T));
+ }
+};
+
+template <class T1, class T2>
+bool operator== (const SystemAllocator<T1>&, const SystemAllocator<T2>&)
+ throw() {
+ return true;
+}
+template <class T1, class T2>
+bool operator!= (const SystemAllocator<T1>&, const SystemAllocator<T2>&)
+ throw() {
+ return false;
+}
+
+} // namespace
+
+#endif // ALLOCATOR_H__
diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc
new file mode 100644
index 0000000..0d35181
--- /dev/null
+++ b/sandbox/linux/seccomp/clone.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_clone(int flags, char* stack, int* pid, int* ctid,
+ void* tls, void *wrapper_sp) {
+ long long tm;
+ Debug::syscall(&tm, __NR_clone, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ Clone clone_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_clone;
+ request.cookie = cookie();
+ request.clone_req.flags = flags;
+ request.clone_req.stack = stack;
+ request.clone_req.pid = pid;
+ request.clone_req.ctid = ctid;
+ request.clone_req.tls = tls;
+
+ // TODO(markus): Passing stack == 0 currently does not do the same thing
+ // that the kernel would do without the sandbox. This is just going to
+ // cause a crash. We should detect this case, and replace the stack pointer
+ // with the correct value, instead.
+ // This is complicated by the fact that we will temporarily be executing
+ // both threads from the same stack. Some synchronization will be necessary.
+ // Fortunately, this complication also explains why hardly anybody ever
+ // does this.
+ // See trusted_thread.cc for more information.
+ long rc;
+ if (stack == 0) {
+ rc = -EINVAL;
+ } else {
+ // Pass along the address on the stack where syscallWrapper() stored the
+ // original CPU registers. These registers will be restored in the newly
+ // created thread prior to returning from the wrapped system call.
+ #if defined(__x86_64__)
+ memcpy(&request.clone_req.regs64, wrapper_sp,
+ sizeof(request.clone_req.regs64) + sizeof(void *));
+ #elif defined(__i386__)
+ memcpy(&request.clone_req.regs32, wrapper_sp,
+ sizeof(request.clone_req.regs32) + sizeof(void *));
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // In order to unblock the signal mask in the newly created thread and
+ // after entering Seccomp mode, we have to call sigreturn(). But that
+ // requires access to a proper stack frame describing a valid signal.
+ // We trigger a signal now and make sure the stack frame ends up on the
+ // new stack. Our segv() handler (in sandbox.cc) does that for us.
+ // See trusted_thread.cc for more details on how threads get created.
+ //
+ // In general we rely on the kernel for generating the signal stack
+ // frame, as the exact binary format has been extended several times over
+ // the course of the kernel's development. Fortunately, the kernel
+ // developers treat the initial part of the stack frame as a stable part
+ // of the ABI. So, we can rely on fixed, well-defined offsets for accessing
+ // register values and for accessing the signal mask.
+ #if defined(__x86_64__)
+ // Red zone compensation. The instrumented system call will remove 128
+ // bytes from the thread's stack prior to returning to the original
+ // call site.
+ stack -= 128;
+ request.clone_req.stack = stack;
+ void *dummy;
+ asm volatile("mov %%rsp, %%rcx\n"
+ "mov %3, %%rsp\n"
+ "int $0\n"
+ "mov %%rcx, %%rsp\n"
+ : "=a"(request.clone_req.stack), "=&c"(dummy)
+ : "a"(__NR_clone + 0xF000), "m"(request.clone_req.stack)
+ : "memory");
+ #elif defined(__i386__)
+ void *dummy;
+ asm volatile("mov %%esp, %%ecx\n"
+ "mov %3, %%esp\n"
+ "int $0\n"
+ "mov %%ecx, %%esp\n"
+ : "=a"(request.clone_req.stack), "=&c"(dummy)
+ : "a"(__NR_clone + 0xF000), "m"(request.clone_req.stack)
+ : "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Adjust the signal stack frame so that it contains the correct stack
+ // pointer upon returning from sigreturn().
+ #if defined(__x86_64__)
+ *(char **)(request.clone_req.stack + 0xA0) = stack;
+ #elif defined(__i386__)
+ *(char **)(request.clone_req.stack + 0x1C) = stack;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward clone() request [sandbox]");
+ }
+ }
+ Debug::elapsed(tm, __NR_clone);
+ return rc;
+}
+
+bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ Clone clone_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &clone_req, sizeof(clone_req)) !=sizeof(clone_req)){
+ die("Failed to read parameters for clone() [process]");
+ }
+
+ // TODO(markus): add policy restricting parameters for clone
+ if ((clone_req.flags & ~CLONE_DETACHED) != (CLONE_VM|CLONE_FS|CLONE_FILES|
+ CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|
+ CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID)) {
+ SecureMem::abandonSystemCall(threadFd, -EPERM);
+ return false;
+ } else {
+ SecureMem::Args* newMem = getNewSecureMem();
+ if (!newMem) {
+ SecureMem::abandonSystemCall(threadFd, -ENOMEM);
+ return false;
+ } else {
+ // clone() has unusual semantics. We don't want to return back into the
+ // trusted thread, but instead we need to continue execution at the IP
+ // where we got called initially.
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ mem->ret = clone_req.ret;
+ #if defined(__x86_64__)
+ mem->rbp = clone_req.regs64.rbp;
+ mem->rbx = clone_req.regs64.rbx;
+ mem->rcx = clone_req.regs64.rcx;
+ mem->rdx = clone_req.regs64.rdx;
+ mem->rsi = clone_req.regs64.rsi;
+ mem->rdi = clone_req.regs64.rdi;
+ mem->r8 = clone_req.regs64.r8;
+ mem->r9 = clone_req.regs64.r9;
+ mem->r10 = clone_req.regs64.r10;
+ mem->r11 = clone_req.regs64.r11;
+ mem->r12 = clone_req.regs64.r12;
+ mem->r13 = clone_req.regs64.r13;
+ mem->r14 = clone_req.regs64.r14;
+ mem->r15 = clone_req.regs64.r15;
+ #elif defined(__i386__)
+ mem->ebp = clone_req.regs32.ebp;
+ mem->edi = clone_req.regs32.edi;
+ mem->esi = clone_req.regs32.esi;
+ mem->edx = clone_req.regs32.edx;
+ mem->ecx = clone_req.regs32.ecx;
+ mem->ebx = clone_req.regs32.ebx;
+ #else
+ #error Unsupported target platform
+ #endif
+ newMem->sequence = 0;
+ newMem->shmId = -1;
+ mem->newSecureMem = newMem;
+ mem->processFdPub = processFdPub_;
+ mem->cloneFdPub = cloneFdPub_;
+
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem,
+ __NR_clone, clone_req.flags, clone_req.stack,
+ clone_req.pid, clone_req.ctid, clone_req.tls);
+ return true;
+ }
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/debug.cc b/sandbox/linux/seccomp/debug.cc
new file mode 100644
index 0000000..5d6de49
--- /dev/null
+++ b/sandbox/linux/seccomp/debug.cc
@@ -0,0 +1,363 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef NDEBUG
+
+#include "debug.h"
+
+namespace playground {
+
+bool Debug::enabled_;
+int Debug::numSyscallNames_;
+const char **Debug::syscallNames_;
+std::map<int, std::string> Debug::syscallNamesMap_;
+
+Debug Debug::debug_;
+
+Debug::Debug() {
+ // Logging is disabled by default, but can be turned on by setting an
+ // appropriate environment variable. Initialize this code from a global
+ // constructor, so that it runs before the sandbox is turned on.
+ enabled_ = !!getenv("SECCOMP_SANDBOX_DEBUGGING");
+
+ // Read names of system calls from header files, if available. Symbolic
+ // names make debugging so much nicer.
+ if (enabled_) {
+ static const char *filenames[] = {
+ #if __WORDSIZE == 64
+ "/usr/include/asm/unistd_64.h",
+ #elif __WORDSIZE == 32
+ "/usr/include/asm/unistd_32.h",
+ #endif
+ "/usr/include/asm/unistd.h",
+ NULL };
+ numSyscallNames_ = 0;
+ for (const char **fn = filenames; *fn; ++fn) {
+ FILE *fp = fopen(*fn, "r");
+ if (fp) {
+ std::string baseName;
+ int baseNum = -1;
+ char buf[80];
+ while (fgets(buf, sizeof(buf), fp)) {
+ // Check if the line starts with "#define"
+ static const char* whitespace = " \t\r\n";
+ char *token, *save;
+ token = strtok_r(buf, whitespace, &save);
+ if (token && !strcmp(token, "#define")) {
+
+ // Only parse identifiers that start with "__NR_"
+ token = strtok_r(NULL, whitespace, &save);
+ if (token) {
+ if (strncmp(token, "__NR_", 5)) {
+ continue;
+ }
+ std::string syscallName(token + 5);
+
+ // Parse the value of the symbol. Try to be forgiving in what
+ // we accept, as the file format might change over time.
+ token = strtok_r(NULL, "\r\n", &save);
+ if (token) {
+ // Some values are defined relative to previous values, we
+ // detect these examples by finding an earlier symbol name
+ // followed by a '+' plus character.
+ bool isRelative = false;
+ char *base = strstr(token, baseName.c_str());
+ if (baseNum >= 0 && base) {
+ base += baseName.length();
+ while (*base == ' ' || *base == '\t') {
+ ++base;
+ }
+ if (*base == '+') {
+ isRelative = true;
+ token = base;
+ }
+ }
+
+ // Skip any characters that are not part of the syscall number.
+ while (*token < '0' || *token > '9') {
+ token++;
+ }
+
+ // If we now have a valid datum, enter it into our map.
+ if (*token) {
+ int sysnum = atoi(token);
+
+ // Deal with symbols that are defined relative to earlier
+ // ones.
+ if (isRelative) {
+ sysnum += baseNum;
+ } else {
+ baseNum = sysnum;
+ baseName = syscallName;
+ }
+
+ // Keep track of the highest syscall number that we know
+ // about.
+ if (sysnum >= numSyscallNames_) {
+ numSyscallNames_ = sysnum + 1;
+ }
+
+ syscallNamesMap_[sysnum] = syscallName;
+ }
+ }
+ }
+ }
+ }
+ fclose(fp);
+ break;
+ }
+ }
+ if (numSyscallNames_) {
+ // We cannot make system calls at the time, when we are looking up
+ // the names. So, copy them into a data structure that can be
+ // accessed without having to allocated memory (i.e. no more STL).
+ syscallNames_ = reinterpret_cast<const char **>(
+ calloc(sizeof(char *), numSyscallNames_));
+ for (std::map<int, std::string>::const_iterator iter =
+ syscallNamesMap_.begin();
+ iter != syscallNamesMap_.end();
+ ++iter) {
+ syscallNames_[iter->first] = iter->second.c_str();
+ }
+ }
+ }
+}
+
+bool Debug::enter() {
+ // Increment the recursion level in TLS storage. This allows us to
+ // make system calls from within our debugging functions, without triggering
+ // additional debugging output.
+ //
+ // This function can be called from both the sandboxed process and from the
+ // trusted process. Only the sandboxed process needs to worry about
+ // recursively calling system calls. The trusted process doesn't intercept
+ // system calls and thus doesn't have this problem. It also doesn't have
+ // a TLS. We explicitly set the segment selector to zero, when in the
+ // trusted process, so that we can avoid tracking recursion levels.
+ int level;
+ #if defined(__x86_64__)
+ asm volatile("mov %%gs, %0\n"
+ "test %0, %0\n"
+ "jz 1f\n"
+ "movl %%gs:0x1050-0xE0, %0\n"
+ "incl %%gs:0x1050-0xE0\n"
+ "1:\n"
+ : "=r"(level)
+ :
+ : "memory");
+ #elif defined(__i386__)
+ asm volatile("mov %%fs, %0\n"
+ "test %0, %0\n"
+ "jz 1f\n"
+ "movl %%fs:0x1034-0x58, %0\n"
+ "incl %%fs:0x1034-0x58\n"
+ "1:\n"
+ : "=r"(level)
+ :
+ : "memory");
+ #else
+ #error "Unsupported target platform"
+ #endif
+ return !level;
+}
+
+bool Debug::leave() {
+ // Decrement the recursion level in TLS storage. This allows us to
+ // make system calls from within our debugging functions, without triggering
+ // additional debugging output.
+ //
+ // This function can be called from both the sandboxed process and from the
+ // trusted process. Only the sandboxed process needs to worry about
+ // recursively calling system calls. The trusted process doesn't intercept
+ // system calls and thus doesn't have this problem. It also doesn't have
+ // a TLS. We explicitly set the segment selector to zero, when in the
+ // trusted process, so that we can avoid tracking recursion levels.
+ int level;
+ #if defined(__x86_64__)
+ asm volatile("mov %%gs, %0\n"
+ "test %0, %0\n"
+ "jz 1f\n"
+ "decl %%gs:0x1050-0xE0\n"
+ "movl %%gs:0x1050-0xE0, %0\n"
+ "1:\n"
+ : "=r"(level)
+ :
+ : "memory");
+ #elif defined(__i386__)
+ asm volatile("mov %%fs, %0\n"
+ "test %0, %0\n"
+ "jz 1f\n"
+ "decl %%fs:0x1034-0x58\n"
+ "movl %%fs:0x1034-0x58, %0\n"
+ "1:\n"
+ : "=r"(level)
+ :
+ : "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+ return !level;
+}
+
+void Debug::_message(const char* msg) {
+ if (enabled_) {
+ Sandbox::SysCalls sys;
+ size_t len = strlen(msg);
+ if (len && msg[len-1] != '\n') {
+ // Write operations should be atomic, so that we don't interleave
+ // messages from multiple threads. Append a newline, if it is not
+ // already there.
+ char copy[len + 1];
+ memcpy(copy, msg, len);
+ copy[len] = '\n';
+ Sandbox::write(sys, 2, copy, len + 1);
+ } else {
+ Sandbox::write(sys, 2, msg, len);
+ }
+ }
+}
+
+void Debug::message(const char* msg) {
+ if (enabled_) {
+ if (enter()) {
+ _message(msg);
+ }
+ leave();
+ }
+}
+
+void Debug::gettimeofday(long long* tm) {
+ if (tm) {
+ struct timeval tv;
+ #if defined(__i386__)
+ // Zero out the lastSyscallNum, so that we don't try to coalesce
+ // calls to gettimeofday(). For debugging purposes, we need the
+ // exact time.
+ asm volatile("movl $0, %fs:0x102C-0x58");
+ #elif !defined(__x86_64__)
+ #error Unsupported target platform
+ #endif
+ ::gettimeofday(&tv, NULL);
+ *tm = 1000ULL*1000ULL*static_cast<unsigned>(tv.tv_sec) +
+ static_cast<unsigned>(tv.tv_usec);
+ }
+}
+
+void Debug::syscall(long long* tm, int sysnum, const char* msg, int call) {
+ // This function gets called from the system call wrapper. Avoid calling
+ // any library functions that themselves need system calls.
+ if (enabled_) {
+ if (enter() || !tm) {
+ gettimeofday(tm);
+
+ const char *sysname = NULL;
+ if (sysnum >= 0 && sysnum < numSyscallNames_) {
+ sysname = syscallNames_[sysnum];
+ }
+ static const char kUnnamedMessage[] = "Unnamed syscall #";
+ char unnamed[40];
+ if (!sysname) {
+ memcpy(unnamed, kUnnamedMessage, sizeof(kUnnamedMessage) - 1);
+ itoa(unnamed + sizeof(kUnnamedMessage) - 1, sysnum);
+ sysname = unnamed;
+ }
+ #if defined(__NR_socketcall) || defined(__NR_ipc)
+ char extra[40];
+ *extra = '\000';
+ #if defined(__NR_socketcall)
+ if (sysnum == __NR_socketcall) {
+ static const char* socketcall_name[] = {
+ 0, "socket", "bind", "connect", "listen", "accept", "getsockname",
+ "getpeername", "socketpair", "send", "recv", "sendto","recvfrom",
+ "shutdown", "setsockopt", "getsockopt", "sendmsg", "recvmsg",
+ "accept4"
+ };
+ if (call >= 1 &&
+ call < (int)(sizeof(socketcall_name)/sizeof(char *))) {
+ strcat(strcpy(extra, " "), socketcall_name[call]);
+ } else {
+ itoa(strcpy(extra, " #") + 2, call);
+ }
+ }
+ #endif
+ #if defined(__NR_ipc)
+ if (sysnum == __NR_ipc) {
+ static const char* ipc_name[] = {
+ 0, "semop", "semget", "semctl", "semtimedop", 0, 0, 0, 0, 0, 0,
+ "msgsnd", "msgrcv", "msgget", "msgctl", 0, 0, 0, 0, 0, 0,
+ "shmat", "shmdt", "shmget", "shmctl" };
+ if (call >= 1 && call < (int)(sizeof(ipc_name)/sizeof(char *)) &&
+ ipc_name[call]) {
+ strcat(strcpy(extra, " "), ipc_name[call]);
+ } else {
+ itoa(strcpy(extra, " #") + 2, call);
+ }
+ }
+ #endif
+ #else
+ static const char extra[1] = { 0 };
+ #endif
+ char buf[strlen(sysname) + strlen(extra) + (msg ? strlen(msg) : 0) + 4];
+ strcat(strcat(strcat(strcat(strcpy(buf, sysname), extra), ": "),
+ msg ? msg : ""), "\n");
+ _message(buf);
+ }
+ leave();
+ }
+}
+
+char* Debug::itoa(char* s, int n) {
+ // Remember return value
+ char *ret = s;
+
+ // Insert sign for negative numbers
+ if (n < 0) {
+ *s++ = '-';
+ n = -n;
+ }
+
+ // Convert to decimal (in reverse order)
+ char *start = s;
+ do {
+ *s++ = '0' + (n % 10);
+ n /= 10;
+ } while (n);
+ *s-- = '\000';
+
+ // Reverse order of digits
+ while (start < s) {
+ char ch = *s;
+ *s-- = *start;
+ *start++ = ch;
+ }
+
+ return ret;
+}
+
+void Debug::elapsed(long long tm, int sysnum, int call) {
+ if (enabled_) {
+ if (enter()) {
+ // Compute the time that has passed since the system call started.
+ long long delta;
+ gettimeofday(&delta);
+ delta -= tm;
+
+ // Format "Elapsed time: %d.%03dms" without using sprintf().
+ char buf[80];
+ itoa(strrchr(strcpy(buf, "Elapsed time: "), '\000'), delta/1000);
+ delta %= 1000;
+ strcat(buf, delta < 100 ? delta < 10 ? ".00" : ".0" : ".");
+ itoa(strrchr(buf, '\000'), delta);
+ strcat(buf, "ms");
+
+ // Print system call name and elapsed time.
+ syscall(NULL, sysnum, buf, call);
+ }
+ leave();
+ }
+}
+
+} // namespace
+
+#endif // NDEBUG
diff --git a/sandbox/linux/seccomp/debug.h b/sandbox/linux/seccomp/debug.h
new file mode 100644
index 0000000..eb5a194
--- /dev/null
+++ b/sandbox/linux/seccomp/debug.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef DEBUG_H__
+#define DEBUG_H__
+
+#include <map>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <string.h>
+
+#include "sandbox_impl.h"
+
+namespace playground {
+
+class Debug {
+ public:
+ // If debugging is enabled, write a message to stderr.
+ static void message(const char* msg)
+ #ifndef NDEBUG
+ asm("playground$debugMessage")
+ #if defined(__x86_64__)
+ __attribute__((visibility("internal")))
+ #endif
+ ;
+ #else
+ { }
+ #endif
+
+ // If debugging is enabled, write the name of the syscall and an optional
+ // message to stderr.
+ static void syscall(long long* tm, int sysnum,
+ const char* msg, int call = -1)
+ #ifndef NDEBUG
+ ;
+ #else
+ { }
+ #endif
+
+ // Print how much wall-time has elapsed since the last call to syscall()
+ static void elapsed(long long tm, int sysnum, int call = -1)
+ #ifndef NDEBUG
+ ;
+ #else
+ {
+ }
+ #endif
+
+ // Check whether debugging is enabled.
+ static bool isEnabled() {
+ #ifndef NDEBUG
+ return enabled_;
+ #else
+ return false;
+ #endif
+ }
+
+ private:
+ #ifndef NDEBUG
+ Debug();
+ static bool enter();
+ static bool leave();
+ static void _message(const char* msg);
+ static void gettimeofday(long long* tm);
+ static char* itoa(char* s, int n);
+
+ static Debug debug_;
+
+ static bool enabled_;
+ static int numSyscallNames_;
+ static const char **syscallNames_;
+ static std::map<int, std::string> syscallNamesMap_;
+ #endif
+};
+
+} // namespace
+
+#endif // DEBUG_H__
diff --git a/sandbox/linux/seccomp/exit.cc b/sandbox/linux/seccomp/exit.cc
new file mode 100644
index 0000000..f4db643
--- /dev/null
+++ b/sandbox/linux/seccomp/exit.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_exit(int status) {
+ long long tm;
+ Debug::syscall(&tm, __NR_exit, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_exit;
+ request.cookie = cookie();
+
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request)) {
+ die("Failed to forward exit() request [sandbox]");
+ }
+ for (;;) {
+ sys._exit(status);
+ }
+}
+
+bool Sandbox::process_exit(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem,
+ __NR_exit, 0);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/getpid.cc b/sandbox/linux/seccomp/getpid.cc
new file mode 100644
index 0000000..be5449b
--- /dev/null
+++ b/sandbox/linux/seccomp/getpid.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_getpid() {
+ long long tm;
+ Debug::syscall(&tm, __NR_getpid, "Executing handler");
+ Debug::elapsed(tm, __NR_getpid);
+ return pid_;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/gettid.cc b/sandbox/linux/seccomp/gettid.cc
new file mode 100644
index 0000000..699774a
--- /dev/null
+++ b/sandbox/linux/seccomp/gettid.cc
@@ -0,0 +1,18 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_gettid() {
+ long long tm;
+ Debug::syscall(&tm, __NR_gettid, "Executing handler");
+ pid_t t = tid();
+ Debug::elapsed(tm, __NR_gettid);
+ return t;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/ioctl.cc b/sandbox/linux/seccomp/ioctl.cc
new file mode 100644
index 0000000..4d2b3c5c5
--- /dev/null
+++ b/sandbox/linux/seccomp/ioctl.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_ioctl(int d, int req, void *arg) {
+ long long tm;
+ Debug::syscall(&tm, __NR_ioctl, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ IOCtl ioctl_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_ioctl;
+ request.cookie = cookie();
+ request.ioctl_req.d = d;
+ request.ioctl_req.req = req;
+ request.ioctl_req.arg = arg;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward ioctl() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_ioctl);
+ return rc;
+}
+
+bool Sandbox::process_ioctl(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ IOCtl ioctl_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &ioctl_req, sizeof(ioctl_req)) !=sizeof(ioctl_req)){
+ die("Failed to read parameters for ioctl() [process]");
+ }
+ int rc = -EINVAL;
+ switch (ioctl_req.req) {
+ case TCGETS:
+ case TIOCGWINSZ:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_ioctl,
+ ioctl_req.d, ioctl_req.req, ioctl_req.arg);
+ return true;
+ default:
+ if (Debug::isEnabled()) {
+ char buf[80];
+ sprintf(buf, "Unsupported ioctl: 0x%04X\n", ioctl_req.req);
+ Debug::message(buf);
+ }
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/ipc.cc b/sandbox/linux/seccomp/ipc.cc
new file mode 100644
index 0000000..67a4e34
--- /dev/null
+++ b/sandbox/linux/seccomp/ipc.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+#ifndef IPC_PRIVATE
+#define IPC_PRIVATE 0
+#endif
+#ifndef IPC_RMID
+#define IPC_RMID 0
+#endif
+#ifndef IPC_64
+#define IPC_64 256
+#endif
+
+#if defined(__NR_shmget)
+void* Sandbox::sandbox_shmat(int shmid, const void* shmaddr, int shmflg) {
+ long long tm;
+ Debug::syscall(&tm, __NR_shmat, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmAt shmat_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmat;
+ request.cookie = cookie();
+ request.shmat_req.shmid = shmid;
+ request.shmat_req.shmaddr = shmaddr;
+ request.shmat_req.shmflg = shmflg;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmat() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_shmat);
+ return reinterpret_cast<void *>(rc);
+}
+
+long Sandbox::sandbox_shmctl(int shmid, int cmd, void* buf) {
+ long long tm;
+ Debug::syscall(&tm, __NR_shmctl, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmCtl shmctl_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmctl;
+ request.cookie = cookie();
+ request.shmctl_req.shmid = shmid;
+ request.shmctl_req.cmd = cmd;
+ request.shmctl_req.buf = buf;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmctl() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_shmctl);
+ return rc;
+}
+
+long Sandbox::sandbox_shmdt(const void* shmaddr) {
+ long long tm;
+ Debug::syscall(&tm, __NR_shmdt, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmDt shmdt_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmdt;
+ request.cookie = cookie();
+ request.shmdt_req.shmaddr = shmaddr;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmdt() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_shmdt);
+ return rc;
+}
+
+long Sandbox::sandbox_shmget(int key, size_t size, int shmflg) {
+ long long tm;
+ Debug::syscall(&tm, __NR_shmget, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmGet shmget_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmget;
+ request.cookie = cookie();
+ request.shmget_req.key = key;
+ request.shmget_req.size = size;
+ request.shmget_req.shmflg = shmflg;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmget() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_shmget);
+ return rc;
+}
+
+bool Sandbox::process_shmat(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmAt shmat_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmat_req, sizeof(shmat_req)) !=
+ sizeof(shmat_req)) {
+ die("Failed to read parameters for shmat() [process]");
+ }
+
+ // We only allow attaching to the shm identifier that was returned by
+ // the most recent call to shmget(IPC_PRIVATE)
+ if (shmat_req.shmaddr || shmat_req.shmflg || shmat_req.shmid != mem->shmId) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmat, shmat_req.shmid, shmat_req.shmaddr,
+ shmat_req.shmflg);
+ return true;
+}
+
+bool Sandbox::process_shmctl(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmCtl shmctl_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmctl_req, sizeof(shmctl_req)) !=
+ sizeof(shmctl_req)) {
+ die("Failed to read parameters for shmctl() [process]");
+ }
+
+ // The only shmctl() operation that we need to support is removal. This
+ // operation is generally safe.
+ if ((shmctl_req.cmd & ~(IPC_64 | IPC_RMID)) || shmctl_req.buf) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmctl, shmctl_req.shmid, shmctl_req.cmd,
+ shmctl_req.buf);
+ return true;
+}
+
+bool Sandbox::process_shmdt(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmDt shmdt_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmdt_req, sizeof(shmdt_req)) !=
+ sizeof(shmdt_req)) {
+ die("Failed to read parameters for shmdt() [process]");
+ }
+
+ // Detaching shared memory segments it generally safe, but just in case
+ // of a kernel bug, we make sure that the address does not fall into any
+ // of the reserved memory regions.
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)shmdt_req.shmaddr);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first <= shmdt_req.shmaddr;
+ ++iter){
+ if (shmdt_req.shmaddr < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ shmdt_req.shmaddr >= iter->first) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmdt, shmdt_req.shmaddr);
+ return true;
+}
+
+bool Sandbox::process_shmget(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmGet shmget_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmget_req, sizeof(shmget_req)) !=
+ sizeof(shmget_req)) {
+ die("Failed to read parameters for shmget() [process]");
+ }
+
+ // We do not want to allow the sandboxed application to access arbitrary
+ // shared memory regions. We only allow it to access regions that it
+ // created itself.
+ if (shmget_req.key != IPC_PRIVATE || shmget_req.shmflg & ~0777) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmget, shmget_req.key, shmget_req.size,
+ shmget_req.shmflg);
+ return true;
+}
+#endif
+
+#if defined(__NR_ipc)
+#ifndef SHMAT
+#define SHMAT 21
+#endif
+#ifndef SHMDT
+#define SHMDT 22
+#endif
+#ifndef SHMGET
+#define SHMGET 23
+#endif
+#ifndef SHMCTL
+#define SHMCTL 24
+#endif
+
+long Sandbox::sandbox_ipc(unsigned call, int first, int second, int third,
+ void* ptr, long fifth) {
+ long long tm;
+ Debug::syscall(&tm, __NR_ipc, "Executing handler", call);
+ struct {
+ int sysnum;
+ long long cookie;
+ IPC ipc_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_ipc;
+ request.cookie = cookie();
+ request.ipc_req.call = call;
+ request.ipc_req.first = first;
+ request.ipc_req.second = second;
+ request.ipc_req.third = third;
+ request.ipc_req.ptr = ptr;
+ request.ipc_req.fifth = fifth;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward ipc() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_ipc, call);
+ return rc;
+}
+
+bool Sandbox::process_ipc(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ IPC ipc_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &ipc_req, sizeof(ipc_req)) != sizeof(ipc_req)) {
+ die("Failed to read parameters for ipc() [process]");
+ }
+
+ // We do not support all of the SysV IPC calls. In fact, we only support
+ // the minimum feature set necessary for Chrome's renderers to share memory
+ // with the X server.
+ switch (ipc_req.call) {
+ case SHMAT: {
+ // We only allow attaching to the shm identifier that was returned by
+ // the most recent call to shmget(IPC_PRIVATE)
+ if (ipc_req.ptr || ipc_req.second || ipc_req.first != mem->shmId) {
+ goto deny;
+ }
+ accept:
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_ipc, ipc_req.call, ipc_req.first,
+ ipc_req.second, ipc_req.third, ipc_req.ptr,
+ ipc_req.fifth);
+ return true;
+ }
+ case SHMCTL:
+ // The only shmctl() operation that we need to support is removal. This
+ // operation is generally safe.
+ if ((ipc_req.second & ~(IPC_64 | IPC_RMID)) || ipc_req.ptr) {
+ goto deny;
+ } else {
+ goto accept;
+ }
+ case SHMDT: {
+ // Detaching shared memory segments it generally safe, but just in case
+ // of a kernel bug, we make sure that the address does not fall into any
+ // of the reserved memory regions.
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)ipc_req.ptr);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first <=ipc_req.ptr; ++iter){
+ if (ipc_req.ptr < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ ipc_req.ptr >= iter->first) {
+ goto deny;
+ }
+ }
+ goto accept;
+ }
+ case SHMGET:
+ // We do not want to allow the sandboxed application to access arbitrary
+ // shared memory regions. We only allow it to access regions that it
+ // created itself.
+ if (ipc_req.first != IPC_PRIVATE || ipc_req.third & ~0777) {
+ goto deny;
+ } else {
+ goto accept;
+ }
+ default:
+ // Other than SysV shared memory, we do not actually need to support any
+ // other SysV IPC calls.
+ deny:
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+}
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc
new file mode 100644
index 0000000..8dd9b93
--- /dev/null
+++ b/sandbox/linux/seccomp/library.cc
@@ -0,0 +1,1208 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#define XOPEN_SOURCE 500
+#include <algorithm>
+#include <elf.h>
+#include <errno.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <set>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "allocator.h"
+#include "debug.h"
+#include "library.h"
+#include "sandbox_impl.h"
+#include "syscall.h"
+#include "syscall_table.h"
+#include "x86_decode.h"
+
+#if defined(__x86_64__)
+typedef Elf64_Phdr Elf_Phdr;
+typedef Elf64_Rela Elf_Rel;
+
+typedef Elf64_Half Elf_Half;
+typedef Elf64_Word Elf_Word;
+typedef Elf64_Sword Elf_Sword;
+typedef Elf64_Xword Elf_Xword;
+typedef Elf64_Sxword Elf_Sxword;
+typedef Elf64_Off Elf_Off;
+typedef Elf64_Section Elf_Section;
+typedef Elf64_Versym Elf_Versym;
+
+#define ELF_ST_BIND ELF64_ST_BIND
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_INFO ELF64_ST_INFO
+#define ELF_R_SYM ELF64_R_SYM
+#define ELF_R_TYPE ELF64_R_TYPE
+#define ELF_R_INFO ELF64_R_INFO
+
+#define ELF_REL_PLT ".rela.plt"
+#define ELF_JUMP_SLOT R_X86_64_JUMP_SLOT
+#elif defined(__i386__)
+typedef Elf32_Phdr Elf_Phdr;
+typedef Elf32_Rel Elf_Rel;
+
+typedef Elf32_Half Elf_Half;
+typedef Elf32_Word Elf_Word;
+typedef Elf32_Sword Elf_Sword;
+typedef Elf32_Xword Elf_Xword;
+typedef Elf32_Sxword Elf_Sxword;
+typedef Elf32_Off Elf_Off;
+typedef Elf32_Section Elf_Section;
+typedef Elf32_Versym Elf_Versym;
+
+#define ELF_ST_BIND ELF32_ST_BIND
+#define ELF_ST_TYPE ELF32_ST_TYPE
+#define ELF_ST_INFO ELF32_ST_INFO
+#define ELF_R_SYM ELF32_R_SYM
+#define ELF_R_TYPE ELF32_R_TYPE
+#define ELF_R_INFO ELF32_R_INFO
+
+#define ELF_REL_PLT ".rel.plt"
+#define ELF_JUMP_SLOT R_386_JMP_SLOT
+#else
+#error Unsupported target platform
+#endif
+
+namespace playground {
+
+char* Library::__kernel_vsyscall;
+char* Library::__kernel_sigreturn;
+char* Library::__kernel_rt_sigreturn;
+
+Library::~Library() {
+ if (image_size_) {
+ // We no longer need access to a full mapping of the underlying library
+ // file. Move the temporarily extended mapping back to where we originally
+ // found. Make sure to preserve any changes that we might have made since.
+ Sandbox::SysCalls sys;
+ sys.mprotect(image_, 4096, PROT_READ | PROT_WRITE | PROT_EXEC);
+ if (memcmp(image_, memory_ranges_.rbegin()->second.start, 4096)) {
+ // Only copy data, if we made any changes in this data. Otherwise there
+ // is no need to create another modified COW mapping.
+ memcpy(image_, memory_ranges_.rbegin()->second.start, 4096);
+ }
+ sys.mprotect(image_, 4096, PROT_READ | PROT_EXEC);
+ sys.mremap(image_, image_size_, 4096, MREMAP_MAYMOVE | MREMAP_FIXED,
+ memory_ranges_.rbegin()->second.start);
+ }
+}
+
+char* Library::getBytes(char* dst, const char* src, ssize_t len) {
+ // Some kernels don't allow accessing the VDSO from write()
+ if (isVDSO_ &&
+ src >= memory_ranges_.begin()->second.start &&
+ src <= memory_ranges_.begin()->second.stop) {
+ ssize_t max =
+ reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - src;
+ if (len > max) {
+ len = max;
+ }
+ memcpy(dst, src, len);
+ return dst;
+ }
+
+ // Read up to "len" bytes from "src" and copy them to "dst". Short
+ // copies are possible, if we are at the end of a mapping. Returns
+ // NULL, if the operation failed completely.
+ static int helper_socket[2];
+ Sandbox::SysCalls sys;
+ if (!helper_socket[0] && !helper_socket[1]) {
+ // Copy data through a socketpair, as this allows us to access it
+ // without incurring a segmentation fault.
+ sys.socketpair(AF_UNIX, SOCK_STREAM, 0, helper_socket);
+ }
+ char* ptr = dst;
+ int inc = 4096;
+ while (len > 0) {
+ ssize_t l = inc == 1 ? inc : 4096 - (reinterpret_cast<long>(src) & 0xFFF);
+ if (l > len) {
+ l = len;
+ }
+ l = NOINTR_SYS(sys.write(helper_socket[0], src, l));
+ if (l == -1) {
+ if (sys.my_errno == EFAULT) {
+ if (inc == 1) {
+ if (ptr == dst) {
+ return NULL;
+ }
+ break;
+ }
+ inc = 1;
+ continue;
+ } else {
+ return NULL;
+ }
+ }
+ l = sys.read(helper_socket[1], ptr, l);
+ if (l <= 0) {
+ return NULL;
+ }
+ ptr += l;
+ src += l;
+ len -= l;
+ }
+ return dst;
+}
+
+char *Library::get(Elf_Addr offset, char *buf, size_t len) {
+ if (!valid_) {
+ memset(buf, 0, len);
+ return NULL;
+ }
+ RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
+ if (iter == memory_ranges_.end()) {
+ memset(buf, 0, len);
+ return NULL;
+ }
+ offset -= iter->first;
+ long size = reinterpret_cast<char *>(iter->second.stop) -
+ reinterpret_cast<char *>(iter->second.start);
+ if (offset > size - len) {
+ memset(buf, 0, len);
+ return NULL;
+ }
+ char *src = reinterpret_cast<char *>(iter->second.start) + offset;
+ memset(buf, 0, len);
+ if (!getBytes(buf, src, len)) {
+ return NULL;
+ }
+ return buf;
+}
+
+Library::string Library::get(Elf_Addr offset) {
+ if (!valid_) {
+ return "";
+ }
+ RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
+ if (iter == memory_ranges_.end()) {
+ return "";
+ }
+ offset -= iter->first;
+ const char *start = reinterpret_cast<char *>(iter->second.start) + offset;
+ const char *stop = reinterpret_cast<char *>(iter->second.stop) + offset;
+ char buf[4096] = { 0 };
+ getBytes(buf, start, stop - start >= (int)sizeof(buf) ?
+ sizeof(buf) - 1 : stop - start);
+ start = buf;
+ stop = buf;
+ while (*stop) {
+ ++stop;
+ }
+ string s = stop > start ? string(start, stop - start) : "";
+ return s;
+}
+
+char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) {
+ if (!valid_) {
+ memset(buf, 0, len);
+ return NULL;
+ }
+ Sandbox::SysCalls sys;
+ if (!image_ && !isVDSO_ && !memory_ranges_.empty() &&
+ memory_ranges_.rbegin()->first == 0) {
+ // Extend the mapping of the very first page of the underlying library
+ // file. This way, we can read the original file contents of the entire
+ // library.
+ // We have to be careful, because doing so temporarily removes the first
+ // 4096 bytes of the library from memory. And we don't want to accidentally
+ // unmap code that we are executing. So, only use functions that can be
+ // inlined.
+ void* start = memory_ranges_.rbegin()->second.start;
+ image_size_ = memory_ranges_.begin()->first +
+ (reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) -
+ reinterpret_cast<char *>(memory_ranges_.begin()->second.start));
+ if (image_size_ < 8192) {
+ // It is possible to create a library that is only a single page in
+ // size. In that case, we have to make sure that we artificially map
+ // one extra page past the end of it, as our code relies on mremap()
+ // actually moving the mapping.
+ image_size_ = 8192;
+ }
+ image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_,
+ MREMAP_MAYMOVE));
+ if (image_size_ == 8192 && image_ == start) {
+ // We really mean it, when we say we want the memory to be moved.
+ image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_,
+ MREMAP_MAYMOVE));
+ sys.munmap(reinterpret_cast<char *>(start) + 4096, 4096);
+ }
+ if (image_ == MAP_FAILED) {
+ image_ = NULL;
+ } else {
+ sys.MMAP(start, 4096, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ for (int i = 4096 / sizeof(long); --i;
+ reinterpret_cast<long *>(start)[i] =
+ reinterpret_cast<long *>(image_)[i]);
+ }
+ }
+
+ if (image_) {
+ if (offset + len > image_size_) {
+ // It is quite likely that we initially did not map the entire file as
+ // we did not know how large it is. So, if necessary, try to extend the
+ // mapping.
+ size_t new_size = (offset + len + 4095) & ~4095;
+ char* tmp =
+ reinterpret_cast<char *>(sys.mremap(image_, image_size_, new_size,
+ MREMAP_MAYMOVE));
+ if (tmp != MAP_FAILED) {
+ image_ = tmp;
+ image_size_ = new_size;
+ }
+ }
+ if (buf && offset + len <= image_size_) {
+ return reinterpret_cast<char *>(memcpy(buf, image_ + offset, len));
+ }
+ return NULL;
+ }
+ return buf ? get(offset, buf, len) : NULL;
+}
+
+Library::string Library::getOriginal(Elf_Addr offset) {
+ if (!valid_) {
+ return "";
+ }
+ // Make sure we actually have a mapping that we can access. If the string
+ // is located at the end of the image, we might not yet have extended the
+ // mapping sufficiently.
+ if (!image_ || image_size_ <= offset) {
+ getOriginal(offset, NULL, 1);
+ }
+
+ if (image_) {
+ if (offset < image_size_) {
+ char* start = image_ + offset;
+ char* stop = start;
+ while (stop < image_ + image_size_ && *stop) {
+ ++stop;
+ if (stop >= image_ + image_size_) {
+ getOriginal(stop - image_, NULL, 1);
+ }
+ }
+ return string(start, stop - start);
+ }
+ return "";
+ }
+ return get(offset);
+}
+
+const Elf_Ehdr* Library::getEhdr() {
+ if (!valid_) {
+ return NULL;
+ }
+ return &ehdr_;
+}
+
+const Elf_Shdr* Library::getSection(const string& section) {
+ if (!valid_) {
+ return NULL;
+ }
+ SectionTable::const_iterator iter = section_table_.find(section);
+ if (iter == section_table_.end()) {
+ return NULL;
+ }
+ return &iter->second.second;
+}
+
+int Library::getSectionIndex(const string& section) {
+ if (!valid_) {
+ return -1;
+ }
+ SectionTable::const_iterator iter = section_table_.find(section);
+ if (iter == section_table_.end()) {
+ return -1;
+ }
+ return iter->second.first;
+}
+
+void Library::makeWritable(bool state) const {
+ for (RangeMap::const_iterator iter = memory_ranges_.begin();
+ iter != memory_ranges_.end(); ++iter) {
+ const Range& range = iter->second;
+ long length = reinterpret_cast<char *>(range.stop) -
+ reinterpret_cast<char *>(range.start);
+ Sandbox::SysCalls sys;
+ sys.mprotect(range.start, length,
+ range.prot | (state ? PROT_WRITE : 0));
+ }
+}
+
+bool Library::isSafeInsn(unsigned short insn) {
+ // Check if the instruction has no unexpected side-effects. If so, it can
+ // be safely relocated from the function that we are patching into the
+ // out-of-line scratch space that we are setting up. This is often necessary
+ // to make room for the JMP into the scratch space.
+ return ((insn & 0x7) < 0x6 && (insn & 0xF0) < 0x40
+ /* ADD, OR, ADC, SBB, AND, SUB, XOR, CMP */) ||
+ #if defined(__x86_64__)
+ insn == 0x63 /* MOVSXD */ ||
+ #endif
+ (insn >= 0x80 && insn <= 0x8E /* ADD, OR, ADC,
+ SBB, AND, SUB, XOR, CMP, TEST, XCHG, MOV, LEA */) ||
+ (insn == 0x90) || /* NOP */
+ (insn >= 0xA0 && insn <= 0xA9) /* MOV, TEST */ ||
+ (insn >= 0xB0 && insn <= 0xBF /* MOV */) ||
+ (insn >= 0xC0 && insn <= 0xC1) || /* Bit Shift */
+ (insn >= 0xD0 && insn <= 0xD3) || /* Bit Shift */
+ (insn >= 0xC6 && insn <= 0xC7 /* MOV */) ||
+ (insn == 0xF7) /* TEST, NOT, NEG, MUL, IMUL, DIV, IDIV */;
+}
+
+char* Library::getScratchSpace(const Maps* maps, char* near, int needed,
+ char** extraSpace, int* extraLength) {
+ if (needed > *extraLength ||
+ labs(*extraSpace - reinterpret_cast<char *>(near)) > (1536 << 20)) {
+ if (*extraSpace) {
+ // Start a new scratch page and mark any previous page as write-protected
+ Sandbox::SysCalls sys;
+ sys.mprotect(*extraSpace, 4096, PROT_READ|PROT_EXEC);
+ }
+ // Our new scratch space is initially executable and writable.
+ *extraLength = 4096;
+ *extraSpace = maps->allocNearAddr(near, *extraLength,
+ PROT_READ|PROT_WRITE|PROT_EXEC);
+ }
+ if (*extraSpace) {
+ *extraLength -= needed;
+ return *extraSpace + *extraLength;
+ }
+ Sandbox::die("Insufficient space to intercept system call");
+}
+
+void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
+ char *end, char** extraSpace,
+ int* extraLength) {
+ std::set<char *, std::less<char *>, SystemAllocator<char *> > branch_targets;
+ for (char *ptr = start; ptr < end; ) {
+ unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64);
+ char *target;
+ if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || insn == 0xEB /* JMP */) {
+ target = ptr + (reinterpret_cast<signed char *>(ptr))[-1];
+ } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ ||
+ (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) {
+ target = ptr + (reinterpret_cast<int *>(ptr))[-1];
+ } else {
+ continue;
+ }
+ branch_targets.insert(target);
+ }
+ struct Code {
+ char* addr;
+ int len;
+ unsigned short insn;
+ bool is_ip_relative;
+ } code[5] = { { 0 } };
+ int codeIdx = 0;
+ char* ptr = start;
+ while (ptr < end) {
+ // Keep a ring-buffer of the last few instruction in order to find the
+ // correct place to patch the code.
+ char *mod_rm;
+ code[codeIdx].addr = ptr;
+ code[codeIdx].insn = next_inst((const char **)&ptr, __WORDSIZE == 64,
+ 0, 0, &mod_rm, 0, 0);
+ code[codeIdx].len = ptr - code[codeIdx].addr;
+ code[codeIdx].is_ip_relative =
+ #if defined(__x86_64__)
+ mod_rm && (*mod_rm & 0xC7) == 0x5;
+ #else
+ false;
+ #endif
+
+ // Whenever we find a system call, we patch it with a jump to out-of-line
+ // code that redirects to our system call wrapper.
+ bool is_syscall = true;
+ #if defined(__x86_64__)
+ bool is_indirect_call = false;
+ if (code[codeIdx].insn == 0x0F05 /* SYSCALL */ ||
+ // In addition, on x86-64, we need to redirect all CALLs between the
+ // VDSO and the VSyscalls page. We want these to jump to our own
+ // modified copy of the VSyscalls. As we know that the VSyscalls are
+ // always more than 2GB away from the VDSO, the compiler has to
+ // generate some form of indirect jumps. We can find all indirect
+ // CALLs and redirect them to a separate scratch area, where we can
+ // inspect the destination address. If it indeed points to the
+ // VSyscall area, we then adjust the destination address accordingly.
+ (is_indirect_call =
+ (isVDSO_ && vsys_offset_ && code[codeIdx].insn == 0xFF &&
+ !code[codeIdx].is_ip_relative &&
+ mod_rm && (*mod_rm & 0x38) == 0x10 /* CALL (indirect) */))) {
+ is_syscall = !is_indirect_call;
+ #elif defined(__i386__)
+ bool is_gs_call = false;
+ if (code[codeIdx].len == 7 &&
+ code[codeIdx].insn == 0xFF &&
+ code[codeIdx].addr[2] == '\x15' /* CALL (indirect) */ &&
+ code[codeIdx].addr[0] == '\x65' /* %gs prefix */) {
+ char* target;
+ asm volatile("mov %%gs:(%1), %0\n"
+ : "=a"(target)
+ : "c"(*reinterpret_cast<int *>(code[codeIdx].addr+3)));
+ if (target == __kernel_vsyscall) {
+ is_gs_call = true;
+ // TODO(markus): also handle the other vsyscalls
+ }
+ }
+ if (is_gs_call ||
+ (code[codeIdx].insn == 0xCD &&
+ code[codeIdx].addr[1] == '\x80' /* INT $0x80 */)) {
+ #else
+ #error Unsupported target platform
+ #endif
+ // Found a system call. Search backwards to figure out how to redirect
+ // the code. We will need to overwrite a couple of instructions and,
+ // of course, move these instructions somewhere else.
+ int startIdx = codeIdx;
+ int endIdx = codeIdx;
+ int length = code[codeIdx].len;
+ for (int idx = codeIdx;
+ (idx = (idx + (sizeof(code) / sizeof(struct Code)) - 1) %
+ (sizeof(code) / sizeof(struct Code))) != codeIdx; ) {
+ std::set<char *>::const_iterator iter =
+ std::upper_bound(branch_targets.begin(), branch_targets.end(),
+ code[idx].addr);
+ if (iter != branch_targets.end() && *iter < ptr) {
+ // Found a branch pointing to somewhere past our instruction. This
+ // instruction cannot be moved safely. Leave it in place.
+ break;
+ }
+ if (code[idx].addr && !code[idx].is_ip_relative &&
+ isSafeInsn(code[idx].insn)) {
+ // These are all benign instructions with no side-effects and no
+ // dependency on the program counter. We should be able to safely
+ // relocate them.
+ startIdx = idx;
+ length = ptr - code[startIdx].addr;
+ } else {
+ break;
+ }
+ }
+ // Search forward past the system call, too. Sometimes, we can only
+ // find relocatable instructions following the system call.
+ #if defined(__i386__)
+ findEndIdx:
+ #endif
+ char *next = ptr;
+ for (int i = codeIdx;
+ next < end &&
+ (i = (i + 1) % (sizeof(code) / sizeof(struct Code))) != startIdx;
+ ) {
+ std::set<char *>::const_iterator iter =
+ std::lower_bound(branch_targets.begin(), branch_targets.end(),
+ next);
+ if (iter != branch_targets.end() && *iter == next) {
+ // Found branch target pointing to our instruction
+ break;
+ }
+ char *tmp_rm;
+ code[i].addr = next;
+ code[i].insn = next_inst((const char **)&next, __WORDSIZE == 64,
+ 0, 0, &tmp_rm, 0, 0);
+ code[i].len = next - code[i].addr;
+ code[i].is_ip_relative = tmp_rm && (*tmp_rm & 0xC7) == 0x5;
+ if (!code[i].is_ip_relative && isSafeInsn(code[i].insn)) {
+ endIdx = i;
+ length = next - code[startIdx].addr;
+ } else {
+ break;
+ }
+ }
+ // We now know, how many instructions neighboring the system call we
+ // can safely overwrite. On x86-32 we need six bytes, and on x86-64
+ // We need five bytes to insert a JMPQ and a 32bit address. We then
+ // jump to a code fragment that safely forwards to our system call
+ // wrapper.
+ // On x86-64, this is complicated by the fact that the API allows up
+ // to 128 bytes of red-zones below the current stack pointer. So, we
+ // cannot write to the stack until we have adjusted the stack
+ // pointer.
+ // On both x86-32 and x86-64 we take care to leave the stack unchanged
+ // while we are executing the preamble and postamble. This allows us
+ // to treat instructions that reference %esp/%rsp as safe for
+ // relocation.
+ // In particular, this means that on x86-32 we cannot use CALL, but
+ // have to use a PUSH/RET combination to change the instruction pointer.
+ // On x86-64, we can instead use a 32bit JMPQ.
+ //
+ // .. .. .. .. ; any leading instructions copied from original code
+ // 48 81 EC 80 00 00 00 SUB $0x80, %rsp
+ // 50 PUSH %rax
+ // 48 8D 05 .. .. .. .. LEA ...(%rip), %rax
+ // 50 PUSH %rax
+ // 48 B8 .. .. .. .. MOV $syscallWrapper, %rax
+ // .. .. .. ..
+ // 50 PUSH %rax
+ // 48 8D 05 06 00 00 00 LEA 6(%rip), %rax
+ // 48 87 44 24 10 XCHG %rax, 16(%rsp)
+ // C3 RETQ
+ // 48 81 C4 80 00 00 00 ADD $0x80, %rsp
+ // .. .. .. .. ; any trailing instructions copied from original code
+ // E9 .. .. .. .. JMPQ ...
+ //
+ // Total: 52 bytes + any bytes that were copied
+ //
+ // On x86-32, the stack is available and we can do:
+ //
+ // TODO(markus): Try to maintain frame pointers on x86-32
+ //
+ // .. .. .. .. ; any leading instructions copied from original code
+ // 68 .. .. .. .. PUSH return_addr
+ // 68 .. .. .. .. PUSH $syscallWrapper
+ // C3 RET
+ // .. .. .. .. ; any trailing instructions copied from original code
+ // 68 .. .. .. .. PUSH return_addr
+ // C3 RET
+ //
+ // Total: 17 bytes + any bytes that were copied
+ //
+ // For indirect jumps from the VDSO to the VSyscall page, we instead
+ // replace the following code (this is only necessary on x86-64). This
+ // time, we don't have to worry about red zones:
+ //
+ // .. .. .. .. ; any leading instructions copied from original code
+ // E8 00 00 00 00 CALL .
+ // 48 83 04 24 .. ADDQ $.., (%rsp)
+ // FF .. .. .. .. .. PUSH .. ; from original CALL instruction
+ // 48 81 3C 24 00 00 00 FF CMPQ $0xFFFFFFFFFF000000, 0(%rsp)
+ // 72 10 JB . + 16
+ // 81 2C 24 .. .. .. .. SUBL ..., 0(%rsp)
+ // C7 44 24 04 00 00 00 00 MOVL $0, 4(%rsp)
+ // C3 RETQ
+ // 48 87 04 24 XCHG %rax,(%rsp)
+ // 48 89 44 24 08 MOV %rax,0x8(%rsp)
+ // 58 POP %rax
+ // C3 RETQ
+ // .. .. .. .. ; any trailing instructions copied from original code
+ // E9 .. .. .. .. JMPQ ...
+ //
+ // Total: 52 bytes + any bytes that were copied
+
+ if (length < (__WORDSIZE == 32 ? 6 : 5)) {
+ // There are a very small number of instruction sequences that we
+ // cannot easily intercept, and that have been observed in real world
+ // examples. Handle them here:
+ #if defined(__i386__)
+ int diff;
+ if (!memcmp(code[codeIdx].addr, "\xCD\x80\xEB", 3) &&
+ (diff = *reinterpret_cast<signed char *>(
+ code[codeIdx].addr + 3)) < 0 && diff >= -6) {
+ // We have seen...
+ // for (;;) {
+ // _exit(0);
+ // }
+ // ..get compiled to:
+ // B8 01 00 00 00 MOV $__NR_exit, %eax
+ // 66 90 XCHG %ax, %ax
+ // 31 DB 0:XOR %ebx, %ebx
+ // CD 80 INT $0x80
+ // EB FA JMP 0b
+ // The JMP is really superfluous as the system call never returns.
+ // And there are in fact no returning system calls that need to be
+ // unconditionally repeated in an infinite loop.
+ // If we replace the JMP with NOPs, the system call can successfully
+ // be intercepted.
+ *reinterpret_cast<unsigned short *>(code[codeIdx].addr + 2) = 0x9090;
+ goto findEndIdx;
+ }
+ #elif defined(__x86_64__)
+ std::set<char *>::const_iterator iter;
+ #endif
+ // If we cannot figure out any other way to intercept this system call,
+ // we replace it with a call to INT0. This causes a SEGV which we then
+ // handle in the signal handler. That's a lot slower than rewriting the
+ // instruction with a jump, but it should only happen very rarely.
+ if (is_syscall) {
+ memcpy(code[codeIdx].addr, "\xCD", 2);
+ if (code[codeIdx].len > 2) {
+ memset(code[codeIdx].addr + 2, 0x90, code[codeIdx].len - 2);
+ }
+ goto replaced;
+ }
+ #if defined(__x86_64__)
+ // On x86-64, we occasionally see code like this in the VDSO:
+ // 48 8B 05 CF FE FF FF MOV -0x131(%rip),%rax
+ // FF 50 20 CALLQ *0x20(%rax)
+ // By default, we would not replace the MOV instruction, as it is
+ // IP relative. But if the following instruction is also IP relative,
+ // we are left with only three bytes which is not enough to insert a
+ // jump.
+ // We recognize this particular situation, and as long as the CALLQ
+ // is not a branch target, we decide to still relocate the entire
+ // sequence. We just have to make sure that we then patch up the
+ // IP relative addressing.
+ else if (is_indirect_call && startIdx == codeIdx &&
+ code[startIdx = (startIdx + (sizeof(code) /
+ sizeof(struct Code)) - 1) %
+ (sizeof(code) / sizeof(struct Code))].addr &&
+ ptr - code[startIdx].addr >= 5 &&
+ code[startIdx].is_ip_relative &&
+ isSafeInsn(code[startIdx].insn) &&
+ ((iter = std::upper_bound(branch_targets.begin(),
+ branch_targets.end(),
+ code[startIdx].addr)) ==
+ branch_targets.end() || *iter >= ptr)) {
+ // We changed startIdx to include the IP relative instruction.
+ // When copying this preamble, we make sure to patch up the
+ // offset.
+ }
+ #endif
+ else {
+ Sandbox::die("Cannot intercept system call");
+ }
+ }
+ int needed = (__WORDSIZE == 32 ? 6 : 5) - code[codeIdx].len;
+ int first = codeIdx;
+ while (needed > 0 && first != startIdx) {
+ first = (first + (sizeof(code) / sizeof(struct Code)) - 1) %
+ (sizeof(code) / sizeof(struct Code));
+ needed -= code[first].len;
+ }
+ int second = codeIdx;
+ while (needed > 0) {
+ second = (second + 1) % (sizeof(code) / sizeof(struct Code));
+ needed -= code[second].len;
+ }
+ int preamble = code[codeIdx].addr - code[first].addr;
+ int postamble = code[second].addr + code[second].len -
+ code[codeIdx].addr - code[codeIdx].len;
+
+ // The following is all the code that construct the various bits of
+ // assembly code.
+ #if defined(__x86_64__)
+ if (is_indirect_call) {
+ needed = 52 + preamble + code[codeIdx].len + postamble;
+ } else {
+ needed = 52 + preamble + postamble;
+ }
+ #elif defined(__i386__)
+ needed = 17 + preamble + postamble;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Allocate scratch space and copy the preamble of code that was moved
+ // from the function that we are patching.
+ char* dest = getScratchSpace(maps, code[first].addr, needed,
+ extraSpace, extraLength);
+ memcpy(dest, code[first].addr, preamble);
+
+ // For jumps from the VDSO to the VSyscalls we sometimes allow exactly
+ // one IP relative instruction in the preamble.
+ if (code[first].is_ip_relative) {
+ *reinterpret_cast<int *>(dest + (code[codeIdx].addr -
+ code[first].addr) - 4)
+ -= dest - code[first].addr;
+ }
+
+ // For indirect calls, we need to copy the actual CALL instruction and
+ // turn it into a PUSH instruction.
+ #if defined(__x86_64__)
+ if (is_indirect_call) {
+ memcpy(dest + preamble, "\xE8\x00\x00\x00\x00\x48\x83\x04\x24", 9);
+ dest[preamble + 9] = code[codeIdx].len + 42;
+ memcpy(dest + preamble + 10, code[codeIdx].addr, code[codeIdx].len);
+
+ // Convert CALL -> PUSH
+ dest[preamble + 10 + (mod_rm - code[codeIdx].addr)] |= 0x20;
+ preamble += 10 + code[codeIdx].len;
+ }
+ #endif
+
+ // Copy the static body of the assembly code.
+ memcpy(dest + preamble,
+ #if defined(__x86_64__)
+ is_indirect_call ?
+ "\x48\x81\x3C\x24\x00\x00\x00\xFF\x72\x10\x81\x2C\x24\x00\x00\x00"
+ "\x00\xC7\x44\x24\x04\x00\x00\x00\x00\xC3\x48\x87\x04\x24\x48\x89"
+ "\x44\x24\x08\x58\xC3" :
+ "\x48\x81\xEC\x80\x00\x00\x00\x50\x48\x8D\x05\x00\x00\x00\x00\x50"
+ "\x48\xB8\x00\x00\x00\x00\x00\x00\x00\x00\x50\x48\x8D\x05\x06\x00"
+ "\x00\x00\x48\x87\x44\x24\x10\xC3\x48\x81\xC4\x80\x00\x00",
+ is_indirect_call ? 37 : 47
+ #elif defined(__i386__)
+ "\x68\x00\x00\x00\x00\x68\x00\x00\x00\x00\xC3", 11
+ #else
+ #error Unsupported target platform
+ #endif
+ );
+
+ // Copy the postamble that was moved from the function that we are
+ // patching.
+ memcpy(dest + preamble +
+ #if defined(__x86_64__)
+ (is_indirect_call ? 37 : 47),
+ #elif defined(__i386__)
+ 11,
+ #else
+ #error Unsupported target platform
+ #endif
+ code[codeIdx].addr + code[codeIdx].len,
+ postamble);
+
+ // Patch up the various computed values
+ #if defined(__x86_64__)
+ int post = preamble + (is_indirect_call ? 37 : 47) + postamble;
+ dest[post] = '\xE9';
+ *reinterpret_cast<int *>(dest + post + 1) =
+ (code[second].addr + code[second].len) - (dest + post + 5);
+ if (is_indirect_call) {
+ *reinterpret_cast<int *>(dest + preamble + 13) = vsys_offset_;
+ } else {
+ *reinterpret_cast<int *>(dest + preamble + 11) =
+ (code[second].addr + code[second].len) - (dest + preamble + 15);
+ *reinterpret_cast<void **>(dest + preamble + 18) =
+ reinterpret_cast<void *>(&syscallWrapper);
+ }
+ #elif defined(__i386__)
+ *(dest + preamble + 11 + postamble) = '\x68'; // PUSH
+ *reinterpret_cast<char **>(dest + preamble + 12 + postamble) =
+ code[second].addr + code[second].len;
+ *(dest + preamble + 16 + postamble) = '\xC3'; // RET
+ *reinterpret_cast<char **>(dest + preamble + 1) =
+ dest + preamble + 11;
+ *reinterpret_cast<void (**)()>(dest + preamble + 6) = syscallWrapper;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Pad unused space in the original function with NOPs
+ memset(code[first].addr, 0x90 /* NOP */,
+ code[second].addr + code[second].len - code[first].addr);
+
+ // Replace the system call with an unconditional jump to our new code.
+ #if defined(__x86_64__)
+ *code[first].addr = '\xE9'; // JMPQ
+ *reinterpret_cast<int *>(code[first].addr + 1) =
+ dest - (code[first].addr + 5);
+ #elif defined(__i386__)
+ code[first].addr[0] = '\x68'; // PUSH
+ *reinterpret_cast<char **>(code[first].addr + 1) = dest;
+ code[first].addr[5] = '\xC3'; // RET
+ #else
+ #error Unsupported target platform
+ #endif
+ }
+ replaced:
+ codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code));
+ }
+}
+
+void Library::patchVDSO(char** extraSpace, int* extraLength){
+ #if defined(__i386__)
+ Sandbox::SysCalls sys;
+ if (!__kernel_vsyscall ||
+ sys.mprotect(reinterpret_cast<void *>(
+ reinterpret_cast<long>(__kernel_vsyscall) & ~0xFFF),
+ 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) {
+ return;
+ }
+
+ // x86-32 has a small number of well-defined functions in the VDSO library.
+ // These functions do not easily lend themselves to be rewritten by the
+ // automatic code. Instead, we explicitly find new definitions for them.
+ //
+ // We don't bother with optimizing the syscall instruction instead always
+ // use INT $0x80, no matter whether the hardware supports more modern
+ // calling conventions.
+ //
+ // TODO(markus): Investigate whether it is worthwhile to optimize this
+ // code path and use the platform-specific entry code.
+ if (__kernel_vsyscall) {
+ // Replace the kernel entry point with:
+ //
+ // E9 .. .. .. .. JMP syscallWrapper
+ *__kernel_vsyscall = '\xE9';
+ *reinterpret_cast<long *>(__kernel_vsyscall + 1) =
+ reinterpret_cast<char *>(&syscallWrapper) -
+ reinterpret_cast<char *>(__kernel_vsyscall + 5);
+ }
+ if (__kernel_sigreturn) {
+ // Replace the sigreturn() system call with a jump to code that does:
+ //
+ // 58 POP %eax
+ // B8 77 00 00 00 MOV $0x77, %eax
+ // E8 .. .. .. .. CALL syscallWrapper
+ char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace,
+ extraLength);
+ memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE8", 7);
+ *reinterpret_cast<long *>(dest + 7) =
+ reinterpret_cast<char *>(&syscallWrapper) - dest - 11;;
+ *__kernel_sigreturn = '\xE9';
+ *reinterpret_cast<long *>(__kernel_sigreturn + 1) =
+ dest - reinterpret_cast<char *>(__kernel_sigreturn) - 5;
+ }
+ if (__kernel_rt_sigreturn) {
+ // Replace the rt_sigreturn() system call with a jump to code that does:
+ //
+ // B8 AD 00 00 00 MOV $0xAD, %eax
+ // E8 .. .. .. .. CALL syscallWrapper
+ char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace,
+ extraLength);
+ memcpy(dest, "\xB8\xAD\x00\x00\x00\xE8", 6);
+ *reinterpret_cast<long *>(dest + 6) =
+ reinterpret_cast<char *>(&syscallWrapper) - dest - 10;
+ *__kernel_rt_sigreturn = '\xE9';
+ *reinterpret_cast<long *>(__kernel_rt_sigreturn + 1) =
+ dest - reinterpret_cast<char *>(__kernel_rt_sigreturn) - 5;
+ }
+ #endif
+}
+
+int Library::patchVSystemCalls() {
+ #if defined(__x86_64__)
+ // VSyscalls live in a shared 4kB page at the top of the address space. This
+ // page cannot be unmapped nor remapped. We have to create a copy within
+ // 2GB of the page, and rewrite all IP-relative accesses to shared variables.
+ // As the top of the address space is not accessible by mmap(), this means
+ // that we need to wrap around addresses to the bottom 2GB of the address
+ // space.
+ // Only x86-64 has VSyscalls.
+ if (maps_->vsyscall()) {
+ char* copy = maps_->allocNearAddr(maps_->vsyscall(), 0x1000,
+ PROT_READ|PROT_WRITE|PROT_EXEC);
+ char* extraSpace = copy;
+ int extraLength = 0x1000;
+ memcpy(copy, maps_->vsyscall(), 0x1000);
+ long adjust = (long)maps_->vsyscall() - (long)copy;
+ for (int vsys = 0; vsys < 0x1000; vsys += 0x400) {
+ char* start = copy + vsys;
+ char* end = start + 0x400;
+
+ // There can only be up to four VSyscalls starting at an offset of
+ // n*0x1000, each. VSyscalls are invoked by functions in the VDSO
+ // and provide fast implementations of a time source. We don't exactly
+ // know where the code and where the data is in the VSyscalls page.
+ // So, we disassemble the code for each function and find all branch
+ // targets within the function in order to find the last address of
+ // function.
+ for (char *last = start, *vars = end, *ptr = start; ptr < end; ) {
+ new_function:
+ char* mod_rm;
+ unsigned short insn = next_inst((const char **)&ptr, true, 0, 0,
+ &mod_rm, 0, 0);
+ if (mod_rm && (*mod_rm & 0xC7) == 0x5) {
+ // Instruction has IP relative addressing mode. Adjust to reference
+ // the variables in the original VSyscall segment.
+ long offset = *reinterpret_cast<int *>(mod_rm + 1);
+ char* var = ptr + offset;
+ if (var >= ptr && var < vars) {
+ // Variables are stored somewhere past all the functions. Remember
+ // the first variable in the VSyscall slot, so that we stop
+ // scanning for instructions once we reach that address.
+ vars = var;
+ }
+ offset += adjust;
+ if ((offset >> 32) && (offset >> 32) != -1) {
+ Sandbox::die("Cannot patch [vsystemcall]");
+ }
+ *reinterpret_cast<int *>(mod_rm + 1) = offset;
+ }
+
+ // Check for jump targets to higher addresses (but within our own
+ // VSyscall slot). They extend the possible end-address of this
+ // function.
+ char *target = 0;
+ if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ ||
+ insn == 0xEB /* JMP */) {
+ target = ptr + (reinterpret_cast<signed char *>(ptr))[-1];
+ } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ ||
+ (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) {
+ target = ptr + (reinterpret_cast<int *>(ptr))[-1];
+ }
+
+ // The function end is found, once the loop reaches the last valid
+ // address in the VSyscall slot, or once it finds a RET instruction
+ // that is not followed by any jump targets. Unconditional jumps that
+ // point backwards are treated the same as a RET instruction.
+ if (insn == 0xC3 /* RET */ ||
+ (target < ptr &&
+ (insn == 0xEB /* JMP */ || insn == 0xE9 /* JMP */))) {
+ if (last >= ptr) {
+ continue;
+ } else {
+ // The function can optionally be followed by more functions in
+ // the same VSyscall slot. Allow for alignment to a 16 byte
+ // boundary. If we then find more non-zero bytes, and if this is
+ // not the known start of the variables, assume a new function
+ // started.
+ for (; ptr < vars; ++ptr) {
+ if ((long)ptr & 0xF) {
+ if (*ptr && *ptr != '\x90' /* NOP */) {
+ goto new_function;
+ }
+ *ptr = '\x90'; // NOP
+ } else {
+ if (*ptr && *ptr != '\x90' /* NOP */) {
+ goto new_function;
+ }
+ break;
+ }
+ }
+
+ // Translate all SYSCALLs to jumps into our system call handler.
+ patchSystemCallsInFunction(NULL, start, ptr,
+ &extraSpace, &extraLength);
+ break;
+ }
+ }
+
+ // Adjust assumed end address for this function, if a valid jump
+ // target has been found that originates from the current instruction.
+ if (target > last && target < start + 0x100) {
+ last = target;
+ }
+ }
+ }
+
+ // We are done. Write-protect our code and make it executable.
+ Sandbox::SysCalls sys;
+ sys.mprotect(copy, 0x1000, PROT_READ|PROT_EXEC);
+ return maps_->vsyscall() - copy;
+ }
+ #endif
+ return 0;
+}
+
+void Library::patchSystemCalls() {
+ if (!valid_) {
+ return;
+ }
+ int extraLength = 0;
+ char* extraSpace = NULL;
+ if (isVDSO_) {
+ // patchVDSO() calls patchSystemCallsInFunction() which needs vsys_offset_
+ // iff processing the VDSO library. So, make sure we call
+ // patchVSystemCalls() first.
+ vsys_offset_ = patchVSystemCalls();
+ #if defined(__i386__)
+ patchVDSO(&extraSpace, &extraLength);
+ return;
+ #endif
+ }
+ SectionTable::const_iterator iter;
+ if ((iter = section_table_.find(".text")) == section_table_.end()) {
+ return;
+ }
+ const Elf_Shdr& shdr = iter->second.second;
+ char* start = reinterpret_cast<char *>(shdr.sh_addr + asr_offset_);
+ char* stop = start + shdr.sh_size;
+ char* func = start;
+ int nopcount = 0;
+ bool has_syscall = false;
+ for (char *ptr = start; ptr < stop; ptr++) {
+ #if defined(__x86_64__)
+ if ((*ptr == '\x0F' && ptr[1] == '\x05' /* SYSCALL */) ||
+ (isVDSO_ && *ptr == '\xFF')) {
+ #elif defined(__i386__)
+ if ((*ptr == '\xCD' && ptr[1] == '\x80' /* INT $0x80 */) ||
+ (*ptr == '\x65' && ptr[1] == '\xFF' &&
+ ptr[2] == '\x15' /* CALL %gs:.. */)) {
+ #else
+ #error Unsupported target platform
+ #endif
+ ptr++;
+ has_syscall = true;
+ nopcount = 0;
+ } else if (*ptr == '\x90' /* NOP */) {
+ nopcount++;
+ } else if (!(reinterpret_cast<long>(ptr) & 0xF)) {
+ if (nopcount > 2) {
+ // This is very likely the beginning of a new function. Functions
+ // are aligned on 16 byte boundaries and the preceding function is
+ // padded out with NOPs.
+ //
+ // For performance reasons, we quickly scan the entire text segment
+ // for potential SYSCALLs, and then patch the code in increments of
+ // individual functions.
+ if (has_syscall) {
+ has_syscall = false;
+ // Our quick scan of the function found a potential system call.
+ // Do a more thorough scan, now.
+ patchSystemCallsInFunction(maps_, func, ptr, &extraSpace,
+ &extraLength);
+ }
+ func = ptr;
+ }
+ nopcount = 0;
+ } else {
+ nopcount = 0;
+ }
+ }
+ if (has_syscall) {
+ // Patch any remaining system calls that were in the last function before
+ // the loop terminated.
+ patchSystemCallsInFunction(maps_, func, stop, &extraSpace, &extraLength);
+ }
+
+ // Mark our scratch space as write-protected and executable.
+ if (extraSpace) {
+ Sandbox::SysCalls sys;
+ sys.mprotect(extraSpace, 4096, PROT_READ|PROT_EXEC);
+ }
+}
+
+bool Library::parseElf() {
+ valid_ = true;
+
+ // Verify ELF header
+ Elf_Shdr str_shdr;
+ if (!getOriginal(0, &ehdr_) ||
+ ehdr_.e_ehsize < sizeof(Elf_Ehdr) ||
+ ehdr_.e_phentsize < sizeof(Elf_Phdr) ||
+ ehdr_.e_shentsize < sizeof(Elf_Shdr) ||
+ !getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize,
+ &str_shdr)) {
+ // Not all memory mappings are necessarily ELF files. Skip memory
+ // mappings that we cannot identify.
+ error:
+ valid_ = false;
+ return false;
+ }
+
+ // Parse section table and find all sections in this ELF file
+ for (int i = 0; i < ehdr_.e_shnum; i++) {
+ Elf_Shdr shdr;
+ if (!getOriginal(ehdr_.e_shoff + i*ehdr_.e_shentsize, &shdr)) {
+ continue;
+ }
+ section_table_.insert(
+ std::make_pair(getOriginal(str_shdr.sh_offset + shdr.sh_name),
+ std::make_pair(i, shdr)));
+ }
+
+ // Compute the offset of entries in the .text segment
+ const Elf_Shdr* text = getSection(".text");
+ if (text == NULL) {
+ // On x86-32, the VDSO is unusual in as much as it does not have a single
+ // ".text" section. Instead, it has one section per function. Each
+ // section name starts with ".text". We just need to pick an arbitrary
+ // one in order to find the asr_offset_ -- which would typically be zero
+ // for the VDSO.
+ for (SectionTable::const_iterator iter = section_table_.begin();
+ iter != section_table_.end(); ++iter) {
+ if (!strncmp(iter->first.c_str(), ".text", 5)) {
+ text = &iter->second.second;
+ break;
+ }
+ }
+ }
+
+ // Now that we know where the .text segment is located, we can compute the
+ // asr_offset_.
+ if (text) {
+ RangeMap::const_iterator iter =
+ memory_ranges_.lower_bound(text->sh_offset);
+ if (iter != memory_ranges_.end()) {
+ asr_offset_ = reinterpret_cast<char *>(iter->second.start) -
+ (text->sh_addr - (text->sh_offset - iter->first));
+ } else {
+ goto error;
+ }
+ } else {
+ goto error;
+ }
+
+ return !isVDSO_ || parseSymbols();
+}
+
+bool Library::parseSymbols() {
+ if (!valid_) {
+ return false;
+ }
+
+ Elf_Shdr str_shdr;
+ getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, &str_shdr);
+
+ // Find PLT and symbol tables
+ const Elf_Shdr* plt = getSection(ELF_REL_PLT);
+ const Elf_Shdr* symtab = getSection(".dynsym");
+ Elf_Shdr strtab = { 0 };
+ if (symtab) {
+ if (symtab->sh_link >= ehdr_.e_shnum ||
+ !getOriginal(ehdr_.e_shoff + symtab->sh_link * ehdr_.e_shentsize,
+ &strtab)) {
+ Debug::message("Cannot find valid symbol table\n");
+ valid_ = false;
+ return false;
+ }
+ }
+
+ if (plt && symtab) {
+ // Parse PLT table and add its entries
+ for (int i = plt->sh_size/sizeof(Elf_Rel); --i >= 0; ) {
+ Elf_Rel rel;
+ if (!getOriginal(plt->sh_offset + i * sizeof(Elf_Rel), &rel) ||
+ ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym) >= symtab->sh_size) {
+ Debug::message("Encountered invalid plt entry\n");
+ valid_ = false;
+ return false;
+ }
+
+ if (ELF_R_TYPE(rel.r_info) != ELF_JUMP_SLOT) {
+ continue;
+ }
+ Elf_Sym sym;
+ if (!getOriginal(symtab->sh_offset +
+ ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym), &sym) ||
+ sym.st_shndx >= ehdr_.e_shnum) {
+ Debug::message("Encountered invalid symbol for plt entry\n");
+ valid_ = false;
+ return false;
+ }
+ string name = getOriginal(strtab.sh_offset + sym.st_name);
+ if (name.empty()) {
+ continue;
+ }
+ plt_entries_.insert(std::make_pair(name, rel.r_offset));
+ }
+ }
+
+ if (symtab) {
+ // Parse symbol table and add its entries
+ for (Elf_Addr addr = 0; addr < symtab->sh_size; addr += sizeof(Elf_Sym)) {
+ Elf_Sym sym;
+ if (!getOriginal(symtab->sh_offset + addr, &sym) ||
+ (sym.st_shndx >= ehdr_.e_shnum &&
+ sym.st_shndx < SHN_LORESERVE)) {
+ Debug::message("Encountered invalid symbol\n");
+ valid_ = false;
+ return false;
+ }
+ string name = getOriginal(strtab.sh_offset + sym.st_name);
+ if (name.empty()) {
+ continue;
+ }
+ symbols_.insert(std::make_pair(name, sym));
+ }
+ }
+
+ SymbolTable::const_iterator iter = symbols_.find("__kernel_vsyscall");
+ if (iter != symbols_.end() && iter->second.st_value) {
+ __kernel_vsyscall = asr_offset_ + iter->second.st_value;
+ }
+ iter = symbols_.find("__kernel_sigreturn");
+ if (iter != symbols_.end() && iter->second.st_value) {
+ __kernel_sigreturn = asr_offset_ + iter->second.st_value;
+ }
+ iter = symbols_.find("__kernel_rt_sigreturn");
+ if (iter != symbols_.end() && iter->second.st_value) {
+ __kernel_rt_sigreturn = asr_offset_ + iter->second.st_value;
+ }
+
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/library.h b/sandbox/linux/seccomp/library.h
new file mode 100644
index 0000000..e27bfde
--- /dev/null
+++ b/sandbox/linux/seccomp/library.h
@@ -0,0 +1,199 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef LIBRARY_H__
+#define LIBRARY_H__
+
+#include <elf.h>
+#include <functional>
+#include <map>
+#include <set>
+#include <string>
+#include <string.h>
+#include <sys/mman.h>
+
+#include "maps.h"
+
+#if defined(__x86_64__)
+typedef Elf64_Ehdr Elf_Ehdr;
+typedef Elf64_Shdr Elf_Shdr;
+typedef Elf64_Sym Elf_Sym;
+typedef Elf64_Addr Elf_Addr;
+#elif defined(__i386__)
+typedef Elf32_Ehdr Elf_Ehdr;
+typedef Elf32_Shdr Elf_Shdr;
+typedef Elf32_Sym Elf_Sym;
+typedef Elf32_Addr Elf_Addr;
+#else
+#error Unsupported target platform
+#endif
+
+struct SyscallTable;
+namespace playground {
+
+class Library {
+ friend class Maps;
+ public:
+ typedef Maps::string string;
+
+ Library() :
+ valid_(false),
+ isVDSO_(false),
+ asr_offset_(0),
+ vsys_offset_(0),
+ maps_(0),
+ image_(0),
+ image_size_(0) {
+ }
+
+ ~Library();
+
+ void setLibraryInfo(Maps* maps) {
+ if (!maps_) {
+ maps_ = maps;
+ }
+ }
+
+ void addMemoryRange(void* start, void* stop, Elf_Addr offset,
+ int prot, int isVDSO) {
+ isVDSO_ = isVDSO;
+ RangeMap::const_iterator iter = memory_ranges_.find(offset);
+ if (iter != memory_ranges_.end()) {
+ // It is possible to have overlapping mappings. This is particularly
+ // likely to happen with very small programs or libraries. If it does
+ // happen, we really only care about the text segment. Look for a
+ // mapping that is mapped executable.
+ if ((prot & PROT_EXEC) == 0) {
+ return;
+ }
+ }
+ memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot)));
+ }
+
+ char *get(Elf_Addr offset, char *buf, size_t len);
+ string get(Elf_Addr offset);
+ char *getOriginal(Elf_Addr offset, char *buf, size_t len);
+ string getOriginal(Elf_Addr offset);
+
+ template<class T>T* get(Elf_Addr offset, T* t) {
+ if (!valid_) {
+ memset(t, 0, sizeof(T));
+ return NULL;
+ }
+ return reinterpret_cast<T *>(get(offset, reinterpret_cast<char *>(t),
+ sizeof(T)));
+ }
+
+ template<class T>T* getOriginal(Elf_Addr offset, T* t) {
+ if (!valid_) {
+ memset(t, 0, sizeof(T));
+ return NULL;
+ }
+ return reinterpret_cast<T *>(getOriginal(offset,
+ reinterpret_cast<char *>(t),
+ sizeof(T)));
+ }
+
+ template<class T>bool set(void *addr, T* value) {
+ if (!valid_) {
+ return false;
+ }
+ *reinterpret_cast<T *>(addr) = *value;
+ return true;
+ }
+
+ template<class T>bool set(Elf_Addr offset, T* value) {
+ if (!valid_) {
+ return false;
+ }
+ RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
+ if (iter == memory_ranges_.end()) {
+ return false;
+ }
+ offset -= iter->first;
+ if (offset >
+ reinterpret_cast<char *>(iter->second.stop) -
+ reinterpret_cast<char *>(iter->second.start) -
+ sizeof(T)) {
+ return false;
+ }
+ *reinterpret_cast<T *>(
+ reinterpret_cast<char *>(iter->second.start) + offset) = *value;
+ return true;
+ }
+
+ bool parseElf();
+ const Elf_Ehdr* getEhdr();
+ const Elf_Shdr* getSection(const string& section);
+ int getSectionIndex(const string& section);
+ void makeWritable(bool state) const;
+ void patchSystemCalls();
+ bool isVDSO() const { return isVDSO_; }
+
+ protected:
+ bool parseSymbols();
+
+ private:
+ class GreaterThan : public std::binary_function<Elf_Addr, Elf_Addr, bool> {
+ // We create the RangeMap with a GreaterThan rather than the default
+ // comparator, as that allows us to use lower_bound() to find memory
+ // mappings.
+ public:
+ bool operator() (Elf_Addr s1, Elf_Addr s2) const {
+ return s1 > s2;
+ }
+ };
+
+ struct Range {
+ Range(void* start, void* stop, int prot) :
+ start(start), stop(stop), prot(prot) { }
+ void* start;
+ void* stop;
+ int prot;
+ };
+
+ typedef std::map<Elf_Addr, Range, GreaterThan,
+ SystemAllocator<std::pair<const Elf_Addr,
+ Range> > > RangeMap;
+ typedef std::map<string, std::pair<int, Elf_Shdr>, std::less<string>,
+ SystemAllocator<std::pair<const string,
+ std::pair<int, Elf_Shdr> > > >
+ SectionTable;
+ typedef std::map<string, Elf_Sym, std::less<string>,
+ SystemAllocator<std::pair<const string,
+ Elf_Sym> > > SymbolTable;
+ typedef std::map<string, Elf_Addr, std::less<string>,
+ SystemAllocator<std::pair<const string,
+ Elf_Addr> > > PltTable;
+
+ char* getBytes(char* dst, const char* src, ssize_t len);
+ static bool isSafeInsn(unsigned short insn);
+ static int isSimpleSystemCall(char *start, char *end);
+ static char* getScratchSpace(const Maps* maps, char* near, int needed,
+ char** extraSpace, int* extraLength);
+ void patchSystemCallsInFunction(const Maps* maps, char *start, char *end,
+ char** extraSpace, int* extraLength);
+ int patchVSystemCalls();
+ void patchVDSO(char** extraSpace, int* extraLength);
+
+ RangeMap memory_ranges_;
+ bool valid_;
+ bool isVDSO_;
+ char* asr_offset_;
+ int vsys_offset_;
+ Maps* maps_;
+ Elf_Ehdr ehdr_;
+ SectionTable section_table_;
+ SymbolTable symbols_;
+ PltTable plt_entries_;
+ char* image_;
+ size_t image_size_;
+ static char* __kernel_vsyscall;
+ static char* __kernel_sigreturn;
+ static char* __kernel_rt_sigreturn;
+};
+
+} // namespace
+
+#endif // LIBRARY_H__
diff --git a/sandbox/linux/seccomp/linux_syscall_support.h b/sandbox/linux/seccomp/linux_syscall_support.h
new file mode 100644
index 0000000..2ee0426
--- /dev/null
+++ b/sandbox/linux/seccomp/linux_syscall_support.h
@@ -0,0 +1,3208 @@
+/* Copyright (c) 2005-2010, Google Inc.
+ * Author: Markus Gutschke
+ *
+ * All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium LICENSE file.
+ */
+
+/* This file includes Linux-specific support functions common to the
+ * coredumper and the thread lister; primarily, this is a collection
+ * of direct system calls, and a couple of symbols missing from
+ * standard header files.
+ * There are a few options that the including file can set to control
+ * the behavior of this file:
+ *
+ * SYS_CPLUSPLUS:
+ * The entire header file will normally be wrapped in 'extern "C" { }",
+ * making it suitable for compilation as both C and C++ source. If you
+ * do not want to do this, you can set the SYS_CPLUSPLUS macro to inhibit
+ * the wrapping. N.B. doing so will suppress inclusion of all prerequisite
+ * system header files, too. It is the caller's responsibility to provide
+ * the necessary definitions.
+ *
+ * SYS_ERRNO:
+ * All system calls will update "errno" unless overriden by setting the
+ * SYS_ERRNO macro prior to including this file. SYS_ERRNO should be
+ * an l-value.
+ *
+ * SYS_INLINE:
+ * New symbols will be defined "static inline", unless overridden by
+ * the SYS_INLINE macro.
+ *
+ * SYS_LINUX_SYSCALL_SUPPORT_H
+ * This macro is used to avoid multiple inclusions of this header file.
+ * If you need to include this file more than once, make sure to
+ * unset SYS_LINUX_SYSCALL_SUPPORT_H before each inclusion.
+ *
+ * SYS_PREFIX:
+ * New system calls will have a prefix of "sys_" unless overridden by
+ * the SYS_PREFIX macro. Valid values for this macro are [0..9] which
+ * results in prefixes "sys[0..9]_". It is also possible to set this
+ * macro to -1, which avoids all prefixes.
+ *
+ * This file defines a few internal symbols that all start with "LSS_".
+ * Do not access these symbols from outside this file. They are not part
+ * of the supported API.
+ */
+#ifndef SYS_LINUX_SYSCALL_SUPPORT_H
+#define SYS_LINUX_SYSCALL_SUPPORT_H
+
+/* We currently only support x86-32, x86-64, ARM, MIPS, and PPC on Linux.
+ * Porting to other related platforms should not be difficult.
+ */
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) || \
+ defined(__mips__) || defined(__PPC__)) && defined(__linux)
+
+#ifndef SYS_CPLUSPLUS
+#ifdef __cplusplus
+/* Some system header files in older versions of gcc neglect to properly
+ * handle being included from C++. As it appears to be harmless to have
+ * multiple nested 'extern "C"' blocks, just add another one here.
+ */
+extern "C" {
+#endif
+
+#include <errno.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/unistd.h>
+#include <endian.h>
+
+#ifdef __mips__
+/* Include definitions of the ABI currently in use. */
+#include <sgidefs.h>
+#endif
+
+#endif
+
+/* As glibc often provides subtly incompatible data structures (and implicit
+ * wrapper functions that convert them), we provide our own kernel data
+ * structures for use by the system calls.
+ * These structures have been developed by using Linux 2.6.23 headers for
+ * reference. Note though, we do not care about exact API compatibility
+ * with the kernel, and in fact the kernel often does not have a single
+ * API that works across architectures. Instead, we try to mimic the glibc
+ * API where reasonable, and only guarantee ABI compatibility with the
+ * kernel headers.
+ * Most notably, here are a few changes that were made to the structures
+ * defined by kernel headers:
+ *
+ * - we only define structures, but not symbolic names for kernel data
+ * types. For the latter, we directly use the native C datatype
+ * (i.e. "unsigned" instead of "mode_t").
+ * - in a few cases, it is possible to define identical structures for
+ * both 32bit (e.g. i386) and 64bit (e.g. x86-64) platforms by
+ * standardizing on the 64bit version of the data types. In particular,
+ * this means that we use "unsigned" where the 32bit headers say
+ * "unsigned long".
+ * - overall, we try to minimize the number of cases where we need to
+ * conditionally define different structures.
+ * - the "struct kernel_sigaction" class of structures have been
+ * modified to more closely mimic glibc's API by introducing an
+ * anonymous union for the function pointer.
+ * - a small number of field names had to have an underscore appended to
+ * them, because glibc defines a global macro by the same name.
+ */
+
+/* include/linux/dirent.h */
+struct kernel_dirent64 {
+ unsigned long long d_ino;
+ long long d_off;
+ unsigned short d_reclen;
+ unsigned char d_type;
+ char d_name[256];
+};
+
+/* include/linux/dirent.h */
+struct kernel_dirent {
+ long d_ino;
+ long d_off;
+ unsigned short d_reclen;
+ char d_name[256];
+};
+
+/* include/linux/uio.h */
+struct kernel_iovec {
+ void *iov_base;
+ unsigned long iov_len;
+};
+
+/* include/linux/socket.h */
+struct kernel_msghdr {
+ void *msg_name;
+ int msg_namelen;
+ struct kernel_iovec*msg_iov;
+ unsigned long msg_iovlen;
+ void *msg_control;
+ unsigned long msg_controllen;
+ unsigned msg_flags;
+};
+
+/* include/asm-generic/poll.h */
+struct kernel_pollfd {
+ int fd;
+ short events;
+ short revents;
+};
+
+/* include/linux/resource.h */
+struct kernel_rlimit {
+ unsigned long rlim_cur;
+ unsigned long rlim_max;
+};
+
+/* include/linux/time.h */
+struct kernel_timespec {
+ long tv_sec;
+ long tv_nsec;
+};
+
+/* include/linux/time.h */
+struct kernel_timeval {
+ long tv_sec;
+ long tv_usec;
+};
+
+/* include/linux/resource.h */
+struct kernel_rusage {
+ struct kernel_timeval ru_utime;
+ struct kernel_timeval ru_stime;
+ long ru_maxrss;
+ long ru_ixrss;
+ long ru_idrss;
+ long ru_isrss;
+ long ru_minflt;
+ long ru_majflt;
+ long ru_nswap;
+ long ru_inblock;
+ long ru_oublock;
+ long ru_msgsnd;
+ long ru_msgrcv;
+ long ru_nsignals;
+ long ru_nvcsw;
+ long ru_nivcsw;
+};
+
+struct siginfo;
+#if defined(__i386__) || defined(__ARM_ARCH_3__) || defined(__PPC__)
+
+/* include/asm-{arm,i386,mips,ppc}/signal.h */
+struct kernel_old_sigaction {
+ union {
+ void (*sa_handler_)(int);
+ void (*sa_sigaction_)(int, struct siginfo *, void *);
+ };
+ unsigned long sa_mask;
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+} __attribute__((packed,aligned(4)));
+#elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+ #define kernel_old_sigaction kernel_sigaction
+#endif
+
+/* Some kernel functions (e.g. sigaction() in 2.6.23) require that the
+ * exactly match the size of the signal set, even though the API was
+ * intended to be extensible. We define our own KERNEL_NSIG to deal with
+ * this.
+ * Please note that glibc provides signals [1.._NSIG-1], whereas the
+ * kernel (and this header) provides the range [1..KERNEL_NSIG]. The
+ * actual number of signals is obviously the same, but the constants
+ * differ by one.
+ */
+#ifdef __mips__
+#define KERNEL_NSIG 128
+#else
+#define KERNEL_NSIG 64
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64}/signal.h */
+struct kernel_sigset_t {
+ unsigned long sig[(KERNEL_NSIG + 8*sizeof(unsigned long) - 1)/
+ (8*sizeof(unsigned long))];
+};
+
+/* include/asm-{arm,i386,mips,x86_64,ppc}/signal.h */
+struct kernel_sigaction {
+#ifdef __mips__
+ unsigned long sa_flags;
+ union {
+ void (*sa_handler_)(int);
+ void (*sa_sigaction_)(int, struct siginfo *, void *);
+ };
+ struct kernel_sigset_t sa_mask;
+#else
+ union {
+ void (*sa_handler_)(int);
+ void (*sa_sigaction_)(int, struct siginfo *, void *);
+ };
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+ struct kernel_sigset_t sa_mask;
+#endif
+};
+
+/* include/linux/socket.h */
+struct kernel_sockaddr {
+ unsigned short sa_family;
+ char sa_data[14];
+};
+
+/* include/asm-{arm,i386,mips,ppc}/stat.h */
+#ifdef __mips__
+#if _MIPS_SIM == _MIPS_SIM_ABI64
+struct kernel_stat {
+#else
+struct kernel_stat64 {
+#endif
+ unsigned st_dev;
+ unsigned __pad0[3];
+ unsigned long long st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned st_rdev;
+ unsigned __pad1[3];
+ long long st_size;
+ unsigned st_atime_;
+ unsigned st_atime_nsec_;
+ unsigned st_mtime_;
+ unsigned st_mtime_nsec_;
+ unsigned st_ctime_;
+ unsigned st_ctime_nsec_;
+ unsigned st_blksize;
+ unsigned __pad2;
+ unsigned long long st_blocks;
+};
+#elif defined __PPC__
+struct kernel_stat64 {
+ unsigned long long st_dev;
+ unsigned long long st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned long long st_rdev;
+ unsigned short int __pad2;
+ long long st_size;
+ long st_blksize;
+ long long st_blocks;
+ long st_atime_;
+ unsigned long st_atime_nsec_;
+ long st_mtime_;
+ unsigned long st_mtime_nsec_;
+ long st_ctime_;
+ unsigned long st_ctime_nsec_;
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+#else
+struct kernel_stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+ unsigned __st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+ long long st_size;
+ unsigned st_blksize;
+ unsigned long long st_blocks;
+ unsigned st_atime_;
+ unsigned st_atime_nsec_;
+ unsigned st_mtime_;
+ unsigned st_mtime_nsec_;
+ unsigned st_ctime_;
+ unsigned st_ctime_nsec_;
+ unsigned long long st_ino;
+};
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64,ppc}/stat.h */
+#if defined(__i386__) || defined(__ARM_ARCH_3__)
+struct kernel_stat {
+ /* The kernel headers suggest that st_dev and st_rdev should be 32bit
+ * quantities encoding 12bit major and 20bit minor numbers in an interleaved
+ * format. In reality, we do not see useful data in the top bits. So,
+ * we'll leave the padding in here, until we find a better solution.
+ */
+ unsigned short st_dev;
+ short pad1;
+ unsigned st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned short st_rdev;
+ short pad2;
+ unsigned st_size;
+ unsigned st_blksize;
+ unsigned st_blocks;
+ unsigned st_atime_;
+ unsigned st_atime_nsec_;
+ unsigned st_mtime_;
+ unsigned st_mtime_nsec_;
+ unsigned st_ctime_;
+ unsigned st_ctime_nsec_;
+ unsigned __unused4;
+ unsigned __unused5;
+};
+#elif defined(__x86_64__)
+struct kernel_stat {
+ unsigned long st_dev;
+ unsigned long st_ino;
+ unsigned long st_nlink;
+ unsigned st_mode;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned __pad0;
+ unsigned long st_rdev;
+ long st_size;
+ long st_blksize;
+ long st_blocks;
+ unsigned long st_atime_;
+ unsigned long st_atime_nsec_;
+ unsigned long st_mtime_;
+ unsigned long st_mtime_nsec_;
+ unsigned long st_ctime_;
+ unsigned long st_ctime_nsec_;
+ long __unused[3];
+};
+#elif defined(__PPC__)
+struct kernel_stat {
+ unsigned st_dev;
+ unsigned long st_ino; // ino_t
+ unsigned long st_mode; // mode_t
+ unsigned short st_nlink; // nlink_t
+ unsigned st_uid; // uid_t
+ unsigned st_gid; // gid_t
+ unsigned st_rdev;
+ long st_size; // off_t
+ unsigned long st_blksize;
+ unsigned long st_blocks;
+ unsigned long st_atime_;
+ unsigned long st_atime_nsec_;
+ unsigned long st_mtime_;
+ unsigned long st_mtime_nsec_;
+ unsigned long st_ctime_;
+ unsigned long st_ctime_nsec_;
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+#elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64)
+struct kernel_stat {
+ unsigned st_dev;
+ int st_pad1[3];
+ unsigned st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned st_rdev;
+ int st_pad2[2];
+ long st_size;
+ int st_pad3;
+ long st_atime_;
+ long st_atime_nsec_;
+ long st_mtime_;
+ long st_mtime_nsec_;
+ long st_ctime_;
+ long st_ctime_nsec_;
+ int st_blksize;
+ int st_blocks;
+ int st_pad4[14];
+};
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64,ppc}/statfs.h */
+#ifdef __mips__
+#if _MIPS_SIM != _MIPS_SIM_ABI64
+struct kernel_statfs64 {
+ unsigned long f_type;
+ unsigned long f_bsize;
+ unsigned long f_frsize;
+ unsigned long __pad;
+ unsigned long long f_blocks;
+ unsigned long long f_bfree;
+ unsigned long long f_files;
+ unsigned long long f_ffree;
+ unsigned long long f_bavail;
+ struct { int val[2]; } f_fsid;
+ unsigned long f_namelen;
+ unsigned long f_spare[6];
+};
+#endif
+#elif !defined(__x86_64__)
+struct kernel_statfs64 {
+ unsigned long f_type;
+ unsigned long f_bsize;
+ unsigned long long f_blocks;
+ unsigned long long f_bfree;
+ unsigned long long f_bavail;
+ unsigned long long f_files;
+ unsigned long long f_ffree;
+ struct { int val[2]; } f_fsid;
+ unsigned long f_namelen;
+ unsigned long f_frsize;
+ unsigned long f_spare[5];
+};
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64,ppc,generic}/statfs.h */
+#ifdef __mips__
+struct kernel_statfs {
+ long f_type;
+ long f_bsize;
+ long f_frsize;
+ long f_blocks;
+ long f_bfree;
+ long f_files;
+ long f_ffree;
+ long f_bavail;
+ struct { int val[2]; } f_fsid;
+ long f_namelen;
+ long f_spare[6];
+};
+#else
+struct kernel_statfs {
+ /* x86_64 actually defines all these fields as signed, whereas all other */
+ /* platforms define them as unsigned. Leaving them at unsigned should not */
+ /* cause any problems. */
+ unsigned long f_type;
+ unsigned long f_bsize;
+ unsigned long f_blocks;
+ unsigned long f_bfree;
+ unsigned long f_bavail;
+ unsigned long f_files;
+ unsigned long f_ffree;
+ struct { int val[2]; } f_fsid;
+ unsigned long f_namelen;
+ unsigned long f_frsize;
+ unsigned long f_spare[5];
+};
+#endif
+
+
+/* Definitions missing from the standard header files */
+#ifndef O_DIRECTORY
+#if defined(__ARM_ARCH_3__)
+#define O_DIRECTORY 0040000
+#else
+#define O_DIRECTORY 0200000
+#endif
+#endif
+#ifndef NT_PRXFPREG
+#define NT_PRXFPREG 0x46e62b7f
+#endif
+#ifndef PTRACE_GETFPXREGS
+#define PTRACE_GETFPXREGS ((enum __ptrace_request)18)
+#endif
+#ifndef PR_GET_DUMPABLE
+#define PR_GET_DUMPABLE 3
+#endif
+#ifndef PR_SET_DUMPABLE
+#define PR_SET_DUMPABLE 4
+#endif
+#ifndef PR_GET_SECCOMP
+#define PR_GET_SECCOMP 21
+#endif
+#ifndef PR_SET_SECCOMP
+#define PR_SET_SECCOMP 22
+#endif
+#ifndef AT_FDCWD
+#define AT_FDCWD (-100)
+#endif
+#ifndef AT_SYMLINK_NOFOLLOW
+#define AT_SYMLINK_NOFOLLOW 0x100
+#endif
+#ifndef AT_REMOVEDIR
+#define AT_REMOVEDIR 0x200
+#endif
+#ifndef MREMAP_FIXED
+#define MREMAP_FIXED 2
+#endif
+#ifndef SA_RESTORER
+#define SA_RESTORER 0x04000000
+#endif
+#ifndef CPUCLOCK_PROF
+#define CPUCLOCK_PROF 0
+#endif
+#ifndef CPUCLOCK_VIRT
+#define CPUCLOCK_VIRT 1
+#endif
+#ifndef CPUCLOCK_SCHED
+#define CPUCLOCK_SCHED 2
+#endif
+#ifndef CPUCLOCK_PERTHREAD_MASK
+#define CPUCLOCK_PERTHREAD_MASK 4
+#endif
+#ifndef MAKE_PROCESS_CPUCLOCK
+#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
+ ((~(int)(pid) << 3) | (int)(clock))
+#endif
+#ifndef MAKE_THREAD_CPUCLOCK
+#define MAKE_THREAD_CPUCLOCK(tid, clock) \
+ ((~(int)(tid) << 3) | (int)((clock) | CPUCLOCK_PERTHREAD_MASK))
+#endif
+
+#ifndef FUTEX_WAIT
+#define FUTEX_WAIT 0
+#endif
+#ifndef FUTEX_WAKE
+#define FUTEX_WAKE 1
+#endif
+#ifndef FUTEX_FD
+#define FUTEX_FD 2
+#endif
+#ifndef FUTEX_REQUEUE
+#define FUTEX_REQUEUE 3
+#endif
+#ifndef FUTEX_CMP_REQUEUE
+#define FUTEX_CMP_REQUEUE 4
+#endif
+#ifndef FUTEX_WAKE_OP
+#define FUTEX_WAKE_OP 5
+#endif
+#ifndef FUTEX_LOCK_PI
+#define FUTEX_LOCK_PI 6
+#endif
+#ifndef FUTEX_UNLOCK_PI
+#define FUTEX_UNLOCK_PI 7
+#endif
+#ifndef FUTEX_TRYLOCK_PI
+#define FUTEX_TRYLOCK_PI 8
+#endif
+#ifndef FUTEX_PRIVATE_FLAG
+#define FUTEX_PRIVATE_FLAG 128
+#endif
+#ifndef FUTEX_CMD_MASK
+#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG
+#endif
+#ifndef FUTEX_WAIT_PRIVATE
+#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_WAKE_PRIVATE
+#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_REQUEUE_PRIVATE
+#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_CMP_REQUEUE_PRIVATE
+#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_WAKE_OP_PRIVATE
+#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_LOCK_PI_PRIVATE
+#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_UNLOCK_PI_PRIVATE
+#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_TRYLOCK_PI_PRIVATE
+#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
+#endif
+
+
+#if defined(__x86_64__)
+#ifndef ARCH_SET_GS
+#define ARCH_SET_GS 0x1001
+#endif
+#ifndef ARCH_GET_GS
+#define ARCH_GET_GS 0x1004
+#endif
+#endif
+
+#if defined(__i386__)
+#ifndef __NR_quotactl
+#define __NR_quotactl 131
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid 164
+#define __NR_getresuid 165
+#define __NR_setresgid 170
+#define __NR_getresgid 171
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn 173
+#define __NR_rt_sigaction 174
+#define __NR_rt_sigprocmask 175
+#define __NR_rt_sigpending 176
+#define __NR_rt_sigsuspend 179
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 180
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 181
+#endif
+#ifndef __NR_ugetrlimit
+#define __NR_ugetrlimit 191
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 195
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 197
+#endif
+#ifndef __NR_setresuid32
+#define __NR_setresuid32 208
+#define __NR_getresuid32 209
+#define __NR_setresgid32 210
+#define __NR_getresgid32 211
+#endif
+#ifndef __NR_setfsuid32
+#define __NR_setfsuid32 215
+#define __NR_setfsgid32 216
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 220
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead 225
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr 226
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr 227
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr 229
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr 230
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr 232
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr 233
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill 238
+#endif
+#ifndef __NR_futex
+#define __NR_futex 240
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 241
+#define __NR_sched_getaffinity 242
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address 258
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime 265
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres 266
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 268
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 269
+#endif
+#ifndef __NR_fadvise64_64
+#define __NR_fadvise64_64 272
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 289
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get 290
+#endif
+#ifndef __NR_openat
+#define __NR_openat 295
+#endif
+#ifndef __NR_fstatat64
+#define __NR_fstatat64 300
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat 301
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages 317
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu 318
+#endif
+#ifndef __NR_fallocate
+#define __NR_fallocate 324
+#endif
+/* End of i386 definitions */
+#elif defined(__ARM_ARCH_3__)
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_SYSCALL_BASE + 164)
+#define __NR_getresuid (__NR_SYSCALL_BASE + 165)
+#define __NR_setresgid (__NR_SYSCALL_BASE + 170)
+#define __NR_getresgid (__NR_SYSCALL_BASE + 171)
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn (__NR_SYSCALL_BASE + 173)
+#define __NR_rt_sigaction (__NR_SYSCALL_BASE + 174)
+#define __NR_rt_sigprocmask (__NR_SYSCALL_BASE + 175)
+#define __NR_rt_sigpending (__NR_SYSCALL_BASE + 176)
+#define __NR_rt_sigsuspend (__NR_SYSCALL_BASE + 179)
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 (__NR_SYSCALL_BASE + 180)
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 (__NR_SYSCALL_BASE + 181)
+#endif
+#ifndef __NR_ugetrlimit
+#define __NR_ugetrlimit (__NR_SYSCALL_BASE + 191)
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 (__NR_SYSCALL_BASE + 195)
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 (__NR_SYSCALL_BASE + 197)
+#endif
+#ifndef __NR_setresuid32
+#define __NR_setresuid32 (__NR_SYSCALL_BASE + 208)
+#define __NR_getresuid32 (__NR_SYSCALL_BASE + 209)
+#define __NR_setresgid32 (__NR_SYSCALL_BASE + 210)
+#define __NR_getresgid32 (__NR_SYSCALL_BASE + 211)
+#endif
+#ifndef __NR_setfsuid32
+#define __NR_setfsuid32 (__NR_SYSCALL_BASE + 215)
+#define __NR_setfsgid32 (__NR_SYSCALL_BASE + 216)
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 (__NR_SYSCALL_BASE + 217)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_SYSCALL_BASE + 224)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_SYSCALL_BASE + 225)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_SYSCALL_BASE + 226)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_SYSCALL_BASE + 227)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_SYSCALL_BASE + 229)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_SYSCALL_BASE + 230)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_SYSCALL_BASE + 232)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_SYSCALL_BASE + 233)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_SYSCALL_BASE + 238)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_SYSCALL_BASE + 240)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_SYSCALL_BASE + 241)
+#define __NR_sched_getaffinity (__NR_SYSCALL_BASE + 242)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_SYSCALL_BASE + 256)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_SYSCALL_BASE + 263)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_SYSCALL_BASE + 264)
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 (__NR_SYSCALL_BASE + 266)
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 (__NR_SYSCALL_BASE + 267)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_SYSCALL_BASE + 314)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_SYSCALL_BASE + 315)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_SYSCALL_BASE + 344)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_SYSCALL_BASE + 345)
+#endif
+/* End of ARM 3 definitions */
+#elif defined(__x86_64__)
+#ifndef __NR_pread64
+#define __NR_pread64 17
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 18
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid 117
+#define __NR_getresuid 118
+#define __NR_setresgid 119
+#define __NR_getresgid 120
+#endif
+#ifndef __NR_quotactl
+#define __NR_quotactl 179
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid 186
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead 187
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr 188
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr 189
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr 191
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr 192
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr 194
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr 195
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill 200
+#endif
+#ifndef __NR_futex
+#define __NR_futex 202
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#define __NR_sched_getaffinity 204
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 217
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address 218
+#endif
+#ifndef __NR_fadvise64
+#define __NR_fadvise64 221
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime 228
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres 229
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 251
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get 252
+#endif
+#ifndef __NR_openat
+#define __NR_openat 257
+#endif
+#ifndef __NR_newfstatat
+#define __NR_newfstatat 262
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat 263
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages 279
+#endif
+#ifndef __NR_fallocate
+#define __NR_fallocate 285
+#endif
+/* End of x86-64 definitions */
+#elif defined(__mips__)
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_Linux + 185)
+#define __NR_getresuid (__NR_Linux + 186)
+#define __NR_setresgid (__NR_Linux + 190)
+#define __NR_getresgid (__NR_Linux + 191)
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn (__NR_Linux + 193)
+#define __NR_rt_sigaction (__NR_Linux + 194)
+#define __NR_rt_sigprocmask (__NR_Linux + 195)
+#define __NR_rt_sigpending (__NR_Linux + 196)
+#define __NR_rt_sigsuspend (__NR_Linux + 199)
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 (__NR_Linux + 200)
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 (__NR_Linux + 201)
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 (__NR_Linux + 213)
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 (__NR_Linux + 215)
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 (__NR_Linux + 219)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_Linux + 222)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_Linux + 223)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_Linux + 224)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_Linux + 225)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_Linux + 227)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_Linux + 228)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_Linux + 230)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_Linux + 231)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_Linux + 236)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_Linux + 238)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_Linux + 239)
+#define __NR_sched_getaffinity (__NR_Linux + 240)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_Linux + 252)
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 (__NR_Linux + 255)
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 (__NR_Linux + 256)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_Linux + 263)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_Linux + 264)
+#endif
+#ifndef __NR_openat
+#define __NR_openat (__NR_Linux + 288)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat (__NR_Linux + 293)
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat (__NR_Linux + 294)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_Linux + 308)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_Linux + 312)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_Linux + 314)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_Linux + 315)
+#endif
+/* End of MIPS (old 32bit API) definitions */
+#elif _MIPS_SIM == _MIPS_SIM_ABI64
+#ifndef __NR_pread64
+#define __NR_pread64 (__NR_Linux + 16)
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 (__NR_Linux + 17)
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_Linux + 115)
+#define __NR_getresuid (__NR_Linux + 116)
+#define __NR_setresgid (__NR_Linux + 117)
+#define __NR_getresgid (__NR_Linux + 118)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_Linux + 178)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_Linux + 179)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_Linux + 180)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_Linux + 181)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_Linux + 183)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_Linux + 184)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_Linux + 186)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_Linux + 187)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_Linux + 192)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_Linux + 194)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_Linux + 195)
+#define __NR_sched_getaffinity (__NR_Linux + 196)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_Linux + 212)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_Linux + 222)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_Linux + 223)
+#endif
+#ifndef __NR_openat
+#define __NR_openat (__NR_Linux + 247)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat (__NR_Linux + 252)
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat (__NR_Linux + 253)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_Linux + 267)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_Linux + 271)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_Linux + 273)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_Linux + 274)
+#endif
+/* End of MIPS (64bit API) definitions */
+#else
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_Linux + 115)
+#define __NR_getresuid (__NR_Linux + 116)
+#define __NR_setresgid (__NR_Linux + 117)
+#define __NR_getresgid (__NR_Linux + 118)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_Linux + 178)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_Linux + 179)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_Linux + 180)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_Linux + 181)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_Linux + 183)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_Linux + 184)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_Linux + 186)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_Linux + 187)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_Linux + 192)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_Linux + 194)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_Linux + 195)
+#define __NR_sched_getaffinity (__NR_Linux + 196)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_Linux + 213)
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 (__NR_Linux + 217)
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 (__NR_Linux + 218)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_Linux + 226)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_Linux + 227)
+#endif
+#ifndef __NR_openat
+#define __NR_openat (__NR_Linux + 251)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat (__NR_Linux + 256)
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat (__NR_Linux + 257)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_Linux + 271)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_Linux + 275)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_Linux + 277)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_Linux + 278)
+#endif
+/* End of MIPS (new 32bit API) definitions */
+#endif
+/* End of MIPS definitions */
+#elif defined(__PPC__)
+#ifndef __NR_setfsuid
+#define __NR_setfsuid 138
+#define __NR_setfsgid 139
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid 164
+#define __NR_getresuid 165
+#define __NR_setresgid 169
+#define __NR_getresgid 170
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn 172
+#define __NR_rt_sigaction 173
+#define __NR_rt_sigprocmask 174
+#define __NR_rt_sigpending 175
+#define __NR_rt_sigsuspend 178
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 179
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 180
+#endif
+#ifndef __NR_ugetrlimit
+#define __NR_ugetrlimit 190
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead 191
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 195
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 197
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 202
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid 207
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill 208
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr 209
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr 210
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr 212
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr 213
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr 215
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr 216
+#endif
+#ifndef __NR_futex
+#define __NR_futex 221
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 222
+#define __NR_sched_getaffinity 223
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address 232
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime 246
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres 247
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 252
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 253
+#endif
+#ifndef __NR_fadvise64_64
+#define __NR_fadvise64_64 254
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 273
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get 274
+#endif
+#ifndef __NR_openat
+#define __NR_openat 286
+#endif
+#ifndef __NR_fstatat64
+#define __NR_fstatat64 291
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat 292
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages 301
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu 302
+#endif
+/* End of powerpc defininitions */
+#endif
+
+
+/* After forking, we must make sure to only call system calls. */
+#if __BOUNDED_POINTERS__
+ #error "Need to port invocations of syscalls for bounded ptrs"
+#else
+ /* The core dumper and the thread lister get executed after threads
+ * have been suspended. As a consequence, we cannot call any functions
+ * that acquire locks. Unfortunately, libc wraps most system calls
+ * (e.g. in order to implement pthread_atfork, and to make calls
+ * cancellable), which means we cannot call these functions. Instead,
+ * we have to call syscall() directly.
+ */
+ #undef LSS_ERRNO
+ #ifdef SYS_ERRNO
+ /* Allow the including file to override the location of errno. This can
+ * be useful when using clone() with the CLONE_VM option.
+ */
+ #define LSS_ERRNO SYS_ERRNO
+ #else
+ #define LSS_ERRNO errno
+ #endif
+
+ #undef LSS_INLINE
+ #ifdef SYS_INLINE
+ #define LSS_INLINE SYS_INLINE
+ #else
+ #define LSS_INLINE static inline
+ #endif
+
+ /* Allow the including file to override the prefix used for all new
+ * system calls. By default, it will be set to "sys_".
+ */
+ #undef LSS_NAME
+ #ifndef SYS_PREFIX
+ #define LSS_NAME(name) sys_##name
+ #elif SYS_PREFIX < 0
+ #define LSS_NAME(name) name
+ #elif SYS_PREFIX == 0
+ #define LSS_NAME(name) sys0_##name
+ #elif SYS_PREFIX == 1
+ #define LSS_NAME(name) sys1_##name
+ #elif SYS_PREFIX == 2
+ #define LSS_NAME(name) sys2_##name
+ #elif SYS_PREFIX == 3
+ #define LSS_NAME(name) sys3_##name
+ #elif SYS_PREFIX == 4
+ #define LSS_NAME(name) sys4_##name
+ #elif SYS_PREFIX == 5
+ #define LSS_NAME(name) sys5_##name
+ #elif SYS_PREFIX == 6
+ #define LSS_NAME(name) sys6_##name
+ #elif SYS_PREFIX == 7
+ #define LSS_NAME(name) sys7_##name
+ #elif SYS_PREFIX == 8
+ #define LSS_NAME(name) sys8_##name
+ #elif SYS_PREFIX == 9
+ #define LSS_NAME(name) sys9_##name
+ #endif
+
+ #undef LSS_RETURN
+ #if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__))
+ /* Failing system calls return a negative result in the range of
+ * -1..-4095. These are "errno" values with the sign inverted.
+ */
+ #define LSS_RETURN(type, res) \
+ do { \
+ if ((unsigned long)(res) >= (unsigned long)(-4095)) { \
+ LSS_ERRNO = -(res); \
+ res = -1; \
+ } \
+ return (type) (res); \
+ } while (0)
+ #elif defined(__mips__)
+ /* On MIPS, failing system calls return -1, and set errno in a
+ * separate CPU register.
+ */
+ #define LSS_RETURN(type, res, err) \
+ do { \
+ if (err) { \
+ LSS_ERRNO = (res); \
+ res = -1; \
+ } \
+ return (type) (res); \
+ } while (0)
+ #elif defined(__PPC__)
+ /* On PPC, failing system calls return -1, and set errno in a
+ * separate CPU register. See linux/unistd.h.
+ */
+ #define LSS_RETURN(type, res, err) \
+ do { \
+ if (err & 0x10000000 ) { \
+ LSS_ERRNO = (res); \
+ res = -1; \
+ } \
+ return (type) (res); \
+ } while (0)
+ #endif
+ #if defined(__i386__)
+ /* In PIC mode (e.g. when building shared libraries), gcc for i386
+ * reserves ebx. Unfortunately, most distribution ship with implementations
+ * of _syscallX() which clobber ebx.
+ * Also, most definitions of _syscallX() neglect to mark "memory" as being
+ * clobbered. This causes problems with compilers, that do a better job
+ * at optimizing across __asm__ calls.
+ * So, we just have to redefine all of the _syscallX() macros.
+ */
+ #undef LSS_BODY
+ #define LSS_BODY(type,args...) \
+ long __res; \
+ __asm__ __volatile__("push %%ebx\n" \
+ "movl %2,%%ebx\n" \
+ "int $0x80\n" \
+ "pop %%ebx" \
+ args \
+ : "esp", "memory"); \
+ LSS_RETURN(type,__res)
+ #undef _syscall0
+ #define _syscall0(type,name) \
+ type LSS_NAME(name)(void) { \
+ long __res; \
+ __asm__ volatile("int $0x80" \
+ : "=a" (__res) \
+ : "0" (__NR_##name) \
+ : "memory"); \
+ LSS_RETURN(type,__res); \
+ }
+ #undef _syscall1
+ #define _syscall1(type,name,type1,arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name), "ri" ((long)(arg1))); \
+ }
+ #undef _syscall2
+ #define _syscall2(type,name,type1,arg1,type2,arg2) \
+ type LSS_NAME(name)(type1 arg1,type2 arg2) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name),"ri" ((long)(arg1)), "c" ((long)(arg2))); \
+ }
+ #undef _syscall3
+ #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
+ type LSS_NAME(name)(type1 arg1,type2 arg2,type3 arg3) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \
+ "d" ((long)(arg3))); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \
+ "d" ((long)(arg3)),"S" ((long)(arg4))); \
+ }
+ #undef _syscall5
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ long __res; \
+ __asm__ __volatile__("push %%ebx\n" \
+ "movl %2,%%ebx\n" \
+ "movl %1,%%eax\n" \
+ "int $0x80\n" \
+ "pop %%ebx" \
+ : "=a" (__res) \
+ : "i" (__NR_##name), "ri" ((long)(arg1)), \
+ "c" ((long)(arg2)), "d" ((long)(arg3)), \
+ "S" ((long)(arg4)), "D" ((long)(arg5)) \
+ : "esp", "memory"); \
+ LSS_RETURN(type,__res); \
+ }
+ #undef _syscall6
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ long __res; \
+ struct { long __a1; long __a6; } __s = { (long)arg1, (long) arg6 }; \
+ __asm__ __volatile__("push %%ebp\n" \
+ "push %%ebx\n" \
+ "movl 4(%2),%%ebp\n" \
+ "movl 0(%2), %%ebx\n" \
+ "movl %1,%%eax\n" \
+ "int $0x80\n" \
+ "pop %%ebx\n" \
+ "pop %%ebp" \
+ : "=a" (__res) \
+ : "i" (__NR_##name), "0" ((long)(&__s)), \
+ "c" ((long)(arg2)), "d" ((long)(arg3)), \
+ "S" ((long)(arg4)), "D" ((long)(arg5)) \
+ : "esp", "memory"); \
+ LSS_RETURN(type,__res); \
+ }
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __res;
+ __asm__ __volatile__(/* if (fn == NULL)
+ * return -EINVAL;
+ */
+ "movl %3,%%ecx\n"
+ "jecxz 1f\n"
+
+ /* if (child_stack == NULL)
+ * return -EINVAL;
+ */
+ "movl %4,%%ecx\n"
+ "jecxz 1f\n"
+
+ /* Set up alignment of the child stack:
+ * child_stack = (child_stack & ~0xF) - 20;
+ */
+ "andl $-16,%%ecx\n"
+ "subl $20,%%ecx\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ "movl %6,%%eax\n"
+ "movl %%eax,4(%%ecx)\n"
+ "movl %3,%%eax\n"
+ "movl %%eax,(%%ecx)\n"
+
+ /* %eax = syscall(%eax = __NR_clone,
+ * %ebx = flags,
+ * %ecx = child_stack,
+ * %edx = parent_tidptr,
+ * %esi = newtls,
+ * %edi = child_tidptr)
+ * Also, make sure that %ebx gets preserved as it is
+ * used in PIC mode.
+ */
+ "movl %8,%%esi\n"
+ "movl %7,%%edx\n"
+ "movl %5,%%eax\n"
+ "movl %9,%%edi\n"
+ "pushl %%ebx\n"
+ "movl %%eax,%%ebx\n"
+ "movl %2,%%eax\n"
+ "int $0x80\n"
+
+ /* In the parent: restore %ebx
+ * In the child: move "fn" into %ebx
+ */
+ "popl %%ebx\n"
+
+ /* if (%eax != 0)
+ * return %eax;
+ */
+ "test %%eax,%%eax\n"
+ "jnz 1f\n"
+
+ /* In the child, now. Terminate frame pointer chain.
+ */
+ "movl $0,%%ebp\n"
+
+ /* Call "fn". "arg" is already on the stack.
+ */
+ "call *%%ebx\n"
+
+ /* Call _exit(%ebx). Unfortunately older versions
+ * of gcc restrict the number of arguments that can
+ * be passed to asm(). So, we need to hard-code the
+ * system call number.
+ */
+ "movl %%eax,%%ebx\n"
+ "movl $1,%%eax\n"
+ "int $0x80\n"
+
+ /* Return to parent.
+ */
+ "1:\n"
+ : "=a" (__res)
+ : "0"(-EINVAL), "i"(__NR_clone),
+ "m"(fn), "m"(child_stack), "m"(flags), "m"(arg),
+ "m"(parent_tidptr), "m"(newtls), "m"(child_tidptr)
+ : "esp", "memory", "ecx", "edx", "esi", "edi");
+ LSS_RETURN(int, __res);
+ }
+
+ #define __NR__fadvise64_64 __NR_fadvise64_64
+ LSS_INLINE _syscall6(int, _fadvise64_64, int, fd,
+ unsigned, offset_lo, unsigned, offset_hi,
+ unsigned, len_lo, unsigned, len_hi,
+ int, advice)
+
+ LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset,
+ loff_t len, int advice) {
+ return LSS_NAME(_fadvise64_64)(fd,
+ (unsigned)offset, (unsigned)(offset >>32),
+ (unsigned)len, (unsigned)(len >> 32),
+ advice);
+ }
+
+ #define __NR__fallocate __NR_fallocate
+ LSS_INLINE _syscall6(int, _fallocate, int, fd,
+ int, mode,
+ unsigned, offset_lo, unsigned, offset_hi,
+ unsigned, len_lo, unsigned, len_hi)
+
+ LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode,
+ loff_t offset, loff_t len) {
+ union { loff_t off; unsigned w[2]; } o = { offset }, l = { len };
+ return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]);
+ }
+
+ LSS_INLINE _syscall1(int, set_thread_area, void *, u)
+ LSS_INLINE _syscall1(int, get_thread_area, void *, u)
+
+ LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
+ /* On i386, the kernel does not know how to return from a signal
+ * handler. Instead, it relies on user space to provide a
+ * restorer function that calls the {rt_,}sigreturn() system call.
+ * Unfortunately, we cannot just reference the glibc version of this
+ * function, as glibc goes out of its way to make it inaccessible.
+ */
+ void (*res)(void);
+ __asm__ __volatile__("call 2f\n"
+ "0:.align 16\n"
+ "1:movl %1,%%eax\n"
+ "int $0x80\n"
+ "2:popl %0\n"
+ "addl $(1b-0b),%0\n"
+ : "=a" (res)
+ : "i" (__NR_rt_sigreturn));
+ return res;
+ }
+ LSS_INLINE void (*LSS_NAME(restore)(void))(void) {
+ /* On i386, the kernel does not know how to return from a signal
+ * handler. Instead, it relies on user space to provide a
+ * restorer function that calls the {rt_,}sigreturn() system call.
+ * Unfortunately, we cannot just reference the glibc version of this
+ * function, as glibc goes out of its way to make it inaccessible.
+ */
+ void (*res)(void);
+ __asm__ __volatile__("call 2f\n"
+ "0:.align 16\n"
+ "1:pop %%eax\n"
+ "movl %1,%%eax\n"
+ "int $0x80\n"
+ "2:popl %0\n"
+ "addl $(1b-0b),%0\n"
+ : "=a" (res)
+ : "i" (__NR_sigreturn));
+ return res;
+ }
+ #elif defined(__x86_64__)
+ /* There are no known problems with any of the _syscallX() macros
+ * currently shipping for x86_64, but we still need to be able to define
+ * our own version so that we can override the location of the errno
+ * location (e.g. when using the clone() system call with the CLONE_VM
+ * option).
+ */
+ #undef LSS_BODY
+ #define LSS_BODY(type,name, ...) \
+ long __res; \
+ __asm__ __volatile__("syscall" : "=a" (__res) : "0" (__NR_##name), \
+ ##__VA_ARGS__ : "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res)
+ #undef _syscall0
+ #define _syscall0(type,name) \
+ type LSS_NAME(name)() { \
+ LSS_BODY(type, name); \
+ }
+ #undef _syscall1
+ #define _syscall1(type,name,type1,arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_BODY(type, name, "D" ((long)(arg1))); \
+ }
+ #undef _syscall2
+ #define _syscall2(type,name,type1,arg1,type2,arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2))); \
+ }
+ #undef _syscall3
+ #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)), \
+ "d" ((long)(arg3))); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ long __res; \
+ __asm__ __volatile__("movq %5,%%r10; syscall" : \
+ "=a" (__res) : "0" (__NR_##name), \
+ "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \
+ "r" ((long)(arg4)) : "r10", "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res); \
+ }
+ #undef _syscall5
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ long __res; \
+ __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; syscall" : \
+ "=a" (__res) : "0" (__NR_##name), \
+ "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \
+ "r" ((long)(arg4)), "r" ((long)(arg5)) : \
+ "r8", "r10", "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res); \
+ }
+ #undef _syscall6
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ long __res; \
+ __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; movq %7,%%r9;" \
+ "syscall" : \
+ "=a" (__res) : "0" (__NR_##name), \
+ "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \
+ "r" ((long)(arg4)), "r" ((long)(arg5)), "r" ((long)(arg6)) : \
+ "r8", "r9", "r10", "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res); \
+ }
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __res;
+ {
+ register void *__tls __asm__("r8") = newtls;
+ register int *__ctid __asm__("r10") = child_tidptr;
+ __asm__ __volatile__(/* if (fn == NULL)
+ * return -EINVAL;
+ */
+ "testq %4,%4\n"
+ "jz 1f\n"
+
+ /* if (child_stack == NULL)
+ * return -EINVAL;
+ */
+ "testq %5,%5\n"
+ "jz 1f\n"
+
+ /* childstack -= 2*sizeof(void *);
+ */
+ "subq $16,%5\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ "movq %7,8(%5)\n"
+ "movq %4,0(%5)\n"
+
+ /* %rax = syscall(%rax = __NR_clone,
+ * %rdi = flags,
+ * %rsi = child_stack,
+ * %rdx = parent_tidptr,
+ * %r8 = new_tls,
+ * %r10 = child_tidptr)
+ */
+ "movq %2,%%rax\n"
+ "syscall\n"
+
+ /* if (%rax != 0)
+ * return;
+ */
+ "testq %%rax,%%rax\n"
+ "jnz 1f\n"
+
+ /* In the child. Terminate frame pointer chain.
+ */
+ "xorq %%rbp,%%rbp\n"
+
+ /* Call "fn(arg)".
+ */
+ "popq %%rax\n"
+ "popq %%rdi\n"
+ "call *%%rax\n"
+
+ /* Call _exit(%ebx).
+ */
+ "movq %%rax,%%rdi\n"
+ "movq %3,%%rax\n"
+ "syscall\n"
+
+ /* Return to parent.
+ */
+ "1:\n"
+ : "=a" (__res)
+ : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
+ "r"(fn), "S"(child_stack), "D"(flags), "r"(arg),
+ "d"(parent_tidptr), "r"(__tls), "r"(__ctid)
+ : "rsp", "memory", "r11", "rcx");
+ }
+ LSS_RETURN(int, __res);
+ }
+ LSS_INLINE _syscall2(int, arch_prctl, int, c, void *, a)
+ LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len,
+ int, advice)
+
+ LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
+ /* On x86-64, the kernel does not know how to return from
+ * a signal handler. Instead, it relies on user space to provide a
+ * restorer function that calls the rt_sigreturn() system call.
+ * Unfortunately, we cannot just reference the glibc version of this
+ * function, as glibc goes out of its way to make it inaccessible.
+ */
+ void (*res)(void);
+ __asm__ __volatile__("call 2f\n"
+ "0:.align 16\n"
+ "1:movq %1,%%rax\n"
+ "syscall\n"
+ "2:popq %0\n"
+ "addq $(1b-0b),%0\n"
+ : "=a" (res)
+ : "i" (__NR_rt_sigreturn));
+ return res;
+ }
+ #elif defined(__ARM_ARCH_3__)
+ /* Most definitions of _syscallX() neglect to mark "memory" as being
+ * clobbered. This causes problems with compilers, that do a better job
+ * at optimizing across __asm__ calls.
+ * So, we just have to redefine all fo the _syscallX() macros.
+ */
+ #undef LSS_REG
+ #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a
+ #undef LSS_BODY
+ #define LSS_BODY(type,name,args...) \
+ register long __res_r0 __asm__("r0"); \
+ long __res; \
+ __asm__ __volatile__ (__syscall(name) \
+ : "=r"(__res_r0) : args : "lr", "memory"); \
+ __res = __res_r0; \
+ LSS_RETURN(type, __res)
+ #undef _syscall0
+ #define _syscall0(type, name) \
+ type LSS_NAME(name)() { \
+ LSS_BODY(type, name); \
+ }
+ #undef _syscall1
+ #define _syscall1(type, name, type1, arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__r0)); \
+ }
+ #undef _syscall2
+ #define _syscall2(type, name, type1, arg1, type2, arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1)); \
+ }
+ #undef _syscall3
+ #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2)); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_REG(3, arg4); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3)); \
+ }
+ #undef _syscall5
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_REG(3, arg4); LSS_REG(4, arg5); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \
+ "r"(__r4)); \
+ }
+ #undef _syscall6
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \
+ "r"(__r4), "r"(__r5)); \
+ }
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __res;
+ {
+ register int __flags __asm__("r0") = flags;
+ register void *__stack __asm__("r1") = child_stack;
+ register void *__ptid __asm__("r2") = parent_tidptr;
+ register void *__tls __asm__("r3") = newtls;
+ register int *__ctid __asm__("r4") = child_tidptr;
+ __asm__ __volatile__(/* if (fn == NULL || child_stack == NULL)
+ * return -EINVAL;
+ */
+ "cmp %2,#0\n"
+ "cmpne %3,#0\n"
+ "moveq %0,%1\n"
+ "beq 1f\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ "str %5,[%3,#-4]!\n"
+ "str %2,[%3,#-4]!\n"
+
+ /* %r0 = syscall(%r0 = flags,
+ * %r1 = child_stack,
+ * %r2 = parent_tidptr,
+ * %r3 = newtls,
+ * %r4 = child_tidptr)
+ */
+ __syscall(clone)"\n"
+
+ /* if (%r0 != 0)
+ * return %r0;
+ */
+ "movs %0,r0\n"
+ "bne 1f\n"
+
+ /* In the child, now. Call "fn(arg)".
+ */
+ "ldr r0,[sp, #4]\n"
+ "mov lr,pc\n"
+ "ldr pc,[sp]\n"
+
+ /* Call _exit(%r0).
+ */
+ __syscall(exit)"\n"
+ "1:\n"
+ : "=r" (__res)
+ : "i"(-EINVAL),
+ "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
+ "r"(__ptid), "r"(__tls), "r"(__ctid)
+ : "lr", "memory");
+ }
+ LSS_RETURN(int, __res);
+ }
+ #elif defined(__mips__)
+ #undef LSS_REG
+ #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) = \
+ (unsigned long)(a)
+ #undef LSS_BODY
+ #define LSS_BODY(type,name,r7,...) \
+ register unsigned long __v0 __asm__("$2") = __NR_##name; \
+ __asm__ __volatile__ ("syscall\n" \
+ : "=&r"(__v0), r7 (__r7) \
+ : "0"(__v0), ##__VA_ARGS__ \
+ : "$8", "$9", "$10", "$11", "$12", \
+ "$13", "$14", "$15", "$24", "memory"); \
+ LSS_RETURN(type, __v0, __r7)
+ #undef _syscall0
+ #define _syscall0(type, name) \
+ type LSS_NAME(name)() { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_BODY(type, name, "=r"); \
+ }
+ #undef _syscall1
+ #define _syscall1(type, name, type1, arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_REG(4, arg1); LSS_BODY(type, name, "=r", "r"(__r4)); \
+ }
+ #undef _syscall2
+ #define _syscall2(type, name, type1, arg1, type2, arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); \
+ LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5)); \
+ }
+ #undef _syscall3
+ #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5), "r"(__r6)); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); \
+ LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6)); \
+ }
+ #undef _syscall5
+ #if _MIPS_SIM == _MIPS_SIM_ABI32
+ /* The old 32bit MIPS system call API passes the fifth and sixth argument
+ * on the stack, whereas the new APIs use registers "r8" and "r9".
+ */
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); \
+ register unsigned long __v0 __asm__("$2"); \
+ __asm__ __volatile__ (".set noreorder\n" \
+ "lw $2, %6\n" \
+ "subu $29, 32\n" \
+ "sw $2, 16($29)\n" \
+ "li $2, %2\n" \
+ "syscall\n" \
+ "addiu $29, 32\n" \
+ ".set reorder\n" \
+ : "=&r"(__v0), "+r" (__r7) \
+ : "i" (__NR_##name), "r"(__r4), "r"(__r5), \
+ "r"(__r6), "m" ((unsigned long)arg5) \
+ : "$8", "$9", "$10", "$11", "$12", \
+ "$13", "$14", "$15", "$24", "memory"); \
+ LSS_RETURN(type, __v0, __r7); \
+ }
+ #else
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); LSS_REG(8, arg5); \
+ LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \
+ "r"(__r8)); \
+ }
+ #endif
+ #undef _syscall6
+ #if _MIPS_SIM == _MIPS_SIM_ABI32
+ /* The old 32bit MIPS system call API passes the fifth and sixth argument
+ * on the stack, whereas the new APIs use registers "r8" and "r9".
+ */
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); \
+ register unsigned long __v0 __asm__("$2"); \
+ __asm__ __volatile__ (".set noreorder\n" \
+ "lw $2, %6\n" \
+ "lw $8, %7\n" \
+ "subu $29, 32\n" \
+ "sw $2, 16($29)\n" \
+ "sw $8, 20($29)\n" \
+ "li $2, %2\n" \
+ "syscall\n" \
+ "addiu $29, 32\n" \
+ ".set reorder\n" \
+ : "=&r"(__v0), "+r" (__r7) \
+ : "i" (__NR_##name), "r"(__r4), "r"(__r5), \
+ "r"(__r6), "r" ((unsigned long)arg5), \
+ "r" ((unsigned long)arg6) \
+ : "$8", "$9", "$10", "$11", "$12", \
+ "$13", "$14", "$15", "$24", "memory"); \
+ LSS_RETURN(type, __v0, __r7); \
+ }
+ #else
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5,type6 arg6) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); LSS_REG(8, arg5); LSS_REG(9, arg6); \
+ LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \
+ "r"(__r8), "r"(__r9)); \
+ }
+ #endif
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ register unsigned long __v0 __asm__("$2");
+ register unsigned long __r7 __asm__("$7") = (unsigned long)newtls;
+ {
+ register int __flags __asm__("$4") = flags;
+ register void *__stack __asm__("$5") = child_stack;
+ register void *__ptid __asm__("$6") = parent_tidptr;
+ register int *__ctid __asm__("$8") = child_tidptr;
+ __asm__ __volatile__(
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "subu $29,24\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "sub $29,16\n"
+ #else
+ "dsubu $29,16\n"
+ #endif
+
+ /* if (fn == NULL || child_stack == NULL)
+ * return -EINVAL;
+ */
+ "li %0,%2\n"
+ "beqz %5,1f\n"
+ "beqz %6,1f\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "subu %6,32\n"
+ "sw %5,0(%6)\n"
+ "sw %8,4(%6)\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "sub %6,32\n"
+ "sw %5,0(%6)\n"
+ "sw %8,8(%6)\n"
+ #else
+ "dsubu %6,32\n"
+ "sd %5,0(%6)\n"
+ "sd %8,8(%6)\n"
+ #endif
+
+ /* $7 = syscall($4 = flags,
+ * $5 = child_stack,
+ * $6 = parent_tidptr,
+ * $7 = newtls,
+ * $8 = child_tidptr)
+ */
+ "li $2,%3\n"
+ "syscall\n"
+
+ /* if ($7 != 0)
+ * return $2;
+ */
+ "bnez $7,1f\n"
+ "bnez $2,1f\n"
+
+ /* In the child, now. Call "fn(arg)".
+ */
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "lw $25,0($29)\n"
+ "lw $4,4($29)\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "lw $25,0($29)\n"
+ "lw $4,8($29)\n"
+ #else
+ "ld $25,0($29)\n"
+ "ld $4,8($29)\n"
+ #endif
+ "jalr $25\n"
+
+ /* Call _exit($2)
+ */
+ "move $4,$2\n"
+ "li $2,%4\n"
+ "syscall\n"
+
+ "1:\n"
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "addu $29, 24\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "add $29, 16\n"
+ #else
+ "daddu $29,16\n"
+ #endif
+ : "=&r" (__v0), "=r" (__r7)
+ : "i"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
+ "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
+ "r"(__ptid), "r"(__r7), "r"(__ctid)
+ : "$9", "$10", "$11", "$12", "$13", "$14", "$15",
+ "$24", "memory");
+ }
+ LSS_RETURN(int, __v0, __r7);
+ }
+ #elif defined (__PPC__)
+ #undef LSS_LOADARGS_0
+ #define LSS_LOADARGS_0(name, dummy...) \
+ __sc_0 = __NR_##name
+ #undef LSS_LOADARGS_1
+ #define LSS_LOADARGS_1(name, arg1) \
+ LSS_LOADARGS_0(name); \
+ __sc_3 = (unsigned long) (arg1)
+ #undef LSS_LOADARGS_2
+ #define LSS_LOADARGS_2(name, arg1, arg2) \
+ LSS_LOADARGS_1(name, arg1); \
+ __sc_4 = (unsigned long) (arg2)
+ #undef LSS_LOADARGS_3
+ #define LSS_LOADARGS_3(name, arg1, arg2, arg3) \
+ LSS_LOADARGS_2(name, arg1, arg2); \
+ __sc_5 = (unsigned long) (arg3)
+ #undef LSS_LOADARGS_4
+ #define LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4) \
+ LSS_LOADARGS_3(name, arg1, arg2, arg3); \
+ __sc_6 = (unsigned long) (arg4)
+ #undef LSS_LOADARGS_5
+ #define LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5) \
+ LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4); \
+ __sc_7 = (unsigned long) (arg5)
+ #undef LSS_LOADARGS_6
+ #define LSS_LOADARGS_6(name, arg1, arg2, arg3, arg4, arg5, arg6) \
+ LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5); \
+ __sc_8 = (unsigned long) (arg6)
+ #undef LSS_ASMINPUT_0
+ #define LSS_ASMINPUT_0 "0" (__sc_0)
+ #undef LSS_ASMINPUT_1
+ #define LSS_ASMINPUT_1 LSS_ASMINPUT_0, "1" (__sc_3)
+ #undef LSS_ASMINPUT_2
+ #define LSS_ASMINPUT_2 LSS_ASMINPUT_1, "2" (__sc_4)
+ #undef LSS_ASMINPUT_3
+ #define LSS_ASMINPUT_3 LSS_ASMINPUT_2, "3" (__sc_5)
+ #undef LSS_ASMINPUT_4
+ #define LSS_ASMINPUT_4 LSS_ASMINPUT_3, "4" (__sc_6)
+ #undef LSS_ASMINPUT_5
+ #define LSS_ASMINPUT_5 LSS_ASMINPUT_4, "5" (__sc_7)
+ #undef LSS_ASMINPUT_6
+ #define LSS_ASMINPUT_6 LSS_ASMINPUT_5, "6" (__sc_8)
+ #undef LSS_BODY
+ #define LSS_BODY(nr, type, name, args...) \
+ long __sc_ret, __sc_err; \
+ { \
+ register unsigned long __sc_0 __asm__ ("r0"); \
+ register unsigned long __sc_3 __asm__ ("r3"); \
+ register unsigned long __sc_4 __asm__ ("r4"); \
+ register unsigned long __sc_5 __asm__ ("r5"); \
+ register unsigned long __sc_6 __asm__ ("r6"); \
+ register unsigned long __sc_7 __asm__ ("r7"); \
+ register unsigned long __sc_8 __asm__ ("r8"); \
+ \
+ LSS_LOADARGS_##nr(name, args); \
+ __asm__ __volatile__ \
+ ("sc\n\t" \
+ "mfcr %0" \
+ : "=&r" (__sc_0), \
+ "=&r" (__sc_3), "=&r" (__sc_4), \
+ "=&r" (__sc_5), "=&r" (__sc_6), \
+ "=&r" (__sc_7), "=&r" (__sc_8) \
+ : LSS_ASMINPUT_##nr \
+ : "cr0", "ctr", "memory", \
+ "r9", "r10", "r11", "r12"); \
+ __sc_ret = __sc_3; \
+ __sc_err = __sc_0; \
+ } \
+ LSS_RETURN(type, __sc_ret, __sc_err)
+ #undef _syscall0
+ #define _syscall0(type, name) \
+ type LSS_NAME(name)(void) { \
+ LSS_BODY(0, type, name); \
+ }
+ #undef _syscall1
+ #define _syscall1(type, name, type1, arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_BODY(1, type, name, arg1); \
+ }
+ #undef _syscall2
+ #define _syscall2(type, name, type1, arg1, type2, arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ LSS_BODY(2, type, name, arg1, arg2); \
+ }
+ #undef _syscall3
+ #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ LSS_BODY(3, type, name, arg1, arg2, arg3); \
+ }
+ #undef _syscall4
+ #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_BODY(4, type, name, arg1, arg2, arg3, arg4); \
+ }
+ #undef _syscall5
+ #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_BODY(5, type, name, arg1, arg2, arg3, arg4, arg5); \
+ }
+ #undef _syscall6
+ #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5, type6, arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \
+ }
+ /* clone function adapted from glibc 2.3.6 clone.S */
+ /* TODO(csilvers): consider wrapping some args up in a struct, like we
+ * do for i386's _syscall6, so we can compile successfully on gcc 2.95
+ */
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __ret, __err;
+ {
+ register int (*__fn)(void *) __asm__ ("r8") = fn;
+ register void *__cstack __asm__ ("r4") = child_stack;
+ register int __flags __asm__ ("r3") = flags;
+ register void * __arg __asm__ ("r9") = arg;
+ register int * __ptidptr __asm__ ("r5") = parent_tidptr;
+ register void * __newtls __asm__ ("r6") = newtls;
+ register int * __ctidptr __asm__ ("r7") = child_tidptr;
+ __asm__ __volatile__(
+ /* check for fn == NULL
+ * and child_stack == NULL
+ */
+ "cmpwi cr0, %6, 0\n\t"
+ "cmpwi cr1, %7, 0\n\t"
+ "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t"
+ "beq- cr0, 1f\n\t"
+
+ /* set up stack frame for child */
+ "clrrwi %7, %7, 4\n\t"
+ "li 0, 0\n\t"
+ "stwu 0, -16(%7)\n\t"
+
+ /* fn, arg, child_stack are saved across the syscall: r28-30 */
+ "mr 28, %6\n\t"
+ "mr 29, %7\n\t"
+ "mr 27, %9\n\t"
+
+ /* syscall */
+ "li 0, %4\n\t"
+ /* flags already in r3
+ * child_stack already in r4
+ * ptidptr already in r5
+ * newtls already in r6
+ * ctidptr already in r7
+ */
+ "sc\n\t"
+
+ /* Test if syscall was successful */
+ "cmpwi cr1, 3, 0\n\t"
+ "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t"
+ "bne- cr1, 1f\n\t"
+
+ /* Do the function call */
+ "mtctr 28\n\t"
+ "mr 3, 27\n\t"
+ "bctrl\n\t"
+
+ /* Call _exit(r3) */
+ "li 0, %5\n\t"
+ "sc\n\t"
+
+ /* Return to parent */
+ "1:\n"
+ "mfcr %1\n\t"
+ "mr %0, 3\n\t"
+ : "=r" (__ret), "=r" (__err)
+ : "0" (-1), "1" (EINVAL),
+ "i" (__NR_clone), "i" (__NR_exit),
+ "r" (__fn), "r" (__cstack), "r" (__flags),
+ "r" (__arg), "r" (__ptidptr), "r" (__newtls),
+ "r" (__ctidptr)
+ : "cr0", "cr1", "memory", "ctr",
+ "r0", "r29", "r27", "r28");
+ }
+ LSS_RETURN(int, __ret, __err);
+ }
+ #endif
+ #define __NR__exit __NR_exit
+ #define __NR__gettid __NR_gettid
+ #define __NR__mremap __NR_mremap
+ LSS_INLINE _syscall1(int, brk, void *, e)
+ LSS_INLINE _syscall1(int, chdir, const char *,p)
+ LSS_INLINE _syscall1(int, close, int, f)
+ LSS_INLINE _syscall2(int, clock_getres, int, c,
+ struct kernel_timespec*, t)
+ LSS_INLINE _syscall2(int, clock_gettime, int, c,
+ struct kernel_timespec*, t)
+ LSS_INLINE _syscall1(int, dup, int, f)
+ LSS_INLINE _syscall2(int, dup2, int, s,
+ int, d)
+ LSS_INLINE _syscall3(int, execve, const char*, f,
+ const char*const*,a,const char*const*, e)
+ LSS_INLINE _syscall1(int, _exit, int, e)
+ LSS_INLINE _syscall1(int, exit_group, int, e)
+ LSS_INLINE _syscall3(int, fcntl, int, f,
+ int, c, long, a)
+ LSS_INLINE _syscall0(pid_t, fork)
+ LSS_INLINE _syscall2(int, fstat, int, f,
+ struct kernel_stat*, b)
+ LSS_INLINE _syscall2(int, fstatfs, int, f,
+ struct kernel_statfs*, b)
+ LSS_INLINE _syscall2(int, ftruncate, int, f,
+ off_t, l)
+ LSS_INLINE _syscall4(int, futex, int*, a,
+ int, o, int, v,
+ struct kernel_timespec*, t)
+ LSS_INLINE _syscall3(int, getdents, int, f,
+ struct kernel_dirent*, d, int, c)
+ LSS_INLINE _syscall3(int, getdents64, int, f,
+ struct kernel_dirent64*, d, int, c)
+ LSS_INLINE _syscall0(gid_t, getegid)
+ LSS_INLINE _syscall0(uid_t, geteuid)
+ LSS_INLINE _syscall0(pid_t, getpgrp)
+ LSS_INLINE _syscall0(pid_t, getpid)
+ LSS_INLINE _syscall0(pid_t, getppid)
+ LSS_INLINE _syscall2(int, getpriority, int, a,
+ int, b)
+ LSS_INLINE _syscall3(int, getresgid, gid_t *, r,
+ gid_t *, e, gid_t *, s)
+ LSS_INLINE _syscall3(int, getresuid, uid_t *, r,
+ uid_t *, e, uid_t *, s)
+ LSS_INLINE _syscall2(int, getrlimit, int, r,
+ struct kernel_rlimit*, l)
+ LSS_INLINE _syscall1(pid_t, getsid, pid_t, p)
+ LSS_INLINE _syscall0(pid_t, _gettid)
+ LSS_INLINE _syscall2(int, gettimeofday, struct timeval *, v,
+ struct timezone *, z)
+ LSS_INLINE _syscall5(int, setxattr, const char *,p,
+ const char *, n, const void *,v,
+ size_t, s, int, f)
+ LSS_INLINE _syscall5(int, lsetxattr, const char *,p,
+ const char *, n, const void *,v,
+ size_t, s, int, f)
+ LSS_INLINE _syscall4(ssize_t, getxattr, const char *,p,
+ const char *, n, void *, v, size_t, s)
+ LSS_INLINE _syscall4(ssize_t, lgetxattr, const char *,p,
+ const char *, n, void *, v, size_t, s)
+ LSS_INLINE _syscall3(ssize_t, listxattr, const char *,p,
+ char *, l, size_t, s)
+ LSS_INLINE _syscall3(ssize_t, llistxattr, const char *,p,
+ char *, l, size_t, s)
+ LSS_INLINE _syscall3(int, ioctl, int, d,
+ int, r, void *, a)
+ LSS_INLINE _syscall2(int, ioprio_get, int, which,
+ int, who)
+ LSS_INLINE _syscall3(int, ioprio_set, int, which,
+ int, who, int, ioprio)
+ LSS_INLINE _syscall2(int, kill, pid_t, p,
+ int, s)
+ LSS_INLINE _syscall3(off_t, lseek, int, f,
+ off_t, o, int, w)
+ LSS_INLINE _syscall2(int, munmap, void*, s,
+ size_t, l)
+ LSS_INLINE _syscall6(long, move_pages, pid_t, p,
+ unsigned long, n, void **,g, int *, d,
+ int *, s, int, f)
+ LSS_INLINE _syscall3(int, mprotect, const void *,a,
+ size_t, l, int, p)
+ LSS_INLINE _syscall5(void*, _mremap, void*, o,
+ size_t, os, size_t, ns,
+ unsigned long, f, void *, a)
+ LSS_INLINE _syscall3(int, open, const char*, p,
+ int, f, int, m)
+ LSS_INLINE _syscall3(int, poll, struct kernel_pollfd*, u,
+ unsigned int, n, int, t)
+ LSS_INLINE _syscall2(int, prctl, int, o,
+ long, a)
+ LSS_INLINE _syscall4(long, ptrace, int, r,
+ pid_t, p, void *, a, void *, d)
+ #if defined(__NR_quotactl)
+ // Defined on x86_64 / i386 only
+ LSS_INLINE _syscall4(int, quotactl, int, cmd, const char *, special,
+ int, id, caddr_t, addr)
+ #endif
+ LSS_INLINE _syscall3(ssize_t, read, int, f,
+ void *, b, size_t, c)
+ LSS_INLINE _syscall3(int, readlink, const char*, p,
+ char*, b, size_t, s)
+ LSS_INLINE _syscall4(int, rt_sigaction, int, s,
+ const struct kernel_sigaction*, a,
+ struct kernel_sigaction*, o, size_t, c)
+ LSS_INLINE _syscall2(int, rt_sigpending, struct kernel_sigset_t *, s,
+ size_t, c)
+ LSS_INLINE _syscall4(int, rt_sigprocmask, int, h,
+ const struct kernel_sigset_t*, s,
+ struct kernel_sigset_t*, o, size_t, c);
+ LSS_INLINE _syscall1(int, rt_sigreturn, unsigned long, u);
+ LSS_INLINE _syscall2(int, rt_sigsuspend,
+ const struct kernel_sigset_t*, s, size_t, c);
+ LSS_INLINE _syscall3(int, sched_getaffinity,pid_t, p,
+ unsigned int, l, unsigned long *, m)
+ LSS_INLINE _syscall3(int, sched_setaffinity,pid_t, p,
+ unsigned int, l, unsigned long *, m)
+ LSS_INLINE _syscall0(int, sched_yield)
+ LSS_INLINE _syscall1(long, set_tid_address, int *, t)
+ LSS_INLINE _syscall1(int, setfsgid, gid_t, g)
+ LSS_INLINE _syscall1(int, setfsuid, uid_t, u)
+ LSS_INLINE _syscall1(int, setuid, uid_t, u)
+ LSS_INLINE _syscall1(int, setgid, gid_t, g)
+ LSS_INLINE _syscall2(int, setpgid, pid_t, p,
+ pid_t, g)
+ LSS_INLINE _syscall3(int, setpriority, int, a,
+ int, b, int, p)
+ LSS_INLINE _syscall3(int, setresgid, gid_t, r,
+ gid_t, e, gid_t, s)
+ LSS_INLINE _syscall3(int, setresuid, uid_t, r,
+ uid_t, e, uid_t, s)
+ LSS_INLINE _syscall2(int, setrlimit, int, r,
+ const struct kernel_rlimit*, l)
+ LSS_INLINE _syscall0(pid_t, setsid)
+ LSS_INLINE _syscall2(int, sigaltstack, const stack_t*, s,
+ const stack_t*, o)
+ #if defined(__NR_sigreturn)
+ LSS_INLINE _syscall1(int, sigreturn, unsigned long, u);
+ #endif
+ LSS_INLINE _syscall2(int, stat, const char*, f,
+ struct kernel_stat*, b)
+ LSS_INLINE _syscall2(int, statfs, const char*, f,
+ struct kernel_statfs*, b)
+ LSS_INLINE _syscall3(int, tgkill, pid_t, p,
+ pid_t, t, int, s)
+ LSS_INLINE _syscall2(int, tkill, pid_t, p,
+ int, s)
+ LSS_INLINE _syscall3(ssize_t, write, int, f,
+ const void *, b, size_t, c)
+ LSS_INLINE _syscall3(ssize_t, writev, int, f,
+ const struct kernel_iovec*, v, size_t, c)
+ LSS_INLINE _syscall1(int, unlink, const char*, f)
+ #if defined(__NR_getcpu)
+ LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu,
+ unsigned *, node, void *, unused);
+ #endif
+ #if defined(__x86_64__) || \
+ (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32)
+ LSS_INLINE _syscall3(int, recvmsg, int, s,
+ struct kernel_msghdr*, m, int, f)
+ LSS_INLINE _syscall3(int, sendmsg, int, s,
+ const struct kernel_msghdr*, m, int, f)
+ LSS_INLINE _syscall6(int, sendto, int, s,
+ const void*, m, size_t, l,
+ int, f,
+ const struct kernel_sockaddr*, a, int, t)
+ LSS_INLINE _syscall2(int, shutdown, int, s,
+ int, h)
+ LSS_INLINE _syscall3(int, socket, int, d,
+ int, t, int, p)
+ LSS_INLINE _syscall4(int, socketpair, int, d,
+ int, t, int, p, int*, s)
+ #endif
+ #if defined(__x86_64__)
+ LSS_INLINE _syscall4(int, fallocate, int, fd, int, mode,
+ loff_t, offset, loff_t, len)
+
+ LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid,
+ gid_t *egid,
+ gid_t *sgid) {
+ return LSS_NAME(getresgid)(rgid, egid, sgid);
+ }
+
+ LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid,
+ uid_t *euid,
+ uid_t *suid) {
+ return LSS_NAME(getresuid)(ruid, euid, suid);
+ }
+
+ LSS_INLINE _syscall6(void*, mmap, void*, s,
+ size_t, l, int, p,
+ int, f, int, d,
+ __off64_t, o)
+
+ LSS_INLINE _syscall4(int, newfstatat, int, d,
+ const char *, p,
+ struct kernel_stat*, b, int, f)
+
+ LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) {
+ return LSS_NAME(setfsgid)(gid);
+ }
+
+ LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) {
+ return LSS_NAME(setfsuid)(uid);
+ }
+
+ LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) {
+ return LSS_NAME(setresgid)(rgid, egid, sgid);
+ }
+
+ LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) {
+ return LSS_NAME(setresuid)(ruid, euid, suid);
+ }
+
+ LSS_INLINE int LSS_NAME(sigaction)(int signum,
+ const struct kernel_sigaction *act,
+ struct kernel_sigaction *oldact) {
+ /* On x86_64, the kernel requires us to always set our own
+ * SA_RESTORER in order to be able to return from a signal handler.
+ * This function must have a "magic" signature that the "gdb"
+ * (and maybe the kernel?) can recognize.
+ */
+ if (act != NULL && !(act->sa_flags & SA_RESTORER)) {
+ struct kernel_sigaction a = *act;
+ a.sa_flags |= SA_RESTORER;
+ a.sa_restorer = LSS_NAME(restore_rt)();
+ return LSS_NAME(rt_sigaction)(signum, &a, oldact,
+ (KERNEL_NSIG+7)/8);
+ } else {
+ return LSS_NAME(rt_sigaction)(signum, act, oldact,
+ (KERNEL_NSIG+7)/8);
+ }
+ }
+
+ LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) {
+ return LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8);
+ }
+
+ LSS_INLINE int LSS_NAME(sigprocmask)(int how,
+ const struct kernel_sigset_t *set,
+ struct kernel_sigset_t *oldset) {
+ return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
+ }
+
+ LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) {
+ return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8);
+ }
+ #endif
+ #if defined(__x86_64__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32)
+ LSS_INLINE _syscall4(pid_t, wait4, pid_t, p,
+ int*, s, int, o,
+ struct kernel_rusage*, r)
+
+ LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){
+ return LSS_NAME(wait4)(pid, status, options, 0);
+ }
+ #endif
+ #if defined(__i386__) || defined(__x86_64__)
+ LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m)
+ LSS_INLINE _syscall3(int, unlinkat, int, d, const char *, p, int, f)
+ #endif
+ #if defined(__i386__) || defined(__ARM_ARCH_3__)
+ #define __NR__getresgid32 __NR_getresgid32
+ #define __NR__getresuid32 __NR_getresuid32
+ #define __NR__setfsgid32 __NR_setfsgid32
+ #define __NR__setfsuid32 __NR_setfsuid32
+ #define __NR__setresgid32 __NR_setresgid32
+ #define __NR__setresuid32 __NR_setresuid32
+ LSS_INLINE _syscall2(int, ugetrlimit, int, r,
+ struct kernel_rlimit*, l)
+ LSS_INLINE _syscall3(int, _getresgid32, gid_t *, r,
+ gid_t *, e, gid_t *, s)
+ LSS_INLINE _syscall3(int, _getresuid32, uid_t *, r,
+ uid_t *, e, uid_t *, s)
+ LSS_INLINE _syscall1(int, _setfsgid32, gid_t, f)
+ LSS_INLINE _syscall1(int, _setfsuid32, uid_t, f)
+ LSS_INLINE _syscall3(int, _setresgid32, gid_t, r,
+ gid_t, e, gid_t, s)
+ LSS_INLINE _syscall3(int, _setresuid32, uid_t, r,
+ uid_t, e, uid_t, s)
+
+ LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid,
+ gid_t *egid,
+ gid_t *sgid) {
+ int rc;
+ if ((rc = LSS_NAME(_getresgid32)(rgid, egid, sgid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((rgid == NULL) || (egid == NULL) || (sgid == NULL)) {
+ return EFAULT;
+ }
+ // Clear the high bits first, since getresgid only sets 16 bits
+ *rgid = *egid = *sgid = 0;
+ rc = LSS_NAME(getresgid)(rgid, egid, sgid);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid,
+ uid_t *euid,
+ uid_t *suid) {
+ int rc;
+ if ((rc = LSS_NAME(_getresuid32)(ruid, euid, suid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((ruid == NULL) || (euid == NULL) || (suid == NULL)) {
+ return EFAULT;
+ }
+ // Clear the high bits first, since getresuid only sets 16 bits
+ *ruid = *euid = *suid = 0;
+ rc = LSS_NAME(getresuid)(ruid, euid, suid);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) {
+ int rc;
+ if ((rc = LSS_NAME(_setfsgid32)(gid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)gid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setfsgid)(gid);
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) {
+ int rc;
+ if ((rc = LSS_NAME(_setfsuid32)(uid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)uid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setfsuid)(uid);
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) {
+ int rc;
+ if ((rc = LSS_NAME(_setresgid32)(rgid, egid, sgid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)rgid & ~0xFFFFu ||
+ (unsigned int)egid & ~0xFFFFu ||
+ (unsigned int)sgid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setresgid)(rgid, egid, sgid);
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) {
+ int rc;
+ if ((rc = LSS_NAME(_setresuid32)(ruid, euid, suid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)ruid & ~0xFFFFu ||
+ (unsigned int)euid & ~0xFFFFu ||
+ (unsigned int)suid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setresuid)(ruid, euid, suid);
+ }
+ }
+ return rc;
+ }
+ #endif
+ LSS_INLINE int LSS_NAME(sigemptyset)(struct kernel_sigset_t *set) {
+ memset(&set->sig, 0, sizeof(set->sig));
+ return 0;
+ }
+
+ LSS_INLINE int LSS_NAME(sigfillset)(struct kernel_sigset_t *set) {
+ memset(&set->sig, -1, sizeof(set->sig));
+ return 0;
+ }
+
+ LSS_INLINE int LSS_NAME(sigaddset)(struct kernel_sigset_t *set,
+ int signum) {
+ if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+ LSS_ERRNO = EINVAL;
+ return -1;
+ } else {
+ set->sig[(signum - 1)/(8*sizeof(set->sig[0]))]
+ |= 1UL << ((signum - 1) % (8*sizeof(set->sig[0])));
+ return 0;
+ }
+ }
+
+ LSS_INLINE int LSS_NAME(sigdelset)(struct kernel_sigset_t *set,
+ int signum) {
+ if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+ LSS_ERRNO = EINVAL;
+ return -1;
+ } else {
+ set->sig[(signum - 1)/(8*sizeof(set->sig[0]))]
+ &= ~(1UL << ((signum - 1) % (8*sizeof(set->sig[0]))));
+ return 0;
+ }
+ }
+
+ LSS_INLINE int LSS_NAME(sigismember)(struct kernel_sigset_t *set,
+ int signum) {
+ if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+ LSS_ERRNO = EINVAL;
+ return -1;
+ } else {
+ return !!(set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] &
+ (1UL << ((signum - 1) % (8*sizeof(set->sig[0])))));
+ }
+ }
+ #if defined(__i386__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || defined(__PPC__)
+ #define __NR__sigaction __NR_sigaction
+ #define __NR__sigpending __NR_sigpending
+ #define __NR__sigprocmask __NR_sigprocmask
+ #define __NR__sigsuspend __NR_sigsuspend
+ #define __NR__socketcall __NR_socketcall
+ LSS_INLINE _syscall2(int, fstat64, int, f,
+ struct kernel_stat64 *, b)
+ LSS_INLINE _syscall5(int, _llseek, uint, fd, ulong, hi, ulong, lo,
+ loff_t *, res, uint, wh)
+ LSS_INLINE _syscall1(void*, mmap, void*, a)
+ LSS_INLINE _syscall6(void*, mmap2, void*, s,
+ size_t, l, int, p,
+ int, f, int, d,
+ __off64_t, o)
+ LSS_INLINE _syscall3(int, _sigaction, int, s,
+ const struct kernel_old_sigaction*, a,
+ struct kernel_old_sigaction*, o)
+ LSS_INLINE _syscall1(int, _sigpending, unsigned long*, s)
+ LSS_INLINE _syscall3(int, _sigprocmask, int, h,
+ const unsigned long*, s,
+ unsigned long*, o)
+ #ifdef __PPC__
+ LSS_INLINE _syscall1(int, _sigsuspend, unsigned long, s)
+ #else
+ LSS_INLINE _syscall3(int, _sigsuspend, const void*, a,
+ int, b,
+ unsigned long, s)
+ #endif
+ LSS_INLINE _syscall2(int, stat64, const char *, p,
+ struct kernel_stat64 *, b)
+
+ LSS_INLINE int LSS_NAME(sigaction)(int signum,
+ const struct kernel_sigaction *act,
+ struct kernel_sigaction *oldact) {
+ int old_errno = LSS_ERRNO;
+ int rc;
+ struct kernel_sigaction a;
+ if (act != NULL) {
+ a = *act;
+ #ifdef __i386__
+ /* On i386, the kernel requires us to always set our own
+ * SA_RESTORER when using realtime signals. Otherwise, it does not
+ * know how to return from a signal handler. This function must have
+ * a "magic" signature that the "gdb" (and maybe the kernel?) can
+ * recognize.
+ * Apparently, a SA_RESTORER is implicitly set by the kernel, when
+ * using non-realtime signals.
+ *
+ * TODO: Test whether ARM needs a restorer
+ */
+ if (!(a.sa_flags & SA_RESTORER)) {
+ a.sa_flags |= SA_RESTORER;
+ a.sa_restorer = (a.sa_flags & SA_SIGINFO)
+ ? LSS_NAME(restore_rt)() : LSS_NAME(restore)();
+ }
+ #endif
+ }
+ rc = LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact,
+ (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ struct kernel_old_sigaction oa, ooa, *ptr_a = &oa, *ptr_oa = &ooa;
+ if (!act) {
+ ptr_a = NULL;
+ } else {
+ oa.sa_handler_ = act->sa_handler_;
+ memcpy(&oa.sa_mask, &act->sa_mask, sizeof(oa.sa_mask));
+ #ifndef __mips__
+ oa.sa_restorer = act->sa_restorer;
+ #endif
+ oa.sa_flags = act->sa_flags;
+ }
+ if (!oldact) {
+ ptr_oa = NULL;
+ }
+ LSS_ERRNO = old_errno;
+ rc = LSS_NAME(_sigaction)(signum, ptr_a, ptr_oa);
+ if (rc == 0 && oldact) {
+ if (act) {
+ memcpy(oldact, act, sizeof(*act));
+ } else {
+ memset(oldact, 0, sizeof(*oldact));
+ }
+ oldact->sa_handler_ = ptr_oa->sa_handler_;
+ oldact->sa_flags = ptr_oa->sa_flags;
+ memcpy(&oldact->sa_mask, &ptr_oa->sa_mask, sizeof(ptr_oa->sa_mask));
+ #ifndef __mips__
+ oldact->sa_restorer = ptr_oa->sa_restorer;
+ #endif
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) {
+ int old_errno = LSS_ERRNO;
+ int rc = LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ LSS_ERRNO = old_errno;
+ LSS_NAME(sigemptyset)(set);
+ rc = LSS_NAME(_sigpending)(&set->sig[0]);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(sigprocmask)(int how,
+ const struct kernel_sigset_t *set,
+ struct kernel_sigset_t *oldset) {
+ int olderrno = LSS_ERRNO;
+ int rc = LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ LSS_ERRNO = olderrno;
+ if (oldset) {
+ LSS_NAME(sigemptyset)(oldset);
+ }
+ rc = LSS_NAME(_sigprocmask)(how,
+ set ? &set->sig[0] : NULL,
+ oldset ? &oldset->sig[0] : NULL);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) {
+ int olderrno = LSS_ERRNO;
+ int rc = LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ LSS_ERRNO = olderrno;
+ rc = LSS_NAME(_sigsuspend)(
+ #ifndef __PPC__
+ set, 0,
+ #endif
+ set->sig[0]);
+ }
+ return rc;
+ }
+ #endif
+ #if defined(__PPC__)
+ #undef LSS_SC_LOADARGS_0
+ #define LSS_SC_LOADARGS_0(dummy...)
+ #undef LSS_SC_LOADARGS_1
+ #define LSS_SC_LOADARGS_1(arg1) \
+ __sc_4 = (unsigned long) (arg1)
+ #undef LSS_SC_LOADARGS_2
+ #define LSS_SC_LOADARGS_2(arg1, arg2) \
+ LSS_SC_LOADARGS_1(arg1); \
+ __sc_5 = (unsigned long) (arg2)
+ #undef LSS_SC_LOADARGS_3
+ #define LSS_SC_LOADARGS_3(arg1, arg2, arg3) \
+ LSS_SC_LOADARGS_2(arg1, arg2); \
+ __sc_6 = (unsigned long) (arg3)
+ #undef LSS_SC_LOADARGS_4
+ #define LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4) \
+ LSS_SC_LOADARGS_3(arg1, arg2, arg3); \
+ __sc_7 = (unsigned long) (arg4)
+ #undef LSS_SC_LOADARGS_5
+ #define LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5) \
+ LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4); \
+ __sc_8 = (unsigned long) (arg5)
+ #undef LSS_SC_BODY
+ #define LSS_SC_BODY(nr, type, opt, args...) \
+ long __sc_ret, __sc_err; \
+ { \
+ register unsigned long __sc_0 __asm__ ("r0") = __NR_socketcall; \
+ register unsigned long __sc_3 __asm__ ("r3") = opt; \
+ register unsigned long __sc_4 __asm__ ("r4"); \
+ register unsigned long __sc_5 __asm__ ("r5"); \
+ register unsigned long __sc_6 __asm__ ("r6"); \
+ register unsigned long __sc_7 __asm__ ("r7"); \
+ register unsigned long __sc_8 __asm__ ("r8"); \
+ LSS_SC_LOADARGS_##nr(args); \
+ __asm__ __volatile__ \
+ ("stwu 1, -48(1)\n\t" \
+ "stw 4, 20(1)\n\t" \
+ "stw 5, 24(1)\n\t" \
+ "stw 6, 28(1)\n\t" \
+ "stw 7, 32(1)\n\t" \
+ "stw 8, 36(1)\n\t" \
+ "addi 4, 1, 20\n\t" \
+ "sc\n\t" \
+ "mfcr %0" \
+ : "=&r" (__sc_0), \
+ "=&r" (__sc_3), "=&r" (__sc_4), \
+ "=&r" (__sc_5), "=&r" (__sc_6), \
+ "=&r" (__sc_7), "=&r" (__sc_8) \
+ : LSS_ASMINPUT_##nr \
+ : "cr0", "ctr", "memory"); \
+ __sc_ret = __sc_3; \
+ __sc_err = __sc_0; \
+ } \
+ LSS_RETURN(type, __sc_ret, __sc_err)
+
+ LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg,
+ int flags){
+ LSS_SC_BODY(3, ssize_t, 17, s, msg, flags);
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s,
+ const struct kernel_msghdr *msg,
+ int flags) {
+ LSS_SC_BODY(3, ssize_t, 16, s, msg, flags);
+ }
+
+ // TODO(csilvers): why is this ifdef'ed out?
+#if 0
+ LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len,
+ int flags,
+ const struct kernel_sockaddr *to,
+ unsigned int tolen) {
+ LSS_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen);
+ }
+#endif
+
+ LSS_INLINE int LSS_NAME(shutdown)(int s, int how) {
+ LSS_SC_BODY(2, int, 13, s, how);
+ }
+
+ LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) {
+ LSS_SC_BODY(3, int, 1, domain, type, protocol);
+ }
+
+ LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol,
+ int sv[2]) {
+ LSS_SC_BODY(4, int, 8, d, type, protocol, sv);
+ }
+ #endif
+ #if defined(__i386__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+ #define __NR__socketcall __NR_socketcall
+ LSS_INLINE _syscall2(int, _socketcall, int, c,
+ va_list, a)
+
+ LSS_INLINE int LSS_NAME(socketcall)(int op, ...) {
+ int rc;
+ va_list ap;
+ va_start(ap, op);
+ rc = LSS_NAME(_socketcall)(op, ap);
+ va_end(ap);
+ return rc;
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg,
+ int flags){
+ return (ssize_t)LSS_NAME(socketcall)(17, s, msg, flags);
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s,
+ const struct kernel_msghdr *msg,
+ int flags) {
+ return (ssize_t)LSS_NAME(socketcall)(16, s, msg, flags);
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len,
+ int flags,
+ const struct kernel_sockaddr *to,
+ unsigned int tolen) {
+ return (ssize_t)LSS_NAME(socketcall)(11, s, buf, len, flags, to, tolen);
+ }
+
+ LSS_INLINE int LSS_NAME(shutdown)(int s, int how) {
+ return LSS_NAME(socketcall)(13, s, how);
+ }
+
+ LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) {
+ return LSS_NAME(socketcall)(1, domain, type, protocol);
+ }
+
+ LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol,
+ int sv[2]) {
+ return LSS_NAME(socketcall)(8, d, type, protocol, sv);
+ }
+ #endif
+ #if defined(__i386__) || defined(__PPC__)
+ LSS_INLINE _syscall4(int, fstatat64, int, d,
+ const char *, p,
+ struct kernel_stat64 *, b, int, f)
+ #endif
+ #if defined(__i386__) || defined(__PPC__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+ LSS_INLINE _syscall3(pid_t, waitpid, pid_t, p,
+ int*, s, int, o)
+ #endif
+ #if defined(__mips__)
+ /* sys_pipe() on MIPS has non-standard calling conventions, as it returns
+ * both file handles through CPU registers.
+ */
+ LSS_INLINE int LSS_NAME(pipe)(int *p) {
+ register unsigned long __v0 __asm__("$2") = __NR_pipe;
+ register unsigned long __v1 __asm__("$3");
+ register unsigned long __r7 __asm__("$7");
+ __asm__ __volatile__ ("syscall\n"
+ : "=&r"(__v0), "=&r"(__v1), "+r" (__r7)
+ : "0"(__v0)
+ : "$8", "$9", "$10", "$11", "$12",
+ "$13", "$14", "$15", "$24", "memory");
+ if (__r7) {
+ LSS_ERRNO = __v0;
+ return -1;
+ } else {
+ p[0] = __v0;
+ p[1] = __v1;
+ return 0;
+ }
+ }
+ #else
+ LSS_INLINE _syscall1(int, pipe, int *, p)
+ #endif
+ /* TODO(csilvers): see if ppc can/should support this as well */
+ #if defined(__i386__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64)
+ #define __NR__statfs64 __NR_statfs64
+ #define __NR__fstatfs64 __NR_fstatfs64
+ LSS_INLINE _syscall3(int, _statfs64, const char*, p,
+ size_t, s,struct kernel_statfs64*, b)
+ LSS_INLINE _syscall3(int, _fstatfs64, int, f,
+ size_t, s,struct kernel_statfs64*, b)
+ LSS_INLINE int LSS_NAME(statfs64)(const char *p,
+ struct kernel_statfs64 *b) {
+ return LSS_NAME(_statfs64)(p, sizeof(*b), b);
+ }
+ LSS_INLINE int LSS_NAME(fstatfs64)(int f,struct kernel_statfs64 *b) {
+ return LSS_NAME(_fstatfs64)(f, sizeof(*b), b);
+ }
+ #endif
+
+ LSS_INLINE int LSS_NAME(execv)(const char *path, const char *const argv[]) {
+ extern char **environ;
+ return LSS_NAME(execve)(path, argv, (const char *const *)environ);
+ }
+
+ LSS_INLINE pid_t LSS_NAME(gettid)() {
+ pid_t tid = LSS_NAME(_gettid)();
+ if (tid != -1) {
+ return tid;
+ }
+ return LSS_NAME(getpid)();
+ }
+
+ LSS_INLINE void *LSS_NAME(mremap)(void *old_address, size_t old_size,
+ size_t new_size, int flags, ...) {
+ va_list ap;
+ void *new_address, *rc;
+ va_start(ap, flags);
+ new_address = va_arg(ap, void *);
+ rc = LSS_NAME(_mremap)(old_address, old_size, new_size,
+ flags, new_address);
+ va_end(ap);
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) {
+ /* PTRACE_DETACH can sometimes forget to wake up the tracee and it
+ * then sends job control signals to the real parent, rather than to
+ * the tracer. We reduce the risk of this happening by starting a
+ * whole new time slice, and then quickly sending a SIGCONT signal
+ * right after detaching from the tracee.
+ *
+ * We use tkill to ensure that we only issue a wakeup for the thread being
+ * detached. Large multi threaded apps can take a long time in the kernel
+ * processing SIGCONT.
+ */
+ int rc, err;
+ LSS_NAME(sched_yield)();
+ rc = LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0);
+ err = LSS_ERRNO;
+ LSS_NAME(tkill)(pid, SIGCONT);
+ /* Old systems don't have tkill */
+ if (LSS_ERRNO == ENOSYS)
+ LSS_NAME(kill)(pid, SIGCONT);
+ LSS_ERRNO = err;
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(raise)(int sig) {
+ return LSS_NAME(kill)(LSS_NAME(getpid)(), sig);
+ }
+
+ LSS_INLINE int LSS_NAME(setpgrp)() {
+ return LSS_NAME(setpgid)(0, 0);
+ }
+
+ LSS_INLINE int LSS_NAME(sysconf)(int name) {
+ extern int __getpagesize(void);
+ switch (name) {
+ case _SC_OPEN_MAX: {
+ struct kernel_rlimit limit;
+ return LSS_NAME(getrlimit)(RLIMIT_NOFILE, &limit) < 0
+ ? 8192 : limit.rlim_cur;
+ }
+ case _SC_PAGESIZE:
+ return __getpagesize();
+ default:
+ LSS_ERRNO = ENOSYS;
+ return -1;
+ }
+ }
+ #if defined(__x86_64__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64)
+ LSS_INLINE _syscall4(ssize_t, pread64, int, f,
+ void *, b, size_t, c,
+ loff_t, o)
+ LSS_INLINE _syscall4(ssize_t, pwrite64, int, f,
+ const void *, b, size_t, c,
+ loff_t, o)
+ LSS_INLINE _syscall3(int, readahead, int, f,
+ loff_t, o, unsigned, c)
+ #else
+ #define __NR__pread64 __NR_pread64
+ #define __NR__pwrite64 __NR_pwrite64
+ #define __NR__readahead __NR_readahead
+ LSS_INLINE _syscall5(ssize_t, _pread64, int, f,
+ void *, b, size_t, c, unsigned, o1,
+ unsigned, o2)
+ LSS_INLINE _syscall5(ssize_t, _pwrite64, int, f,
+ const void *, b, size_t, c, unsigned, o1,
+ long, o2)
+ LSS_INLINE _syscall4(int, _readahead, int, f,
+ unsigned, o1, unsigned, o2, size_t, c);
+ /* We force 64bit-wide parameters onto the stack, then access each
+ * 32-bit component individually. This guarantees that we build the
+ * correct parameters independent of the native byte-order of the
+ * underlying architecture.
+ */
+ LSS_INLINE ssize_t LSS_NAME(pread64)(int fd, void *buf, size_t count,
+ loff_t off) {
+ union { loff_t off; unsigned arg[2]; } o = { off };
+ return LSS_NAME(_pread64)(fd, buf, count, o.arg[0], o.arg[1]);
+ }
+ LSS_INLINE ssize_t LSS_NAME(pwrite64)(int fd, const void *buf,
+ size_t count, loff_t off) {
+ union { loff_t off; unsigned arg[2]; } o = { off };
+ return LSS_NAME(_pwrite64)(fd, buf, count, o.arg[0], o.arg[1]);
+ }
+ LSS_INLINE int LSS_NAME(readahead)(int fd, loff_t off, int len) {
+ union { loff_t off; unsigned arg[2]; } o = { off };
+ return LSS_NAME(_readahead)(fd, o.arg[0], o.arg[1], len);
+ }
+ #endif
+#endif
+
+#if defined(__cplusplus) && !defined(SYS_CPLUSPLUS)
+}
+#endif
+
+#endif
+#endif
diff --git a/sandbox/linux/seccomp/madvise.cc b/sandbox/linux/seccomp/madvise.cc
new file mode 100644
index 0000000..70c594f
--- /dev/null
+++ b/sandbox/linux/seccomp/madvise.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_madvise(void* start, size_t length, int advice) {
+ long long tm;
+ Debug::syscall(&tm, __NR_madvise, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MAdvise madvise_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_madvise;
+ request.cookie = cookie();
+ request.madvise_req.start = start;
+ request.madvise_req.len = length;
+ request.madvise_req.advice = advice;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward madvise() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_madvise);
+ return rc;
+}
+
+bool Sandbox::process_madvise(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ MAdvise madvise_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &madvise_req, sizeof(madvise_req)) !=
+ sizeof(madvise_req)) {
+ die("Failed to read parameters for madvise() [process]");
+ }
+ int rc = -EINVAL;
+ switch (madvise_req.advice) {
+ case MADV_NORMAL:
+ case MADV_RANDOM:
+ case MADV_SEQUENTIAL:
+ case MADV_WILLNEED:
+ ok:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_madvise,
+ madvise_req.start, madvise_req.len,
+ madvise_req.advice);
+ return true;
+ default:
+ // All other flags to madvise() are potential dangerous (as opposed to
+ // merely affecting overall performance). Do not allow them on memory
+ // ranges that were part of the original mappings.
+ void *stop = reinterpret_cast<void *>(
+ (char *)madvise_req.start + madvise_req.len);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)madvise_req.start);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (madvise_req.start < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+
+ // Changing attributes on memory regions that were newly mapped inside of
+ // the sandbox is OK.
+ goto ok;
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/maps.cc b/sandbox/linux/seccomp/maps.cc
new file mode 100644
index 0000000..8ae218d
--- /dev/null
+++ b/sandbox/linux/seccomp/maps.cc
@@ -0,0 +1,267 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "library.h"
+#include "maps.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+Maps::Maps(int proc_self_maps) :
+ proc_self_maps_(proc_self_maps),
+ begin_iter_(this, true, false),
+ end_iter_(this, false, true),
+ vsyscall_(0) {
+ Sandbox::SysCalls sys;
+ if (proc_self_maps_ >= 0 &&
+ !sys.lseek(proc_self_maps_, 0, SEEK_SET)) {
+ char buf[256] = { 0 };
+ int len = 0, rc = 1;
+ bool long_line = false;
+ do {
+ if (rc > 0) {
+ rc = Sandbox::read(sys, proc_self_maps_, buf + len,
+ sizeof(buf) - len - 1);
+ if (rc > 0) {
+ len += rc;
+ }
+ }
+ char *ptr = buf;
+ if (!long_line) {
+ long_line = true;
+ unsigned long start = strtoul(ptr, &ptr, 16);
+ unsigned long stop = strtoul(ptr + 1, &ptr, 16);
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ char *perm_ptr = ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
+ string perm(perm_ptr, ptr - perm_ptr);
+ unsigned long offset = strtoul(ptr, &ptr, 16);
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ char *id_ptr = ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
+ string id(id_ptr, ptr - id_ptr);
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ char *library_ptr = ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t' && *ptr != '\n') ++ptr;
+ string library(library_ptr, ptr - library_ptr);
+ bool isVDSO = false;
+ if (library == "[vdso]") {
+ // /proc/self/maps has a misleading file offset in the [vdso] entry.
+ // Override it with a sane value.
+ offset = 0;
+ isVDSO = true;
+ } else if (library == "[vsyscall]") {
+ vsyscall_ = reinterpret_cast<char *>(start);
+ } else if (library.empty() || library[0] == '[') {
+ goto skip_entry;
+ }
+ int prot = 0;
+ if (perm.find('r') != string::npos) {
+ prot |= PROT_READ;
+ }
+ if (perm.find('w') != string::npos) {
+ prot |= PROT_WRITE;
+ }
+ if (perm.find('x') != string::npos) {
+ prot |= PROT_EXEC;
+ }
+ if ((prot & (PROT_EXEC | PROT_READ)) == 0) {
+ goto skip_entry;
+ }
+ Library* lib = &libs_[id + ' ' + library];
+ lib->setLibraryInfo(this);
+ lib->addMemoryRange(reinterpret_cast<void *>(start),
+ reinterpret_cast<void *>(stop),
+ Elf_Addr(offset),
+ prot, isVDSO);
+ }
+ skip_entry:
+ for (;;) {
+ if (!*ptr || *ptr++ == '\n') {
+ long_line = false;
+ memmove(buf, ptr, len - (ptr - buf));
+ memset(buf + len - (ptr - buf), 0, ptr - buf);
+ len -= (ptr - buf);
+ break;
+ }
+ }
+ } while (len || long_line);
+ }
+}
+
+Maps::Iterator::Iterator(Maps* maps, bool at_beginning, bool at_end)
+ : maps_(maps),
+ at_beginning_(at_beginning),
+ at_end_(at_end) {
+}
+
+Maps::LibraryMap::iterator& Maps::Iterator::getIterator() const {
+ if (at_beginning_) {
+ iter_ = maps_->libs_.begin();
+ } else if (at_end_) {
+ iter_ = maps_->libs_.end();
+ }
+ return iter_;
+}
+
+Maps::Iterator Maps::Iterator::begin() {
+ return maps_->begin_iter_;
+}
+
+Maps::Iterator Maps::Iterator::end() {
+ return maps_->end_iter_;
+}
+
+Maps::Iterator& Maps::Iterator::operator++() {
+ getIterator().operator++();
+ at_beginning_ = false;
+ return *this;
+}
+
+Maps::Iterator Maps::Iterator::operator++(int i) {
+ getIterator().operator++(i);
+ at_beginning_ = false;
+ return *this;
+}
+
+Library* Maps::Iterator::operator*() const {
+ return &getIterator().operator*().second;
+}
+
+bool Maps::Iterator::operator==(const Maps::Iterator& iter) const {
+ return getIterator().operator==(iter.getIterator());
+}
+
+bool Maps::Iterator::operator!=(const Maps::Iterator& iter) const {
+ return !operator==(iter);
+}
+
+Maps::string Maps::Iterator::name() const {
+ return getIterator()->first;
+}
+
+// Test whether a line ends with "[stack]"; used for identifying the
+// stack entry of /proc/self/maps.
+static bool isStackLine(char* buf, char* end) {
+ char* ptr = buf;
+ for ( ; *ptr != '\n' && ptr < end; ++ptr)
+ ;
+ if (ptr < end && ptr - 7 > buf) {
+ return (memcmp(ptr - 7, "[stack]", 7) == 0);
+ }
+ return false;
+}
+
+char* Maps::allocNearAddr(char* addr_target, size_t size, int prot) const {
+ // We try to allocate memory within 1.5GB of a target address. This means,
+ // we will be able to perform relative 32bit jumps from the target address.
+ const unsigned long kMaxDistance = 1536 << 20;
+ // In most of the code below, we just care about the numeric value of
+ // the address.
+ const long addr = reinterpret_cast<long>(addr_target);
+ size = (size + 4095) & ~4095;
+ Sandbox::SysCalls sys;
+ if (sys.lseek(proc_self_maps_, 0, SEEK_SET)) {
+ return NULL;
+ }
+
+ // Iterate through lines of /proc/self/maps to consider each mapped
+ // region one at a time, looking for a gap between regions to allocate.
+ char buf[256] = { 0 };
+ int len = 0, rc = 1;
+ bool long_line = false;
+ unsigned long gap_start = 0x10000;
+ void* new_addr;
+ do {
+ if (rc > 0) {
+ do {
+ rc = Sandbox::read(sys, proc_self_maps_, buf + len,
+ sizeof(buf) - len - 1);
+ if (rc > 0) {
+ len += rc;
+ }
+ } while (rc > 0 && len < (int)sizeof(buf) - 1);
+ }
+ char *ptr = buf;
+ if (!long_line) {
+ long_line = true;
+ // Maps lines have the form "<start address>-<end address> ... <name>".
+ unsigned long gap_end = strtoul(ptr, &ptr, 16);
+ unsigned long map_end = strtoul(ptr + 1, &ptr, 16);
+
+ // gap_start to gap_end now covers the region of empty space before
+ // the current line. Now we try to see if there's a place within the
+ // gap we can use.
+
+ if (gap_end - gap_start >= size) {
+ // Is the gap before our target address?
+ if (addr - static_cast<long>(gap_end) >= 0) {
+ if (addr - (gap_end - size) < kMaxDistance) {
+ unsigned long position;
+ if (isStackLine(ptr, buf + len)) {
+ // If we're adjacent to the stack, try to stay away from
+ // the GROWS_DOWN region. Pick the farthest away region that
+ // is still within the gap.
+
+ if (static_cast<unsigned long>(addr) < kMaxDistance || // Underflow protection.
+ static_cast<unsigned long>(addr) - kMaxDistance < gap_start) {
+ position = gap_start;
+ } else {
+ position = (addr - kMaxDistance) & ~4095;
+ if (position < gap_start) {
+ position = gap_start;
+ }
+ }
+ } else {
+ // Otherwise, take the end of the region.
+ position = gap_end - size;
+ }
+ new_addr = reinterpret_cast<char *>(sys.MMAP
+ (reinterpret_cast<void *>(position), size, prot,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0));
+ if (new_addr != MAP_FAILED) {
+ goto done;
+ }
+ }
+ } else if (gap_start + size - addr < kMaxDistance) {
+ // Gap is after the address. Above checks that we can wrap around
+ // through 0 to a space we'd use.
+ new_addr = reinterpret_cast<char *>(sys.MMAP
+ (reinterpret_cast<void *>(gap_start), size, prot,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1 ,0));
+ if (new_addr != MAP_FAILED) {
+ goto done;
+ }
+ }
+ }
+ gap_start = map_end;
+ }
+ for (;;) {
+ if (!*ptr || *ptr++ == '\n') {
+ long_line = false;
+ memmove(buf, ptr, len - (ptr - buf));
+ memset(buf + len - (ptr - buf), 0, ptr - buf);
+ len -= (ptr - buf);
+ break;
+ }
+ }
+ } while (len || long_line);
+ new_addr = NULL;
+done:
+ return reinterpret_cast<char*>(new_addr);
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/maps.h b/sandbox/linux/seccomp/maps.h
new file mode 100644
index 0000000..fbcc7672
--- /dev/null
+++ b/sandbox/linux/seccomp/maps.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MAPS_H__
+#define MAPS_H__
+
+#include <elf.h>
+#include <functional>
+#include <map>
+#include <string>
+
+#include "allocator.h"
+
+#if defined(__x86_64__)
+typedef Elf64_Addr Elf_Addr;
+#elif defined(__i386__)
+typedef Elf32_Addr Elf_Addr;
+#else
+#error Undefined target platform
+#endif
+
+namespace playground {
+
+class Library;
+class Maps {
+ friend class Library;
+ public:
+ typedef std::basic_string<char, std::char_traits<char>,
+ SystemAllocator<char> > string;
+
+ Maps(int proc_self_maps);
+ ~Maps() { }
+
+ protected:
+ // A map with all the libraries currently loaded into the application.
+ // The key is a unique combination of device number, inode number, and
+ // file name. It should be treated as opaque.
+ typedef std::map<string, Library, std::less<string>,
+ SystemAllocator<std::pair<const string,
+ Library> > > LibraryMap;
+ friend class Iterator;
+ class Iterator {
+ friend class Maps;
+
+ protected:
+ explicit Iterator(Maps* maps);
+ Iterator(Maps* maps, bool at_beginning, bool at_end);
+ Maps::LibraryMap::iterator& getIterator() const;
+
+ public:
+ Iterator begin();
+ Iterator end();
+ Iterator& operator++();
+ Iterator operator++(int i);
+ Library* operator*() const;
+ bool operator==(const Iterator& iter) const;
+ bool operator!=(const Iterator& iter) const;
+ string name() const;
+
+ protected:
+ mutable LibraryMap::iterator iter_;
+ Maps *maps_;
+ bool at_beginning_;
+ bool at_end_;
+ };
+
+ public:
+ typedef class Iterator const_iterator;
+
+ const_iterator begin() {
+ return begin_iter_;
+ }
+
+ const_iterator end() {
+ return end_iter_;
+ }
+
+ char* allocNearAddr(char *addr, size_t size, int prot) const;
+
+ char* vsyscall() const { return vsyscall_; }
+
+ protected:
+ const int proc_self_maps_;
+ const Iterator begin_iter_;
+ const Iterator end_iter_;
+
+ LibraryMap libs_;
+ char* vsyscall_;
+};
+
+} // namespace
+
+#endif // MAPS_H__
diff --git a/sandbox/linux/seccomp/mmap.cc b/sandbox/linux/seccomp/mmap.cc
new file mode 100644
index 0000000..700da91
--- /dev/null
+++ b/sandbox/linux/seccomp/mmap.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+void* Sandbox::sandbox_mmap(void *start, size_t length, int prot, int flags,
+ int fd, off_t offset) {
+ long long tm;
+ Debug::syscall(&tm, __NR_mmap, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MMap mmap_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_MMAP;
+ request.cookie = cookie();
+ request.mmap_req.start = start;
+ request.mmap_req.length = length;
+ request.mmap_req.prot = prot;
+ request.mmap_req.flags = flags;
+ request.mmap_req.fd = fd;
+ request.mmap_req.offset = offset;
+
+ void* rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward mmap() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_mmap);
+ return rc;
+}
+
+bool Sandbox::process_mmap(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ MMap mmap_req;
+ if (read(sys, sandboxFd, &mmap_req, sizeof(mmap_req)) != sizeof(mmap_req)) {
+ die("Failed to read parameters for mmap() [process]");
+ }
+
+ if (mmap_req.flags & MAP_FIXED) {
+ // Cannot map a memory area that was part of the original memory mappings.
+ void *stop = reinterpret_cast<void *>(
+ (char *)mmap_req.start + mmap_req.length);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)mmap_req.start);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (mmap_req.start < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ int rc = -EINVAL;
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+ }
+
+ // All other mmap() requests are OK
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_MMAP,
+ mmap_req.start, mmap_req.length, mmap_req.prot,
+ mmap_req.flags, mmap_req.fd, mmap_req.offset);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/mprotect.cc b/sandbox/linux/seccomp/mprotect.cc
new file mode 100644
index 0000000..548199d
--- /dev/null
+++ b/sandbox/linux/seccomp/mprotect.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_mprotect(const void *addr, size_t len, int prot) {
+ long long tm;
+ Debug::syscall(&tm, __NR_mprotect, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MProtect mprotect_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_mprotect;
+ request.cookie = cookie();
+ request.mprotect_req.addr = addr;
+ request.mprotect_req.len = len;
+ request.mprotect_req.prot = prot;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward mprotect() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_mprotect);
+ return rc;
+}
+
+bool Sandbox::process_mprotect(int parentMapsFd, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ MProtect mprotect_req;
+ if (read(sys, sandboxFd, &mprotect_req, sizeof(mprotect_req)) !=
+ sizeof(mprotect_req)) {
+ die("Failed to read parameters for mprotect() [process]");
+ }
+
+ // Cannot change permissions on any memory region that was part of the
+ // original memory mappings.
+ int rc = -EINVAL;
+ void *stop = reinterpret_cast<void *>(
+ (char *)mprotect_req.addr + mprotect_req.len);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)mprotect_req.addr);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (mprotect_req.addr < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+
+ // Changing permissions on memory regions that were newly mapped inside of
+ // the sandbox is OK.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_mprotect,
+ mprotect_req.addr, mprotect_req.len,
+ mprotect_req.prot);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/munmap.cc b/sandbox/linux/seccomp/munmap.cc
new file mode 100644
index 0000000..dde7c7a
--- /dev/null
+++ b/sandbox/linux/seccomp/munmap.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_munmap(void* start, size_t length) {
+ long long tm;
+ Debug::syscall(&tm, __NR_munmap, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MUnmap munmap_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_munmap;
+ request.cookie = cookie();
+ request.munmap_req.start = start;
+ request.munmap_req.length = length;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward munmap() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_munmap);
+ return rc;
+}
+
+bool Sandbox::process_munmap(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ MUnmap munmap_req;
+ if (read(sys, sandboxFd, &munmap_req, sizeof(munmap_req)) !=
+ sizeof(munmap_req)) {
+ die("Failed to read parameters for munmap() [process]");
+ }
+
+ // Cannot unmap any memory region that was part of the original memory
+ // mappings.
+ int rc = -EINVAL;
+ void *stop = reinterpret_cast<void *>(
+ reinterpret_cast<char *>(munmap_req.start) + munmap_req.length);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ munmap_req.start);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (munmap_req.start < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+
+ // Unmapping memory regions that were newly mapped inside of the sandbox
+ // is OK.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_munmap,
+ munmap_req.start, munmap_req.length);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/mutex.h b/sandbox/linux/seccomp/mutex.h
new file mode 100644
index 0000000..d7e1c5d
--- /dev/null
+++ b/sandbox/linux/seccomp/mutex.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef MUTEX_H__
+#define MUTEX_H__
+
+#include "sandbox_impl.h"
+
+namespace playground {
+
+class Mutex {
+ public:
+ typedef int mutex_t;
+
+ enum { kInitValue = 0 };
+
+ static void initMutex(mutex_t* mutex) {
+ // Mutex is unlocked, and nobody is waiting for it
+ *mutex = kInitValue;
+ }
+
+ static void unlockMutex(mutex_t* mutex) {
+ char status;
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; addl %2, %0\n"
+ "setz %1"
+ : "=m"(*mutex), "=qm"(status)
+ : "ir"(0x80000000), "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ if (status) {
+ // Mutex is zero now. No other waiters. So, we can return.
+ return;
+ }
+ // We unlocked the mutex, but still need to wake up other waiters.
+ Sandbox::SysCalls sys;
+ sys.futex(mutex, FUTEX_WAKE, 1, NULL);
+ }
+
+ static bool lockMutex(mutex_t* mutex, int timeout = 0) {
+ bool rc = true;
+ // Increment mutex to add ourselves to the list of waiters
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; incl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ for (;;) {
+ // Atomically check whether the mutex is available and if so, acquire it
+ char status;
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; btsl %3, %1\n"
+ "setc %0"
+ : "=q"(status), "=m"(*mutex)
+ : "m"(*mutex), "ir"(31));
+ #else
+ #error Unsupported target platform
+ #endif
+ if (!status) {
+ done:
+ // If the mutex was available, remove ourselves from list of waiters
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; decl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ return rc;
+ }
+ int value = *mutex;
+ if (value >= 0) {
+ // Mutex has just become available, no need to call kernel
+ continue;
+ }
+ Sandbox::SysCalls sys;
+ Sandbox::SysCalls::kernel_timespec tm;
+ if (timeout) {
+ tm.tv_sec = timeout / 1000;
+ tm.tv_nsec = (timeout % 1000) * 1000 * 1000;
+ } else {
+ tm.tv_sec = 0;
+ tm.tv_nsec = 0;
+ }
+ if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) &&
+ sys.my_errno == ETIMEDOUT) {
+ rc = false;
+ goto done;
+ }
+ }
+ }
+
+ static bool waitForUnlock(mutex_t* mutex, int timeout = 0) {
+ bool rc = true;
+ // Increment mutex to add ourselves to the list of waiters
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; incl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ Sandbox::SysCalls sys;
+ for (;;) {
+ mutex_t value = *mutex;
+ if (value >= 0) {
+ done:
+ // Mutex was not locked. Remove ourselves from list of waiters, notify
+ // any other waiters (if any), and return.
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; decl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ NOINTR_SYS(sys.futex(mutex, FUTEX_WAKE, 1, 0));
+ return rc;
+ }
+
+ // Wait for mutex to become unlocked
+ Sandbox::SysCalls::kernel_timespec tm;
+ if (timeout) {
+ tm.tv_sec = timeout / 1000;
+ tm.tv_nsec = (timeout % 1000) * 1000 * 1000;
+ } else {
+ tm.tv_sec = 0;
+ tm.tv_nsec = 0;
+ }
+
+ if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) &&
+ sys.my_errno == ETIMEDOUT) {
+ rc = false;
+ goto done;
+ }
+ }
+ }
+
+};
+
+} // namespace
+
+#endif // MUTEX_H__
diff --git a/sandbox/linux/seccomp/open.cc b/sandbox/linux/seccomp/open.cc
new file mode 100644
index 0000000..8a9093c
--- /dev/null
+++ b/sandbox/linux/seccomp/open.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_open(const char *pathname, int flags, mode_t mode) {
+ long long tm;
+ Debug::syscall(&tm, __NR_open, "Executing handler");
+ size_t len = strlen(pathname);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Open open_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_open;
+ request->cookie = cookie();
+ request->open_req.path_length = len;
+ request->open_req.flags = flags;
+ request->open_req.mode = mode;
+ memcpy(request->pathname, pathname, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward open() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_open);
+ return rc;
+}
+
+bool Sandbox::process_open(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ Open open_req;
+ if (read(sys, sandboxFd, &open_req, sizeof(open_req)) != sizeof(open_req)) {
+ read_parm_failed:
+ die("Failed to read parameters for open() [process]");
+ }
+ int rc = -ENAMETOOLONG;
+ if (open_req.path_length >= sizeof(mem->pathname)) {
+ char buf[32];
+ while (open_req.path_length > 0) {
+ size_t len = open_req.path_length > sizeof(buf) ?
+ sizeof(buf) : open_req.path_length;
+ ssize_t i = read(sys, sandboxFd, buf, len);
+ if (i <= 0) {
+ goto read_parm_failed;
+ }
+ open_req.path_length -= i;
+ }
+ if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to return data from open() [process]");
+ }
+ return false;
+ }
+
+ if ((open_req.flags & O_ACCMODE) != O_RDONLY ||
+ !g_policy.allow_file_namespace) {
+ // After locking the mutex, we can no longer abandon the system call. So,
+ // perform checks before clobbering the securely shared memory.
+ char tmp[open_req.path_length];
+ if (read(sys, sandboxFd, tmp, open_req.path_length) !=
+ (ssize_t)open_req.path_length) {
+ goto read_parm_failed;
+ }
+ Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str());
+ SecureMem::abandonSystemCall(threadFd, -EACCES);
+ return false;
+ }
+
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ if (read(sys, sandboxFd, mem->pathname, open_req.path_length) !=
+ (ssize_t)open_req.path_length) {
+ goto read_parm_failed;
+ }
+ mem->pathname[open_req.path_length] = '\000';
+
+ // TODO(markus): Implement sandboxing policy. For now, we allow read
+ // access to everything. That's probably not correct.
+ Debug::message(("Allowing access to \"" + std::string(mem->pathname) +
+ "\"").c_str());
+
+ // Tell trusted thread to open the file.
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem, __NR_open,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ open_req.flags, open_req.mode);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc
new file mode 100644
index 0000000..0b09457
--- /dev/null
+++ b/sandbox/linux/seccomp/sandbox.cc
@@ -0,0 +1,838 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "library.h"
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+// Global variables
+int Sandbox::proc_self_maps_ = -1;
+enum Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;
+int Sandbox::pid_;
+int Sandbox::processFdPub_;
+int Sandbox::cloneFdPub_;
+Sandbox::SysCalls::kernel_sigaction Sandbox::sa_segv_;
+Sandbox::ProtectedMap Sandbox::protectedMap_;
+std::vector<SecureMem::Args*> Sandbox::secureMemPool_;
+
+bool Sandbox::sendFd(int transport, int fd0, int fd1, const void* buf,
+ size_t len) {
+ int fds[2], count = 0;
+ if (fd0 >= 0) { fds[count++] = fd0; }
+ if (fd1 >= 0) { fds[count++] = fd1; }
+ if (!count) {
+ return false;
+ }
+ char cmsg_buf[CMSG_SPACE(count*sizeof(int))];
+ memset(cmsg_buf, 0, sizeof(cmsg_buf));
+ struct SysCalls::kernel_iovec iov[2] = { { 0 } };
+ struct SysCalls::kernel_msghdr msg = { 0 };
+ int dummy = 0;
+ iov[0].iov_base = &dummy;
+ iov[0].iov_len = sizeof(dummy);
+ if (buf && len > 0) {
+ iov[1].iov_base = const_cast<void *>(buf);
+ iov[1].iov_len = len;
+ }
+ msg.msg_iov = iov;
+ msg.msg_iovlen = (buf && len > 0) ? 2 : 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = CMSG_LEN(count*sizeof(int));
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(count*sizeof(int));
+ memcpy(CMSG_DATA(cmsg), fds, count*sizeof(int));
+ SysCalls sys;
+ return NOINTR_SYS(sys.sendmsg(transport, &msg, 0)) ==
+ (ssize_t)(sizeof(dummy) + ((buf && len > 0) ? len : 0));
+}
+
+bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) {
+ int count = 0;
+ int *err = NULL;
+ if (fd0) {
+ count++;
+ err = fd0;
+ *fd0 = -1;
+ }
+ if (fd1) {
+ if (!count++) {
+ err = fd1;
+ }
+ *fd1 = -1;
+ }
+ if (!count) {
+ return false;
+ }
+ char cmsg_buf[CMSG_SPACE(count*sizeof(int))];
+ memset(cmsg_buf, 0, sizeof(cmsg_buf));
+ struct SysCalls::kernel_iovec iov[2] = { { 0 } };
+ struct SysCalls::kernel_msghdr msg = { 0 };
+ iov[0].iov_base = err;
+ iov[0].iov_len = sizeof(int);
+ if (buf && len && *len > 0) {
+ iov[1].iov_base = buf;
+ iov[1].iov_len = *len;
+ }
+ msg.msg_iov = iov;
+ msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = CMSG_LEN(count*sizeof(int));
+ SysCalls sys;
+ ssize_t bytes = NOINTR_SYS(sys.recvmsg(transport, &msg, 0));
+ if (len) {
+ *len = bytes > (int)sizeof(int) ?
+ bytes - sizeof(int) : 0;
+ }
+ if (bytes != (ssize_t)(sizeof(int) + ((buf && len && *len > 0) ? *len : 0))){
+ *err = bytes >= 0 ? 0 : -EBADF;
+ return false;
+ }
+ if (*err) {
+ // "err" is the first four bytes of the payload. If these are non-zero,
+ // the sender on the other side of the socketpair sent us an errno value.
+ // We don't expect to get any file handles in this case.
+ return false;
+ }
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) ||
+ !cmsg ||
+ cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS ||
+ cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) {
+ *err = -EBADF;
+ return false;
+ }
+ if (fd1) { *fd1 = ((int *)CMSG_DATA(cmsg))[--count]; }
+ if (fd0) { *fd0 = ((int *)CMSG_DATA(cmsg))[--count]; }
+ return true;
+}
+
+void Sandbox::setupSignalHandlers() {
+ // Set SIGCHLD to SIG_DFL so that waitpid() can work
+ SysCalls sys;
+ struct SysCalls::kernel_sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler_ = SIG_DFL;
+ sys.sigaction(SIGCHLD, &sa, NULL);
+
+ // Set up SEGV handler for dealing with RDTSC instructions, system calls
+ // that have been rewritten to use INT0, for sigprocmask() emulation, for
+ // the creation of threads, and for user-provided SEGV handlers.
+ sa.sa_sigaction_ = segv();
+ sa.sa_flags = SA_SIGINFO | SA_NODEFER;
+ sys.sigaction(SIGSEGV, &sa, &sa_segv_);
+
+ // Unblock SIGSEGV and SIGCHLD
+ SysCalls::kernel_sigset_t mask;
+ memset(&mask, 0x00, sizeof(mask));
+ mask.sig[0] |= (1 << (SIGSEGV - 1)) | (1 << (SIGCHLD - 1));
+ sys.sigprocmask(SIG_UNBLOCK, &mask, 0);
+}
+
+void (*Sandbox::segv())(int signo, SysCalls::siginfo *context, void *unused) {
+ void (*fnc)(int signo, SysCalls::siginfo *context, void *unused);
+ asm volatile(
+ "call 999f\n"
+#if defined(__x86_64__)
+ // Inspect instruction at the point where the segmentation fault
+ // happened. If it is RDTSC, forward the request to the trusted
+ // thread.
+ "mov $-3, %%r14\n" // request for RDTSC
+ "mov 0xB0(%%rsp), %%r15\n" // %rip at time of segmentation fault
+ "cmpw $0x310F, (%%r15)\n" // RDTSC
+ "jz 0f\n"
+ "cmpw $0x010F, (%%r15)\n" // RDTSCP
+ "jnz 8f\n"
+ "cmpb $0xF9, 2(%%r15)\n"
+ "jnz 8f\n"
+ "mov $-4, %%r14\n" // request for RDTSCP
+ "0:"
+#ifndef NDEBUG
+ "lea 100f(%%rip), %%rdi\n"
+ "call playground$debugMessage\n"
+#endif
+ "sub $4, %%rsp\n"
+ "push %%r14\n"
+ "mov %%gs:16, %%edi\n" // fd = threadFdPub
+ "mov %%rsp, %%rsi\n" // buf = %rsp
+ "mov $4, %%edx\n" // len = sizeof(int)
+ "1:mov $1, %%eax\n" // NR_write
+ "syscall\n"
+ "cmp %%rax, %%rdx\n"
+ "jz 5f\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 1b\n"
+ "2:add $12, %%rsp\n"
+ "movq $0, 0x98(%%rsp)\n" // %rax at time of segmentation fault
+ "movq $0, 0x90(%%rsp)\n" // %rdx at time of segmentation fault
+ "cmpw $0x310F, (%%r15)\n" // RDTSC
+ "jz 3f\n"
+ "movq $0, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault
+ "3:addq $2, 0xB0(%%rsp)\n" // %rip at time of segmentation fault
+ "cmpw $0x010F, (%%r15)\n" // RDTSC
+ "jnz 4f\n"
+ "addq $1, 0xB0(%%rsp)\n" // %rip at time of segmentation fault
+ "4:ret\n"
+ "5:mov $12, %%edx\n" // len = 3*sizeof(int)
+ "6:mov $0, %%eax\n" // NR_read
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 6b\n"
+ "cmp %%rax, %%rdx\n"
+ "jnz 2b\n"
+ "mov 0(%%rsp), %%eax\n"
+ "mov 4(%%rsp), %%edx\n"
+ "mov 8(%%rsp), %%ecx\n"
+ "add $12, %%rsp\n"
+ "mov %%rdx, 0x90(%%rsp)\n" // %rdx at time of segmentation fault
+ "cmpw $0x310F, (%%r15)\n" // RDTSC
+ "jz 7f\n"
+ "mov %%rcx, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault
+ "7:mov %%rax, 0x98(%%rsp)\n" // %rax at time of segmentation fault
+ "jmp 3b\n"
+
+ // If the instruction is INT 0, then this was probably the result
+ // of playground::Library being unable to find a way to safely
+ // rewrite the system call instruction. Retrieve the CPU register
+ // at the time of the segmentation fault and invoke syscallWrapper().
+ "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0
+ "jnz 16f\n"
+#ifndef NDEBUG
+ "lea 200f(%%rip), %%rdi\n"
+ "call playground$debugMessage\n"
+#endif
+ "mov 0x98(%%rsp), %%rax\n" // %rax at time of segmentation fault
+ "mov 0x70(%%rsp), %%rdi\n" // %rdi at time of segmentation fault
+ "mov 0x78(%%rsp), %%rsi\n" // %rsi at time of segmentation fault
+ "mov 0x90(%%rsp), %%rdx\n" // %rdx at time of segmentation fault
+ "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault
+ "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault
+ "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault
+
+ // Handle rt_sigprocmask()
+ "cmp $14, %%rax\n" // NR_rt_sigprocmask
+ "jnz 12f\n"
+ "mov $-22, %%rax\n" // -EINVAL
+ "cmp $8, %%r10\n" // %r10 = sigsetsize (8 bytes = 64 signals)
+ "jl 7b\n"
+ "mov 0x130(%%rsp), %%r10\n" // signal mask at time of segmentation fault
+ "test %%rsi, %%rsi\n" // only set mask, if set is non-NULL
+ "jz 11f\n"
+ "mov 0(%%rsi), %%rsi\n"
+ "cmp $0, %%rdi\n" // %rdi = how (SIG_BLOCK)
+ "jnz 9f\n"
+ "or %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault
+ "jmp 11f\n"
+ "9:cmp $1, %%rdi\n" // %rdi = how (SIG_UNBLOCK)
+ "jnz 10f\n"
+ "xor $-1, %%rsi\n"
+ "and %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault
+ "jmp 11f\n"
+ "10:cmp $2, %%rdi\n" // %rdi = how (SIG_SETMASK)
+ "jnz 7b\n"
+ "mov %%rsi, 0x130(%%rsp)\n" // signal mask at time of segmentation fault
+ "11:xor %%rax, %%rax\n"
+ "test %%rdx, %%rdx\n" // only return old mask, if set is non-NULL
+ "jz 7b\n"
+ "mov %%r10, 0(%%rdx)\n" // old_set
+ "jmp 7b\n"
+
+ // Handle rt_sigreturn()
+ "12:cmp $15, %%rax\n" // NR_rt_sigreturn
+ "jnz 14f\n"
+ "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault
+ "13:syscall\n" // rt_sigreturn() is unrestricted
+ "mov $66, %%edi\n" // rt_sigreturn() should never return
+ "mov $231, %%eax\n" // NR_exit_group
+ "jmp 13b\n"
+
+ // Copy signal frame onto new stack. See clone.cc for details
+ "14:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000
+ "jnz 15f\n"
+ "lea 8(%%rsp), %%rax\n" // retain stack frame upon returning
+ "mov %%rax, 0xA8(%%rsp)\n" // %rsp at time of segmentation fault
+ "jmp 7b\n"
+
+ // Forward system call to syscallWrapper()
+ "15:lea 7b(%%rip), %%rcx\n"
+ "push %%rcx\n"
+ "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault
+ "lea playground$syscallWrapper(%%rip), %%rcx\n"
+ "jmp *%%rcx\n"
+
+ // In order to implement SA_NODEFER, we have to keep track of recursive
+ // calls to SIGSEGV handlers. This means we have to increment a counter
+ // before calling the user's signal handler, and decrement it on
+ // leaving the user's signal handler.
+ // Some signal handlers look at the return address of the signal
+ // stack, and more importantly "gdb" uses the call to rt_sigreturn()
+ // as a magic signature when doing stacktraces. So, we have to use
+ // a little more unusual code to regain control after the user's
+ // signal handler is done. We adjust the return address to point to
+ // non-executable memory. And when we trigger another SEGV we pop the
+ // extraneous signal frame and then call rt_sigreturn().
+ // N.B. We currently do not correctly adjust the SEGV counter, if the
+ // user's signal handler exits in way other than by returning (e.g. by
+ // directly calling rt_sigreturn(), or by calling siglongjmp()).
+ "16:lea 22f(%%rip), %%r14\n"
+ "cmp %%r14, %%r15\n"
+ "jnz 17f\n" // check if returning from user's handler
+ "decl %%gs:0x105C-0xE0\n" // decrement SEGV recursion counter
+ "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault
+ "mov $0xF, %%eax\n" // NR_rt_sigreturn
+ "syscall\n"
+
+ // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for
+ // what we are supposed to do.
+ "17:mov playground$sa_segv@GOTPCREL(%%rip), %%rax\n"
+ "cmp $0, 0(%%rax)\n" // SIG_DFL
+ "jz 18f\n"
+ "cmp $1, 0(%%rax)\n" // SIG_IGN
+ "jnz 19f\n" // can't really ignore synchronous signals
+
+ // Trigger the kernel's default signal disposition. The only way we can
+ // do this from seccomp mode is by blocking the signal and retriggering
+ // it.
+ "18:orb $4, 0x131(%%rsp)\n" // signal mask at time of segmentation fault
+ "ret\n"
+
+ // Check sa_flags:
+ // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they
+ // do not have any effect for SIGSEGV.
+ // - On x86-64, we can also ignore SA_SIGINFO, as the calling
+ // conventions for sa_handler() are a subset of the conventions for
+ // sa_sigaction().
+ // - We have to always register our signal handler with SA_NODEFER so
+ // that the user's signal handler can make system calls which might
+ // require additional help from our SEGV handler.
+ // - If the user's signal handler wasn't supposed to be SA_NODEFER, then
+ // we emulate this behavior by keeping track of a recursion counter.
+ //
+ // TODO(markus): If/when we add support for sigaltstack(), we have to
+ // handle SA_ONSTACK.
+ "19:cmpl $0, %%gs:0x105C-0xE0\n"// check if we failed inside of SEGV handler
+ "jnz 18b\n" // if so, then terminate program
+ "mov 0(%%rax), %%rbx\n" // sa_segv_.sa_sigaction
+ "mov 8(%%rax), %%rcx\n" // sa_segv_.sa_flags
+ "btl $31, %%ecx\n" // SA_RESETHAND
+ "jnc 20f\n"
+ "movq $0, 0(%%rax)\n" // set handler to SIG_DFL
+ "20:btl $30, %%ecx\n" // SA_NODEFER
+ "jc 21f\n"
+ "mov %%r14, 0(%%rsp)\n" // trigger a SEGV on return, so that we can
+ "incl %%gs:0x105C-0xE0\n" // clean up state; incr. recursion counter
+ "21:jmp *%%rbx\n" // call user's signal handler
+
+
+ // Non-executable version of the restorer function. We use this to
+ // trigger a SEGV upon returning from the user's signal handler, giving
+ // us an ability to clean up prior to returning from the SEGV handler.
+ ".pushsection .data\n" // move code into non-executable section
+ "22:mov $0xF, %%rax\n" // gdb looks for this signature when doing
+ "syscall\n" // backtraces
+ ".popsection\n"
+#elif defined(__i386__)
+ // Inspect instruction at the point where the segmentation fault
+ // happened. If it is RDTSC, forward the request to the trusted
+ // thread.
+ "mov $-3, %%ebx\n" // request for RDTSC
+ "mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault
+ "cmpw $0x310F, (%%ebp)\n" // RDTSC
+ "jz 0f\n"
+ "cmpw $0x010F, (%%ebp)\n" // RDTSCP
+ "jnz 9f\n"
+ "cmpb $0xF9, 2(%%ebp)\n"
+ "jnz 9f\n"
+ "mov $-4, %%ebx\n" // request for RDTSCP
+ "0:"
+#ifndef NDEBUG
+ "lea 100f, %%eax\n"
+ "push %%eax\n"
+ "call playground$debugMessage\n"
+ "sub $4, %%esp\n"
+#else
+ "sub $8, %%esp\n" // allocate buffer for receiving timestamp
+#endif
+ "push %%ebx\n"
+ "mov %%fs:16, %%ebx\n" // fd = threadFdPub
+ "mov %%esp, %%ecx\n" // buf = %esp
+ "mov $4, %%edx\n" // len = sizeof(int)
+ "1:mov %%edx, %%eax\n" // NR_write
+ "int $0x80\n"
+ "cmp %%eax, %%edx\n"
+ "jz 7f\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 1b\n"
+ "2:add $12, %%esp\n" // remove temporary buffer from stack
+ "xor %%eax, %%eax\n"
+ "movl $0, 0xC8(%%esp)\n" // %edx at time of segmentation fault
+ "cmpw $0x310F, (%%ebp)\n" // RDTSC
+ "jz 3f\n"
+ "movl $0, 0xCC(%%esp)\n" // %ecx at time of segmentation fault
+ "3:mov %%eax, 0xD0(%%esp)\n" // %eax at time of segmentation fault
+ "4:mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault
+ "addl $2, 0xDC(%%esp)\n" // %eip at time of segmentation fault
+ "cmpw $0x010F, (%%ebp)\n" // RDTSCP
+ "jnz 5f\n"
+ "addl $1, 0xDC(%%esp)\n" // %eip at time of segmentation fault
+ "5:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger
+ "mov 0x1CC(%%esp), %%eax\n" // push signal number
+ "push %%eax\n"
+ "lea 0x270(%%esp), %%esi\n" // copy siginfo register values
+ "lea 0x4(%%esp), %%edi\n" // into new location
+ "mov $22, %%ecx\n"
+ "cld\n"
+ "rep movsl\n"
+ "mov 0x2C8(%%esp), %%ebx\n" // copy first half of signal mask
+ "mov %%ebx, 0x54(%%esp)\n"
+ "lea 6f, %%esi\n" // copy "magic" restorer function
+ "push %%esi\n" // push restorer function
+ "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers
+ "movb $2, %%cl\n"
+ "rep movsl\n"
+ "ret\n" // return to restorer function
+
+ // The restorer function is sometimes used by gdb as a magic marker to
+ // recognize signal stack frames. Don't change any of the next three
+ // instructions.
+ "6:pop %%eax\n" // remove dummy argument (signo)
+ "mov $119, %%eax\n" // NR_sigreturn
+ "int $0x80\n"
+ "7:mov $12, %%edx\n" // len = 3*sizeof(int)
+ "8:mov $3, %%eax\n" // NR_read
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 8b\n"
+ "cmp %%eax, %%edx\n"
+ "jnz 2b\n"
+ "pop %%eax\n"
+ "pop %%edx\n"
+ "pop %%ecx\n"
+ "mov %%edx, 0xC8(%%esp)\n" // %edx at time of segmentation fault
+ "cmpw $0x310F, (%%ebp)\n" // RDTSC
+ "jz 3b\n"
+ "mov %%ecx, 0xCC(%%esp)\n" // %ecx at time of segmentation fault
+ "jmp 3b\n"
+
+ // If the instruction is INT 0, then this was probably the result
+ // of playground::Library being unable to find a way to safely
+ // rewrite the system call instruction. Retrieve the CPU register
+ // at the time of the segmentation fault and invoke syscallWrapper().
+ "9:cmpw $0x00CD, (%%ebp)\n" // INT $0x0
+ "jnz 20f\n"
+#ifndef NDEBUG
+ "lea 200f, %%eax\n"
+ "push %%eax\n"
+ "call playground$debugMessage\n"
+ "add $0x4, %%esp\n"
+#endif
+ "mov 0xD0(%%esp), %%eax\n" // %eax at time of segmentation fault
+ "mov 0xC4(%%esp), %%ebx\n" // %ebx at time of segmentation fault
+ "mov 0xCC(%%esp), %%ecx\n" // %ecx at time of segmentation fault
+ "mov 0xC8(%%esp), %%edx\n" // %edx at time of segmentation fault
+ "mov 0xB8(%%esp), %%esi\n" // %esi at time of segmentation fault
+ "mov 0xB4(%%esp), %%edi\n" // %edi at time of segmentation fault
+ "mov 0xB2(%%esp), %%ebp\n" // %ebp at time of segmentation fault
+
+ // Handle sigprocmask() and rt_sigprocmask()
+ "cmp $175, %%eax\n" // NR_rt_sigprocmask
+ "jnz 10f\n"
+ "mov $-22, %%eax\n" // -EINVAL
+ "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals)
+ "jl 3b\n"
+ "jmp 11f\n"
+ "10:cmp $126, %%eax\n" // NR_sigprocmask
+ "jnz 15f\n"
+ "mov $-22, %%eax\n"
+ "11:mov 0xFC(%%esp), %%edi\n" // signal mask at time of segmentation fault
+ "mov 0x100(%%esp), %%ebp\n"
+ "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL
+ "jz 14f\n"
+ "mov 0(%%ecx), %%esi\n"
+ "mov 4(%%ecx), %%ecx\n"
+ "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK)
+ "jnz 12f\n"
+ "or %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault
+ "or %%ecx, 0x100(%%esp)\n"
+ "jmp 14f\n"
+ "12:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK)
+ "jnz 13f\n"
+ "xor $-1, %%esi\n"
+ "xor $-1, %%ecx\n"
+ "and %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault
+ "and %%ecx, 0x100(%%esp)\n"
+ "jmp 14f\n"
+ "13:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK)
+ "jnz 3b\n"
+ "mov %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault
+ "mov %%ecx, 0x100(%%esp)\n"
+ "14:xor %%eax, %%eax\n"
+ "test %%edx, %%edx\n" // only return old mask, if set is non-NULL
+ "jz 3b\n"
+ "mov %%edi, 0(%%edx)\n" // old_set
+ "mov %%ebp, 4(%%edx)\n"
+ "jmp 3b\n"
+
+ // Handle sigreturn() and rt_sigreturn()
+ // See syscall.cc for a discussion on how we can emulate rt_sigreturn()
+ // by calling sigreturn() with a suitably adjusted stack.
+ "15:cmp $119, %%eax\n" // NR_sigreturn
+ "jnz 17f\n"
+ "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault
+ "16:int $0x80\n" // sigreturn() is unrestricted
+ "17:cmp $173, %%eax\n" // NR_rt_sigreturn
+ "jnz 18f\n"
+ "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault
+ "sub $4, %%esp\n" // add fake return address
+ "jmp 4b\n"
+
+ // Copy signal frame onto new stack. In the process, we have to convert
+ // it from an RT signal frame to a legacy signal frame.
+ // See clone.cc for details
+ "18:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000
+ "jnz 19f\n"
+ "lea -0x1C8(%%esp), %%eax\n"// retain stack frame upon returning
+ "mov %%eax, 0xC0(%%esp)\n" // %esp at time of segmentation fault
+ "jmp 3b\n"
+
+ // Forward system call to syscallWrapper()
+ "19:call playground$syscallWrapper\n"
+ "jmp 3b\n"
+
+ // In order to implement SA_NODEFER, we have to keep track of recursive
+ // calls to SIGSEGV handlers. This means we have to increment a counter
+ // before calling the user's signal handler, and decrement it on
+ // leaving the user's signal handler.
+ // Some signal handlers look at the return address of the signal
+ // stack, and more importantly "gdb" uses the call to {,rt_}sigreturn()
+ // as a magic signature when doing stacktraces. So, we have to use
+ // a little more unusual code to regain control after the user's
+ // signal handler is done. We adjust the return address to point to
+ // non-executable memory. And when we trigger another SEGV we pop the
+ // extraneous signal frame and then call sigreturn().
+ // N.B. We currently do not correctly adjust the SEGV counter, if the
+ // user's signal handler exits in way other than by returning (e.g. by
+ // directly calling {,rt_}sigreturn(), or by calling siglongjmp()).
+ "20:lea 30f, %%edi\n" // rt-style restorer function
+ "lea 31f, %%esi\n" // legacy restorer function
+ "cmp %%ebp, %%edi\n" // check if returning from user's handler
+ "jnz 21f\n"
+ "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter
+ "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault
+ "jmp 29f\n"
+ "21:cmp %%ebp, %%esi\n" // check if returning from user's handler
+ "jnz 22f\n"
+ "decl %%fs:0x1040-0x58\n" // decrement SEGV recursion counter
+ "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault
+ "jmp 6b\n"
+
+ // This was a genuine segmentation fault. Check Sandbox::sa_segv_ for
+ // what we are supposed to do.
+ "22:lea playground$sa_segv, %%eax\n"
+ "cmp $0, 0(%%eax)\n" // SIG_DFL
+ "jz 23f\n"
+ "cmp $1, 0(%%eax)\n" // SIG_IGN
+ "jnz 24f\n" // can't really ignore synchronous signals
+
+ // Trigger the kernel's default signal disposition. The only way we can
+ // do this from seccomp mode is by blocking the signal and retriggering
+ // it.
+ "23:orb $4, 0xFD(%%esp)\n" // signal mask at time of segmentation fault
+ "jmp 5b\n"
+
+ // Check sa_flags:
+ // - We can ignore SA_NOCLDSTOP, SA_NOCLDWAIT, and SA_RESTART as they
+ // do not have any effect for SIGSEGV.
+ // - We have to always register our signal handler with SA_NODEFER so
+ // that the user's signal handler can make system calls which might
+ // require additional help from our SEGV handler.
+ // - If the user's signal handler wasn't supposed to be SA_NODEFER, then
+ // we emulate this behavior by keeping track of a recursion counter.
+ //
+ // TODO(markus): If/when we add support for sigaltstack(), we have to
+ // handle SA_ONSTACK.
+ "24:cmpl $0, %%fs:0x1040-0x58\n"// check if we failed inside of SEGV handler
+ "jnz 23b\n" // if so, then terminate program
+ "mov 0(%%eax), %%ebx\n" // sa_segv_.sa_sigaction
+ "mov 4(%%eax), %%ecx\n" // sa_segv_.sa_flags
+ "btl $31, %%ecx\n" // SA_RESETHAND
+ "jnc 25f\n"
+ "movl $0, 0(%%eax)\n" // set handler to SIG_DFL
+ "25:btl $30, %%ecx\n" // SA_NODEFER
+ "jc 28f\n"
+ "btl $2, %%ecx\n" // SA_SIGINFO
+ "jnc 26f\n"
+ "mov %%edi, 0(%%esp)\n" // trigger a SEGV on return
+ "incl %%fs:0x1040-0x58\n" // increment recursion counter
+ "jmp *%%ebx\n" // call user's signal handler
+ "26:mov %%esi, 0(%%esp)\n"
+ "incl %%fs:0x1040-0x58\n" // increment recursion counter
+
+ // We always register the signal handler to give us rt-style signal
+ // frames. But if the user asked for legacy signal frames, we must
+ // convert the signal frame prior to calling the user's signal handler.
+ "27:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger
+ "mov 0x1CC(%%esp), %%eax\n" // push signal number
+ "push %%eax\n"
+ "mov 0x1CC(%%esp), %%eax\n" // push restorer function
+ "push %%eax\n"
+ "lea 0x274(%%esp), %%esi\n" // copy siginfo register values
+ "lea 0x8(%%esp), %%edi\n" // into new location
+ "mov $22, %%ecx\n"
+ "cld\n"
+ "rep movsl\n"
+ "mov 0x2CC(%%esp), %%eax\n" // copy first half of signal mask
+ "mov %%eax, 0x58(%%esp)\n"
+ "lea 31f, %%esi\n"
+ "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers
+ "movb $2, %%cl\n"
+ "rep movsl\n"
+ "jmp *%%ebx\n" // call user's signal handler
+ "28:lea 6b, %%eax\n" // set appropriate restorer function
+ "mov %%eax, 0(%%esp)\n"
+ "btl $2, %%ecx\n" // SA_SIGINFO
+ "jnc 27b\n"
+ "lea 29f, %%eax\n"
+ "mov %%eax, 0(%%esp)\n" // set appropriate restorer function
+ "jmp *%%ebx\n" // call user's signal handler
+ "29:pushl $30f\n" // emulate rt_sigreturn()
+ "jmp 5b\n"
+
+ // Non-executable versions of the restorer function. We use these to
+ // trigger a SEGV upon returning from the user's signal handler, giving
+ // us an ability to clean up prior to returning from the SEGV handler.
+ ".pushsection .data\n" // move code into non-executable section
+ "30:mov $173, %%eax\n" // NR_rt_sigreturn
+ "int $0x80\n" // gdb looks for this signature when doing
+ ".byte 0\n" // backtraces
+ "31:pop %%eax\n"
+ "mov $119, %%eax\n" // NR_sigreturn
+ "int $0x80\n"
+ ".popsection\n"
+#else
+#error Unsupported target platform
+#endif
+ ".pushsection \".rodata\"\n"
+#ifndef NDEBUG
+ "100:.asciz \"RDTSC(P): Executing handler\\n\"\n"
+ "200:.asciz \"INT $0x0: Executing handler\\n\"\n"
+#endif
+ ".popsection\n"
+ "999:pop %0\n"
+ : "=g"(fnc)
+ :
+ : "memory"
+#if defined(__x86_64__)
+ , "rsp"
+#elif defined(__i386__)
+ , "esp"
+#endif
+ );
+ return fnc;
+}
+
+SecureMem::Args* Sandbox::getSecureMem() {
+ // Check trusted_thread.cc for the magic offset that gets us from the TLS
+ // to the beginning of the secure memory area.
+ SecureMem::Args* ret;
+#if defined(__x86_64__)
+ asm volatile(
+ "movq %%gs:-0xE0, %0\n"
+ : "=q"(ret));
+#elif defined(__i386__)
+ asm volatile(
+ "movl %%fs:-0x58, %0\n"
+ : "=r"(ret));
+#else
+#error Unsupported target platform
+#endif
+ return ret;
+}
+
+void Sandbox::snapshotMemoryMappings(int processFd, int proc_self_maps) {
+ SysCalls sys;
+ if (sys.lseek(proc_self_maps, 0, SEEK_SET) ||
+ !sendFd(processFd, proc_self_maps, -1, NULL, 0)) {
+ failure:
+ die("Cannot access /proc/self/maps");
+ }
+ int dummy;
+ if (read(sys, processFd, &dummy, sizeof(dummy)) != sizeof(dummy)) {
+ goto failure;
+ }
+}
+
+int Sandbox::supportsSeccompSandbox(int proc_fd) {
+ if (status_ != STATUS_UNKNOWN) {
+ return status_ != STATUS_UNSUPPORTED;
+ }
+ int fds[2];
+ SysCalls sys;
+ if (sys.pipe(fds)) {
+ status_ = STATUS_UNSUPPORTED;
+ return 0;
+ }
+ pid_t pid;
+ switch ((pid = sys.fork())) {
+ case -1:
+ status_ = STATUS_UNSUPPORTED;
+ return 0;
+ case 0: {
+ int devnull = sys.open("/dev/null", O_RDWR, 0);
+ if (devnull >= 0) {
+ sys.dup2(devnull, 0);
+ sys.dup2(devnull, 1);
+ sys.dup2(devnull, 2);
+ sys.close(devnull);
+ }
+ if (proc_fd >= 0) {
+ setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0));
+ }
+ startSandbox();
+ write(sys, fds[1], "", 1);
+
+ // Try to tell the trusted thread to shut down the entire process in an
+ // orderly fashion
+ defaultSystemCallHandler(__NR_exit_group, 0, 0, 0, 0, 0, 0);
+
+ // If that did not work (e.g. because the kernel does not know about the
+ // exit_group() system call), make a direct _exit() system call instead.
+ // This system call is unrestricted in seccomp mode, so it will always
+ // succeed. Normally, we don't like it, because unlike exit_group() it
+ // does not terminate any other thread. But since we know that
+ // exit_group() exists in all kernels which support kernel-level threads,
+ // this is OK we only get here for old kernels where _exit() is OK.
+ sys._exit(0);
+ }
+ default:
+ NOINTR_SYS(sys.close(fds[1]));
+ char ch;
+ if (read(sys, fds[0], &ch, 1) != 1) {
+ status_ = STATUS_UNSUPPORTED;
+ } else {
+ status_ = STATUS_AVAILABLE;
+ }
+ int rc;
+ NOINTR_SYS(sys.waitpid(pid, &rc, 0));
+ NOINTR_SYS(sys.close(fds[0]));
+ return status_ != STATUS_UNSUPPORTED;
+ }
+}
+
+void Sandbox::setProcSelfMaps(int proc_self_maps) {
+ proc_self_maps_ = proc_self_maps;
+}
+
+void Sandbox::startSandbox() {
+ if (status_ == STATUS_UNSUPPORTED) {
+ die("The seccomp sandbox is not supported on this computer");
+ } else if (status_ == STATUS_ENABLED) {
+ return;
+ }
+
+ SysCalls sys;
+ if (proc_self_maps_ < 0) {
+ proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0);
+ if (proc_self_maps_ < 0) {
+ die("Cannot access \"/proc/self/maps\"");
+ }
+ }
+
+ // The pid is unchanged for the entire program, so we can retrieve it once
+ // and store it in a global variable.
+ pid_ = sys.getpid();
+
+ // Block all signals, except for the RDTSC handler
+ setupSignalHandlers();
+
+ // Get socketpairs for talking to the trusted process
+ int pair[4];
+ if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) ||
+ sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) {
+ die("Failed to create trusted thread");
+ }
+ processFdPub_ = pair[0];
+ cloneFdPub_ = pair[2];
+ SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1],
+ pair[2], pair[3]);
+
+ // We find all libraries that have system calls and redirect the system
+ // calls to the sandbox. If we miss any system calls, the application will be
+ // terminated by the kernel's seccomp code. So, from a security point of
+ // view, if this code fails to identify system calls, we are still behaving
+ // correctly.
+ {
+ Maps maps(proc_self_maps_);
+ const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL };
+
+ // Intercept system calls in the VDSO segment (if any). This has to happen
+ // before intercepting system calls in any of the other libraries, as
+ // the main kernel entry point might be inside of the VDSO and we need to
+ // determine its address before we can compare it to jumps from inside
+ // other libraries.
+ for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){
+ Library* library = *iter;
+ if (library->isVDSO() && library->parseElf()) {
+ library->makeWritable(true);
+ library->patchSystemCalls();
+ library->makeWritable(false);
+ break;
+ }
+ }
+
+ // Intercept system calls in libraries that are known to have them.
+ for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){
+ Library* library = *iter;
+ const char* mapping = iter.name().c_str();
+
+ // Find the actual base name of the mapped library by skipping past any
+ // SPC and forward-slashes. We don't want to accidentally find matches,
+ // because the directory name included part of our well-known lib names.
+ //
+ // Typically, prior to pruning, entries would look something like this:
+ // 08:01 2289011 /lib/libc-2.7.so
+ for (const char *delim = " /"; *delim; ++delim) {
+ const char* skip = strrchr(mapping, *delim);
+ if (skip) {
+ mapping = skip + 1;
+ }
+ }
+
+ for (const char **ptr = libs; *ptr; ptr++) {
+ const char *name = strstr(mapping, *ptr);
+ if (name == mapping) {
+ char ch = name[strlen(*ptr)];
+ if (ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z') {
+ if (library->parseElf()) {
+ library->makeWritable(true);
+ library->patchSystemCalls();
+ library->makeWritable(false);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Take a snapshot of the current memory mappings. These mappings will be
+ // off-limits to all future mmap(), munmap(), mremap(), and mprotect() calls.
+ snapshotMemoryMappings(processFdPub_, proc_self_maps_);
+ NOINTR_SYS(sys.close(proc_self_maps_));
+ proc_self_maps_ = -1;
+
+ // Creating the trusted thread enables sandboxing
+ createTrustedThread(processFdPub_, cloneFdPub_, secureMem);
+
+ // We can no longer check for sandboxing support at this point, but we also
+ // know for a fact that it is available (as we just turned it on). So update
+ // the status to reflect this information.
+ status_ = STATUS_ENABLED;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/sandbox.h b/sandbox/linux/seccomp/sandbox.h
new file mode 100644
index 0000000..8f49575
--- /dev/null
+++ b/sandbox/linux/seccomp/sandbox.h
@@ -0,0 +1,12 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SANDBOX_H__
+#define SANDBOX_H__
+
+extern "C" int SupportsSeccompSandbox(int proc_fd);
+extern "C" void SeccompSandboxSetProcSelfMaps(int proc_self_maps);
+extern "C" void StartSeccompSandbox();
+
+#endif // SANDBOX_H__
diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h
new file mode 100644
index 0000000..3e99a5510
--- /dev/null
+++ b/sandbox/linux/seccomp/sandbox_impl.h
@@ -0,0 +1,715 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SANDBOX_IMPL_H__
+#define SANDBOX_IMPL_H__
+
+#include <asm/ldt.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/prctl.h>
+#include <linux/unistd.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#define NOINTR_SYS(x) \
+ ({ typeof(x) i__; while ((i__ = (x)) < 0 && sys.my_errno == EINTR); i__;})
+
+#ifdef __cplusplus
+#include <map>
+#include <vector>
+#include "sandbox.h"
+#include "securemem.h"
+#include "tls.h"
+
+namespace playground {
+
+class Sandbox {
+ // TODO(markus): restrict access to our private file handles
+ public:
+ enum { kMaxThreads = 100 };
+
+
+ // There are a lot of reasons why the Seccomp sandbox might not be available.
+ // This could be because the kernel does not support Seccomp mode, or it
+ // could be because we fail to successfully rewrite all system call entry
+ // points.
+ // "proc_fd" should be a file descriptor for "/proc", or -1 if not provided
+ // by the caller.
+ static int supportsSeccompSandbox(int proc_fd)
+ asm("SupportsSeccompSandbox");
+
+ // The sandbox needs to be able to access "/proc/self/maps". If this file
+ // is not accessible when "startSandbox()" gets called, the caller can
+ // provide an already opened file descriptor by calling "setProcSelfMaps()".
+ // The sandbox becomes the newer owner of this file descriptor and will
+ // eventually close it when "startSandbox()" executes.
+ static void setProcSelfMaps(int proc_self_maps)
+ asm("SeccompSandboxSetProcSelfMaps");
+
+ // This is the main public entry point. It finds all system calls that
+ // need rewriting, sets up the resources needed by the sandbox, and
+ // enters Seccomp mode.
+ static void startSandbox() asm("StartSeccompSandbox");
+
+ private:
+// syscall_table.c has to be implemented in C, as C++ does not support
+// designated initializers for arrays. The only other alternative would be
+// to have a source code generator for this table.
+//
+// We would still like the C source file to include our header file. This
+// requires some define statements to transform C++ specific constructs to
+// something that is palatable to a C compiler.
+#define STATIC static
+#define SecureMemArgs SecureMem::Args
+ // Clone() is special as it has a wrapper in syscall_table.c. The wrapper
+ // adds one extra argument (the pointer to the saved registers) and then
+ // calls playground$sandbox__clone().
+ static long sandbox_clone(int flags, char* stack, int* pid, int* ctid,
+ void* tls, void* wrapper_sp)
+ asm("playground$sandbox__clone")
+ #if defined(__x86_64__)
+ __attribute__((visibility("internal")))
+#endif
+ ;
+#else
+#define STATIC
+#define bool int
+#define SecureMemArgs void
+ // This is the wrapper entry point that is found in the syscall_table.
+ long sandbox_clone(int flags, char* stack, int* pid, int* ctid, void* tls)
+ asm("playground$sandbox_clone");
+#endif
+
+ // Entry points for sandboxed code that is attempting to make system calls
+ STATIC long sandbox_access(const char*, int)
+ asm("playground$sandbox_access");
+ STATIC long sandbox_exit(int status) asm("playground$sandbox_exit");
+ STATIC long sandbox_getpid() asm("playground$sandbox_getpid");
+ #if defined(__NR_getsockopt)
+ STATIC long sandbox_getsockopt(int, int, int, void*, socklen_t*)
+ asm("playground$sandbox_getsockopt");
+ #endif
+ STATIC long sandbox_gettid() asm("playground$sandbox_gettid");
+ STATIC long sandbox_ioctl(int d, int req, void* arg)
+ asm("playground$sandbox_ioctl");
+ #if defined(__NR_ipc)
+ STATIC long sandbox_ipc(unsigned, int, int, int, void*, long)
+ asm("playground$sandbox_ipc");
+ #endif
+ STATIC long sandbox_lstat(const char* path, void* buf)
+ asm("playground$sandbox_lstat");
+ #if defined(__NR_lstat64)
+ STATIC long sandbox_lstat64(const char *path, void* b)
+ asm("playground$sandbox_lstat64");
+ #endif
+ STATIC long sandbox_madvise(void*, size_t, int)
+ asm("playground$sandbox_madvise");
+ STATIC void *sandbox_mmap(void* start, size_t length, int prot, int flags,
+ int fd, off_t offset)
+ asm("playground$sandbox_mmap");
+ STATIC long sandbox_mprotect(const void*, size_t, int)
+ asm("playground$sandbox_mprotect");
+ STATIC long sandbox_munmap(void* start, size_t length)
+ asm("playground$sandbox_munmap");
+ STATIC long sandbox_open(const char*, int, mode_t)
+ asm("playground$sandbox_open");
+ #if defined(__NR_recvfrom)
+ STATIC ssize_t sandbox_recvfrom(int, void*, size_t, int, void*, socklen_t*)
+ asm("playground$sandbox_recvfrom");
+ STATIC ssize_t sandbox_recvmsg(int, struct msghdr*, int)
+ asm("playground$sandbox_recvmsg");
+ #endif
+ #if defined(__NR_rt_sigaction)
+ STATIC long sandbox_rt_sigaction(int, const void*, void*, size_t)
+ asm("playground$sandbox_rt_sigaction");
+ #endif
+ #if defined(__NR_rt_sigprocmask)
+ STATIC long sandbox_rt_sigprocmask(int how, const void*, void*, size_t)
+ asm("playground$sandbox_rt_sigprocmask");
+ #endif
+ #if defined(__NR_sendmsg)
+ STATIC size_t sandbox_sendmsg(int, const struct msghdr*, int)
+ asm("playground$sandbox_sendmsg");
+ STATIC ssize_t sandbox_sendto(int, const void*, size_t, int, const void*,
+ socklen_t)asm("playground$sandbox_sendto");
+ #endif
+ #if defined(__NR_shmat)
+ STATIC void* sandbox_shmat(int, const void*, int)
+ asm("playground$sandbox_shmat");
+ STATIC long sandbox_shmctl(int, int, void*)
+ asm("playground$sandbox_shmctl");
+ STATIC long sandbox_shmdt(const void*) asm("playground$sandbox_shmdt");
+ STATIC long sandbox_shmget(int, size_t, int)
+ asm("playground$sandbox_shmget");
+ #endif
+ #if defined(__NR_setsockopt)
+ STATIC long sandbox_setsockopt(int, int, int, const void*, socklen_t)
+ asm("playground$sandbox_setsockopt");
+ #endif
+ #if defined(__NR_sigaction)
+ STATIC long sandbox_sigaction(int, const void*, void*)
+ asm("playground$sandbox_sigaction");
+ #endif
+ #if defined(__NR_signal)
+ STATIC void* sandbox_signal(int, const void*)
+ asm("playground$sandbox_signal");
+ #endif
+ #if defined(__NR_sigprocmask)
+ STATIC long sandbox_sigprocmask(int how, const void*, void*)
+ asm("playground$sandbox_sigprocmask");
+ #endif
+ #if defined(__NR_socketcall)
+ STATIC long sandbox_socketcall(int call, void* args)
+ asm("playground$sandbox_socketcall");
+ #endif
+ STATIC long sandbox_stat(const char* path, void* buf)
+ asm("playground$sandbox_stat");
+ #if defined(__NR_stat64)
+ STATIC long sandbox_stat64(const char *path, void* b)
+ asm("playground$sandbox_stat64");
+ #endif
+
+ // Functions for system calls that need to be handled in the trusted process
+ STATIC bool process_access(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_access");
+ STATIC bool process_clone(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_clone");
+ STATIC bool process_exit(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_exit");
+ #if defined(__NR_getsockopt)
+ STATIC bool process_getsockopt(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_getsockopt");
+ #endif
+ STATIC bool process_ioctl(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_ioctl");
+ #if defined(__NR_ipc)
+ STATIC bool process_ipc(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_ipc");
+ #endif
+ STATIC bool process_madvise(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_madvise");
+ STATIC bool process_mmap(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_mmap");
+ STATIC bool process_mprotect(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_mprotect");
+ STATIC bool process_munmap(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_munmap");
+ STATIC bool process_open(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_open");
+ #if defined(__NR_recvfrom)
+ STATIC bool process_recvfrom(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_recvfrom");
+ STATIC bool process_recvmsg(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_recvmsg");
+ STATIC bool process_sendmsg(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_sendmsg");
+ STATIC bool process_sendto(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_sendto");
+ STATIC bool process_setsockopt(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_setsockopt");
+ #endif
+ #if defined(__NR_shmat)
+ STATIC bool process_shmat(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmat");
+ STATIC bool process_shmctl(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmctl");
+ STATIC bool process_shmdt(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmdt");
+ STATIC bool process_shmget(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmget");
+ #endif
+ STATIC bool process_sigaction(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_sigaction");
+ #if defined(__NR_socketcall)
+ STATIC bool process_socketcall(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_socketcall");
+ #endif
+ STATIC bool process_stat(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_stat");
+
+#ifdef __cplusplus
+ friend class Debug;
+ friend class Library;
+ friend class Maps;
+ friend class Mutex;
+ friend class SecureMem;
+ friend class TLS;
+
+ // Define our own inline system calls. These calls will not be rewritten
+ // to point to the sandboxed wrapper functions. They thus allow us to
+ // make actual system calls (e.g. in the sandbox initialization code, and
+ // in the trusted process)
+ class SysCalls {
+ public:
+ #define SYS_CPLUSPLUS
+ #define SYS_ERRNO my_errno
+ #define SYS_INLINE inline
+ #define SYS_PREFIX -1
+ #undef SYS_LINUX_SYSCALL_SUPPORT_H
+ #include "linux_syscall_support.h"
+ SysCalls() : my_errno(0) { }
+ int my_errno;
+ };
+ #ifdef __NR_mmap2
+ #define MMAP mmap2
+ #define __NR_MMAP __NR_mmap2
+ #else
+ #define MMAP mmap
+ #define __NR_MMAP __NR_mmap
+ #endif
+
+ // Print an error message and terminate the program. Used for fatal errors.
+ static void die(const char *msg = 0) __attribute__((noreturn)) {
+ SysCalls sys;
+ if (msg) {
+ sys.write(2, msg, strlen(msg));
+ sys.write(2, "\n", 1);
+ }
+ for (;;) {
+ sys.exit_group(1);
+ sys._exit(1);
+ }
+ }
+
+ // Wrapper around "read()" that can deal with partial and interrupted reads
+ // and that does not modify the global errno variable.
+ static ssize_t read(SysCalls& sys, int fd, void* buf, size_t len) {
+ if (static_cast<ssize_t>(len) < 0) {
+ sys.my_errno = EINVAL;
+ return -1;
+ }
+ size_t offset = 0;
+ while (offset < len) {
+ ssize_t partial =
+ NOINTR_SYS(sys.read(fd, reinterpret_cast<char*>(buf) + offset,
+ len - offset));
+ if (partial < 0) {
+ return partial;
+ } else if (!partial) {
+ break;
+ }
+ offset += partial;
+ }
+ return offset;
+ }
+
+ // Wrapper around "write()" that can deal with interrupted writes and that
+ // does not modify the global errno variable.
+ static ssize_t write(SysCalls& sys, int fd, const void* buf, size_t len){
+ return NOINTR_SYS(sys.write(fd, buf, len));
+ }
+
+ // Sends a file handle to another process.
+ // N.B. trusted_thread.cc has an assembly version of this function that
+ // is safe to use without a call stack. If the wire-format is changed,
+ /// make sure to update the assembly code.
+ static bool sendFd(int transport, int fd0, int fd1, const void* buf,
+ size_t len);
+
+ // If getFd() fails, it will set the first valid fd slot (e.g. fd0) to
+ // -errno.
+ static bool getFd(int transport, int* fd0, int* fd1, void* buf,
+ size_t* len);
+
+ // Data structures used to forward system calls to the trusted process.
+ struct Accept {
+ int sockfd;
+ void* addr;
+ socklen_t* addrlen;
+ } __attribute__((packed));
+
+ struct Accept4 {
+ int sockfd;
+ void* addr;
+ socklen_t* addrlen;
+ int flags;
+ } __attribute__((packed));
+
+ struct Access {
+ size_t path_length;
+ int mode;
+ } __attribute__((packed));
+
+ struct Bind {
+ int sockfd;
+ void* addr;
+ socklen_t addrlen;
+ } __attribute__((packed));
+
+ struct Clone {
+ int flags;
+ char* stack;
+ int* pid;
+ int* ctid;
+ void* tls;
+ #if defined(__x86_64__)
+ struct {
+ void* r15;
+ void* r14;
+ void* r13;
+ void* r12;
+ void* r11;
+ void* r10;
+ void* r9;
+ void* r8;
+ void* rdi;
+ void* rsi;
+ void* rdx;
+ void* rcx;
+ void* rbx;
+ void* rbp;
+ void* fake_ret;
+ } regs64 __attribute__((packed));
+ #elif defined(__i386__)
+ struct {
+ void* ebp;
+ void* edi;
+ void* esi;
+ void* edx;
+ void* ecx;
+ void* ebx;
+ } regs32 __attribute__((packed));
+ #else
+ #error Unsupported target platform
+ #endif
+ void* ret;
+ } __attribute__((packed));
+
+ struct Connect {
+ int sockfd;
+ void* addr;
+ socklen_t addrlen;
+ } __attribute__((packed));
+
+ struct GetSockName {
+ int sockfd;
+ void* name;
+ socklen_t* namelen;
+ } __attribute__((packed));
+
+ struct GetPeerName {
+ int sockfd;
+ void* name;
+ socklen_t* namelen;
+ } __attribute__((packed));
+
+ struct GetSockOpt {
+ int sockfd;
+ int level;
+ int optname;
+ void* optval;
+ socklen_t* optlen;
+ } __attribute__((packed));
+
+ struct IOCtl {
+ int d;
+ int req;
+ void *arg;
+ } __attribute__((packed));
+
+ #if defined(__NR_ipc)
+ struct IPC {
+ unsigned call;
+ int first;
+ int second;
+ int third;
+ void* ptr;
+ long fifth;
+ } __attribute__((packed));
+ #endif
+
+ struct Listen {
+ int sockfd;
+ int backlog;
+ } __attribute__((packed));
+
+ struct MAdvise {
+ const void* start;
+ size_t len;
+ int advice;
+ } __attribute__((packed));
+
+ struct MMap {
+ void* start;
+ size_t length;
+ int prot;
+ int flags;
+ int fd;
+ off_t offset;
+ } __attribute__((packed));
+
+ struct MProtect {
+ const void* addr;
+ size_t len;
+ int prot;
+ };
+
+ struct MUnmap {
+ void* start;
+ size_t length;
+ } __attribute__((packed));
+
+ struct Open {
+ size_t path_length;
+ int flags;
+ mode_t mode;
+ } __attribute__((packed));
+
+ struct Recv {
+ int sockfd;
+ void* buf;
+ size_t len;
+ int flags;
+ } __attribute__((packed));
+
+ struct RecvFrom {
+ int sockfd;
+ void* buf;
+ size_t len;
+ int flags;
+ void* from;
+ socklen_t *fromlen;
+ } __attribute__((packed));
+
+ struct RecvMsg {
+ int sockfd;
+ struct msghdr* msg;
+ int flags;
+ } __attribute__((packed));
+
+ struct Send {
+ int sockfd;
+ const void* buf;
+ size_t len;
+ int flags;
+ } __attribute__((packed));
+
+ struct SendMsg {
+ int sockfd;
+ const struct msghdr* msg;
+ int flags;
+ } __attribute__((packed));
+
+ struct SendTo {
+ int sockfd;
+ const void* buf;
+ size_t len;
+ int flags;
+ const void* to;
+ socklen_t tolen;
+ } __attribute__((packed));
+
+ struct SetSockOpt {
+ int sockfd;
+ int level;
+ int optname;
+ const void* optval;
+ socklen_t optlen;
+ } __attribute__((packed));
+
+ #if defined(__NR_shmat)
+ struct ShmAt {
+ int shmid;
+ const void* shmaddr;
+ int shmflg;
+ } __attribute__((packed));
+
+ struct ShmCtl {
+ int shmid;
+ int cmd;
+ void *buf;
+ } __attribute__((packed));
+
+ struct ShmDt {
+ const void *shmaddr;
+ } __attribute__((packed));
+
+ struct ShmGet {
+ int key;
+ size_t size;
+ int shmflg;
+ } __attribute__((packed));
+ #endif
+
+ struct ShutDown {
+ int sockfd;
+ int how;
+ } __attribute__((packed));
+
+ struct SigAction {
+ int sysnum;
+ int signum;
+ const SysCalls::kernel_sigaction* action;
+ const SysCalls::kernel_sigaction* old_action;
+ size_t sigsetsize;
+ } __attribute__((packed));
+
+ struct Socket {
+ int domain;
+ int type;
+ int protocol;
+ } __attribute__((packed));
+
+ struct SocketPair {
+ int domain;
+ int type;
+ int protocol;
+ int* pair;
+ } __attribute__((packed));
+
+ #if defined(__NR_socketcall)
+ struct SocketCall {
+ int call;
+ void* arg_ptr;
+ union {
+ Socket socket;
+ Bind bind;
+ Connect connect;
+ Listen listen;
+ Accept accept;
+ GetSockName getsockname;
+ GetPeerName getpeername;
+ SocketPair socketpair;
+ Send send;
+ Recv recv;
+ SendTo sendto;
+ RecvFrom recvfrom;
+ ShutDown shutdown;
+ SetSockOpt setsockopt;
+ GetSockOpt getsockopt;
+ SendMsg sendmsg;
+ RecvMsg recvmsg;
+ Accept4 accept4;
+ } args;
+ } __attribute__((packed));
+ #endif
+
+ struct Stat {
+ int sysnum;
+ size_t path_length;
+ void* buf;
+ } __attribute__((packed));
+
+ // Thread local data available from each sandboxed thread.
+ enum { TLS_COOKIE, TLS_TID, TLS_THREAD_FD };
+ static long long cookie() { return TLS::getTLSValue<long long>(TLS_COOKIE); }
+ static int tid() { return TLS::getTLSValue<int>(TLS_TID); }
+ static int threadFdPub() { return TLS::getTLSValue<int>(TLS_THREAD_FD); }
+ static int processFdPub() { return processFdPub_; }
+ static kernel_sigset_t* signalMask() { return &getSecureMem()->signalMask; }
+
+ // The SEGV handler knows how to handle RDTSC instructions
+ static void setupSignalHandlers();
+ static void (*segv())(int signo, SysCalls::siginfo *context, void *unused);
+
+ // If no specific handler has been registered for a system call, call this
+ // function which asks the trusted thread to perform the call. This is used
+ // for system calls that are not restricted.
+ static void* defaultSystemCallHandler(int syscallNum, void* arg0,
+ void* arg1, void* arg2, void* arg3,
+ void* arg4, void* arg5)
+ asm("playground$defaultSystemCallHandler")
+ #if defined(__x86_64__)
+ __attribute__((visibility("internal")))
+ #endif
+ ;
+
+ // Return the current secure memory structure for this thread.
+ static SecureMem::Args* getSecureMem();
+
+ // Return a secure memory structure that can be used by a newly created
+ // thread.
+ static SecureMem::Args* getNewSecureMem();
+
+ // This functions runs in the trusted process at startup and finds all the
+ // memory mappings that existed when the sandbox was first enabled. Going
+ // forward, all these mappings are off-limits for operations such as
+ // mmap(), munmap(), and mprotect().
+ static int initializeProtectedMap(int fd);
+
+ // Helper functions that allows the trusted process to get access to
+ // "/proc/self/maps" in the sandbox.
+ static void snapshotMemoryMappings(int processFd, int proc_self_maps);
+
+ // Main loop for the trusted process.
+ static void trustedProcess(int parentMapsFd, int processFdPub,
+ int sandboxFd, int cloneFd,
+ SecureMem::Args* secureArena)
+ __attribute__((noreturn));
+
+ // Fork()s of the trusted process.
+ static SecureMem::Args* createTrustedProcess(int processFdPub, int sandboxFd,
+ int cloneFdPub, int cloneFd);
+
+ // Creates the trusted thread for the initial thread, then enables
+ // Seccomp mode.
+ static void createTrustedThread(int processFdPub, int cloneFdPub,
+ SecureMem::Args* secureMem);
+
+ static int proc_self_maps_;
+ static enum SandboxStatus {
+ STATUS_UNKNOWN, STATUS_UNSUPPORTED, STATUS_AVAILABLE, STATUS_ENABLED
+ } status_;
+ static int pid_;
+ static int processFdPub_;
+ static int cloneFdPub_;
+
+ #ifdef __i386__
+ struct SocketCallArgInfo;
+ static const struct SocketCallArgInfo socketCallArgInfo[];
+ #endif
+
+ // We always have to intercept SIGSEGV. If the application wants to set its
+ // own SEGV handler, we forward to it whenever necessary.
+ static SysCalls::kernel_sigaction sa_segv_ asm("playground$sa_segv");
+
+ // The syscall_mutex_ can only be directly accessed by the trusted process.
+ // It can be accessed by the trusted thread after fork()ing and calling
+ // mprotect(PROT_READ|PROT_WRITE). The mutex is used for system calls that
+ // require passing additional data, and that require the trusted process to
+ // wait until the trusted thread is done processing (e.g. exit(), clone(),
+ // open(), stat())
+ static int syscall_mutex_ asm("playground$syscall_mutex");
+
+ // Available in trusted process, only
+ typedef std::map<void *, long> ProtectedMap;
+ static ProtectedMap protectedMap_;
+ static std::vector<SecureMem::Args*> secureMemPool_;
+};
+
+// If this struct is extended to contain parameters that are read by
+// the trusted thread, we will have to mprotect() it to be read-only when
+// starting the sandbox. However, currently it is read only by the
+// trusted process, and the sandboxed process cannot change the values
+// that the fork()'d trusted process sees.
+struct SandboxPolicy {
+ bool allow_file_namespace; // Allow filename-based system calls.
+};
+
+extern struct SandboxPolicy g_policy;
+
+} // namespace
+
+using playground::Sandbox;
+#endif // __cplusplus
+
+#endif // SANDBOX_IMPL_H__
diff --git a/sandbox/linux/seccomp/seccomp.gyp b/sandbox/linux/seccomp/seccomp.gyp
new file mode 100644
index 0000000..596be21
--- /dev/null
+++ b/sandbox/linux/seccomp/seccomp.gyp
@@ -0,0 +1,93 @@
+# Copyright (c) 2010 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+{
+ 'variables': {
+ 'chromium_code': 1,
+ 'seccomp_intermediate_dir': '<(INTERMEDIATE_DIR)/seccomp-sandbox',
+ },
+ 'targets': [
+ {
+ 'target_name': 'seccomp_sandbox',
+ 'type': 'static_library',
+ 'sources': [
+ 'access.cc',
+ 'allocator.cc',
+ 'allocator.h',
+ 'clone.cc',
+ 'exit.cc',
+ 'debug.cc',
+ 'getpid.cc',
+ 'gettid.cc',
+ 'ioctl.cc',
+ 'ipc.cc',
+ 'library.cc',
+ 'library.h',
+ 'linux_syscall_support.h',
+ 'madvise.cc',
+ 'maps.cc',
+ 'maps.h',
+ 'mmap.cc',
+ 'mprotect.cc',
+ 'munmap.cc',
+ 'mutex.h',
+ 'open.cc',
+ 'sandbox.cc',
+ 'sandbox.h',
+ 'sandbox_impl.h',
+ 'securemem.cc',
+ 'securemem.h',
+ 'sigaction.cc',
+ 'sigprocmask.cc',
+ 'socketcall.cc',
+ 'stat.cc',
+ 'syscall.cc',
+ 'syscall.h',
+ 'syscall_table.c',
+ 'syscall_table.h',
+ 'tls.h',
+ 'trusted_process.cc',
+ 'trusted_thread.cc',
+ 'x86_decode.cc',
+ 'x86_decode.h',
+ ],
+ },
+ {
+ 'target_name': 'seccomp_tests',
+ 'type': 'executable',
+ 'sources': [
+ 'tests/test_syscalls.cc',
+ ],
+ 'include_dirs': [
+ '.',
+ '<(seccomp_intermediate_dir)',
+ ],
+ 'dependencies': [
+ 'seccomp_sandbox',
+ ],
+ 'libraries': [
+ '-lpthread',
+ '-lutil', # For openpty()
+ ],
+ 'actions': [
+ {
+ 'action_name': 'make_test_list',
+ 'inputs': [
+ 'tests/list_tests.py',
+ 'tests/test_syscalls.cc',
+ ],
+ 'outputs': ['<(seccomp_intermediate_dir)/test-list.h'],
+ 'action': ['sh', '-c', 'python <(_inputs) > <(_outputs)'],
+ },
+ ],
+ },
+ {
+ 'target_name': 'timestats',
+ 'type': 'executable',
+ 'sources': [
+ 'timestats.cc',
+ ],
+ },
+ ],
+}
diff --git a/sandbox/linux/seccomp/securemem.cc b/sandbox/linux/seccomp/securemem.cc
new file mode 100644
index 0000000..5f07bbe
--- /dev/null
+++ b/sandbox/linux/seccomp/securemem.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "mutex.h"
+#include "sandbox_impl.h"
+#include "securemem.h"
+
+namespace playground {
+
+void SecureMem::abandonSystemCall(int fd, int err) {
+ void* rc = reinterpret_cast<void *>(err);
+ if (err) {
+ Debug::message("System call failed\n");
+ }
+ Sandbox::SysCalls sys;
+ if (Sandbox::write(sys, fd, &rc, sizeof(rc)) != sizeof(rc)) {
+ Sandbox::die("Failed to send system call");
+ }
+}
+
+void SecureMem::dieIfParentDied(int parentMapsFd) {
+ // The syscall_mutex_ should not be contended. If it is, we are either
+ // experiencing a very unusual load of system calls that the sandbox is not
+ // optimized for; or, more likely, the sandboxed process terminated while the
+ // trusted process was in the middle of waiting for the mutex. We detect
+ // this situation and terminate the trusted process.
+ int alive = !lseek(parentMapsFd, 0, SEEK_SET);
+ if (alive) {
+ char buf;
+ do {
+ alive = read(parentMapsFd, &buf, 1);
+ } while (alive < 0 && errno == EINTR);
+ }
+ if (!alive) {
+ Sandbox::die();
+ }
+}
+
+void SecureMem::lockSystemCall(int parentMapsFd, Args* mem) {
+ while (!Mutex::lockMutex(&Sandbox::syscall_mutex_, 500)) {
+ dieIfParentDied(parentMapsFd);
+ }
+ asm volatile(
+ #if defined(__x86_64__)
+ "lock; incq (%0)\n"
+ #elif defined(__i386__)
+ "lock; incl (%0)\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ :
+ : "q"(&mem->sequence)
+ : "memory");
+}
+
+void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentMapsFd,
+ Args* mem, int syscallNum, void* arg1,
+ void* arg2, void* arg3, void* arg4,
+ void* arg5, void* arg6) {
+ if (!locked) {
+ asm volatile(
+ #if defined(__x86_64__)
+ "lock; incq (%0)\n"
+ #elif defined(__i386__)
+ "lock; incl (%0)\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ :
+ : "q"(&mem->sequence)
+ : "memory");
+ }
+ mem->callType = locked ? -2 : -1;
+ mem->syscallNum = syscallNum;
+ mem->arg1 = arg1;
+ mem->arg2 = arg2;
+ mem->arg3 = arg3;
+ mem->arg4 = arg4;
+ mem->arg5 = arg5;
+ mem->arg6 = arg6;
+ asm volatile(
+ #if defined(__x86_64__)
+ "lock; incq (%0)\n"
+ #elif defined(__i386__)
+ "lock; incl (%0)\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ :
+ : "q"(&mem->sequence)
+ : "memory");
+ Sandbox::SysCalls sys;
+ if (Sandbox::write(sys, fd, &mem->callType, sizeof(int)) != sizeof(int)) {
+ Sandbox::die("Failed to send system call");
+ }
+ if (parentMapsFd >= 0) {
+ while (!Mutex::waitForUnlock(&Sandbox::syscall_mutex_, 500)) {
+ dieIfParentDied(parentMapsFd);
+ }
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h
new file mode 100644
index 0000000..91283db
--- /dev/null
+++ b/sandbox/linux/seccomp/securemem.h
@@ -0,0 +1,205 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SECURE_MEM_H__
+#define SECURE_MEM_H__
+
+#include <stdlib.h>
+#include "linux_syscall_support.h"
+
+namespace playground {
+
+class SecureMem {
+ public:
+ // Each thread is associated with two memory pages (i.e. 8192 bytes). This
+ // memory is fully accessible by the trusted process, but in the trusted
+ // thread and the sandboxed thread, the first page is only mapped PROT_READ,
+ // and the second one is PROT_READ|PROT_WRITE.
+ //
+ // The first page can be modified by the trusted process and this is the
+ // main mechanism how it communicates with the trusted thread. After each
+ // update, it updates the "sequence" number. The trusted process must
+ // check the "sequence" number has the expected value, and only then can
+ // it trust the data in this page.
+ typedef struct Args {
+ union {
+ struct {
+ union {
+ struct {
+ struct Args* self;
+ long sequence;
+ long callType;
+ long syscallNum;
+ void* arg1;
+ void* arg2;
+ void* arg3;
+ void* arg4;
+ void* arg5;
+ void* arg6;
+
+ // Used by clone() to allow return from the syscall wrapper.
+ void* ret;
+ #if defined(__x86_64__)
+ void* rbp;
+ void* rbx;
+ void* rcx;
+ void* rdx;
+ void* rsi;
+ void* rdi;
+ void* r8;
+ void* r9;
+ void* r10;
+ void* r11;
+ void* r12;
+ void* r13;
+ void* r14;
+ void* r15;
+ #elif defined(__i386__)
+ void* ebp;
+ void* edi;
+ void* esi;
+ void* edx;
+ void* ecx;
+ void* ebx;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Used by clone() to set up data for the new thread.
+ struct Args* newSecureMem;
+ int processFdPub;
+ int cloneFdPub;
+
+ // Set to non-zero, if in debugging mode
+ int allowAllSystemCalls;
+
+ // The most recent SysV SHM identifier returned by
+ // shmget(IPC_PRIVATE)
+ int shmId;
+
+ // The following entries make up the sandboxed thread's TLS
+ long long cookie;
+ long long threadId;
+ long long threadFdPub;
+ } __attribute__((packed));
+ char header[512];
+ };
+ // Used for calls such as open() and stat().
+ char pathname[4096 - 512];
+ } __attribute__((packed));
+ char securePage[4096];
+ };
+ union {
+ struct {
+ // This scratch space is used by the trusted thread to read parameters
+ // for unrestricted system calls.
+ int tmpSyscallNum;
+ void* tmpArg1;
+ void* tmpArg2;
+ void* tmpArg3;
+ void* tmpArg4;
+ void* tmpArg5;
+ void* tmpArg6;
+ void* tmpReturnValue;
+
+ // Scratch space used to return the result of a rdtsc instruction
+ int rdtscpEax;
+ int rdtscpEdx;
+ int rdtscpEcx;
+
+ // We often have long sequences of calls to gettimeofday(). This is
+ // needlessly expensive. Coalesce them into a single call.
+ int lastSyscallNum;
+ int gettimeofdayCounter;
+
+ // For debugging purposes, we want to be able to log messages. This can
+ // result in additional system calls. Make sure that we don't trigger
+ // logging of those recursive calls.
+ int recursionLevel;
+
+ // Computing the signal mask is expensive. Keep a cached copy.
+ kernel_sigset_t signalMask;
+
+ // Keep track of whether we are in a SEGV handler
+ int inSegvHandler;
+ } __attribute__((packed));
+ char scratchPage[4096];
+ };
+ } __attribute__((packed)) Args;
+
+ // Allows the trusted process to check whether the parent process still
+ // exists. If it doesn't, kill the trusted process.
+ static void dieIfParentDied(int parentProc);
+
+ // The trusted process received a system call that it intends to deny.
+ static void abandonSystemCall(int fd, int err);
+
+ // Acquires the syscall_mutex_ prior to making changes to the parameters in
+ // the secure memory page. Used by calls such as exit(), clone(), open(),
+ // socketcall(), and stat().
+ // After locking the mutex, it is no longer valid to abandon the system
+ // call!
+ static void lockSystemCall(int parentProc, Args* mem);
+
+ // Sends a system call to the trusted thread. If "locked" is true, the
+ // caller must first call lockSystemCall() and must also provide
+ // "parentProc". In locked mode, sendSystemCall() won't return until the
+ // trusted thread has completed processing.
+ // Use sparingly as it serializes the operation of the trusted process.
+ static void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum);
+ }
+ template<class T1> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1);
+ }
+ template<class T1, class T2> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2);
+ }
+ template<class T1, class T2, class T3> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3);
+ }
+ template<class T1, class T2, class T3, class T4> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4);
+ }
+ template<class T1, class T2, class T3, class T4, class T5> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4,
+ T5 arg5) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4,
+ (void*)arg5);
+ }
+ template<class T1, class T2, class T3, class T4, class T5, class T6> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4,
+ T5 arg5, T6 arg6) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4,
+ (void*)arg5, (void*)arg6);
+ }
+
+ private:
+ static void sendSystemCallInternal(int fd, bool locked, int parentProc,
+ Args* mem, int syscallNum, void* arg1 = 0,
+ void* arg2 = 0, void* arg3 = 0,
+ void* arg4 = 0, void* arg5 = 0,
+ void* arg6 = 0);
+};
+
+} // namespace
+
+#endif // SECURE_MEM_H__
diff --git a/sandbox/linux/seccomp/sigaction.cc b/sandbox/linux/seccomp/sigaction.cc
new file mode 100644
index 0000000..162416d
--- /dev/null
+++ b/sandbox/linux/seccomp/sigaction.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// TODO(markus): We currently instrument the restorer functions with calls to
+// the syscallWrapper(). This prevents gdb from properly
+// creating backtraces of code that is running in signal
+// handlers. We might instead want to always override the
+// restorer with a function that contains the "magic" signature
+// but that is not executable. The SEGV handler can detect this
+// and then invoke the appropriate restorer.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+#if defined(__NR_sigaction)
+long Sandbox::sandbox_sigaction(int signum, const void* a_, void* oa_) {
+ const SysCalls::kernel_old_sigaction* action =
+ reinterpret_cast<const SysCalls::kernel_old_sigaction*>(a_);
+ SysCalls::kernel_old_sigaction* old_action =
+ reinterpret_cast<SysCalls::kernel_old_sigaction*>(oa_);
+
+ long rc = 0;
+ long long tm;
+ Debug::syscall(&tm, __NR_sigaction, "Executing handler");
+ if (signum == SIGSEGV) {
+ if (old_action) {
+ old_action->sa_handler_ = sa_segv_.sa_handler_;
+ old_action->sa_mask = sa_segv_.sa_mask.sig[0];
+ old_action->sa_flags = sa_segv_.sa_flags;
+ old_action->sa_restorer = sa_segv_.sa_restorer;
+ }
+ if (action) {
+ sa_segv_.sa_handler_ = action->sa_handler_;
+ sa_segv_.sa_mask.sig[0] = action->sa_mask;
+ sa_segv_.sa_flags = action->sa_flags;
+ sa_segv_.sa_restorer = action->sa_restorer;
+ }
+ } else {
+ struct {
+ int sysnum;
+ long long cookie;
+ SigAction sigaction_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_sigaction;
+ request.cookie = cookie();
+ request.sigaction_req.sysnum = __NR_sigaction;
+ request.sigaction_req.signum = signum;
+ request.sigaction_req.action =
+ reinterpret_cast<const SysCalls::kernel_sigaction *>(action);
+ request.sigaction_req.old_action =
+ reinterpret_cast<const SysCalls::kernel_sigaction *>(old_action);
+ request.sigaction_req.sigsetsize = 8;
+
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward sigaction() request [sandbox]");
+ }
+ }
+ Debug::elapsed(tm, __NR_sigaction);
+ return rc;
+}
+#endif
+
+#if defined(__NR_rt_sigaction)
+#define min(a,b) ({ typeof(a) a_=(a); typeof(b) b_=(b); a_ < b_ ? a_ : b_; })
+#define max(a,b) ({ typeof(a) a_=(a); typeof(b) b_=(b); a_ > b_ ? a_ : b_; })
+
+long Sandbox::sandbox_rt_sigaction(int signum, const void* a_, void* oa_,
+ size_t sigsetsize) {
+ const SysCalls::kernel_sigaction* action =
+ reinterpret_cast<const SysCalls::kernel_sigaction*>(a_);
+ SysCalls::kernel_sigaction* old_action =
+ reinterpret_cast<SysCalls::kernel_sigaction*>(oa_);
+
+ long rc = 0;
+ long long tm;
+ Debug::syscall(&tm, __NR_rt_sigaction, "Executing handler");
+ if (signum == SIGSEGV) {
+ size_t theirSize = offsetof(SysCalls::kernel_sigaction, sa_mask) +
+ sigsetsize;
+ if (old_action) {
+ memcpy(old_action, &sa_segv_, min(sizeof(sa_segv_), theirSize));
+ memset(old_action + 1, 0, max(0u, theirSize - sizeof(sa_segv_)));
+ }
+ if (action) {
+ memcpy(&sa_segv_, action, min(sizeof(sa_segv_), theirSize));
+ memset(&sa_segv_.sa_mask, 0, max(0u, 8 - sigsetsize));
+ }
+ } else {
+ struct {
+ int sysnum;
+ long long cookie;
+ SigAction sigaction_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_rt_sigaction;
+ request.cookie = cookie();
+ request.sigaction_req.sysnum = __NR_rt_sigaction;
+ request.sigaction_req.signum = signum;
+ request.sigaction_req.action = action;
+ request.sigaction_req.old_action = old_action;
+ request.sigaction_req.sigsetsize = sigsetsize;
+
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward rt_sigaction() request [sandbox]");
+ }
+ }
+ Debug::elapsed(tm, __NR_rt_sigaction);
+ return rc;
+}
+#endif
+
+#if defined(__NR_signal)
+void* Sandbox::sandbox_signal(int signum, const void* handler) {
+ struct kernel_old_sigaction sa, osa;
+ sa.sa_handler_ = reinterpret_cast<void (*)(int)>(handler);
+ sa.sa_flags = SA_NODEFER | SA_RESETHAND | SA_RESTORER;
+ sa.sa_mask = 0;
+ asm volatile(
+ "lea 0f, %0\n"
+ "jmp 1f\n"
+ "0:pop %%eax\n"
+ "mov $119, %%eax\n" // __NR_sigreturn
+ "int $0x80\n"
+ "1:\n"
+ : "=r"(sa.sa_restorer));
+ long rc = sandbox_sigaction(signum, &sa, &osa);
+ if (rc < 0) {
+ return (void *)rc;
+ }
+ return reinterpret_cast<void *>(osa.sa_handler_);
+}
+#endif
+
+bool Sandbox::process_sigaction(int parentMapsFd, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // We need to intercept sigaction() in order to properly rewrite calls to
+ // sigaction(SEGV). While there is no security implication if we didn't do
+ // so, it would end up preventing the program from running correctly as the
+ // the sandbox's SEGV handler could accidentally get removed. All of this is
+ // done in sandbox_{,rt_}sigaction(). But we still bounce through the
+ // trusted process as that is the only way we can instrument system calls.
+ // This is somewhat needlessly complicated. But as sigaction() is not a
+ // performance critical system call, it is easier to do this way than to
+ // extend the format of the syscall_table so that it could deal with this
+ // special case.
+
+ // Read request
+ SigAction sigaction_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &sigaction_req, sizeof(sigaction_req)) !=
+ sizeof(sigaction_req)) {
+ die("Failed to read parameters for sigaction() [process]");
+ }
+ if (sigaction_req.signum == SIGSEGV) {
+ // This should never happen. Something went wrong when intercepting the
+ // system call. This is not a security problem, but it clearly doesn't
+ // make sense to let the system call pass.
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, sigaction_req.sysnum,
+ sigaction_req.signum, sigaction_req.action,
+ sigaction_req.old_action,
+ sigaction_req.sigsetsize);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/sigprocmask.cc b/sandbox/linux/seccomp/sigprocmask.cc
new file mode 100644
index 0000000..9ff2922
--- /dev/null
+++ b/sandbox/linux/seccomp/sigprocmask.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+// If the sandboxed process tries to mask SIGSEGV, there is a good chance
+// the process will eventually get terminated. If this is really ever a
+// problem, we can hide the fact that SIGSEGV is unmasked. But I don't think
+// we really need this. Masking of synchronous signals is rarely necessary.
+
+#if defined(__NR_sigprocmask)
+long Sandbox::sandbox_sigprocmask(int how, const void* set, void* old_set) {
+ long long tm;
+ Debug::syscall(&tm, __NR_sigprocmask, "Executing handler");
+
+ // Access the signal mask by triggering a SEGV and modifying the signal state
+ // prior to calling rt_sigreturn().
+ long res = -ENOSYS;
+ #if defined(__x86_64__)
+ #error x86-64 does not support sigprocmask(); use rt_sigprocmask() instead
+ #elif defined(__i386__)
+ asm volatile(
+ "push %%ebx\n"
+ "movl %2, %%ebx\n"
+ "int $0\n"
+ "pop %%ebx\n"
+ : "=a"(res)
+ : "0"(__NR_sigprocmask), "ri"((long)how),
+ "c"((long)set), "d"((long)old_set)
+ : "esp", "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Update our shadow signal mask, so that we can copy it upon creation of
+ // new threads.
+ if (res == 0 && set != NULL) {
+ SecureMem::Args* args = getSecureMem();
+ switch (how) {
+ case SIG_BLOCK:
+ *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set;
+ break;
+ case SIG_UNBLOCK:
+ *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set;
+ break;
+ case SIG_SETMASK:
+ *(unsigned long long *)&args->signalMask = *(unsigned long long *)set;
+ break;
+ default:
+ break;
+ }
+ }
+
+ Debug::elapsed(tm, __NR_sigprocmask);
+
+ return res;
+}
+#endif
+
+#if defined(__NR_rt_sigprocmask)
+long Sandbox::sandbox_rt_sigprocmask(int how, const void* set, void* old_set,
+ size_t bytes) {
+ long long tm;
+ Debug::syscall(&tm, __NR_rt_sigprocmask, "Executing handler");
+
+ // Access the signal mask by triggering a SEGV and modifying the signal state
+ // prior to calling rt_sigreturn().
+ long res = -ENOSYS;
+ #if defined(__x86_64__)
+ asm volatile(
+ "movq %5, %%r10\n"
+ "int $0\n"
+ : "=a"(res)
+ : "0"(__NR_rt_sigprocmask), "D"((long)how),
+ "S"((long)set), "d"((long)old_set), "r"((long)bytes)
+ : "r10", "r11", "rcx", "memory");
+ #elif defined(__i386__)
+ asm volatile(
+ "push %%ebx\n"
+ "movl %2, %%ebx\n"
+ "int $0\n"
+ "pop %%ebx\n"
+ : "=a"(res)
+ : "0"(__NR_rt_sigprocmask), "ri"((long)how),
+ "c"((long)set), "d"((long)old_set), "S"((long)bytes)
+ : "esp", "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Update our shadow signal mask, so that we can copy it upon creation of
+ // new threads.
+ if (res == 0 && set != NULL && bytes >= 8) {
+ SecureMem::Args* args = getSecureMem();
+ switch (how) {
+ case SIG_BLOCK:
+ *(unsigned long long *)&args->signalMask |= *(unsigned long long *)set;
+ break;
+ case SIG_UNBLOCK:
+ *(unsigned long long *)&args->signalMask &= ~*(unsigned long long *)set;
+ break;
+ case SIG_SETMASK:
+ *(unsigned long long *)&args->signalMask = *(unsigned long long *)set;
+ break;
+ default:
+ break;
+ }
+ }
+
+ Debug::elapsed(tm, __NR_rt_sigprocmask);
+
+ return res;
+}
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/socketcall.cc b/sandbox/linux/seccomp/socketcall.cc
new file mode 100644
index 0000000..c7b2015
--- /dev/null
+++ b/sandbox/linux/seccomp/socketcall.cc
@@ -0,0 +1,1039 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+#if defined(__NR_socket)
+
+ssize_t Sandbox::sandbox_recvfrom(int sockfd, void* buf, size_t len, int flags,
+ void* from, socklen_t* fromlen) {
+ long long tm;
+ Debug::syscall(&tm, __NR_recvfrom, "Executing handler");
+
+ SysCalls sys;
+ if (!from && !flags) {
+ // recv() with a NULL sender and no flags is the same as read(), which
+ // is unrestricted in seccomp mode.
+ Debug::message("Replaced recv() with call to read()");
+ ssize_t rc = sys.read(sockfd, buf, len);
+ if (rc < 0) {
+ Debug::elapsed(tm, __NR_recvfrom);
+ return -sys.my_errno;
+ } else {
+ Debug::elapsed(tm, __NR_recvfrom);
+ return rc;
+ }
+ }
+
+ struct {
+ int sysnum;
+ long long cookie;
+ RecvFrom recvfrom_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_recvfrom;
+ request.cookie = cookie();
+ request.recvfrom_req.sockfd = sockfd;
+ request.recvfrom_req.buf = buf;
+ request.recvfrom_req.len = len;
+ request.recvfrom_req.flags = flags;
+ request.recvfrom_req.from = from;
+ request.recvfrom_req.fromlen = fromlen;
+
+ long rc;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward recvfrom() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_recvfrom);
+ return static_cast<ssize_t>(rc);
+}
+
+ssize_t Sandbox::sandbox_recvmsg(int sockfd, struct msghdr* msg, int flags) {
+ long long tm;
+ Debug::syscall(&tm, __NR_recvmsg, "Executing handler");
+
+ // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do
+ // not know whether the caller needs us to set msg->msg_flags.
+ struct {
+ int sysnum;
+ long long cookie;
+ RecvMsg recvmsg_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_recvmsg;
+ request.cookie = cookie();
+ request.recvmsg_req.sockfd = sockfd;
+ request.recvmsg_req.msg = msg;
+ request.recvmsg_req.flags = flags;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward recvmsg() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_recvmsg);
+ return static_cast<ssize_t>(rc);
+}
+
+size_t Sandbox::sandbox_sendmsg(int sockfd, const struct msghdr* msg,
+ int flags) {
+ long long tm;
+ Debug::syscall(&tm, __NR_sendmsg, "Executing handler");
+
+ if (msg->msg_iovlen == 1 && msg->msg_controllen == 0) {
+ // sendmsg() can sometimes be simplified as sendto()
+ return sandbox_sendto(sockfd, msg->msg_iov, msg->msg_iovlen,
+ flags, msg->msg_name, msg->msg_namelen);
+ }
+
+ struct Request {
+ int sysnum;
+ long long cookie;
+ SendMsg sendmsg_req;
+ struct msghdr msg;
+ } __attribute__((packed));
+ char data[sizeof(struct Request) + msg->msg_namelen + msg->msg_controllen];
+ struct Request *request = reinterpret_cast<struct Request *>(data);
+ request->sysnum = __NR_sendmsg;
+ request->cookie = cookie();
+ request->sendmsg_req.sockfd = sockfd;
+ request->sendmsg_req.msg = msg;
+ request->sendmsg_req.flags = flags;
+ request->msg = *msg;
+ memcpy(reinterpret_cast<char *>(
+ memcpy(request + 1, msg->msg_name, msg->msg_namelen)) +
+ msg->msg_namelen,
+ msg->msg_control, msg->msg_controllen);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &data, sizeof(data)) !=
+ (ssize_t)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward sendmsg() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_sendmsg);
+ return static_cast<ssize_t>(rc);
+}
+
+ssize_t Sandbox::sandbox_sendto(int sockfd, const void* buf, size_t len,
+ int flags, const void* to, socklen_t tolen) {
+ long long tm;
+ Debug::syscall(&tm, __NR_sendto, "Executing handler");
+
+ SysCalls sys;
+ if (!to && !flags) {
+ // sendto() with a NULL recipient and no flags is the same as write(),
+ // which is unrestricted in seccomp mode.
+ Debug::message("Replaced sendto() with call to write()");
+ ssize_t rc = sys.write(sockfd, buf, len);
+ if (rc < 0) {
+ Debug::elapsed(tm, __NR_sendto);
+ return -sys.my_errno;
+ } else {
+ Debug::elapsed(tm, __NR_sendto);
+ return rc;
+ }
+ }
+
+ struct {
+ int sysnum;
+ long long cookie;
+ SendTo sendto_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_sendto;
+ request.cookie = cookie();
+ request.sendto_req.sockfd = sockfd;
+ request.sendto_req.buf = buf;
+ request.sendto_req.len = len;
+ request.sendto_req.flags = flags;
+ request.sendto_req.to = to;
+ request.sendto_req.tolen = tolen;
+
+ long rc;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward sendto() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_sendto);
+ return static_cast<ssize_t>(rc);
+}
+
+long Sandbox::sandbox_setsockopt(int sockfd, int level, int optname,
+ const void* optval, socklen_t optlen) {
+ long long tm;
+ Debug::syscall(&tm, __NR_setsockopt, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ SetSockOpt setsockopt_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_setsockopt;
+ request.cookie = cookie();
+ request.setsockopt_req.sockfd = sockfd;
+ request.setsockopt_req.level = level;
+ request.setsockopt_req.optname = optname;
+ request.setsockopt_req.optval = optval;
+ request.setsockopt_req.optlen = optlen;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward setsockopt() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_setsockopt);
+ return rc;
+}
+
+long Sandbox::sandbox_getsockopt(int sockfd, int level, int optname,
+ void* optval, socklen_t* optlen) {
+ long long tm;
+ Debug::syscall(&tm, __NR_getsockopt, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ GetSockOpt getsockopt_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_getsockopt;
+ request.cookie = cookie();
+ request.getsockopt_req.sockfd = sockfd;
+ request.getsockopt_req.level = level;
+ request.getsockopt_req.optname = optname;
+ request.getsockopt_req.optval = optval;
+ request.getsockopt_req.optlen = optlen;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward getsockopt() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_getsockopt);
+ return rc;
+}
+
+bool Sandbox::process_recvfrom(int parentMapsFd, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ RecvFrom recvfrom_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &recvfrom_req, sizeof(recvfrom_req)) !=
+ sizeof(recvfrom_req)) {
+ die("Failed to read parameters for recvfrom() [process]");
+ }
+
+ // Unsupported flag encountered. Deny the call.
+ if (recvfrom_req.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // While we do not anticipate any particular need to receive data on
+ // unconnected sockets, there is no particular risk in doing so.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_recvfrom, recvfrom_req.sockfd,
+ recvfrom_req.buf, recvfrom_req.len,
+ recvfrom_req.flags, recvfrom_req.from,
+ recvfrom_req.fromlen);
+ return true;
+}
+
+bool Sandbox::process_recvmsg(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ RecvMsg recvmsg_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &recvmsg_req, sizeof(recvmsg_req)) !=
+ sizeof(recvmsg_req)) {
+ die("Failed to read parameters for recvmsg() [process]");
+ }
+
+ // Unsupported flag encountered. Deny the call.
+ if (recvmsg_req.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // Receiving messages is general not security critical.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_recvmsg, recvmsg_req.sockfd,
+ recvmsg_req.msg, recvmsg_req.flags);
+ return true;
+}
+
+bool Sandbox::process_sendmsg(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ struct {
+ SendMsg sendmsg_req;
+ struct msghdr msg;
+ } __attribute__((packed)) data;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &data, sizeof(data)) != sizeof(data)) {
+ die("Failed to read parameters for sendmsg() [process]");
+ }
+
+ if (data.msg.msg_namelen > 4096 || data.msg.msg_controllen > 4096) {
+ die("Unexpected size for socketcall() payload [process]");
+ }
+ char extra[data.msg.msg_namelen + data.msg.msg_controllen];
+ if (read(sys, sandboxFd, &extra, sizeof(extra)) != (ssize_t)sizeof(extra)) {
+ die("Failed to read parameters for sendmsg() [process]");
+ }
+ if (sizeof(struct msghdr) + sizeof(extra) > sizeof(mem->pathname)) {
+ goto deny;
+ }
+
+ if (data.msg.msg_namelen ||
+ (data.sendmsg_req.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))) {
+ deny:
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // The trusted process receives file handles when a new untrusted thread
+ // gets created. We have security checks in place that prevent any
+ // critical information from being tampered with during thread creation.
+ // But if we disallowed passing of file handles, this would add an extra
+ // hurdle for an attacker.
+ // Unfortunately, for now, this is not possible as Chrome's
+ // base::SendRecvMsg() needs the ability to pass file handles.
+ if (data.msg.msg_controllen) {
+ data.msg.msg_control = extra + data.msg.msg_namelen;
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&data.msg);
+ do {
+ if (cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS) {
+ goto deny;
+ }
+ } while ((cmsg = CMSG_NXTHDR(&data.msg, cmsg)) != NULL);
+ }
+
+ // This must be a locked system call, because we have to ensure that the
+ // untrusted code does not tamper with the msghdr after we have examined it.
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ if (sizeof(extra) > 0) {
+ if (data.msg.msg_namelen > 0) {
+ data.msg.msg_name = mem->pathname + sizeof(struct msghdr);
+ }
+ if (data.msg.msg_controllen > 0) {
+ data.msg.msg_control = mem->pathname + sizeof(struct msghdr) +
+ data.msg.msg_namelen;
+ }
+ memcpy(mem->pathname + sizeof(struct msghdr), extra, sizeof(extra));
+ }
+ memcpy(mem->pathname, &data.msg, sizeof(struct msghdr));
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem,
+ __NR_sendmsg, data.sendmsg_req.sockfd,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ data.sendmsg_req.flags);
+ return true;
+}
+
+bool Sandbox::process_sendto(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SendTo sendto_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &sendto_req, sizeof(sendto_req)) !=
+ sizeof(sendto_req)) {
+ die("Failed to read parameters for sendto() [process]");
+ }
+
+ // The sandbox does not allow sending to arbitrary addresses.
+ if (sendto_req.to) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // Unsupported flag encountered. Deny the call.
+ if (sendto_req.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // Sending data on a connected socket is similar to calling write().
+ // Allow it.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_sendto, sendto_req.sockfd,
+ sendto_req.buf, sendto_req.len,
+ sendto_req.flags, sendto_req.to,
+ sendto_req.tolen);
+ return true;
+}
+
+bool Sandbox::process_setsockopt(int parentMapsFd, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ SetSockOpt setsockopt_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &setsockopt_req, sizeof(setsockopt_req)) !=
+ sizeof(setsockopt_req)) {
+ die("Failed to read parameters for setsockopt() [process]");
+ }
+
+ switch (setsockopt_req.level) {
+ case SOL_SOCKET:
+ switch (setsockopt_req.optname) {
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_setsockopt, setsockopt_req.sockfd,
+ setsockopt_req.level, setsockopt_req.optname,
+ setsockopt_req.optval, setsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (setsockopt_req.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_setsockopt, setsockopt_req.sockfd,
+ setsockopt_req.level, setsockopt_req.optname,
+ setsockopt_req.optval, setsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+}
+
+bool Sandbox::process_getsockopt(int parentMapsFd, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ GetSockOpt getsockopt_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &getsockopt_req, sizeof(getsockopt_req)) !=
+ sizeof(getsockopt_req)) {
+ die("Failed to read parameters for getsockopt() [process]");
+ }
+
+ switch (getsockopt_req.level) {
+ case SOL_SOCKET:
+ switch (getsockopt_req.optname) {
+ case SO_ACCEPTCONN:
+ case SO_ERROR:
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ case SO_TYPE:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_getsockopt, getsockopt_req.sockfd,
+ getsockopt_req.level, getsockopt_req.optname,
+ getsockopt_req.optval, getsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (getsockopt_req.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_getsockopt, getsockopt_req.sockfd,
+ getsockopt_req.level, getsockopt_req.optname,
+ getsockopt_req.optval, getsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+}
+
+#endif
+#if defined(__NR_socketcall)
+
+enum {
+ SYS_SOCKET = 1,
+ SYS_BIND = 2,
+ SYS_CONNECT = 3,
+ SYS_LISTEN = 4,
+ SYS_ACCEPT = 5,
+ SYS_GETSOCKNAME = 6,
+ SYS_GETPEERNAME = 7,
+ SYS_SOCKETPAIR = 8,
+ SYS_SEND = 9,
+ SYS_RECV = 10,
+ SYS_SENDTO = 11,
+ SYS_RECVFROM = 12,
+ SYS_SHUTDOWN = 13,
+ SYS_SETSOCKOPT = 14,
+ SYS_GETSOCKOPT = 15,
+ SYS_SENDMSG = 16,
+ SYS_RECVMSG = 17,
+ SYS_ACCEPT4 = 18
+};
+
+struct Sandbox::SocketCallArgInfo {
+ size_t len;
+ off_t addrOff;
+ off_t lengthOff;
+};
+const struct Sandbox::SocketCallArgInfo Sandbox::socketCallArgInfo[] = {
+ #define STRUCT(s) reinterpret_cast<SocketCall *>(0)->args.s
+ #define SIZE(s) sizeof(STRUCT(s))
+ #define OFF(s, f) offsetof(typeof STRUCT(s), f)
+ { 0 },
+ { SIZE(socket) },
+ { SIZE(bind), OFF(bind, addr), OFF(bind, addrlen) },
+ { SIZE(connect), OFF(connect, addr), OFF(connect, addrlen) },
+ { SIZE(listen) },
+ { SIZE(accept) },
+ { SIZE(getsockname) },
+ { SIZE(getpeername) },
+ { SIZE(socketpair) },
+ { SIZE(send) },
+ { SIZE(recv) },
+ { SIZE(sendto), OFF(sendto, to), OFF(sendto, tolen) },
+ { SIZE(recvfrom) },
+ { SIZE(shutdown) },
+ { SIZE(setsockopt), OFF(setsockopt, optval), OFF(setsockopt, optlen) },
+ { SIZE(getsockopt) },
+ { SIZE(sendmsg) },
+ { SIZE(recvmsg) },
+ { SIZE(accept4) }
+ #undef STRUCT
+ #undef SIZE
+ #undef OFF
+};
+
+long Sandbox::sandbox_socketcall(int call, void* args) {
+ long long tm;
+ Debug::syscall(&tm, __NR_socketcall, "Executing handler", call);
+
+ // When demultiplexing socketcall(), only accept calls that have a valid
+ // "call" opcode.
+ if (call < SYS_SOCKET || call > SYS_ACCEPT4) {
+ Debug::elapsed(tm, __NR_socketcall, call);
+ return -ENOSYS;
+ }
+
+ // Some type of calls include a pointer to an address or name, which cannot
+ // be accessed by the trusted process, as it lives in a separate address
+ // space. For these calls, append the extra data to the serialized request.
+ // This requires some copying of data, as we have to make sure there is
+ // only a single atomic call to write().
+ socklen_t numExtraData = 0;
+ const void* extraDataAddr = NULL;
+ if (socketCallArgInfo[call].lengthOff) {
+ memcpy(&numExtraData,
+ reinterpret_cast<char *>(args) + socketCallArgInfo[call].lengthOff,
+ sizeof(socklen_t));
+ extraDataAddr = reinterpret_cast<char *>(args) +
+ socketCallArgInfo[call].addrOff;
+ }
+
+ // sendmsg() and recvmsg() have more complicated requirements for computing
+ // the amount of extra data that needs to be sent to the trusted process.
+ if (call == SYS_SENDMSG) {
+ SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args);
+ if (sendmsg_args->msg->msg_iovlen == 1 &&
+ !sendmsg_args->msg->msg_control) {
+ // Further down in the code, this sendmsg() call will be simplified to
+ // a sendto() call. Make sure we already compute the correct value for
+ // numExtraData, as it is needed when we allocate "data[]" on the stack.
+ numExtraData = sendmsg_args->msg->msg_namelen;
+ extraDataAddr = sendmsg_args->msg->msg_name;
+ } else {
+ // sendmsg() needs to include some of the extra data so that we can
+ // inspect it in process_socketcall()
+ numExtraData = sizeof(*sendmsg_args->msg) +
+ sendmsg_args->msg->msg_namelen +
+ sendmsg_args->msg->msg_controllen;
+ extraDataAddr = NULL;
+ }
+ }
+ if (call == SYS_RECVMSG) {
+ RecvMsg *recvmsg_args = reinterpret_cast<RecvMsg *>(args);
+ numExtraData = sizeof(*recvmsg_args->msg);
+ extraDataAddr = recvmsg_args->msg;
+ }
+
+ // Set up storage for the request header and copy the data from "args"
+ // into it.
+ struct Request {
+ int sysnum;
+ long long cookie;
+ SocketCall socketcall_req;
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + numExtraData];
+ request = reinterpret_cast<struct Request *>(data);
+ memcpy(&request->socketcall_req.args, args, socketCallArgInfo[call].len);
+
+ // Simplify send(), sendto() and sendmsg(), if there are simpler equivalent
+ // calls. This allows us to occasionally replace them with calls to write(),
+ // which don't have to be forwarded to the trusted process.
+ SysCalls sys;
+ if (call == SYS_SENDMSG &&
+ request->socketcall_req.args.sendmsg.msg->msg_iovlen == 1 &&
+ !request->socketcall_req.args.sendmsg.msg->msg_control) {
+ // Ordering of these assignments is important, as we are reshuffling
+ // fields inside of a union.
+ call = SYS_SENDTO;
+ request->socketcall_req.args.sendto.flags =
+ request->socketcall_req.args.sendmsg.flags;
+ request->socketcall_req.args.sendto.to =
+ request->socketcall_req.args.sendmsg.msg->msg_name;
+ request->socketcall_req.args.sendto.tolen =
+ request->socketcall_req.args.sendmsg.msg->msg_namelen;
+ request->socketcall_req.args.sendto.len =
+ request->socketcall_req.args.sendmsg.msg->msg_iov->iov_len;
+ request->socketcall_req.args.sendto.buf =
+ request->socketcall_req.args.sendmsg.msg->msg_iov->iov_base;
+ }
+ if (call == SYS_SENDTO && !request->socketcall_req.args.sendto.to) {
+ // sendto() with a NULL address is the same as send()
+ call = SYS_SEND;
+ numExtraData = 0;
+ }
+ if (call == SYS_SEND && !request->socketcall_req.args.send.flags) {
+ // send() with no flags is the same as write(), which is unrestricted
+ // in seccomp mode.
+ Debug::message("Replaced socketcall() with call to write()");
+ ssize_t rc = sys.write(request->socketcall_req.args.send.sockfd,
+ request->socketcall_req.args.send.buf,
+ request->socketcall_req.args.send.len);
+ if (rc < 0) {
+ Debug::elapsed(tm, __NR_socketcall, call);
+ return -sys.my_errno;
+ } else {
+ Debug::elapsed(tm, __NR_socketcall, call);
+ return rc;
+ }
+ }
+
+ // Simplify recv(), and recvfrom(), if there are simpler equivalent calls.
+ // This allows us to occasionally replace them with calls to read(), which
+ // don't have to be forwarded to the trusted process.
+ // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do
+ // not know whether the caller needs us to set msg->msg_flags.
+ if (call == SYS_RECVFROM && !request->socketcall_req.args.recvfrom.from) {
+ // recvfrom() with a NULL address buffer is the same as recv()
+ call = SYS_RECV;
+ }
+ if (call == SYS_RECV && !request->socketcall_req.args.recv.flags) {
+ // recv() with no flags is the same as read(), which is unrestricted
+ // in seccomp mode.
+ Debug::message("Replaced socketcall() with call to read()");
+ ssize_t rc = sys.read(request->socketcall_req.args.recv.sockfd,
+ request->socketcall_req.args.recv.buf,
+ request->socketcall_req.args.recv.len);
+ if (rc < 0) {
+ Debug::elapsed(tm, __NR_socketcall, call);
+ return -sys.my_errno;
+ } else {
+ Debug::elapsed(tm, __NR_socketcall, call);
+ return rc;
+ }
+ }
+
+ // Fill in the rest of the request header.
+ request->sysnum = __NR_socketcall;
+ request->cookie = cookie();
+ request->socketcall_req.call = call;
+ request->socketcall_req.arg_ptr = args;
+ int padding = sizeof(request->socketcall_req.args) -
+ socketCallArgInfo[call].len;
+ if (padding > 0) {
+ memset((char *)(&request->socketcall_req.args + 1) - padding, 0, padding);
+ }
+ if (call == SYS_SENDMSG) {
+ // for sendmsg() we include the (optional) destination address, and the
+ // (optional) control data in the payload.
+ SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args);
+ memcpy(reinterpret_cast<char *>(
+ memcpy(reinterpret_cast<char *>(
+ memcpy(request + 1, sendmsg_args->msg, sizeof(*sendmsg_args->msg))) +
+ sizeof(*sendmsg_args->msg),
+ sendmsg_args->msg->msg_name, sendmsg_args->msg->msg_namelen)) +
+ sendmsg_args->msg->msg_namelen,
+ sendmsg_args->msg->msg_control, sendmsg_args->msg->msg_controllen);
+ } else if (extraDataAddr) {
+ memcpy(request + 1, extraDataAddr, numExtraData);
+ }
+
+ // Send request to trusted process and collect response from trusted thread.
+ long rc;
+ ssize_t len = sizeof(struct Request) + numExtraData;
+ if (write(sys, processFdPub(), data, len) != len ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward socketcall() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_socketcall, call);
+ return rc;
+}
+
+bool Sandbox::process_socketcall(int parentMapsFd, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ SocketCall socketcall_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &socketcall_req, sizeof(socketcall_req)) !=
+ sizeof(socketcall_req)) {
+ die("Failed to read parameters for socketcall() [process]");
+ }
+
+ // sandbox_socketcall() should never send us an unexpected "call" opcode.
+ // If it did, something went very wrong and we better terminate the process.
+ if (socketcall_req.call < SYS_SOCKET || socketcall_req.call > SYS_ACCEPT4) {
+ die("Unexpected socketcall() [process]");
+ }
+
+ // Check if this particular operation carries an extra payload.
+ socklen_t numExtraData = 0;
+ if (socketCallArgInfo[socketcall_req.call].lengthOff) {
+ memcpy(&numExtraData,
+ reinterpret_cast<char *>(&socketcall_req) +
+ socketCallArgInfo[socketcall_req.call].lengthOff,
+ sizeof(socklen_t));
+ } else if (socketcall_req.call == SYS_SENDMSG) {
+ numExtraData = sizeof(*socketcall_req.args.sendmsg.msg);
+ } else if (socketcall_req.call == SYS_RECVMSG) {
+ numExtraData = sizeof(*socketcall_req.args.recvmsg.msg);
+ }
+
+ // Verify that the length for the payload is reasonable. We don't want to
+ // blow up our stack, and excessive (or negative) buffer sizes are almost
+ // certainly a bug.
+ if (numExtraData > 4096) {
+ die("Unexpected size for socketcall() payload [process]");
+ }
+
+ // Read the extra payload, if any.
+ char extra[numExtraData];
+ if (numExtraData) {
+ if (read(sys, sandboxFd, extra, numExtraData) != (ssize_t)numExtraData) {
+ die("Failed to read socketcall() payload [process]");
+ }
+ }
+
+ // sendmsg() has another level of indirection and can carry even more payload
+ ssize_t numSendmsgExtra = 0;
+ if (socketcall_req.call == SYS_SENDMSG) {
+ struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra);
+ if (msg->msg_namelen > 4096 || msg->msg_controllen > 4096) {
+ die("Unexpected size for socketcall() payload [process]");
+ }
+ numSendmsgExtra = msg->msg_namelen + msg->msg_controllen;
+ }
+ char sendmsgExtra[numSendmsgExtra];
+ if (numSendmsgExtra) {
+ if (read(sys, sandboxFd, sendmsgExtra, numSendmsgExtra) !=
+ numSendmsgExtra) {
+ die("Failed to read socketcall() payload [process]");
+ }
+ }
+
+ int rc = -EINVAL;
+ switch (socketcall_req.call) {
+ case SYS_SOCKET:
+ // The sandbox does not allow creation of any new sockets.
+ goto deny;
+ case SYS_BIND:
+ // The sandbox does not allow binding an address to a socket.
+ goto deny;
+ case SYS_CONNECT:
+ // The sandbox does not allow connecting a socket.
+ goto deny;
+ case SYS_LISTEN:
+ // The sandbox does not allow a socket to enter listening state.
+ goto deny;
+ case SYS_ACCEPT4:
+ case SYS_ACCEPT:
+ // If the sandbox obtained a socket that is already in the listening
+ // state (e.g. because somebody sent it a suitable file descriptor), it
+ // is permissible to call accept().
+
+ accept_simple:
+ // None of the parameters need to be checked, so it is OK to refer
+ // to the parameter block created by the untrusted code.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_socketcall,
+ socketcall_req.call, socketcall_req.arg_ptr);
+ return true;
+ case SYS_GETSOCKNAME:
+ case SYS_GETPEERNAME:
+ // Querying the local and the remote name is not considered security
+ // sensitive for the purposes of the sandbox.
+ goto accept_simple;
+ case SYS_SOCKETPAIR:
+ // Socket pairs are connected to each other and not considered
+ // security sensitive.
+ goto accept_simple;
+ case SYS_SENDTO:
+ if (socketcall_req.args.sendto.to) {
+ // The sandbox does not allow sending to arbitrary addresses.
+ goto deny;
+ }
+ // Fall through
+ case SYS_SEND:
+ if (socketcall_req.args.send.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) {
+ // Unsupported flag encountered. Deny the call.
+ goto deny;
+ }
+ // Sending data on a connected socket is similar to calling write().
+ // Allow it.
+
+ accept_complex:
+ // The parameter block contains potentially security critical information
+ // that should not be tampered with after it has been inspected. Copy it
+ // into the write-protected securely shared memory before telling the
+ // trusted thread to execute the socket call.
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args));
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem,
+ __NR_socketcall, socketcall_req.call,
+ mem->pathname - (char*)mem + (char*)mem->self);
+ return true;
+ case SYS_RECVFROM:
+ // While we do not anticipate any particular need to receive data on
+ // unconnected sockets, there is no particular risk in doing so.
+ // Fall through
+ case SYS_RECV:
+ if (socketcall_req.args.recv.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ // Unsupported flag encountered. Deny the call.
+ goto deny;
+ }
+ // Receiving data on a connected socket is similar to calling read().
+ // Allow it.
+ goto accept_complex;
+ case SYS_SHUTDOWN:
+ // Shutting down a socket is always OK.
+ goto accept_simple;
+ case SYS_SETSOCKOPT:
+ switch (socketcall_req.args.setsockopt.level) {
+ case SOL_SOCKET:
+ switch (socketcall_req.args.setsockopt.optname) {
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (socketcall_req.args.setsockopt.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ goto deny;
+ case SYS_GETSOCKOPT:
+ switch (socketcall_req.args.getsockopt.level) {
+ case SOL_SOCKET:
+ switch (socketcall_req.args.getsockopt.optname) {
+ case SO_ACCEPTCONN:
+ case SO_ERROR:
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ case SO_TYPE:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (socketcall_req.args.getsockopt.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ goto deny;
+ case SYS_SENDMSG: {
+ struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra);
+
+ if (sizeof(socketcall_req.args) + sizeof(*msg) + numSendmsgExtra >
+ sizeof(mem->pathname)) {
+ goto deny;
+ }
+
+ if (msg->msg_namelen ||
+ (socketcall_req.args.sendmsg.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))){
+ goto deny;
+ }
+
+ // The trusted process receives file handles when a new untrusted thread
+ // gets created. We have security checks in place that prevent any
+ // critical information from being tampered with during thread creation.
+ // But if we disallowed passing of file handles, this would add an extra
+ // hurdle for an attacker.
+ // Unfortunately, for now, this is not possible as Chrome's
+ // base::SendRecvMsg() needs the ability to pass file handles.
+ if (msg->msg_controllen) {
+ msg->msg_control = sendmsgExtra + msg->msg_namelen;
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+ do {
+ if (cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS) {
+ goto deny;
+ }
+ } while ((cmsg = CMSG_NXTHDR(msg, cmsg)) != NULL);
+ }
+
+ // This must be a locked system call, because we have to ensure that
+ // the untrusted code does not tamper with the msghdr after we have
+ // examined it.
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ socketcall_req.args.sendmsg.msg =
+ reinterpret_cast<struct msghdr*>(mem->pathname +
+ sizeof(socketcall_req.args) -
+ (char*)mem + (char*)mem->self);
+ memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args));
+ if (numSendmsgExtra) {
+ if (msg->msg_namelen > 0) {
+ msg->msg_name = const_cast<struct msghdr*>(
+ socketcall_req.args.sendmsg.msg) + 1;
+ }
+ if (msg->msg_controllen > 0) {
+ msg->msg_control = (char *)(
+ socketcall_req.args.sendmsg.msg + 1) + msg->msg_namelen;
+ }
+ memcpy(mem->pathname + sizeof(socketcall_req.args) + sizeof(*msg),
+ sendmsgExtra, numSendmsgExtra);
+ }
+ memcpy(mem->pathname + sizeof(socketcall_req.args), msg, sizeof(*msg));
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem,
+ __NR_socketcall, socketcall_req.call,
+ mem->pathname - (char*)mem + (char*)mem->self);
+ return true;
+ }
+ case SYS_RECVMSG:
+ // Receiving messages is general not security critical.
+ if (socketcall_req.args.recvmsg.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ goto deny;
+ }
+ goto accept_complex;
+ default:
+ deny:
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+}
+
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/stat.cc b/sandbox/linux/seccomp/stat.cc
new file mode 100644
index 0000000..cdf7e4c
--- /dev/null
+++ b/sandbox/linux/seccomp/stat.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+long Sandbox::sandbox_stat(const char *path, void *buf) {
+ long long tm;
+ Debug::syscall(&tm, __NR_stat, "Executing handler");
+ size_t len = strlen(path);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Stat stat_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_stat;
+ request->cookie = cookie();
+ request->stat_req.sysnum = __NR_stat;
+ request->stat_req.path_length = len;
+ request->stat_req.buf = buf;
+ memcpy(request->pathname, path, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward stat() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_stat);
+ return rc;
+}
+
+long Sandbox::sandbox_lstat(const char *path, void *buf) {
+ long long tm;
+ Debug::syscall(&tm, __NR_lstat, "Executing handler");
+ size_t len = strlen(path);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Stat stat_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_lstat;
+ request->cookie = cookie();
+ request->stat_req.sysnum = __NR_lstat;
+ request->stat_req.path_length = len;
+ request->stat_req.buf = buf;
+ memcpy(request->pathname, path, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward lstat() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_lstat);
+ return rc;
+}
+
+#if defined(__NR_stat64)
+long Sandbox::sandbox_stat64(const char *path, void *buf) {
+ long long tm;
+ Debug::syscall(&tm, __NR_stat64, "Executing handler");
+ size_t len = strlen(path);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Stat stat_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_stat64;
+ request->cookie = cookie();
+ request->stat_req.sysnum = __NR_stat64;
+ request->stat_req.path_length = len;
+ request->stat_req.buf = buf;
+ memcpy(request->pathname, path, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward stat64() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_stat64);
+ return rc;
+}
+
+long Sandbox::sandbox_lstat64(const char *path, void *buf) {
+ long long tm;
+ Debug::syscall(&tm, __NR_lstat64, "Executing handler");
+ size_t len = strlen(path);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Stat stat_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_lstat64;
+ request->cookie = cookie();
+ request->stat_req.sysnum = __NR_lstat64;
+ request->stat_req.path_length = len;
+ request->stat_req.buf = buf;
+ memcpy(request->pathname, path, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward lstat64() request [sandbox]");
+ }
+ Debug::elapsed(tm, __NR_lstat64);
+ return rc;
+}
+#endif
+
+bool Sandbox::process_stat(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ Stat stat_req;
+ if (read(sys, sandboxFd, &stat_req, sizeof(stat_req)) != sizeof(stat_req)) {
+ read_parm_failed:
+ die("Failed to read parameters for stat() [process]");
+ }
+ int rc = -ENAMETOOLONG;
+ if (stat_req.path_length >= (int)sizeof(mem->pathname)) {
+ char buf[32];
+ while (stat_req.path_length > 0) {
+ size_t len = stat_req.path_length > sizeof(buf) ?
+ sizeof(buf) : stat_req.path_length;
+ ssize_t i = read(sys, sandboxFd, buf, len);
+ if (i <= 0) {
+ goto read_parm_failed;
+ }
+ stat_req.path_length -= i;
+ }
+ if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to return data from stat() [process]");
+ }
+ return false;
+ }
+ if (stat_req.sysnum != __NR_stat && stat_req.sysnum != __NR_lstat
+ #ifdef __NR_stat64
+ && stat_req.sysnum != __NR_stat64
+ #endif
+ #ifdef __NR_lstat64
+ && stat_req.sysnum != __NR_lstat64
+ #endif
+ ) {
+ die("Corrupted stat() request");
+ }
+
+ if (!g_policy.allow_file_namespace) {
+ // After locking the mutex, we can no longer abandon the system call. So,
+ // perform checks before clobbering the securely shared memory.
+ char tmp[stat_req.path_length];
+ if (read(sys, sandboxFd, tmp, stat_req.path_length) !=
+ (ssize_t)stat_req.path_length) {
+ goto read_parm_failed;
+ }
+ Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str());
+ SecureMem::abandonSystemCall(threadFd, -EACCES);
+ return false;
+ }
+
+ SecureMem::lockSystemCall(parentMapsFd, mem);
+ if (read(sys, sandboxFd, mem->pathname, stat_req.path_length) !=
+ (ssize_t)stat_req.path_length) {
+ goto read_parm_failed;
+ }
+ mem->pathname[stat_req.path_length] = '\000';
+
+ // TODO(markus): Implement sandboxing policy
+ Debug::message(("Allowing access to \"" + std::string(mem->pathname) +
+ "\"").c_str());
+
+ // Tell trusted thread to stat the file.
+ SecureMem::sendSystemCall(threadFdPub, true, parentMapsFd, mem,
+ stat_req.sysnum,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ stat_req.buf);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc
new file mode 100644
index 0000000..681fec9
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall.cc
@@ -0,0 +1,380 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+// TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file
+asm(
+ ".pushsection .text, \"ax\", @progbits\n"
+
+ // This is the special wrapper for the clone() system call. The code
+ // relies on the stack layout of the system call wrapper (c.f. below). It
+ // passes the stack pointer as an additional argument to sandbox__clone(),
+ // so that upon starting the child, register values can be restored and
+ // the child can start executing at the correct IP, instead of trying to
+ // run in the trusted thread.
+ "playground$sandbox_clone:"
+ ".globl playground$sandbox_clone\n"
+ ".type playground$sandbox_clone, @function\n"
+ #if defined(__x86_64__)
+ // Skip the 8 byte return address into the system call wrapper. The
+ // following bytes are the saved register values that we need to restore
+ // upon return from clone() in the new thread.
+ "lea 8(%rsp), %r9\n"
+ "jmp playground$sandbox__clone\n"
+ #elif defined(__i386__)
+ // As i386 passes function arguments on the stack, we need to skip a few
+ // more values before we can get to the saved registers.
+ "lea 28(%esp), %eax\n"
+ "mov %eax, 24(%esp)\n"
+ "jmp playground$sandbox__clone\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ ".size playground$sandbox_clone, .-playground$sandbox_clone\n"
+
+
+ // This is the wrapper which is called by the untrusted code, trying to
+ // make a system call.
+ "playground$syscallWrapper:"
+ ".internal playground$syscallWrapper\n"
+ ".globl playground$syscallWrapper\n"
+ ".type playground$syscallWrapper, @function\n"
+ #if defined(__x86_64__)
+ // Check for rt_sigreturn(). It needs to be handled specially.
+ "cmp $15, %rax\n" // NR_rt_sigreturn
+ "jnz 1f\n"
+ "add $0x90, %rsp\n" // pop return addresses and red zone
+ "0:syscall\n" // rt_sigreturn() is unrestricted
+ "mov $66, %edi\n" // rt_sigreturn() should never return
+ "mov $231, %eax\n" // NR_exit_group
+ "jmp 0b\n"
+
+ // Save all registers
+ "1:push %rbp\n"
+ "mov %rsp, %rbp\n"
+ "push %rbx\n"
+ "push %rcx\n"
+ "push %rdx\n"
+ "push %rsi\n"
+ "push %rdi\n"
+ "push %r8\n"
+ "push %r9\n"
+ "push %r10\n"
+ "push %r11\n"
+ "push %r12\n"
+ "push %r13\n"
+ "push %r14\n"
+ "push %r15\n"
+
+ // Convert from syscall calling conventions to C calling conventions.
+ // System calls have a subtly different register ordering than the user-
+ // space x86-64 ABI.
+ "mov %r10, %rcx\n"
+
+ // Check range of system call
+ "cmp playground$maxSyscall(%rip), %eax\n"
+ "ja 3f\n"
+
+ // Retrieve function call from system call table (c.f. syscall_table.c).
+ // We have three different types of entries; zero for denied system calls,
+ // that should be handled by the defaultSystemCallHandler(); minus one
+ // for unrestricted system calls that need to be forwarded to the trusted
+ // thread; and function pointers to specific handler functions.
+ "mov %rax, %r10\n"
+ "shl $4, %r10\n"
+ "lea playground$syscallTable(%rip), %r11\n"
+ "add %r11, %r10\n"
+ "mov 0(%r10), %r10\n"
+
+ // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
+ // jump to fallback handler.
+ "cmp $1, %r10\n"
+ "jbe 3f\n"
+ "call *%r10\n"
+ "2:"
+
+ // Restore CPU registers, except for %rax which was set by the system call.
+ "pop %r15\n"
+ "pop %r14\n"
+ "pop %r13\n"
+ "pop %r12\n"
+ "pop %r11\n"
+ "pop %r10\n"
+ "pop %r9\n"
+ "pop %r8\n"
+ "pop %rdi\n"
+ "pop %rsi\n"
+ "pop %rdx\n"
+ "pop %rcx\n"
+ "pop %rbx\n"
+ "pop %rbp\n"
+
+ // Remove fake return address. This is added in the patching code in
+ // library.cc and it makes stack traces a little cleaner.
+ "add $8, %rsp\n"
+
+ // Return to caller
+ "ret\n"
+
+ "3:"
+ // If we end up calling a specific handler, we don't need to know the
+ // system call number. However, in the generic case, we do. Shift
+ // registers so that the system call number becomes visible as the
+ // first function argument.
+ "push %r9\n"
+ "mov %r8, %r9\n"
+ "mov %rcx, %r8\n"
+ "mov %rdx, %rcx\n"
+ "mov %rsi, %rdx\n"
+ "mov %rdi, %rsi\n"
+ "mov %rax, %rdi\n"
+
+ // Call default handler.
+ "call playground$defaultSystemCallHandler\n"
+ "pop %r9\n"
+ "jmp 2b\n"
+ #elif defined(__i386__)
+ "cmp $119, %eax\n" // NR_sigreturn
+ "jnz 1f\n"
+ "add $0x4, %esp\n" // pop return address
+ "0:int $0x80\n" // sigreturn() is unrestricted
+ "mov $66, %ebx\n" // sigreturn() should never return
+ "mov %ebx, %eax\n" // NR_exit
+ "jmp 0b\n"
+ "1:cmp $173, %eax\n" // NR_rt_sigreturn
+ "jnz 3f\n"
+
+ // Convert rt_sigframe into sigframe, allowing us to call sigreturn().
+ // This is possible since the first part of signal stack frames have
+ // stayed very stable since the earliest kernel versions. While never
+ // officially documented, lots of user space applications rely on this
+ // part of the ABI, and kernel developers have been careful to maintain
+ // backwards compatibility.
+ // In general, the rt_sigframe includes a lot of extra information that
+ // the signal handler can look at. Most notably, this means a complete
+ // siginfo record.
+ // Fortunately though, the kernel doesn't look at any of this extra data
+ // when returning from a signal handler. So, we can safely convert an
+ // rt_sigframe to a legacy sigframe, discarding the extra data in the
+ // process. Interestingly, the legacy signal frame is actually larger than
+ // the rt signal frame, as it includes a lot more padding.
+ "sub $0x1C8, %esp\n" // a legacy signal stack is much larger
+ "mov 0x1CC(%esp), %eax\n" // push signal number
+ "push %eax\n"
+ "lea 0x270(%esp), %esi\n" // copy siginfo register values
+ "lea 0x4(%esp), %edi\n" // into new location
+ "mov $0x16, %ecx\n"
+ "cld\n"
+ "rep movsl\n"
+ "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask
+ "mov %ebx, 0x54(%esp)\n"
+ "lea 2f, %esi\n"
+ "push %esi\n" // push restorer function
+ "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers
+ "movb $2, %cl\n"
+ "rep movsl\n"
+ "ret\n" // return to restorer function
+ "2:pop %eax\n" // remove dummy argument (signo)
+ "mov $119, %eax\n" // NR_sigaction
+ "int $0x80\n"
+
+
+ // Preserve all registers
+ "3:push %ebx\n"
+ "push %ecx\n"
+ "push %edx\n"
+ "push %esi\n"
+ "push %edi\n"
+ "push %ebp\n"
+
+ // Convert from syscall calling conventions to C calling conventions
+ "push %ebp\n"
+ "push %edi\n"
+ "push %esi\n"
+ "push %edx\n"
+ "push %ecx\n"
+ "push %ebx\n"
+ "push %eax\n"
+
+ // Check range of system call
+ "cmp playground$maxSyscall, %eax\n"
+ "ja 9f\n"
+
+ // We often have long sequences of calls to gettimeofday(). This is
+ // needlessly expensive. Coalesce them into a single call.
+ //
+ // We keep track of state in TLS storage that we can access through
+ // the %fs segment register. See trusted_thread.cc for the exact
+ // memory layout.
+ //
+ // TODO(markus): maybe, we should proactively call gettimeofday() and
+ // clock_gettime(), whenever we talk to the trusted thread?
+ // or maybe, if we have recently seen requests to compute
+ // the time. There might be a repeated pattern of those.
+ "cmp $78, %eax\n" // __NR_gettimeofday
+ "jnz 6f\n"
+ "cmp %eax, %fs:0x102C-0x58\n" // last system call
+ "jnz 4f\n"
+
+ // This system call and the last system call prior to this one both are
+ // calls to gettimeofday(). Try to avoid making the new call and just
+ // return the same result as in the previous call.
+ // Just in case the caller is spinning on the result from gettimeofday(),
+ // every so often, call the actual system call.
+ "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday()
+ "jz 4f\n"
+
+ // Atomically read the 64bit word representing last-known timestamp and
+ // return it to the caller. On x86-32 this is a little more complicated and
+ // requires the use of the cmpxchg8b instruction.
+ "mov %ebx, %eax\n"
+ "mov %ecx, %edx\n"
+ "lock; cmpxchg8b 100f\n"
+ "mov %eax, 0(%ebx)\n"
+ "mov %edx, 4(%ebx)\n"
+ "xor %eax, %eax\n"
+ "add $28, %esp\n"
+ "jmp 8f\n"
+
+ // This is a call to gettimeofday(), but we don't have a valid cached
+ // result, yet.
+ "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number
+ "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations
+ "call playground$defaultSystemCallHandler\n"
+
+ // Returned from gettimeofday(). Remember return value, in case the
+ // application calls us again right away.
+ // Again, this has to happen atomically and requires cmpxchg8b.
+ "mov 4(%ebx), %ecx\n"
+ "mov 0(%ebx), %ebx\n"
+ "mov 100f, %eax\n"
+ "mov 101f, %edx\n"
+ "5:lock; cmpxchg8b 100f\n"
+ "jnz 5b\n"
+ "xor %eax, %eax\n"
+ "jmp 10f\n"
+
+ // Remember the number of the last system call made. We deliberately do
+ // not remember calls to gettid(), as we have often seen long sequences
+ // of calls to just gettimeofday() and gettid(). In that situation, we
+ // would still like to coalesce the gettimeofday() calls.
+ "6:cmp $224, %eax\n" // __NR_gettid
+ "jz 7f\n"
+ "mov %eax, %fs:0x102C-0x58\n" // remember syscall number
+
+ // Retrieve function call from system call table (c.f. syscall_table.c).
+ // We have three different types of entries; zero for denied system calls,
+ // that should be handled by the defaultSystemCallHandler(); minus one
+ // for unrestricted system calls that need to be forwarded to the trusted
+ // thread; and function pointers to specific handler functions.
+ "7:shl $3, %eax\n"
+ "lea playground$syscallTable, %ebx\n"
+ "add %ebx, %eax\n"
+ "mov 0(%eax), %eax\n"
+
+ // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
+ // jump to fallback handler.
+ "cmp $1, %eax\n"
+ "jbe 9f\n"
+ "add $4, %esp\n"
+ "call *%eax\n"
+ "add $24, %esp\n"
+
+ // Restore CPU registers, except for %eax which was set by the system call.
+ "8:pop %ebp\n"
+ "pop %edi\n"
+ "pop %esi\n"
+ "pop %edx\n"
+ "pop %ecx\n"
+ "pop %ebx\n"
+
+ // Return to caller
+ "ret\n"
+
+ // Call default handler.
+ "9:call playground$defaultSystemCallHandler\n"
+ "10:add $28, %esp\n"
+ "jmp 8b\n"
+
+ ".pushsection \".bss\"\n"
+ ".balign 8\n"
+"100:.byte 0, 0, 0, 0\n"
+"101:.byte 0, 0, 0, 0\n"
+ ".popsection\n"
+
+ #else
+ #error Unsupported target platform
+ #endif
+ ".size playground$syscallWrapper, .-playground$syscallWrapper\n"
+ ".popsection\n"
+);
+
+
+void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
+ void* arg2, void* arg3, void* arg4,
+ void* arg5) {
+ // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.
+
+ // We try to avoid intercepting read(), and write(), as these system calls
+ // are not restricted in Seccomp mode. But depending on the exact
+ // instruction sequence in libc, we might not be able to reliably
+ // filter out these system calls at the time when we instrument the code.
+ SysCalls sys;
+ long rc;
+ long long tm;
+ switch (syscallNum) {
+ case __NR_read:
+ Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
+ rc = sys.read((long)arg0, arg1, (size_t)arg2);
+ break;
+ case __NR_write:
+ Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
+ rc = sys.write((long)arg0, arg1, (size_t)arg2);
+ break;
+ default:
+ if (Debug::isEnabled()) {
+ // In debug mode, prevent stderr from being closed
+ if (syscallNum == __NR_close && arg0 == (void *)2)
+ return 0;
+ }
+
+ if ((unsigned)syscallNum <= maxSyscall &&
+ syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) {
+ Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
+ perform_unrestricted:
+ struct {
+ int sysnum;
+ void* unrestricted_req[6];
+ } __attribute__((packed)) request = {
+ syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } };
+
+ int thread = threadFdPub();
+ void* rc;
+ if (write(sys, thread, &request, sizeof(request)) != sizeof(request) ||
+ read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward unrestricted system call");
+ }
+ Debug::elapsed(tm, syscallNum);
+ return rc;
+ } else if (Debug::isEnabled()) {
+ Debug::syscall(&tm, syscallNum,
+ "In production mode, this call would be disallowed");
+ goto perform_unrestricted;
+ } else {
+ return (void *)-ENOSYS;
+ }
+ }
+ if (rc < 0) {
+ rc = -sys.my_errno;
+ }
+ Debug::elapsed(tm, syscallNum);
+ return (void *)rc;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/syscall.h b/sandbox/linux/seccomp/syscall.h
new file mode 100644
index 0000000..1315e12
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SYSCALL_H__
+#define SYSCALL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void syscallWrapper() asm("playground$syscallWrapper")
+#if defined(__x86_64__)
+ __attribute__((visibility("internal")))
+#endif
+;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // SYSCALL_H__
diff --git a/sandbox/linux/seccomp/syscall_table.c b/sandbox/linux/seccomp/syscall_table.c
new file mode 100644
index 0000000..c9dd7a4
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall_table.c
@@ -0,0 +1,153 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <asm/unistd.h>
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+#if defined(__x86_64__)
+#ifndef __NR_set_robust_list
+#define __NR_set_robust_list 273
+#endif
+#ifndef __NR_accept4
+#define __NR_accept4 288
+#endif
+#elif defined(__i386__)
+#ifndef __NR_set_robust_list
+#define __NR_set_robust_list 311
+#endif
+#else
+#error Unsupported target platform
+#endif
+
+// TODO(markus): This is an incredibly dirty hack to make the syscallTable
+// live in r/o memory.
+// Unfortunately, gcc doesn't give us a clean option to do
+// this. Ultimately, we should probably write some code that
+// parses /usr/include/asm/unistd*.h and generates a *.S file.
+// But we then need to figure out how to integrate this code
+// with our build system.
+
+const struct SyscallTable syscallTable[] __attribute__((
+ section(".rodata, \"a\", @progbits\n#"))) ={
+
+ #if defined(__NR_accept)
+ [ __NR_accept ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_accept4 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_access ] = { (void*)&sandbox_access, process_access },
+ [ __NR_brk ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_clock_gettime ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_clone ] = { (void*)&sandbox_clone, process_clone },
+ [ __NR_close ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_dup ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_dup2 ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_epoll_create ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_epoll_ctl ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_epoll_wait ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_exit ] = { (void*)&sandbox_exit, process_exit },
+ [ __NR_exit_group ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_fcntl ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_fcntl64)
+ [ __NR_fcntl64 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_fstat ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_fstat64)
+ [ __NR_fstat64 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_futex ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_getdents ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_getdents64 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_getpeername)
+ [ __NR_getpeername ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_getpid ] = { (void*)&sandbox_getpid, 0 },
+ #if defined(__NR_getsockname)
+ [ __NR_getsockname ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_getsockopt ] = { (void*)&sandbox_getsockopt,process_getsockopt },
+ #endif
+ [ __NR_gettid ] = { (void*)&sandbox_gettid, 0 },
+ [ __NR_gettimeofday ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_ioctl ] = { (void*)&sandbox_ioctl, process_ioctl },
+ #if defined(__NR_ipc)
+ [ __NR_ipc ] = { (void*)&sandbox_ipc, process_ipc },
+ #endif
+ #if defined(__NR__llseek)
+ [ __NR__llseek ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_lseek ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_lstat ] = { (void*)&sandbox_lstat, process_stat },
+ #if defined(__NR_lstat64)
+ [ __NR_lstat64 ] = { (void*)&sandbox_lstat64, process_stat },
+ #endif
+ [ __NR_madvise ] = { (void*)&sandbox_madvise, process_madvise },
+ #if defined(__NR_mmap2)
+ [ __NR_mmap2 ] =
+ #else
+ [ __NR_mmap ] =
+ #endif
+ { (void*)&sandbox_mmap, process_mmap },
+ [ __NR_mprotect ] = { (void*)&sandbox_mprotect, process_mprotect },
+ [ __NR_munmap ] = { (void*)&sandbox_munmap, process_munmap },
+ [ __NR_open ] = { (void*)&sandbox_open, process_open },
+ [ __NR_pipe ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_poll ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_recvfrom)
+ [ __NR_recvfrom ] = { (void*)&sandbox_recvfrom, process_recvfrom },
+ [ __NR_recvmsg ] = { (void*)&sandbox_recvmsg, process_recvmsg },
+ #endif
+ #if defined(__NR_rt_sigaction)
+ [ __NR_rt_sigaction ] = { (void*)&sandbox_rt_sigaction,process_sigaction},
+ #endif
+ #if defined(__NR_rt_sigprocmask)
+ [ __NR_rt_sigprocmask ] = { (void*)&sandbox_rt_sigprocmask, 0 },
+ #endif
+ #if defined(__NR_sendmsg)
+ [ __NR_sendmsg ] = { (void*)&sandbox_sendmsg, process_sendmsg },
+ [ __NR_sendto ] = { (void*)&sandbox_sendto, process_sendto },
+ #endif
+ [ __NR_set_robust_list ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_setsockopt)
+ [ __NR_setsockopt ] = { (void*)&sandbox_setsockopt,process_setsockopt },
+ #endif
+ #if defined(__NR_shmat)
+ [ __NR_shmat ] = { (void*)&sandbox_shmat, process_shmat },
+ [ __NR_shmctl ] = { (void*)&sandbox_shmctl, process_shmctl },
+ [ __NR_shmdt ] = { (void*)&sandbox_shmdt, process_shmdt },
+ [ __NR_shmget ] = { (void*)&sandbox_shmget, process_shmget },
+ #endif
+ #if defined(__NR_shutdown)
+ [ __NR_shutdown ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ #if defined(__NR_sigaction)
+ [ __NR_sigaction ] = { (void*)&sandbox_sigaction,process_sigaction },
+ #endif
+ #if defined(__NR_signal)
+ [ __NR_signal ] = { (void*)&sandbox_signal, process_sigaction },
+ #endif
+ #if defined(__NR_sigprocmask)
+ [ __NR_sigprocmask ] = { (void*)&sandbox_sigprocmask, 0 },
+ #endif
+ #if defined(__NR_socketpair)
+ [ __NR_socketpair ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ #if defined(__NR_socketcall)
+ [ __NR_socketcall ] = { (void*)&sandbox_socketcall,process_socketcall },
+ #endif
+ [ __NR_stat ] = { (void*)&sandbox_stat, process_stat },
+ #if defined(__NR_stat64)
+ [ __NR_stat64 ] = { (void*)&sandbox_stat64, process_stat },
+ #endif
+ [ __NR_time ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_uname ] = { UNRESTRICTED_SYSCALL, 0 },
+};
+const unsigned maxSyscall __attribute__((section(".rodata"))) =
+ sizeof(syscallTable)/sizeof(struct SyscallTable);
+
+const int syscall_mutex_[4096/sizeof(int)] asm("playground$syscall_mutex")
+ __attribute__((section(".rodata"),aligned(4096)
+#if defined(__x86_64__)
+ ,visibility("internal")
+#endif
+ )) = { 0x80000000 };
diff --git a/sandbox/linux/seccomp/syscall_table.h b/sandbox/linux/seccomp/syscall_table.h
new file mode 100644
index 0000000..5bd6791
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall_table.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SYSCALL_TABLE_H__
+#define SYSCALL_TABLE_H__
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+#include "securemem.h"
+extern "C" {
+namespace playground {
+#define SecureMemArgs SecureMem::Args
+#else
+#define SecureMemArgs void
+#define bool int
+#endif
+ #define UNRESTRICTED_SYSCALL ((void *)1)
+
+ struct SyscallTable {
+ void *handler;
+ bool (*trustedProcess)(int parentMapsFd, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMemArgs* mem);
+ };
+ extern const struct SyscallTable syscallTable[]
+ asm("playground$syscallTable")
+#if defined(__x86_64__)
+ __attribute__((visibility("internal")))
+#endif
+ ;
+ extern const unsigned maxSyscall
+ asm("playground$maxSyscall")
+#if defined(__x86_64__)
+ __attribute__((visibility("internal")))
+#endif
+ ;
+#ifdef __cplusplus
+} // namespace
+}
+#endif
+
+#endif // SYSCALL_TABLE_H__
diff --git a/sandbox/linux/seccomp/tests/list_tests.py b/sandbox/linux/seccomp/tests/list_tests.py
new file mode 100644
index 0000000..011a52e
--- /dev/null
+++ b/sandbox/linux/seccomp/tests/list_tests.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2010 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import re
+import sys
+
+
+def get_tests(filename):
+ for line in open(filename):
+ match = re.match(r"TEST\((\w+)\)", line)
+ if match is not None:
+ yield match.group(1)
+
+
+def main(args):
+ for name in get_tests(args[0]):
+ print ' { "%s", %s },' % (name, name)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff --git a/sandbox/linux/seccomp/tests/test_syscalls.cc b/sandbox/linux/seccomp/tests/test_syscalls.cc
new file mode 100644
index 0000000..3e6acd5
--- /dev/null
+++ b/sandbox/linux/seccomp/tests/test_syscalls.cc
@@ -0,0 +1,758 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <assert.h>
+#include <dirent.h>
+#include <pthread.h>
+#include <pty.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "sandbox_impl.h"
+
+#ifdef DEBUG
+#define MSG(fmt, ...) printf(fmt, ##__VA_ARGS__)
+#else
+#define MSG(fmt, ...) do { } while (0)
+#endif
+
+int g_intended_status_fd = -1;
+
+// Declares the wait() status that the test subprocess intends to exit with.
+void intend_exit_status(int val, bool is_signal) {
+ if (is_signal) {
+ val = W_EXITCODE(0, val);
+ } else {
+ val = W_EXITCODE(val, 0);
+ }
+ if (g_intended_status_fd != -1) {
+ int sent = write(g_intended_status_fd, &val, sizeof(val));
+ assert(sent == sizeof(val));
+ } else {
+ // This prints in cases where we run one test without forking
+ printf("Intending to exit with status %i...\n", val);
+ }
+}
+
+
+// This is basically a marker to grep for.
+#define TEST(name) void name()
+
+TEST(test_dup) {
+ StartSeccompSandbox();
+ // Test a simple syscall that is marked as UNRESTRICTED_SYSCALL.
+ int fd = dup(1);
+ assert(fd >= 0);
+ int rc = close(fd);
+ assert(rc == 0);
+}
+
+TEST(test_segfault) {
+ StartSeccompSandbox();
+ // Check that the sandbox's SIGSEGV handler does not stop the
+ // process from dying cleanly in the event of a real segfault.
+ intend_exit_status(SIGSEGV, true);
+ asm("hlt");
+}
+
+TEST(test_exit) {
+ StartSeccompSandbox();
+ intend_exit_status(123, false);
+ _exit(123);
+}
+
+// This has an off-by-three error because it counts ".", "..", and the
+// FD for the /proc/self/fd directory. This doesn't matter because it
+// is only used to check for differences in the number of open FDs.
+static int count_fds() {
+ DIR *dir = opendir("/proc/self/fd");
+ assert(dir != NULL);
+ int count = 0;
+ while (1) {
+ struct dirent *d = readdir(dir);
+ if (d == NULL)
+ break;
+ count++;
+ }
+ int rc = closedir(dir);
+ assert(rc == 0);
+ return count;
+}
+
+static void *thread_func(void *x) {
+ int *ptr = (int *) x;
+ *ptr = 123;
+ MSG("In new thread\n");
+ return (void *) 456;
+}
+
+TEST(test_thread) {
+ playground::g_policy.allow_file_namespace = true; // To allow count_fds()
+ StartSeccompSandbox();
+ int fd_count1 = count_fds();
+ pthread_t tid;
+ int x = 999;
+ void *result;
+ pthread_create(&tid, NULL, thread_func, &x);
+ MSG("Waiting for thread\n");
+ pthread_join(tid, &result);
+ assert(result == (void *) 456);
+ assert(x == 123);
+ // Check that the process has not leaked FDs.
+ int fd_count2 = count_fds();
+ assert(fd_count2 == fd_count1);
+}
+
+static int clone_func(void *x) {
+ int *ptr = (int *) x;
+ *ptr = 124;
+ MSG("In thread\n");
+ // On x86-64, returning from this function calls the __NR_exit_group
+ // syscall instead of __NR_exit.
+ syscall(__NR_exit, 100);
+ // Not reached.
+ return 200;
+}
+
+#if defined(__i386__)
+static int get_gs() {
+ int gs;
+ asm volatile("mov %%gs, %0" : "=r"(gs));
+ return gs;
+}
+#endif
+
+static void *get_tls_base() {
+ void *base;
+#if defined(__x86_64__)
+ asm volatile("mov %%fs:0, %0" : "=r"(base));
+#elif defined(__i386__)
+ asm volatile("mov %%gs:0, %0" : "=r"(base));
+#else
+#error Unsupported target platform
+#endif
+ return base;
+}
+
+TEST(test_clone) {
+ playground::g_policy.allow_file_namespace = true; // To allow count_fds()
+ StartSeccompSandbox();
+ int fd_count1 = count_fds();
+ int stack_size = 0x1000;
+ char *stack = (char *) malloc(stack_size);
+ assert(stack != NULL);
+ int flags = CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+ CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
+ int tid = -1;
+ int x = 999;
+
+ // The sandbox requires us to pass CLONE_TLS. Pass settings that
+ // are enough to copy the parent thread's TLS setup. This allows us
+ // to invoke libc in the child thread.
+#if defined(__x86_64__)
+ void *tls = get_tls_base();
+#elif defined(__i386__)
+ struct user_desc tls_desc, *tls = &tls_desc;
+ tls_desc.entry_number = get_gs() >> 3;
+ tls_desc.base_addr = (long) get_tls_base();
+ tls_desc.limit = 0xfffff;
+ tls_desc.seg_32bit = 1;
+ tls_desc.contents = 0;
+ tls_desc.read_exec_only = 0;
+ tls_desc.limit_in_pages = 1;
+ tls_desc.seg_not_present = 0;
+ tls_desc.useable = 1;
+#else
+#error Unsupported target platform
+#endif
+
+ int rc = clone(clone_func, (void *) (stack + stack_size), flags, &x,
+ &tid, tls, &tid);
+ assert(rc > 0);
+ while (tid == rc) {
+ syscall(__NR_futex, &tid, FUTEX_WAIT, rc, NULL);
+ }
+ assert(tid == 0);
+ assert(x == 124);
+ // Check that the process has not leaked FDs.
+ int fd_count2 = count_fds();
+ assert(fd_count2 == fd_count1);
+}
+
+static int uncalled_clone_func(void *x) {
+ printf("In thread func, which shouldn't happen\n");
+ return 1;
+}
+
+TEST(test_clone_disallowed_flags) {
+ StartSeccompSandbox();
+ int stack_size = 4096;
+ char *stack = (char *) malloc(stack_size);
+ assert(stack != NULL);
+ /* We omit the flags CLONE_SETTLS, CLONE_PARENT_SETTID and
+ CLONE_CHILD_CLEARTID, which is disallowed by the sandbox. */
+ int flags = CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM;
+ int rc = clone(uncalled_clone_func, (void *) (stack + stack_size),
+ flags, NULL, NULL, NULL, NULL);
+ assert(rc == -1);
+ assert(errno == EPERM);
+}
+
+static void *fp_thread(void *x) {
+ int val;
+ asm("movss %%xmm0, %0" : "=m"(val));
+ MSG("val=%i\n", val);
+ return NULL;
+}
+
+TEST(test_fp_regs) {
+ StartSeccompSandbox();
+ int val = 1234;
+ asm("movss %0, %%xmm0" : "=m"(val));
+ pthread_t tid;
+ pthread_create(&tid, NULL, fp_thread, NULL);
+ pthread_join(tid, NULL);
+ MSG("thread done OK\n");
+}
+
+static long long read_tsc() {
+ long long rc;
+ asm volatile(
+ "rdtsc\n"
+ "mov %%eax, (%0)\n"
+ "mov %%edx, 4(%0)\n"
+ :
+ : "c"(&rc), "a"(-1), "d"(-1));
+ return rc;
+}
+
+TEST(test_rdtsc) {
+ StartSeccompSandbox();
+ // Just check that we can do the instruction.
+ read_tsc();
+}
+
+TEST(test_getpid) {
+ int pid1 = getpid();
+ StartSeccompSandbox();
+ int pid2 = getpid();
+ assert(pid1 == pid2);
+ // Bypass any caching that glibc's getpid() wrapper might do.
+ int pid3 = syscall(__NR_getpid);
+ assert(pid1 == pid3);
+}
+
+TEST(test_gettid) {
+ // glibc doesn't provide a gettid() wrapper.
+ int tid1 = syscall(__NR_gettid);
+ assert(tid1 > 0);
+ StartSeccompSandbox();
+ int tid2 = syscall(__NR_gettid);
+ assert(tid1 == tid2);
+}
+
+static void *map_something() {
+ void *addr = mmap(NULL, 0x1000, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ assert(addr != MAP_FAILED);
+ return addr;
+}
+
+TEST(test_mmap_disallows_remapping) {
+ void *addr = map_something();
+ StartSeccompSandbox();
+ // Overwriting a mapping that was created before the sandbox was
+ // enabled is not allowed.
+ void *result = mmap(addr, 0x1000, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ assert(result == MAP_FAILED);
+ assert(errno == EINVAL);
+}
+
+TEST(test_mmap_disallows_low_address) {
+ StartSeccompSandbox();
+ // Mapping pages at low addresses is not allowed because this helps
+ // with exploiting buggy kernels.
+ void *result = mmap(NULL, 0x1000, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ assert(result == MAP_FAILED);
+ assert(errno == EINVAL);
+}
+
+TEST(test_munmap_allowed) {
+ StartSeccompSandbox();
+ void *addr = map_something();
+ int result = munmap(addr, 0x1000);
+ assert(result == 0);
+}
+
+TEST(test_munmap_disallowed) {
+ void *addr = map_something();
+ StartSeccompSandbox();
+ int result = munmap(addr, 0x1000);
+ assert(result == -1);
+ assert(errno == EINVAL);
+}
+
+TEST(test_mprotect_allowed) {
+ StartSeccompSandbox();
+ void *addr = map_something();
+ int result = mprotect(addr, 0x1000, PROT_READ | PROT_WRITE);
+ assert(result == 0);
+}
+
+TEST(test_mprotect_disallowed) {
+ void *addr = map_something();
+ StartSeccompSandbox();
+ int result = mprotect(addr, 0x1000, PROT_READ | PROT_WRITE);
+ assert(result == -1);
+ assert(errno == EINVAL);
+}
+
+static int get_tty_fd() {
+ int master_fd, tty_fd;
+ int rc = openpty(&master_fd, &tty_fd, NULL, NULL, NULL);
+ assert(rc == 0);
+ return tty_fd;
+}
+
+TEST(test_ioctl_tiocgwinsz_allowed) {
+ int tty_fd = get_tty_fd();
+ StartSeccompSandbox();
+ int size[2];
+ // Get terminal width and height.
+ int result = ioctl(tty_fd, TIOCGWINSZ, size);
+ assert(result == 0);
+}
+
+TEST(test_ioctl_disallowed) {
+ int tty_fd = get_tty_fd();
+ StartSeccompSandbox();
+ // This ioctl call inserts a character into the tty's input queue,
+ // which provides a way to send commands to an interactive shell.
+ char c = 'x';
+ int result = ioctl(tty_fd, TIOCSTI, &c);
+ assert(result == -1);
+ assert(errno == EINVAL);
+}
+
+TEST(test_socket) {
+ StartSeccompSandbox();
+ int fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ assert(fd == -1);
+ // TODO: Make it consistent between i386 and x86-64.
+ assert(errno == EINVAL || errno == ENOSYS);
+}
+
+TEST(test_open_disabled) {
+ StartSeccompSandbox();
+ int fd = open("/dev/null", O_RDONLY);
+ assert(fd == -1);
+ assert(errno == EACCES);
+
+ // Writing to the policy flag does not change this.
+ playground::g_policy.allow_file_namespace = true;
+ fd = open("/dev/null", O_RDONLY);
+ assert(fd == -1);
+ assert(errno == EACCES);
+}
+
+TEST(test_open_enabled) {
+ playground::g_policy.allow_file_namespace = true;
+ StartSeccompSandbox();
+ int fd = open("/dev/null", O_RDONLY);
+ assert(fd >= 0);
+ int rc = close(fd);
+ assert(rc == 0);
+ fd = open("/dev/null", O_WRONLY);
+ assert(fd == -1);
+ assert(errno == EACCES);
+}
+
+TEST(test_access_disabled) {
+ StartSeccompSandbox();
+ int rc = access("/dev/null", R_OK);
+ assert(rc == -1);
+ assert(errno == EACCES);
+}
+
+TEST(test_access_enabled) {
+ playground::g_policy.allow_file_namespace = true;
+ StartSeccompSandbox();
+ int rc = access("/dev/null", R_OK);
+ assert(rc == 0);
+ rc = access("path-that-does-not-exist", R_OK);
+ assert(rc == -1);
+ assert(errno == ENOENT);
+}
+
+TEST(test_stat_disabled) {
+ StartSeccompSandbox();
+ struct stat st;
+ int rc = stat("/dev/null", &st);
+ assert(rc == -1);
+ assert(errno == EACCES);
+}
+
+TEST(test_stat_enabled) {
+ playground::g_policy.allow_file_namespace = true;
+ StartSeccompSandbox();
+ struct stat st;
+ int rc = stat("/dev/null", &st);
+ assert(rc == 0);
+ rc = stat("path-that-does-not-exist", &st);
+ assert(rc == -1);
+ assert(errno == ENOENT);
+}
+
+static int g_value;
+
+static void signal_handler(int sig) {
+ g_value = 300;
+ MSG("In signal handler\n");
+}
+
+static void sigaction_handler(int sig, siginfo_t *a, void *b) {
+ g_value = 300;
+ MSG("In sigaction handler\n");
+}
+
+static void (*g_sig_handler_ptr)(int sig, void *addr) asm("g_sig_handler_ptr");
+
+static void non_fatal_sig_handler(int sig, void *addr) {
+ g_value = 300;
+ MSG("Caught signal %d at %p\n", sig, addr);
+}
+
+static void fatal_sig_handler(int sig, void *addr) {
+ // Recursively trigger another segmentation fault while already in the SEGV
+ // handler. This should terminate the program if SIGSEGV is marked as a
+ // deferred signal.
+ // Only do this on the first entry to this function. Otherwise, the signal
+ // handler was probably marked as SA_NODEFER and we want to continue
+ // execution.
+ if (!g_value++) {
+ MSG("Caught signal %d at %p\n", sig, addr);
+ if (sig == SIGSEGV) {
+ asm volatile("hlt");
+ } else {
+ asm volatile("int3");
+ }
+ }
+}
+
+static void (*generic_signal_handler(void))
+ (int signo, siginfo_t *info, void *context) {
+ void (*hdl)(int, siginfo_t *, void *);
+ asm volatile(
+ "lea 0f, %0\n"
+ "jmp 999f\n"
+ "0:\n"
+
+#if defined(__x86_64__)
+ "mov 0xB0(%%rsp), %%rsi\n" // Pass original %rip to signal handler
+ "cmpb $0xF4, 0(%%rsi)\n" // hlt
+ "jnz 1f\n"
+ "addq $1, 0xB0(%%rsp)\n" // Adjust %eip past failing instruction
+ "1:jmp *g_sig_handler_ptr\n" // Call actual signal handler
+#elif defined(__i386__)
+ // TODO(markus): We currently don't guarantee that signal handlers always
+ // have the correct "magic" restorer function. If we fix
+ // this, we should add a test for it (both for SEGV and
+ // non-SEGV).
+ "cmpw $0, 0xA(%%esp)\n"
+ "lea 0x40(%%esp), %%eax\n" // %eip at time of exception
+ "jz 1f\n"
+ "add $0x9C, %%eax\n" // %eip at time of exception
+ "1:mov 0(%%eax), %%ecx\n"
+ "cmpb $0xF4, 0(%%ecx)\n" // hlt
+ "jnz 2f\n"
+ "addl $1, 0(%%eax)\n" // Adjust %eip past failing instruction
+ "2:push %%ecx\n" // Pass original %eip to signal handler
+ "mov 8(%%esp), %%eax\n"
+ "push %%eax\n" // Pass signal number to signal handler
+ "call *g_sig_handler_ptr\n" // Call actual signal handler
+ "pop %%eax\n"
+ "pop %%ecx\n"
+ "ret\n"
+#else
+#error Unsupported target platform
+#endif
+
+"999:\n"
+ : "=r"(hdl));
+ return hdl;
+}
+
+TEST(test_signal_handler) {
+ sighandler_t result = signal(SIGTRAP, signal_handler);
+ assert(result != SIG_ERR);
+
+ StartSeccompSandbox();
+
+ result = signal(SIGTRAP, signal_handler);
+ assert(result != SIG_ERR);
+
+ g_value = 200;
+ asm("int3");
+ assert(g_value == 300);
+}
+
+TEST(test_sigaction_handler) {
+ struct sigaction act;
+ act.sa_sigaction = sigaction_handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ int rc = sigaction(SIGTRAP, &act, NULL);
+ assert(rc == 0);
+
+ StartSeccompSandbox();
+
+ rc = sigaction(SIGTRAP, &act, NULL);
+ assert(rc == 0);
+
+ g_value = 200;
+ asm("int3");
+ assert(g_value == 300);
+}
+
+TEST(test_blocked_signal) {
+ sighandler_t result = signal(SIGTRAP, signal_handler);
+ assert(result != SIG_ERR);
+ StartSeccompSandbox();
+
+ // Initially the signal should not be blocked.
+ sigset_t sigs;
+ sigfillset(&sigs);
+ int rc = sigprocmask(0, NULL, &sigs);
+ assert(rc == 0);
+ assert(!sigismember(&sigs, SIGTRAP));
+
+ sigemptyset(&sigs);
+ sigaddset(&sigs, SIGTRAP);
+ rc = sigprocmask(SIG_BLOCK, &sigs, NULL);
+ assert(rc == 0);
+
+ // Check that we can read back the blocked status.
+ sigemptyset(&sigs);
+ rc = sigprocmask(0, NULL, &sigs);
+ assert(rc == 0);
+ assert(sigismember(&sigs, SIGTRAP));
+
+ // Check that the signal handler really is blocked.
+ intend_exit_status(SIGTRAP, true);
+ asm("int3");
+}
+
+TEST(test_sigaltstack) {
+ // The sandbox does not support sigaltstack() yet. Just test that
+ // it returns an error.
+ StartSeccompSandbox();
+ stack_t st;
+ st.ss_size = 0x4000;
+ st.ss_sp = malloc(st.ss_size);
+ assert(st.ss_sp != NULL);
+ st.ss_flags = 0;
+ int rc = sigaltstack(&st, NULL);
+ assert(rc == -1);
+ assert(errno == ENOSYS);
+}
+
+TEST(test_sa_flags) {
+ StartSeccompSandbox();
+ int flags[4] = { 0, SA_NODEFER, SA_SIGINFO, SA_SIGINFO | SA_NODEFER };
+ for (int i = 0; i < 4; ++i) {
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = generic_signal_handler();
+ g_sig_handler_ptr = non_fatal_sig_handler;
+ sa.sa_flags = flags[i];
+
+ // Test SEGV handling
+ g_value = 200;
+ sigaction(SIGSEGV, &sa, NULL);
+ asm volatile("hlt");
+ assert(g_value == 300);
+
+ // Test non-SEGV handling
+ g_value = 200;
+ sigaction(SIGTRAP, &sa, NULL);
+ asm volatile("int3");
+ assert(g_value == 300);
+ }
+}
+
+TEST(test_segv_defer) {
+ StartSeccompSandbox();
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = generic_signal_handler();
+ g_sig_handler_ptr = fatal_sig_handler;
+
+ // Test non-deferred SEGV (should continue execution)
+ sa.sa_flags = SA_NODEFER;
+ sigaction(SIGSEGV, &sa, NULL);
+ g_value = 0;
+ asm volatile("hlt");
+
+ // Test deferred SEGV (should terminate program)
+ sa.sa_flags = 0;
+ sigaction(SIGSEGV, &sa, NULL);
+ g_value = 0;
+ intend_exit_status(SIGSEGV, true);
+ asm volatile("hlt");
+}
+
+TEST(test_trap_defer) {
+ StartSeccompSandbox();
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = generic_signal_handler();
+ g_sig_handler_ptr = fatal_sig_handler;
+
+ // Test non-deferred TRAP (should continue execution)
+ sa.sa_flags = SA_NODEFER;
+ sigaction(SIGTRAP, &sa, NULL);
+ g_value = 0;
+ asm volatile("int3");
+
+ // Test deferred TRAP (should terminate program)
+ sa.sa_flags = 0;
+ sigaction(SIGTRAP, &sa, NULL);
+ g_value = 0;
+ intend_exit_status(SIGTRAP, true);
+ asm volatile("int3");
+}
+
+TEST(test_segv_resethand) {
+ StartSeccompSandbox();
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = generic_signal_handler();
+ g_sig_handler_ptr = non_fatal_sig_handler;
+ sa.sa_flags = SA_RESETHAND;
+ sigaction(SIGSEGV, &sa, NULL);
+
+ // Test first invocation of signal handler (should continue execution)
+ asm volatile("hlt");
+
+ // Test second invocation of signal handler (should terminate program)
+ intend_exit_status(SIGSEGV, true);
+ asm volatile("hlt");
+}
+
+TEST(test_trap_resethand) {
+ StartSeccompSandbox();
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = generic_signal_handler();
+ g_sig_handler_ptr = non_fatal_sig_handler;
+ sa.sa_flags = SA_RESETHAND;
+ sigaction(SIGTRAP, &sa, NULL);
+
+ // Test first invocation of signal handler (should continue execution)
+ asm volatile("int3");
+
+ // Test second invocation of signal handler (should terminate program)
+ intend_exit_status(SIGTRAP, true);
+ asm volatile("int3");
+}
+
+struct testcase {
+ const char *test_name;
+ void (*test_func)();
+};
+
+struct testcase all_tests[] = {
+#include "test-list.h"
+ { NULL, NULL },
+};
+
+static int run_test_forked(struct testcase *test) {
+ printf("** %s\n", test->test_name);
+ int pipe_fds[2];
+ int rc = pipe(pipe_fds);
+ assert(rc == 0);
+ int pid = fork();
+ if (pid == 0) {
+ rc = close(pipe_fds[0]);
+ assert(rc == 0);
+ g_intended_status_fd = pipe_fds[1];
+
+ test->test_func();
+ intend_exit_status(0, false);
+ _exit(0);
+ }
+ rc = close(pipe_fds[1]);
+ assert(rc == 0);
+
+ int intended_status;
+ int got = read(pipe_fds[0], &intended_status, sizeof(intended_status));
+ bool got_intended_status = got == sizeof(intended_status);
+ if (!got_intended_status) {
+ printf("Test runner: Did not receive intended status\n");
+ }
+
+ int status;
+ int pid2 = waitpid(pid, &status, 0);
+ assert(pid2 == pid);
+ if (!got_intended_status) {
+ printf("Test returned exit status %i\n", status);
+ return 1;
+ }
+ else if ((status & ~WCOREFLAG) != intended_status) {
+ printf("Test failed with exit status %i, expected %i\n",
+ status, intended_status);
+ return 1;
+ }
+ else {
+ return 0;
+ }
+}
+
+static int run_test_by_name(const char *name) {
+ struct testcase *test;
+ for (test = all_tests; test->test_name != NULL; test++) {
+ if (strcmp(name, test->test_name) == 0) {
+ printf("Running test %s...\n", name);
+ test->test_func();
+ printf("OK\n");
+ return 0;
+ }
+ }
+ fprintf(stderr, "Test '%s' not found\n", name);
+ return 1;
+}
+
+int main(int argc, char **argv) {
+ setvbuf(stdout, NULL, _IONBF, 0);
+ setvbuf(stderr, NULL, _IONBF, 0);
+ if (argc == 2) {
+ // Run one test without forking, to aid debugging.
+ return run_test_by_name(argv[1]);
+ }
+ else if (argc > 2) {
+ // TODO: run multiple tests.
+ fprintf(stderr, "Too many arguments\n");
+ return 1;
+ }
+ else {
+ // Run all tests.
+ struct testcase *test;
+ int failures = 0;
+ for (test = all_tests; test->test_name != NULL; test++) {
+ failures += run_test_forked(test);
+ }
+ if (failures == 0) {
+ printf("OK\n");
+ return 0;
+ }
+ else {
+ printf("%i FAILURE(S)\n", failures);
+ return 1;
+ }
+ }
+}
diff --git a/sandbox/linux/seccomp/timestats.cc b/sandbox/linux/seccomp/timestats.cc
new file mode 100644
index 0000000..5d9b66a
--- /dev/null
+++ b/sandbox/linux/seccomp/timestats.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Helper program to analyze the time that Chrome's renderers spend in system
+// calls. Start Chrome like this:
+//
+// SECCOMP_SANDBOX_DEBUGGING=1 chrome --enable-seccomp-sandbox 2>&1 | timestats
+//
+// The program prints CPU time (0-100%) spent within system calls. This gives
+// a general idea of where it is worthwhile to spend effort optimizing Chrome.
+//
+// Caveats:
+// - there currently is no way to estimate what the overhead is for running
+// inside of the sandbox vs. running without a sandbox.
+// - we currently use a very simple heuristic to decide whether a system call
+// is blocking or not. Blocking system calls should not be included in the
+// computations. But it is quite possible for the numbers to be somewhat
+// wrong, because the heuristic failed.
+// - in order to collect this data, we have to turn on sandbox debugging.
+// There is a measurable performance penalty to doing so. Production numbers
+// are strictly better than the numbers reported by this tool.
+#include <set>
+#include <vector>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+
+static const int kAvgWindowSizeMs = 500;
+static const int kPeakWindowSizeMs = 2*1000;
+
+// Class containing information on a single system call. Most notably, it
+// contains the time when the system call happened, and the time that it
+// took to complete.
+class Datum {
+ friend class Data;
+ public:
+ Datum(const char* name, double ms)
+ : name_(name),
+ ms_(ms) {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ timestamp_ = tv.tv_sec*1000.0 + tv.tv_usec/1000.0;
+ }
+ virtual ~Datum() { }
+
+ double operator-(const Datum& b) {
+ return timestamp_ - b.timestamp_;
+ }
+
+ protected:
+ const char* name_;
+ double ms_;
+ double timestamp_;
+};
+
+// Class containing data on the most recent system calls. It maintains
+// sliding averages for total CPU time used, and it also maintains a peak
+// CPU usage. The peak usage is usually updated slower than the average
+// usage, as that makes it easier to inspect visually.
+class Data {
+ public:
+ Data() { }
+ virtual ~Data() { }
+
+ void addData(const char* name, double ms) {
+ average_.push_back(Datum(name, ms));
+ peak_.push_back(Datum(name, ms));
+
+ // Prune entries outside of the window
+ std::vector<Datum>::iterator iter;
+ for (iter = average_.begin();
+ *average_.rbegin() - *iter > kAvgWindowSizeMs;
+ ++iter) {
+ }
+ average_.erase(average_.begin(), iter);
+
+ for (iter = peak_.begin();
+ *peak_.rbegin() - *iter > kPeakWindowSizeMs;
+ ++iter){
+ }
+ peak_.erase(peak_.begin(), iter);
+
+ // Add the total usage of all system calls inside of the window
+ double total = 0;
+ for (iter = average_.begin(); iter != average_.end(); ++iter) {
+ total += iter->ms_;
+ }
+
+ // Compute the peak CPU usage during the last window
+ double peak = 0;
+ double max = 0;
+ std::vector<Datum>::iterator tail = peak_.begin();
+ for (iter = tail; iter != peak_.end(); ++iter) {
+ while (*iter - *tail > kAvgWindowSizeMs) {
+ peak -= tail->ms_;
+ ++tail;
+ }
+ peak += iter->ms_;
+ if (peak > max) {
+ max = peak;
+ }
+ }
+
+ // Print the average CPU usage in the last window
+ char buf[80];
+ total *= 100.0/kAvgWindowSizeMs;
+ max *= 100.0/kAvgWindowSizeMs;
+ sprintf(buf, "%6.2f%% (peak=%6.2f%%) ", total, max);
+
+ // Animate the actual usage, displaying both average and peak values
+ int len = strlen(buf);
+ int space = sizeof(buf) - len - 1;
+ int mark = (total * space + 50)/100;
+ int bar = (max * space + 50)/100;
+ for (int i = 0; i < mark; ++i) {
+ buf[len++] = '*';
+ }
+ if (mark == bar) {
+ if (bar) {
+ len--;
+ }
+ } else {
+ for (int i = 0; i < bar - mark - 1; ++i) {
+ buf[len++] = ' ';
+ }
+ }
+ buf[len++] = '|';
+ while (len < static_cast<int>(sizeof(buf))) {
+ buf[len++] = ' ';
+ }
+ strcpy(buf + len, "\r");
+ fwrite(buf, len + 1, 1, stdout);
+ fflush(stdout);
+ }
+
+ private:
+ std::vector<Datum> average_;
+ std::vector<Datum> peak_;
+};
+static Data data;
+
+
+int main(int argc, char *argv[]) {
+ char buf[80];
+ bool expensive = false;
+ while (fgets(buf, sizeof(buf), stdin)) {
+ // Allow longer delays for expensive system calls
+ if (strstr(buf, "This is an expensive system call")) {
+ expensive = true;
+ continue;
+ }
+
+ // Parse the string and extract the elapsed time
+ const char elapsed[] = "Elapsed time: ";
+ char* ms_string = strstr(buf, elapsed);
+ char* endptr;
+ double ms;
+ char* colon = strchr(buf, ':');
+
+ // If this string doesn't match, then it must be some other type of
+ // message. Just ignore it.
+ // It is quite likely that we will regularly encounter debug messages
+ // that either should be parsed by a completely different tool, or
+ // messages that were intended for humans to read.
+ if (!ms_string ||
+ ((ms = strtod(ms_string + sizeof(elapsed) - 1, &endptr)),
+ endptr == ms_string) ||
+ !colon) {
+ continue;
+ }
+
+ // Filter out system calls that were probably just blocking
+ // TODO(markus): automatically compute the cut-off for blocking calls
+ if (!expensive && ms > 0.05) {
+ continue;
+ }
+ expensive = false;
+
+ // Extract the name of the system call
+ *colon = '\000';
+
+ // Add the data point and update the display
+ data.addData(buf, ms);
+ }
+ puts("");
+ return 0;
+}
diff --git a/sandbox/linux/seccomp/tls.h b/sandbox/linux/seccomp/tls.h
new file mode 100644
index 0000000..7ec5a28
--- /dev/null
+++ b/sandbox/linux/seccomp/tls.h
@@ -0,0 +1,155 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TLS_H__
+#define TLS_H__
+
+#include <asm/ldt.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+namespace playground {
+
+class TLS {
+ private:
+ class SysCalls {
+ public:
+ #define SYS_CPLUSPLUS
+ #define SYS_ERRNO my_errno
+ #define SYS_INLINE inline
+ #define SYS_PREFIX -1
+ #undef SYS_LINUX_SYSCALL_SUPPORT_H
+ #include "linux_syscall_support.h"
+ SysCalls() : my_errno(0) { }
+ int my_errno;
+ };
+
+ public:
+ static void *allocateTLS() {
+ SysCalls sys;
+ #if defined(__x86_64__)
+ void *addr = sys.mmap(0, 4096, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ if (sys.arch_prctl(ARCH_SET_GS, addr) < 0) {
+ return NULL;
+ }
+ #elif defined(__i386__)
+ void *addr = sys.mmap2(0, 4096, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ struct user_desc u;
+ u.entry_number = (typeof u.entry_number)-1;
+ u.base_addr = (int)addr;
+ u.limit = 0xfffff;
+ u.seg_32bit = 1;
+ u.contents = 0;
+ u.read_exec_only = 0;
+ u.limit_in_pages = 1;
+ u.seg_not_present = 0;
+ u.useable = 1;
+ if (sys.set_thread_area(&u) < 0) {
+ return NULL;
+ }
+ asm volatile(
+ "movw %w0, %%fs"
+ :
+ : "q"(8*u.entry_number+3));
+ #else
+ #error Unsupported target platform
+ #endif
+ return addr;
+ }
+
+ static void freeTLS() {
+ SysCalls sys;
+ void *addr;
+ #if defined(__x86_64__)
+ sys.arch_prctl(ARCH_GET_GS, &addr);
+ #elif defined(__i386__)
+ struct user_desc u;
+ sys.get_thread_area(&u);
+ addr = (void *)u.base_addr;
+ #else
+ #error Unsupported target platform
+ #endif
+ sys.munmap(addr, 4096);
+ }
+
+ template<class T> static inline bool setTLSValue(int idx, T val) {
+ #if defined(__x86_64__)
+ if (idx < 0 || idx >= 4096/8) {
+ return false;
+ }
+ asm volatile(
+ "movq %0, %%gs:(%1)\n"
+ :
+ : "q"((void *)val), "q"(8ll * idx));
+ #elif defined(__i386__)
+ if (idx < 0 || idx >= 4096/8) {
+ return false;
+ }
+ if (sizeof(T) == 8) {
+ asm volatile(
+ "movl %0, %%fs:(%1)\n"
+ :
+ : "r"((unsigned)val), "r"(8 * idx));
+ asm volatile(
+ "movl %0, %%fs:(%1)\n"
+ :
+ : "r"((unsigned)((unsigned long long)val >> 32)), "r"(8 * idx + 4));
+ } else {
+ asm volatile(
+ "movl %0, %%fs:(%1)\n"
+ :
+ : "r"(val), "r"(8 * idx));
+ }
+ #else
+ #error Unsupported target platform
+ #endif
+ return true;
+ }
+
+ template<class T> static inline T getTLSValue(int idx) {
+ #if defined(__x86_64__)
+ long long rc;
+ if (idx < 0 || idx >= 4096/8) {
+ return 0;
+ }
+ asm volatile(
+ "movq %%gs:(%1), %0\n"
+ : "=q"(rc)
+ : "q"(8ll * idx));
+ return (T)rc;
+ #elif defined(__i386__)
+ if (idx < 0 || idx >= 4096/8) {
+ return 0;
+ }
+ if (sizeof(T) == 8) {
+ unsigned lo, hi;
+ asm volatile(
+ "movl %%fs:(%1), %0\n"
+ : "=r"(lo)
+ : "r"(8 * idx));
+ asm volatile(
+ "movl %%fs:(%1), %0\n"
+ : "=r"(hi)
+ : "r"(8 * idx + 4));
+ return (T)((unsigned long long)lo + ((unsigned long long)hi << 32));
+ } else {
+ long rc;
+ asm volatile(
+ "movl %%fs:(%1), %0\n"
+ : "=r"(rc)
+ : "r"(8 * idx));
+ return (T)rc;
+ }
+ #else
+ #error Unsupported target platform
+ #endif
+ }
+
+};
+
+} // namespace
+#endif
diff --git a/sandbox/linux/seccomp/trusted_process.cc b/sandbox/linux/seccomp/trusted_process.cc
new file mode 100644
index 0000000..5c62b0f
--- /dev/null
+++ b/sandbox/linux/seccomp/trusted_process.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <dirent.h>
+#include <map>
+
+#include "debug.h"
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+struct SandboxPolicy g_policy;
+
+struct Thread {
+ int fdPub, fd;
+ SecureMem::Args* mem;
+};
+
+SecureMem::Args* Sandbox::getNewSecureMem() {
+ if (!secureMemPool_.empty()) {
+ SecureMem::Args* rc = secureMemPool_.back();
+ secureMemPool_.pop_back();
+ memset(rc->scratchPage, 0, sizeof(rc->scratchPage));
+ return rc;
+ }
+ return NULL;
+}
+
+void Sandbox::trustedProcess(int parentMapsFd, int processFdPub, int sandboxFd,
+ int cloneFd, SecureMem::Args* secureArena) {
+ // The trusted process doesn't have access to TLS. Zero out the segment
+ // registers so that we can later test that we are in the trusted process.
+ #if defined(__x86_64__)
+ asm volatile("mov %0, %%gs\n" : : "r"(0));
+ #elif defined(__i386__)
+ asm volatile("mov %0, %%fs\n" : : "r"(0));
+ #else
+ #error Unsupported target platform
+ #endif
+
+ std::map<long long, struct Thread> threads;
+ SysCalls sys;
+ long long cookie = 0;
+
+ // The very first entry in the secure memory arena has been assigned to the
+ // initial thread. The remaining entries are available for allocation.
+ SecureMem::Args* startAddress = secureArena;
+ SecureMem::Args* nextThread = startAddress;
+ for (int i = 0; i < kMaxThreads-1; i++) {
+ secureMemPool_.push_back(++startAddress);
+ }
+
+newThreadCreated:
+ // Receive information from newly created thread
+ Thread *newThread = &threads[++cookie];
+ memset(newThread, 0, sizeof(Thread));
+ struct {
+ SecureMem::Args* self;
+ int tid;
+ int fdPub;
+ } __attribute__((packed)) data;
+
+ size_t dataLen = sizeof(data);
+ if (!getFd(cloneFd, &newThread->fdPub, &newThread->fd, &data, &dataLen) ||
+ dataLen != sizeof(data)) {
+ // We get here either because the sandbox got corrupted, or because our
+ // parent process has terminated.
+ if (newThread->fdPub || dataLen) {
+ die("Failed to receive new thread information");
+ }
+ die();
+ }
+ if (data.self != nextThread) {
+ // The only potentially security critical information received from the
+ // newly created thread is "self". The "tid" is for informational purposes
+ // (and for use in the new thread's TLS), and "fdPub" is uncritical as all
+ // file descriptors are considered untrusted.
+ // Thus, we only use "self" for a sanity check, but don't actually trust
+ // it beyond that.
+ die("Received corrupted thread information");
+ }
+ newThread->mem = nextThread;
+
+ // Set up TLS area and let thread know that the data is now ready
+ nextThread->cookie = cookie;
+ nextThread->threadId = data.tid;
+ nextThread->threadFdPub = data.fdPub;
+ write(sys, newThread->fd, "", 1);
+
+ // Dispatch system calls that have been forwarded from the trusted thread(s).
+ for (;;) {
+ struct {
+ unsigned int sysnum;
+ long long cookie;
+ } __attribute__((packed)) header;
+
+ int rc;
+ if ((rc = read(sys, sandboxFd, &header, sizeof(header))) !=sizeof(header)){
+ if (rc) {
+ die("Failed to read system call number and thread id");
+ }
+ die();
+ }
+ std::map<long long, struct Thread>::iterator iter =
+ threads.find(header.cookie);
+ if (iter == threads.end()) {
+ die("Received request from unknown thread");
+ }
+ struct Thread* currentThread = &iter->second;
+ if (header.sysnum > maxSyscall ||
+ !syscallTable[header.sysnum].trustedProcess) {
+ die("Trusted process encountered unexpected system call");
+ }
+
+ // Dispatch system call to handler function. Treat both exit() and clone()
+ // specially.
+ if (syscallTable[header.sysnum].trustedProcess(parentMapsFd,
+ sandboxFd,
+ currentThread->fdPub,
+ currentThread->fd,
+ currentThread->mem) &&
+ header.sysnum == __NR_clone) {
+ nextThread = currentThread->mem->newSecureMem;
+ goto newThreadCreated;
+ } else if (header.sysnum == __NR_exit) {
+ NOINTR_SYS(sys.close(iter->second.fdPub));
+ NOINTR_SYS(sys.close(iter->second.fd));
+ SecureMem::Args* secureMem = currentThread->mem;
+ threads.erase(iter);
+ secureMemPool_.push_back(secureMem);
+ }
+ }
+}
+
+int Sandbox::initializeProtectedMap(int fd) {
+ int mapsFd;
+ if (!getFd(fd, &mapsFd, NULL, NULL, NULL)) {
+ maps_failure:
+ die("Cannot access /proc/self/maps");
+ }
+
+ // Read the memory mappings as they were before the sandbox takes effect.
+ // These mappings cannot be changed by the sandboxed process.
+ char line[80];
+ FILE *fp = fdopen(mapsFd, "r");
+ for (bool truncated = false;;) {
+ if (fgets(line, sizeof(line), fp) == NULL) {
+ if (feof(fp) || errno != EINTR) {
+ break;
+ }
+ continue;
+ }
+ if (!truncated) {
+ unsigned long start, stop;
+ char *ptr = line;
+ errno = 0;
+ start = strtoul(ptr, &ptr, 16);
+ if (errno || *ptr++ != '-') {
+ parse_failure:
+ die("Failed to parse /proc/self/maps");
+ }
+ stop = strtoul(ptr, &ptr, 16);
+ if (errno || *ptr++ != ' ') {
+ goto parse_failure;
+ }
+ protectedMap_[reinterpret_cast<void *>(start)] = stop - start;
+ }
+ truncated = strchr(line, '\n') == NULL;
+ }
+
+ // Prevent low address memory allocations. Some buggy kernels allow those
+ if (protectedMap_[0] < (64 << 10)) {
+ protectedMap_[0] = 64 << 10;
+ }
+
+ // Let the sandbox know that we are done parsing the memory map.
+ SysCalls sys;
+ if (write(sys, fd, &mapsFd, sizeof(mapsFd)) != sizeof(mapsFd)) {
+ goto maps_failure;
+ }
+
+ return mapsFd;
+}
+
+SecureMem::Args* Sandbox::createTrustedProcess(int processFdPub, int sandboxFd,
+ int cloneFdPub, int cloneFd) {
+ // Allocate memory that will be used by an arena for storing the secure
+ // memory. While we allow this memory area to be empty at times (e.g. when
+ // not all threads are in use), we make sure that it never gets overwritten
+ // by user-allocated memory. This happens in initializeProtectedMap() and
+ // snapshotMemoryMappings().
+ SecureMem::Args* secureArena = reinterpret_cast<SecureMem::Args*>(
+ mmap(NULL, 8192*kMaxThreads, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS, -1, 0));
+ if (secureArena == MAP_FAILED) {
+ die("Failed to allocate secure memory arena");
+ }
+
+ // Set up the mutex to be accessible from the trusted process and from
+ // children of the trusted thread(s)
+ if (mmap(&syscall_mutex_, 4096, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0) != &syscall_mutex_) {
+ die("Failed to initialize secure mutex");
+ }
+ syscall_mutex_ = 0x80000000;
+
+
+ // Create a trusted process that can evaluate system call parameters and
+ // decide whether a system call should execute. This process runs outside of
+ // the seccomp sandbox. It communicates with the sandbox'd process through
+ // a socketpair() and through securely shared memory.
+ pid_t pid = fork();
+ if (pid < 0) {
+ die("Failed to create trusted process");
+ }
+ if (!pid) {
+ // Close all file handles except for sandboxFd, cloneFd, and stdio
+ DIR *dir = opendir("/proc/self/fd");
+ if (dir == 0) {
+ // If we don't know the list of our open file handles, just try closing
+ // all valid ones.
+ for (int fd = sysconf(_SC_OPEN_MAX); --fd > 2; ) {
+ if (fd != sandboxFd && fd != cloneFd) {
+ close(fd);
+ }
+ }
+ } else {
+ // If available, if is much more efficient to just close the file
+ // handles that show up in /proc/self/fd/
+ struct dirent de, *res;
+ while (!readdir_r(dir, &de, &res) && res) {
+ if (res->d_name[0] < '0')
+ continue;
+ int fd = atoi(res->d_name);
+ if (fd > 2 &&
+ fd != sandboxFd && fd != cloneFd && fd != dirfd(dir)) {
+ close(fd);
+ }
+ }
+ closedir(dir);
+ }
+
+ // Initialize secure memory used for threads
+ for (int i = 0; i < kMaxThreads; i++) {
+ SecureMem::Args* args = secureArena + i;
+ args->self = args;
+ #ifndef NDEBUG
+ args->allowAllSystemCalls= Debug::isEnabled();
+ #endif
+ }
+
+ int parentMapsFd = initializeProtectedMap(sandboxFd);
+ trustedProcess(parentMapsFd, processFdPub, sandboxFd,
+ cloneFd, secureArena);
+ die();
+ }
+
+ // We are still in the untrusted code. Deny access to restricted resources.
+ mprotect(secureArena, 8192*kMaxThreads, PROT_NONE);
+ mprotect(&syscall_mutex_, 4096, PROT_NONE);
+ close(sandboxFd);
+
+ return secureArena;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc
new file mode 100644
index 0000000..6d6a3f5
--- /dev/null
+++ b/sandbox/linux/seccomp/trusted_thread.cc
@@ -0,0 +1,1483 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
+ SecureMem::Args* secureMem) {
+ SecureMem::Args args = { { { { { 0 } } } } };
+ args.self = &args;
+ args.newSecureMem = secureMem;
+ args.processFdPub = processFdPub;
+ args.cloneFdPub = cloneFdPub;
+#if defined(__x86_64__)
+ asm volatile(
+ "push %%rbx\n"
+ "push %%rbp\n"
+ "mov %0, %%rbp\n" // %rbp = args
+ "xor %%rbx, %%rbx\n" // initial sequence number
+ "lea 999f(%%rip), %%r15\n" // continue in same thread
+
+ // Signal handlers are process-wide. This means that for security
+ // reasons, we cannot allow that the trusted thread ever executes any
+ // signal handlers.
+ // We prevent the execution of signal handlers by setting a signal
+ // mask that blocks all signals. In addition, we make sure that the
+ // stack pointer is invalid.
+ // We cannot reset the signal mask until after we have enabled
+ // Seccomp mode. Our sigprocmask() wrapper would normally do this by
+ // raising a signal, modifying the signal mask in the kernel-generated
+ // signal frame, and then calling sigreturn(). This presents a bit of
+ // a Catch-22, as all signals are masked and we can therefore not
+ // raise any signal that would allow us to generate the signal stack
+ // frame.
+ // Instead, we have to create the signal stack frame prior to entering
+ // Seccomp mode. This incidentally also helps us to restore the
+ // signal mask to the same value that it had prior to entering the
+ // sandbox.
+ // The signal wrapper for clone() is the second entry point into this
+ // code (by means of sending an IPC to its trusted thread). It goes
+ // through the same steps of creating a signal stack frame on the
+ // newly created thread's stacks prior to cloning. See clone.cc for
+ // details.
+ "mov $56+0xF000, %%eax\n" // __NR_clone + 0xF000
+ "mov %%rsp, %%rcx\n"
+ "int $0\n" // push a signal stack frame (see clone.cc)
+ "mov %%rcx, 0xA0(%%rsp)\n" // pop stack upon call to sigreturn()
+ "mov %%rsp, %%r9\n"
+ "mov $2, %%rdi\n" // how = SIG_SETMASK
+ "pushq $-1\n"
+ "mov %%rsp, %%rsi\n" // set = full mask
+ "xor %%rdx, %%rdx\n" // old_set = NULL
+ "mov $8, %%r10\n" // mask all 64 signals
+ "mov $14, %%eax\n" // NR_rt_sigprocmask
+ "syscall\n"
+ "xor %%rsp, %%rsp\n" // invalidate the stack in all trusted code
+ "jmp 20f\n" // create trusted thread
+
+ // TODO(markus): Coalesce the read() operations by reading into a bigger
+ // buffer.
+
+ // Parameters:
+ // *%fs: secure memory region
+ // the page following this one contains the scratch space
+ // %r13: thread's side of threadFd
+ // %r15: processFdPub
+
+ // Local variables:
+ // %rbx: sequence number for trusted calls
+
+ // Temporary variables:
+ // %r8: child stack
+ // %r9: system call number, child stack
+ // %rbp: secure memory of previous thread
+
+ // Layout of secure shared memory region (c.f. securemem.h):
+ // 0x00: pointer to the secure shared memory region (i.e. self)
+ // 0x08: sequence number; must match %rbx
+ // 0x10: call type; must match %eax, iff %eax == -1 || %eax == -2
+ // 0x18: system call number; passed to syscall in %rax
+ // 0x20: first argument; passed to syscall in %rdi
+ // 0x28: second argument; passed to syscall in %rsi
+ // 0x30: third argument; passed to syscall in %rdx
+ // 0x38: fourth argument; passed to syscall in %r10
+ // 0x40: fifth argument; passed to syscall in %r8
+ // 0x48: sixth argument; passed to syscall in %r9
+ // 0x50: stored return address for clone() system call
+ // 0x58: stored %rbp value for clone() system call
+ // 0x60: stored %rbx value for clone() system call
+ // 0x68: stored %rcx value for clone() system call
+ // 0x70: stored %rdx value for clone() system call
+ // 0x78: stored %rsi value for clone() system call
+ // 0x80: stored %rdi value for clone() system call
+ // 0x88: stored %r8 value for clone() system call
+ // 0x90: stored %r9 value for clone() system call
+ // 0x98: stored %r10 value for clone() system call
+ // 0xA0: stored %r11 value for clone() system call
+ // 0xA8: stored %r12 value for clone() system call
+ // 0xB0: stored %r13 value for clone() system call
+ // 0xB8: stored %r14 value for clone() system call
+ // 0xC0: stored %r15 value for clone() system call
+ // 0xC8: new shared memory for clone()
+ // 0xD0: processFdPub for talking to trusted process
+ // 0xD4: cloneFdPub for talking to trusted process
+ // 0xD8: set to non-zero, if in debugging mode
+ // 0xDC: most recent SHM id returned by shmget(IPC_PRIVATE)
+ // 0xE0: cookie assigned to us by the trusted process (TLS_COOKIE)
+ // 0xE8: thread id (TLS_TID)
+ // 0xF0: threadFdPub (TLS_THREAD_FD)
+ // 0x200-0x1000: securely passed verified file name(s)
+
+ // Layout of (untrusted) scratch space:
+ // 0x00: syscall number; passed in %rax
+ // 0x04: first argument; passed in %rdi
+ // 0x0C: second argument; passed in %rsi
+ // 0x14: third argument; passed in %rdx
+ // 0x1C: fourth argument; passed in %r10
+ // 0x24: fifth argument; passed in %r8
+ // 0x2C: sixth argument; passed in %r9
+ // 0x34: return value
+ // 0x3C: RDTSCP result (%eax)
+ // 0x40: RDTSCP result (%edx)
+ // 0x44: RDTSCP result (%ecx)
+ // 0x48: last system call (not used on x86-64)
+ // 0x4C: number of consecutive calls to a time fnc (not used on x86-64)
+ // 0x50: nesting level of system calls (for debugging purposes only)
+ // 0x54: signal mask
+ // 0x5C: in SEGV handler
+
+ // We use the %fs register for accessing the secure read-only page, and
+ // the untrusted scratch space immediately following it. The segment
+ // register and the local descriptor table is set up by passing
+ // appropriate arguments to clone().
+
+ "0:xor %%rsp, %%rsp\n"
+ "mov $2, %%ebx\n" // %rbx = initial sequence number
+
+ // Read request from untrusted thread, or from trusted process. In either
+ // case, the data that we read has to be considered untrusted.
+ // read(threadFd, &scratch, 4)
+ "1:xor %%rax, %%rax\n" // NR_read
+ "mov %%r13, %%rdi\n" // fd = threadFd
+ "mov %%fs:0x0, %%rsi\n" // secure_mem
+ "add $0x1000, %%rsi\n" // buf = &scratch
+ "mov $4, %%edx\n" // len = 4
+ "2:syscall\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 2b\n"
+ "cmp %%rdx, %%rax\n"
+ "jnz 25f\n" // exit process
+
+ // Retrieve system call number. It is crucial that we only dereference
+ // %fs:0x1000 exactly once. Afterwards, memory becomes untrusted and
+ // we must use the value that we have read the first time.
+ "mov 0(%%rsi), %%eax\n"
+
+ // If syscall number is -1, execute an unlocked system call from the
+ // secure memory area
+ "cmp $-1, %%eax\n"
+ "jnz 5f\n"
+ "3:cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "cmp %%fs:0x10, %%eax\n"
+ "jne 25f\n" // exit process
+ "mov %%fs:0x18, %%rax\n"
+ "mov %%fs:0x20, %%rdi\n"
+ "mov %%fs:0x28, %%rsi\n"
+ "mov %%fs:0x30, %%rdx\n"
+ "mov %%fs:0x38, %%r10\n"
+ "mov %%fs:0x40, %%r8\n"
+ "mov %%fs:0x48, %%r9\n"
+ "cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "add $2, %%rbx\n"
+
+ // shmget() gets some special treatment. Whenever we return from this
+ // system call, we remember the most recently returned SysV shm id.
+ "cmp $29, %%eax\n" // NR_shmget
+ "jnz 4f\n"
+ "syscall\n"
+ "mov %%rax, %%r8\n"
+ "mov $56, %%eax\n" // NR_clone
+ "mov $17, %%edi\n" // flags = SIGCHLD
+ "mov $1, %%esi\n" // stack = 1
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25f\n" // exit process
+ "mov %%rax, %%rdi\n"
+ "jnz 8f\n" // wait for child, then return result
+ "mov %%fs:0x0, %%rdi\n" // start = secure_mem
+ "mov $4096, %%esi\n" // len = 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $10, %%eax\n" // NR_mprotect
+ "syscall\n"
+ "mov %%r8d, 0xDC(%%rdi)\n" // set most recently returned SysV shm id
+ "xor %%rdi, %%rdi\n"
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "cmpw $0, %%fs:0xD8\n" // debug mode
+ "jz 27f\n"
+ "mov $1, %%eax\n" // NR_write
+ "mov $2, %%edi\n" // fd = stderr
+ "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "syscall\n"
+ "xor %%rdi, %%rdi\n"
+ #endif
+
+ "jmp 27f\n" // exit program, no message
+ "4:syscall\n"
+ "jmp 15f\n" // return result
+
+ // If syscall number is -2, execute locked system call from the
+ // secure memory area
+ "5:jg 12f\n"
+ "cmp $-2, %%eax\n"
+ "jnz 9f\n"
+ "cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "cmp %%eax, %%fs:0x10\n"
+ "jne 25f\n" // exit process
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "cmpw $0, %%fs:0xD8\n" // debug mode
+ "jz 6f\n"
+ "mov $1, %%eax\n" // NR_write
+ "mov $2, %%edi\n" // fd = stderr
+ "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "syscall\n"
+ "6:"
+ #endif
+
+ "mov %%fs:0x18, %%rax\n"
+ "mov %%fs:0x20, %%rdi\n"
+ "mov %%fs:0x28, %%rsi\n"
+ "mov %%fs:0x30, %%rdx\n"
+ "mov %%fs:0x38, %%r10\n"
+ "mov %%fs:0x40, %%r8\n"
+ "mov %%fs:0x48, %%r9\n"
+ "cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+
+ // clone() has unusual calling conventions and must be handled specially
+ "cmp $56, %%rax\n" // NR_clone
+ "jz 19f\n"
+
+ // exit() terminates trusted thread
+ "cmp $60, %%eax\n" // NR_exit
+ "jz 18f\n"
+
+ // Perform requested system call
+ "syscall\n"
+
+ // Unlock mutex
+ "7:cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "add $2, %%rbx\n"
+ "mov %%rax, %%r8\n"
+ "mov $56, %%eax\n" // NR_clone
+ "mov $17, %%rdi\n" // flags = SIGCHLD
+ "mov $1, %%rsi\n" // stack = 1
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25f\n" // exit process
+ "jz 22f\n" // unlock and exit
+ "mov %%rax, %%rdi\n"
+ "8:xor %%rsi, %%rsi\n"
+ "xor %%rdx, %%rdx\n"
+ "xor %%r10, %%r10\n"
+ "mov $61, %%eax\n" // NR_wait4
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 8b\n"
+ "mov %%r8, %%rax\n"
+ "jmp 15f\n" // return result
+
+ // If syscall number is -3, read the time stamp counter
+ "9:cmp $-3, %%eax\n"
+ "jnz 10f\n"
+ "rdtsc\n" // sets %edx:%eax
+ "xor %%rcx, %%rcx\n"
+ "jmp 11f\n"
+ "10:cmp $-4, %%eax\n"
+ "jnz 12f\n"
+ "rdtscp\n" // sets %edx:%eax and %ecx
+ "11:add $0x3C, %%rsi\n"
+ "mov %%eax, 0(%%rsi)\n"
+ "mov %%edx, 4(%%rsi)\n"
+ "mov %%ecx, 8(%%rsi)\n"
+ "mov $12, %%edx\n"
+ "jmp 16f\n" // return result
+
+ // Check in syscallTable whether this system call is unrestricted
+ "12:mov %%rax, %%r9\n"
+ #ifndef NDEBUG
+ "cmpw $0, %%fs:0xD8\n" // debug mode
+ "jnz 13f\n"
+ #endif
+ "cmp playground$maxSyscall(%%rip), %%eax\n"
+ "ja 25f\n" // exit process
+ "shl $4, %%rax\n"
+ "lea playground$syscallTable(%%rip), %%rdi\n"
+ "add %%rdi, %%rax\n"
+ "mov 0(%%rax), %%rax\n"
+ "cmp $1, %%rax\n"
+ "jne 25f\n" // exit process
+
+ // Default behavior for unrestricted system calls is to just execute
+ // them. Read the remaining arguments first.
+ "13:mov %%rsi, %%r8\n"
+ "xor %%rax, %%rax\n" // NR_read
+ "mov %%r13, %%rdi\n" // fd = threadFd
+ "add $4, %%rsi\n" // buf = &scratch + 4
+ "mov $48, %%edx\n" // len = 6*sizeof(void *)
+ "14:syscall\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 14b\n"
+ "cmp %%rdx, %%rax\n"
+ "jnz 25f\n" // exit process
+ "mov %%r9, %%rax\n"
+ "mov 0x04(%%r8), %%rdi\n"
+ "mov 0x0C(%%r8), %%rsi\n"
+ "mov 0x14(%%r8), %%rdx\n"
+ "mov 0x1C(%%r8), %%r10\n"
+ "mov 0x2C(%%r8), %%r9\n"
+ "mov 0x24(%%r8), %%r8\n"
+ "cmp $231, %%rax\n" // NR_exit_group
+ "jz 27f\n" // exit program, no message
+ "syscall\n"
+
+ // Return result of system call to sandboxed thread
+ "15:mov %%fs:0x0, %%rsi\n" // secure_mem
+ "add $0x1034, %%rsi\n" // buf = &scratch + 52
+ "mov %%rax, (%%rsi)\n"
+ "mov $8, %%edx\n" // len = 8
+ "16:mov %%r13, %%rdi\n" // fd = threadFd
+ "mov $1, %%eax\n" // NR_write
+ "17:syscall\n"
+ "cmp %%rdx, %%rax\n"
+ "jz 1b\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 17b\n"
+ "jmp 25f\n" // exit process
+
+ // NR_exit:
+ // Exit trusted thread after cleaning up resources
+ "18:mov %%fs:0x0, %%rsi\n" // secure_mem
+ "mov 0xF0(%%rsi), %%rdi\n" // fd = threadFdPub
+ "mov $3, %%eax\n" // NR_close
+ "syscall\n"
+ "mov %%rsi, %%rdi\n" // start = secure_mem
+ "mov $8192, %%esi\n" // length = 8192
+ "xor %%rdx, %%rdx\n" // prot = PROT_NONE
+ "mov $10, %%eax\n" // NR_mprotect
+ "syscall\n"
+ "mov %%r13, %%rdi\n" // fd = threadFd
+ "mov $3, %%eax\n" // NR_close
+ "syscall\n"
+ "mov $56, %%eax\n" // NR_clone
+ "mov $17, %%rdi\n" // flags = SIGCHLD
+ "mov $1, %%rsi\n" // stack = 1
+ "syscall\n"
+ "mov %%rax, %%rdi\n"
+ "test %%rax, %%rax\n"
+ "js 27f\n" // exit process
+ "jne 21f\n" // reap helper, exit thread
+ "jmp 22f\n" // unlock mutex
+
+ // NR_clone:
+ // Original trusted thread calls clone() to create new nascent
+ // thread. This thread is (typically) fully privileged and shares all
+ // resources with the caller (i.e. the previous trusted thread),
+ // and by extension it shares all resources with the sandbox'd
+ // threads.
+ "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem
+ "mov %%rsi, %%r15\n" // remember child stack
+ "mov $1, %%rsi\n" // stack = 1
+ "syscall\n" // calls NR_clone
+ "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values
+ "jae 7b\n" // unlock mutex, return result
+ "add $2, %%rbx\n"
+ "test %%rax, %%rax\n"
+ "jne 15b\n" // return result
+
+ // In nascent thread, now.
+ "sub $2, %%rbx\n"
+
+ // We want to maintain an invalid %rsp whenver we access untrusted
+ // memory. This ensures that even if an attacker can trick us into
+ // triggering a SIGSEGV, we will never successfully execute a signal
+ // handler.
+ // Signal handlers are inherently dangerous, as an attacker could trick
+ // us into returning to the wrong address by adjusting the signal stack
+ // right before the handler returns.
+ // N.B. While POSIX is curiously silent about this, it appears that on
+ // Linux, alternate signal stacks are a per-thread property. That is
+ // good. It means that this security mechanism works, even if the
+ // sandboxed thread manages to set up an alternate signal stack.
+ //
+ // TODO(markus): We currently do not support emulating calls to
+ // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc
+ // for a discussion on how to fix this, if this ever becomes neccessary.
+ "mov %%r15, %%r9\n" // %r9 = child_stack
+ "xor %%r15, %%r15\n" // Request to return from clone() when done
+
+ // Get thread id of nascent thread
+ "20:mov $186, %%eax\n" // NR_gettid
+ "syscall\n"
+ "mov %%rax, %%r14\n"
+
+ // Nascent thread creates socketpair() for sending requests to
+ // trusted thread.
+ // We can create the filehandles on the child's stack. Filehandles are
+ // always treated as untrusted.
+ // socketpair(AF_UNIX, SOCK_STREAM, 0, fds)
+ "sub $0x10, %%r9\n"
+ "mov %%r15, 8(%%r9)\n" // preserve return address on child stack
+ "mov $53, %%eax\n" // NR_socketpair
+ "mov $1, %%edi\n" // domain = AF_UNIX
+ "mov $1, %%esi\n" // type = SOCK_STREAM
+ "xor %%rdx, %%rdx\n" // protocol = 0
+ "mov %%r9, %%r10\n" // sv = child_stack
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "jz 28f\n"
+
+ // If things went wrong, we don't have an (easy) way of signaling
+ // the parent. For our purposes, it is sufficient to fail with a
+ // fatal error.
+ "jmp 25f\n" // exit process
+ "21:xor %%rsi, %%rsi\n"
+ "xor %%rdx, %%rdx\n"
+ "xor %%r10, %%r10\n"
+ "mov $61, %%eax\n" // NR_wait4
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 21b\n"
+ "jmp 23f\n" // exit thread (no message)
+ "22:lea playground$syscall_mutex(%%rip), %%rdi\n"
+ "mov $4096, %%esi\n"
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $10, %%eax\n" // NR_mprotect
+ "syscall\n"
+ "lock; addl $0x80000000, (%%rdi)\n"
+ "jz 23f\n" // exit thread
+ "mov $1, %%edx\n"
+ "mov %%rdx, %%rsi\n" // FUTEX_WAKE
+ "mov $202, %%eax\n" // NR_futex
+ "syscall\n"
+ "23:mov $60, %%eax\n" // NR_exit
+ "mov $1, %%edi\n" // status = 1
+ "24:syscall\n"
+ "25:mov $1, %%eax\n" // NR_write
+ "mov $2, %%edi\n" // fd = stderr
+ "lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected"
+ "mov $101f-100f, %%edx\n" // len = strlen(msg)
+ "syscall\n"
+ "26:mov $1, %%edi\n"
+ "27:mov $231, %%eax\n" // NR_exit_group
+ "jmp 24b\n"
+
+ // The first page is mapped read-only for use as securely shared memory
+ "28:mov 0xC8(%%rbp), %%r12\n" // %r12 = secure shared memory
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+ "mov $10, %%eax\n" // NR_mprotect
+ "mov %%r12, %%rdi\n" // addr = secure_mem
+ "mov $4096, %%esi\n" // len = 4096
+ "mov $1, %%edx\n" // prot = PROT_READ
+ "syscall\n"
+
+ // The second page is used as scratch space by the trusted thread.
+ // Make it writable.
+ "mov $10, %%eax\n" // NR_mprotect
+ "add $4096, %%rdi\n" // addr = secure_mem + 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "syscall\n"
+
+ // Call clone() to create new trusted thread().
+ // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ // CLONE_SYSVSEM|CLONE_UNTRACED|CLONE_SETTLS, stack, NULL, NULL,
+ // tls)
+ "mov 4(%%r9), %%r13d\n" // %r13 = threadFd (on child's stack)
+ "mov $56, %%eax\n" // NR_clone
+ "mov $0x8D0F00, %%edi\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR|TLS
+ "mov $1, %%rsi\n" // stack = 1
+ "mov %%r12, %%r8\n" // tls = new_secure_mem
+ "mov 0xD0(%%rbp), %%r15d\n" // %r15 = processFdPub
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25b\n" // exit process
+ "jz 0b\n" // invoke trustedThreadFnc()
+
+ // Copy the caller's signal mask
+ "mov 0x1054(%%rbp), %%rax\n"
+ "mov %%rax, 0x1054(%%r12)\n"
+
+ // Done creating trusted thread. We can now get ready to return to caller
+ "mov %%r9, %%r8\n" // %r8 = child_stack
+ "mov 0(%%r9), %%r9d\n" // %r9 = threadFdPub
+
+ // Set up thread local storage with information on how to talk to
+ // trusted thread and trusted process.
+ "lea 0xE0(%%r12), %%rsi\n" // args = &secure_mem.TLS;
+ "mov $158, %%eax\n" // NR_arch_prctl
+ "mov $0x1001, %%edi\n" // option = ARCH_SET_GS
+ "syscall\n"
+ "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values
+ "jae 25b\n" // exit process
+
+ // Check whether this is the initial thread, or a newly created one.
+ // At startup we run the same code as when we create a new thread. At
+ // the very top of this function, you will find that we push 999(%rip)
+ // on the stack. That is the signal that we should return on the same
+ // stack rather than return to where clone was called.
+ "mov 8(%%r8), %%r15\n"
+ "add $0x10, %%r8\n"
+ "test %%r15, %%r15\n"
+ "jne 29f\n"
+
+ // Returning from clone() into the newly created thread is special. We
+ // cannot unroll the stack, as we just set up a new stack for this
+ // thread. We have to explicitly restore CPU registers to the values
+ // that they had when the program originally called clone().
+ // We patch the register values in the signal stack frame so that we
+ // can ask sigreturn() to restore all registers for us.
+ "sub $0x8, %%r8\n"
+ "mov 0x50(%%rbp), %%rax\n"
+ "mov %%rax, 0x00(%%r8)\n" // return address
+ "xor %%rax, %%rax\n"
+ "mov %%rax, 0x98(%%r8)\n" // %rax = 0
+ "mov 0x58(%%rbp), %%rax\n"
+ "mov %%rax, 0x80(%%r8)\n" // %rbp
+ "mov 0x60(%%rbp), %%rax\n"
+ "mov %%rax, 0x88(%%r8)\n" // %rbx
+ "mov 0x68(%%rbp), %%rax\n"
+ "mov %%rax, 0xA0(%%r8)\n" // %rcx
+ "mov 0x70(%%rbp), %%rax\n"
+ "mov %%rax, 0x90(%%r8)\n" // %rdx
+ "mov 0x78(%%rbp), %%rax\n"
+ "mov %%rax, 0x78(%%r8)\n" // %rsi
+ "mov 0x80(%%rbp), %%rax\n"
+ "mov %%rax, 0x70(%%r8)\n" // %rdi
+ "mov 0x88(%%rbp), %%rax\n"
+ "mov %%rax, 0x30(%%r8)\n" // %r8
+ "mov 0x90(%%rbp), %%rax\n"
+ "mov %%rax, 0x38(%%r8)\n" // %r9
+ "mov 0x98(%%rbp), %%rax\n"
+ "mov %%rax, 0x40(%%r8)\n" // %r10
+ "mov 0xA0(%%rbp), %%rax\n"
+ "mov %%rax, 0x48(%%r8)\n" // %r11
+ "mov 0xA8(%%rbp), %%rax\n"
+ "mov %%rax, 0x50(%%r8)\n" // %r12
+ "mov 0xB0(%%rbp), %%rax\n"
+ "mov %%rax, 0x58(%%r8)\n" // %r13
+ "mov 0xB8(%%rbp), %%rax\n"
+ "mov %%rax, 0x60(%%r8)\n" // %r14
+ "mov 0xC0(%%rbp), %%rax\n"
+ "mov %%rax, 0x68(%%r8)\n" // %r15
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+
+ // Nascent thread launches a helper that doesn't share any of our
+ // resources, except for pages mapped as MAP_SHARED.
+ // clone(SIGCHLD, stack=1)
+ "29:mov $56, %%eax\n" // NR_clone
+ "mov $17, %%rdi\n" // flags = SIGCHLD
+ "mov $1, %%rsi\n" // stack = 1
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25b\n" // exit process
+ "jne 31f\n"
+
+ // Use sendmsg() to send to the trusted process the file handles for
+ // communicating with the new trusted thread. We also send the address
+ // of the secure memory area (for sanity checks) and the thread id.
+ "mov 0xD4(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub()
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+
+ // 0x00 msg:
+ // 0x00 msg_name ($0)
+ // 0x08 msg_namelen ($0)
+ // 0x10 msg_iov (%r8 + 0x44)
+ // 0x18 msg_iovlen ($1)
+ // 0x20 msg_control (%r8 + 0x54)
+ // 0x28 msg_controllen ($0x18)
+ // 0x30 data:
+ // 0x30 msg_flags/err ($0)
+ // 0x34 secure_mem (%r12)
+ // 0x3C threadId (%r14d)
+ // 0x40 threadFdPub (%r9d)
+ // 0x44 iov:
+ // 0x44 iov_base (%r8 + 0x30)
+ // 0x4C iov_len ($0x14)
+ // 0x54 cmsg:
+ // 0x54 cmsg_len ($0x18)
+ // 0x5C cmsg_level ($1, SOL_SOCKET)
+ // 0x60 cmsg_type ($1, SCM_RIGHTS)
+ // 0x64 threadFdPub (%r9d)
+ // 0x68 threadFd (%r13d)
+ // 0x6C
+ "sub $0x6C, %%r8\n"
+ "xor %%rdx, %%rdx\n" // flags = 0
+ "mov %%rdx, 0x00(%%r8)\n" // msg_name
+ "mov %%edx, 0x08(%%r8)\n" // msg_namelen
+ "mov %%edx, 0x30(%%r8)\n" // msg_flags
+ "mov $1, %%r11d\n"
+ "mov %%r11, 0x18(%%r8)\n" // msg_iovlen
+ "mov %%r11d, 0x5C(%%r8)\n" // cmsg_level
+ "mov %%r11d, 0x60(%%r8)\n" // cmsg_type
+ "lea 0x30(%%r8), %%r11\n"
+ "mov %%r11, 0x44(%%r8)\n" // iov_base
+ "add $0x14, %%r11\n"
+ "mov %%r11, 0x10(%%r8)\n" // msg_iov
+ "add $0x10, %%r11\n"
+ "mov %%r11, 0x20(%%r8)\n" // msg_control
+ "mov $0x14, %%r11d\n"
+ "mov %%r11, 0x4C(%%r8)\n" // iov_len
+ "add $4, %%r11d\n"
+ "mov %%r11, 0x28(%%r8)\n" // msg_controllen
+ "mov %%r11, 0x54(%%r8)\n" // cmsg_len
+ "mov %%r12, 0x34(%%r8)\n" // secure_mem
+ "mov %%r14d, 0x3C(%%r8)\n" // threadId
+ "mov %%r9d, 0x40(%%r8)\n" // threadFdPub
+ "mov %%r9d, 0x64(%%r8)\n" // threadFdPub
+ "mov %%r13d, 0x68(%%r8)\n" // threadFd
+ "mov $46, %%eax\n" // NR_sendmsg
+ "mov %%r8, %%rsi\n" // msg
+ "syscall\n"
+
+ // Release syscall_mutex_. This signals the trusted process that
+ // it can write into the original thread's secure memory again.
+ "mov $10, %%eax\n" // NR_mprotect
+ "lea playground$syscall_mutex(%%rip), %%rdi\n"
+ "mov $4096, %%esi\n"
+ "mov $3, %%edx\n" // PROT_READ | PROT_WRITE
+ "syscall\n"
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+ "lock; addl $0x80000000, (%%rdi)\n"
+ "jz 30f\n" // exit process (no error message)
+ "mov $1, %%edx\n"
+ "mov %%rdx, %%rsi\n" // FUTEX_WAKE
+ "mov $202, %%eax\n" // NR_futex
+ "syscall\n"
+ "30:xor %%rdi, %%rdi\n"
+ "jmp 27b\n" // exit process (no error message)
+
+ // Reap helper
+ "31:mov %%rax, %%rdi\n"
+ "32:lea -4(%%r8), %%rsi\n"
+ "xor %%rdx, %%rdx\n"
+ "xor %%r10, %%r10\n"
+ "mov $61, %%eax\n" // NR_wait4
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 32b\n"
+ "mov -4(%%r8), %%eax\n"
+ "test %%rax, %%rax\n"
+ "jnz 26b\n" // exit process (no error message)
+
+ // Release privileges by entering seccomp mode.
+ "mov $157, %%eax\n" // NR_prctl
+ "mov $22, %%edi\n" // PR_SET_SECCOMP
+ "mov $1, %%esi\n"
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "jnz 25b\n" // exit process
+
+ // We can finally start using the stack. Signal handlers no longer pose
+ // a threat to us.
+ "mov %%r8, %%rsp\n"
+
+ // Back in the newly created sandboxed thread, wait for trusted process
+ // to receive request. It is possible for an attacker to make us
+ // continue even before the trusted process is done. This is OK. It'll
+ // result in us putting stale values into the new thread's TLS. But that
+ // data is considered untrusted anyway.
+ "push %%rax\n"
+ "mov $1, %%edx\n" // len = 1
+ "mov %%rsp, %%rsi\n" // buf = %rsp
+ "mov %%r9, %%rdi\n" // fd = threadFdPub
+ "33:xor %%rax, %%rax\n" // NR_read
+ "syscall\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 33b\n"
+ "cmp %%rdx, %%rax\n"
+ "jne 25b\n" // exit process
+ "pop %%rax\n"
+
+ // Return to caller. We are in the new thread, now.
+ "test %%r15, %%r15\n"
+ "jnz 34f\n" // Returning to createTrustedThread()
+
+ // Returning to the place where clone() had been called. We rely on
+ // using rt_sigreturn() for restoring our registers. The caller already
+ // created a signal stack frame, and we patched the register values
+ // with the ones that were in effect prior to calling sandbox_clone().
+ "pop %%r15\n"
+ "34:mov %%r15, 0xA8(%%rsp)\n" // compute new %rip
+ "mov $15, %%eax\n" // NR_rt_sigreturn
+ "syscall\n"
+
+ ".pushsection \".rodata\"\n"
+ "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
+ "101:.ascii \"WARNING! This is an expensive system call\\n\"\n"
+ "102:\n"
+ ".popsection\n"
+
+ "999:pop %%rbp\n"
+ "pop %%rbx\n"
+ :
+ : "g"(&args)
+ : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+ "r13", "r14", "r15", "rsp", "memory"
+#elif defined(__i386__)
+ struct user_desc u;
+ u.entry_number = (typeof u.entry_number)-1;
+ u.base_addr = 0;
+ u.limit = 0xfffff;
+ u.seg_32bit = 1;
+ u.contents = 0;
+ u.read_exec_only = 0;
+ u.limit_in_pages = 1;
+ u.seg_not_present = 0;
+ u.useable = 1;
+ SysCalls sys;
+ if (sys.set_thread_area(&u) < 0) {
+ die("Cannot set up thread local storage");
+ }
+ asm volatile("movw %w0, %%fs"
+ :
+ : "q"(8*u.entry_number+3));
+ asm volatile(
+ "push %%ebx\n"
+ "push %%ebp\n"
+
+ // Signal handlers are process-wide. This means that for security
+ // reasons, we cannot allow that the trusted thread ever executes any
+ // signal handlers.
+ // We prevent the execution of signal handlers by setting a signal
+ // mask that blocks all signals. In addition, we make sure that the
+ // stack pointer is invalid.
+ // We cannot reset the signal mask until after we have enabled
+ // Seccomp mode. Our sigprocmask() wrapper would normally do this by
+ // raising a signal, modifying the signal mask in the kernel-generated
+ // signal frame, and then calling sigreturn(). This presents a bit of
+ // a Catch-22, as all signals are masked and we can therefore not
+ // raise any signal that would allow us to generate the signal stack
+ // frame.
+ // Instead, we have to create the signal stack frame prior to entering
+ // Seccomp mode. This incidentally also helps us to restore the
+ // signal mask to the same value that it had prior to entering the
+ // sandbox.
+ // The signal wrapper for clone() is the second entry point into this
+ // code (by means of sending an IPC to its trusted thread). It goes
+ // through the same steps of creating a signal stack frame on the
+ // newly created thread's stacks prior to cloning. See clone.cc for
+ // details.
+ "mov %0, %%edi\n" // create signal stack before accessing MMX
+ "mov $120+0xF000, %%eax\n" // __NR_clone + 0xF000
+ "mov %%esp, %%ebp\n"
+ "int $0\n" // push a signal stack frame (see clone.cc)
+ "mov %%ebp, 0x1C(%%esp)\n" // pop stack upon call to sigreturn()
+ "mov %%esp, %%ebp\n"
+ "mov $2, %%ebx\n" // how = SIG_SETMASK
+ "pushl $-1\n"
+ "pushl $-1\n"
+ "mov %%esp, %%ecx\n" // set = full mask
+ "xor %%edx, %%edx\n" // old_set = NULL
+ "mov $8, %%esi\n" // mask all 64 signals
+ "mov $175, %%eax\n" // NR_rt_sigprocmask
+ "int $0x80\n"
+ "mov $126, %%eax\n" // NR_sigprocmask
+ "int $0x80\n"
+ "xor %%esp, %%esp\n" // invalidate the stack in all trusted code
+ "movd %%edi, %%mm6\n" // %mm6 = args
+ "lea 999f, %%edi\n" // continue in same thread
+ "movd %%edi, %%mm3\n"
+ "xor %%edi, %%edi\n" // initial sequence number
+ "movd %%edi, %%mm2\n"
+ "jmp 20f\n" // create trusted thread
+
+ // TODO(markus): Coalesce the read() operations by reading into a bigger
+ // buffer.
+
+ // Parameters:
+ // %mm0: thread's side of threadFd
+ // %mm1: processFdPub
+ // %mm3: return address after creation of new trusted thread
+ // %mm5: secure memory region
+ // the page following this one contains the scratch space
+
+ // Local variables:
+ // %mm2: sequence number for trusted calls
+ // %mm4: thread id
+
+ // Temporary variables:
+ // %ebp: system call number
+ // %mm6: secure memory of previous thread
+ // %mm7: temporary variable for spilling data
+
+ // Layout of secure shared memory region (c.f. securemem.h):
+ // 0x00: pointer to the secure shared memory region (i.e. self)
+ // 0x04: sequence number; must match %mm2
+ // 0x08: call type; must match %eax, iff %eax == -1 || %eax == -2
+ // 0x0C: system call number; passed to syscall in %eax
+ // 0x10: first argument; passed to syscall in %ebx
+ // 0x14: second argument; passed to syscall in %ecx
+ // 0x18: third argument; passed to syscall in %edx
+ // 0x1C: fourth argument; passed to syscall in %esi
+ // 0x20: fifth argument; passed to syscall in %edi
+ // 0x24: sixth argument; passed to syscall in %ebp
+ // 0x28: stored return address for clone() system call
+ // 0x2C: stored %ebp value for clone() system call
+ // 0x30: stored %edi value for clone() system call
+ // 0x34: stored %esi value for clone() system call
+ // 0x38: stored %edx value for clone() system call
+ // 0x3C: stored %ecx value for clone() system call
+ // 0x40: stored %ebx value for clone() system call
+ // 0x44: new shared memory for clone()
+ // 0x48: processFdPub for talking to trusted process
+ // 0x4C: cloneFdPub for talking to trusted process
+ // 0x50: set to non-zero, if in debugging mode
+ // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE)
+ // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE)
+ // 0x60: thread id (TLS_TID)
+ // 0x68: threadFdPub (TLS_THREAD_FD)
+ // 0x200-0x1000: securely passed verified file name(s)
+
+ // Layout of (untrusted) scratch space:
+ // 0x00: syscall number; passed in %eax
+ // 0x04: first argument; passed in %ebx
+ // 0x08: second argument; passed in %ecx
+ // 0x0C: third argument; passed in %edx
+ // 0x10: fourth argument; passed in %esi
+ // 0x14: fifth argument; passed in %edi
+ // 0x18: sixth argument; passed in %ebp
+ // 0x1C: return value
+ // 0x20: RDTSCP result (%eax)
+ // 0x24: RDTSCP result (%edx)
+ // 0x28: RDTSCP result (%ecx)
+ // 0x2C: last system call (updated in syscall.cc)
+ // 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday)
+ // 0x34: nesting level of system calls (for debugging purposes only)
+ // 0x38: signal mask
+ // 0x40: in SEGV handler
+
+ "0:xor %%esp, %%esp\n"
+ "mov $2, %%eax\n" // %mm2 = initial sequence number
+ "movd %%eax, %%mm2\n"
+
+ // Read request from untrusted thread, or from trusted process. In either
+ // case, the data that we read has to be considered untrusted.
+ // read(threadFd, &scratch, 4)
+ "1:mov $3, %%eax\n" // NR_read
+ "movd %%mm0, %%ebx\n" // fd = threadFd
+ "movd %%mm5, %%ecx\n" // secure_mem
+ "add $0x1000, %%ecx\n" // buf = &scratch
+ "mov $4, %%edx\n" // len = 4
+ "2:int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 2b\n"
+ "cmp %%edx, %%eax\n"
+ "jnz 25f\n" // exit process
+
+ // Retrieve system call number. It is crucial that we only dereference
+ // 0x1000(%mm5) exactly once. Afterwards, memory becomes untrusted and
+ // we must use the value that we have read the first time.
+ "mov 0(%%ecx), %%eax\n"
+
+ // If syscall number is -1, execute an unlocked system call from the
+ // secure memory area
+ "cmp $-1, %%eax\n"
+ "jnz 5f\n"
+ "3:movd %%mm2, %%ebp\n"
+ "cmp %%ebp, 0x4-0x1000(%%ecx)\n"
+ "jne 25f\n" // exit process
+ "cmp 0x08-0x1000(%%ecx), %%eax\n"
+ "jne 25f\n" // exit process
+ "mov 0x0C-0x1000(%%ecx), %%eax\n"
+ "mov 0x10-0x1000(%%ecx), %%ebx\n"
+ "mov 0x18-0x1000(%%ecx), %%edx\n"
+ "mov 0x1C-0x1000(%%ecx), %%esi\n"
+ "mov 0x20-0x1000(%%ecx), %%edi\n"
+ "mov 0x24-0x1000(%%ecx), %%ebp\n"
+ "mov 0x14-0x1000(%%ecx), %%ecx\n"
+ "movd %%edi, %%mm4\n"
+ "movd %%ebp, %%mm7\n"
+ "movd %%mm2, %%ebp\n"
+ "movd %%mm5, %%edi\n"
+ "cmp %%ebp, 4(%%edi)\n"
+ "jne 25f\n" // exit process
+ "add $2, %%ebp\n"
+ "movd %%ebp, %%mm2\n"
+ "movd %%mm4, %%edi\n"
+ "movd %%mm7, %%ebp\n"
+
+ // shmget() gets some special treatment. Whenever we return from this
+ // system call, we remember the most recently returned SysV shm id.
+ "cmp $117, %%eax\n" // NR_ipc
+ "jnz 4f\n"
+ "cmp $23, %%ebx\n" // shmget()
+ "jnz 4f\n"
+ "int $0x80\n"
+ "mov %%eax, %%ebp\n"
+ "mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25f\n" // exit process
+ "mov %%eax, %%ebx\n"
+ "jnz 8f\n" // wait for child, then return result
+ "movd %%mm5, %%ebx\n" // start = secure_mem
+ "mov $4096, %%ecx\n" // len = 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $125, %%eax\n" // NR_mprotect
+ "int $0x80\n"
+ "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id
+ "xor %%ebx, %%ebx\n"
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "movd %%mm5, %%ecx\n"
+ "cmpw $0, 0x50(%%ecx)\n" // debug mode
+ "jz 27f\n"
+ "mov $4, %%eax\n" // NR_write
+ "mov $2, %%ebx\n" // fd = stderr
+ "lea 101f, %%ecx\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "int $0x80\n"
+ "xor %%ebx, %%ebx\n"
+ #endif
+
+ "jmp 27f\n" // exit program, no message
+ "4:int $0x80\n"
+ "jmp 15f\n" // return result
+
+ // If syscall number is -2, execute locked system call from the
+ // secure memory area
+ "5:jg 12f\n"
+ "cmp $-2, %%eax\n"
+ "jnz 9f\n"
+ "movd %%mm2, %%ebp\n"
+ "cmp %%ebp, 0x4-0x1000(%%ecx)\n"
+ "jne 25f\n" // exit process
+ "cmp %%eax, 0x8-0x1000(%%ecx)\n"
+ "jne 25f\n" // exit process
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "cmpw $0, 0x50-0x1000(%%ecx)\n"
+ "jz 6f\n" // debug mode
+ "mov %%ecx, %%ebp\n"
+ "mov $4, %%eax\n" // NR_write
+ "mov $2, %%ebx\n" // fd = stderr
+ "lea 101f, %%ecx\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "int $0x80\n"
+ "mov %%ebp, %%ecx\n"
+ "6:"
+ #endif
+
+ "mov 0x0C-0x1000(%%ecx), %%eax\n"
+ "mov 0x10-0x1000(%%ecx), %%ebx\n"
+ "mov 0x18-0x1000(%%ecx), %%edx\n"
+ "mov 0x1C-0x1000(%%ecx), %%esi\n"
+ "mov 0x20-0x1000(%%ecx), %%edi\n"
+ "mov 0x24-0x1000(%%ecx), %%ebp\n"
+ "mov 0x14-0x1000(%%ecx), %%ecx\n"
+ "movd %%edi, %%mm4\n"
+ "movd %%ebp, %%mm7\n"
+ "movd %%mm2, %%ebp\n"
+ "movd %%mm5, %%edi\n"
+ "cmp %%ebp, 4(%%edi)\n"
+ "jne 25f\n" // exit process
+
+ // clone() has unusual calling conventions and must be handled specially
+ "cmp $120, %%eax\n" // NR_clone
+ "jz 19f\n"
+
+ // exit() terminates trusted thread
+ "cmp $1, %%eax\n" // NR_exit
+ "jz 18f\n"
+
+ // Perform requested system call
+ "movd %%mm4, %%edi\n"
+ "movd %%mm7, %%ebp\n"
+ "int $0x80\n"
+
+ // Unlock mutex
+ "7:movd %%mm2, %%ebp\n"
+ "movd %%mm5, %%edi\n"
+ "cmp %%ebp, 4(%%edi)\n"
+ "jne 25f\n" // exit process
+ "add $2, %%ebp\n"
+ "movd %%ebp, %%mm2\n"
+ "mov %%eax, %%ebp\n"
+ "mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25f\n" // exit process
+ "jz 22f\n" // unlock and exit
+ "mov %%eax, %%ebx\n"
+ "8:xor %%ecx, %%ecx\n"
+ "xor %%edx, %%edx\n"
+ "mov $7, %%eax\n" // NR_waitpid
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 8b\n"
+ "mov %%ebp, %%eax\n"
+ "jmp 15f\n" // return result
+
+ // If syscall number is -3, read the time stamp counter
+ "9:cmp $-3, %%eax\n"
+ "jnz 10f\n"
+ "rdtsc\n" // sets %edx:%eax
+ "xor %%ecx, %%ecx\n"
+ "jmp 11f\n"
+ "10:cmp $-4, %%eax\n"
+ "jnz 12f\n"
+ "rdtscp\n" // sets %edx:%eax and %ecx
+ "11:movd %%mm5, %%ebx\n"
+ "add $0x1020, %%ebx\n"
+ "mov %%eax, 0(%%ebx)\n"
+ "mov %%edx, 4(%%ebx)\n"
+ "mov %%ecx, 8(%%ebx)\n"
+ "mov %%ebx, %%ecx\n"
+ "mov $12, %%edx\n"
+ "jmp 16f\n" // return result
+
+ // Check in syscallTable whether this system call is unrestricted
+ "12:mov %%eax, %%ebp\n"
+ #ifndef NDEBUG
+ "cmpw $0, 0x50-0x1000(%%ecx)\n"
+ "jnz 13f\n" // debug mode
+ #endif
+ "cmp playground$maxSyscall, %%eax\n"
+ "ja 25f\n" // exit process
+ "shl $3, %%eax\n"
+ "add $playground$syscallTable, %%eax\n"
+ "mov 0(%%eax), %%eax\n"
+ "cmp $1, %%eax\n"
+ "jne 25f\n" // exit process
+
+ // Default behavior for unrestricted system calls is to just execute
+ // them. Read the remaining arguments first.
+ "13:mov $3, %%eax\n" // NR_read
+ "movd %%mm0, %%ebx\n" // fd = threadFd
+ "add $4, %%ecx\n" // buf = &scratch + 4
+ "mov $24, %%edx\n" // len = 6*sizeof(void *)
+ "14:int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 14b\n"
+ "cmp %%edx, %%eax\n"
+ "jnz 25f\n" // exit process
+ "mov %%ebp, %%eax\n"
+ "mov 0x00(%%ecx), %%ebx\n"
+ "mov 0x08(%%ecx), %%edx\n"
+ "mov 0x0C(%%ecx), %%esi\n"
+ "mov 0x10(%%ecx), %%edi\n"
+ "mov 0x14(%%ecx), %%ebp\n"
+ "mov 0x04(%%ecx), %%ecx\n"
+ "cmp $252, %%eax\n" // NR_exit_group
+ "jz 27f\n" // exit program, no message
+ "int $0x80\n"
+
+ // Return result of system call to sandboxed thread
+ "15:movd %%mm5, %%ecx\n" // secure_mem
+ "add $0x101C, %%ecx\n" // buf = &scratch + 28
+ "mov %%eax, (%%ecx)\n"
+ "mov $4, %%edx\n" // len = 4
+ "16:movd %%mm0, %%ebx\n" // fd = threadFd
+ "mov $4, %%eax\n" // NR_write
+ "17:int $0x80\n"
+ "cmp %%edx, %%eax\n"
+ "jz 1b\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 17b\n"
+ "jmp 25f\n" // exit process
+
+ // NR_exit:
+ // Exit trusted thread after cleaning up resources
+ "18:mov %%edi, %%ecx\n" // secure_mem
+ "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub
+ "mov $6, %%eax\n" // NR_close
+ "int $0x80\n"
+ "mov %%ecx, %%ebx\n" // start = secure_mem
+ "mov $8192, %%ecx\n" // length = 8192
+ "xor %%edx, %%edx\n" // prot = PROT_NONE
+ "mov $125, %%eax\n" // NR_mprotect
+ "int $0x80\n"
+ "movd %%mm0, %%ebx\n" // fd = threadFd
+ "mov $6, %%eax\n" // NR_close
+ "int $0x80\n"
+ "mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "mov %%eax, %%ebx\n"
+ "test %%eax, %%eax\n"
+ "js 25f\n" // exit process
+ "jne 21f\n" // reap helper, exit thread
+ "jmp 22f\n" // unlock mutex
+
+ // NR_clone:
+ // Original trusted thread calls clone() to create new nascent
+ // thread. This thread is (typically) fully privileged and shares all
+ // resources with the caller (i.e. the previous trusted thread),
+ // and by extension it shares all resources with the sandbox'd
+ // threads.
+ "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem
+ "movd %%mm4, %%edi\n" // child_tidptr
+ "mov %%ecx, %%ebp\n" // remember child stack
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n" // calls NR_clone
+ "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values
+ "jae 7b\n" // unlock mutex, return result
+ "movd %%mm2, %%edi\n"
+ "add $2, %%edi\n"
+ "movd %%edi, %%mm2\n"
+ "test %%eax, %%eax\n"
+ "jne 15b\n" // return result
+
+ // In nascent thread, now.
+ "sub $2, %%edi\n"
+ "movd %%edi, %%mm2\n"
+
+ // We want to maintain an invalid %esp whenver we access untrusted
+ // memory. This ensures that even if an attacker can trick us into
+ // triggering a SIGSEGV, we will never successfully execute a signal
+ // handler.
+ // Signal handlers are inherently dangerous, as an attacker could trick
+ // us into returning to the wrong address by adjusting the signal stack
+ // right before the handler returns.
+ // N.B. While POSIX is curiously silent about this, it appears that on
+ // Linux, alternate signal stacks are a per-thread property. That is
+ // good. It means that this security mechanism works, even if the
+ // sandboxed thread manages to set up an alternate signal stack.
+ //
+ // TODO(markus): We currently do not support emulating calls to
+ // sys_clone() with a zero (i.e. copy) stack parameter. See clone.cc
+ // for a discussion on how to fix this, if this ever becomes neccessary.
+ "movd %%eax, %%mm3\n" // Request to return from clone() when done
+
+ // Get thread id of nascent thread
+ "20:mov $224, %%eax\n" // NR_gettid
+ "int $0x80\n"
+ "movd %%eax, %%mm4\n"
+
+ // Nascent thread creates socketpair() for sending requests to
+ // trusted thread.
+ // We can create the filehandles on the child's stack. Filehandles are
+ // always treated as untrusted.
+ // socketpair(AF_UNIX, SOCK_STREAM, 0, fds)
+ "mov $102, %%eax\n" // NR_socketcall
+ "mov $8, %%ebx\n" // socketpair
+ "sub $8, %%ebp\n" // sv = child_stack
+ "mov %%ebp, -0x04(%%ebp)\n"
+ "movl $0, -0x08(%%ebp)\n" // protocol = 0
+ "movl $1, -0x0C(%%ebp)\n" // type = SOCK_STREAM
+ "movl $1, -0x10(%%ebp)\n" // domain = AF_UNIX
+ "lea -0x10(%%ebp), %%ecx\n"
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "jz 28f\n"
+
+ // If things went wrong, we don't have an (easy) way of signaling
+ // the parent. For our purposes, it is sufficient to fail with a
+ // fatal error.
+ "jmp 25f\n" // exit process
+ "21:xor %%ecx, %%ecx\n"
+ "xor %%edx, %%edx\n"
+ "mov $7, %%eax\n" // NR_waitpid
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 21b\n"
+ "jmp 23f\n" // exit thread (no message)
+ "22:lea playground$syscall_mutex, %%ebx\n"
+ "mov $4096, %%ecx\n"
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $125, %%eax\n" // NR_mprotect
+ "int $0x80\n"
+ "lock; addl $0x80000000, (%%ebx)\n"
+ "jz 23f\n" // exit thread
+ "mov $1, %%edx\n"
+ "mov %%edx, %%ecx\n" // FUTEX_WAKE
+ "mov $240, %%eax\n" // NR_futex
+ "int $0x80\n"
+ "23:mov $1, %%eax\n" // NR_exit
+ "mov $1, %%ebx\n" // status = 1
+ "24:int $0x80\n"
+ "25:mov $4, %%eax\n" // NR_write
+ "mov $2, %%ebx\n" // fd = stderr
+ "lea 100f, %%ecx\n" // "Sandbox violation detected"
+ "mov $101f-100f, %%edx\n" // len = strlen(msg)
+ "int $0x80\n"
+ "26:mov $1, %%ebx\n"
+ "27:mov $252, %%eax\n" // NR_exit_group
+ "jmp 24b\n"
+
+ // The first page is mapped read-only for use as securely shared memory
+ "28:movd %%mm6, %%edi\n" // %edi = old_shared_mem
+ "mov 0x44(%%edi), %%ebx\n" // addr = secure_mem
+ "movd %%ebx, %%mm5\n" // %mm5 = secure_mem
+ "movd %%mm2, %%esi\n"
+ "cmp %%esi, 4(%%edi)\n"
+ "jne 25b\n" // exit process
+ "mov $125, %%eax\n" // NR_mprotect
+ "mov $4096, %%ecx\n" // len = 4096
+ "mov $1, %%edx\n" // prot = PROT_READ
+ "int $0x80\n"
+
+ // The second page is used as scratch space by the trusted thread.
+ // Make it writable.
+ "mov $125, %%eax\n" // NR_mprotect
+ "add $4096, %%ebx\n" // addr = secure_mem + 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "int $0x80\n"
+
+ // Call clone() to create new trusted thread().
+ // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ // CLONE_SYSVSEM|CLONE_UNTRACED, stack, NULL, NULL, NULL)
+ "mov 4(%%ebp), %%eax\n" // threadFd (on child's stack)
+ "movd %%eax, %%mm0\n" // %mm0 = threadFd
+ "mov $120, %%eax\n" // NR_clone
+ "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR
+ "mov $1, %%ecx\n" // stack = 1
+ "movd 0x48(%%edi), %%mm1\n" // %mm1 = processFdPub
+ "cmp %%esi, 4(%%edi)\n"
+ "jne 25b\n" // exit process
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25b\n" // exit process
+ "jz 0b\n" // invoke trustedThreadFnc()
+
+ // Set up thread local storage
+ "mov $0x51, %%eax\n" // seg_32bit, limit_in_pages, useable
+ "mov %%eax, -0x04(%%ebp)\n"
+ "mov $0xFFFFF, %%eax\n" // limit
+ "mov %%eax, -0x08(%%ebp)\n"
+ "movd %%mm5, %%eax\n"
+ "add $0x58, %%eax\n"
+ "mov %%eax, -0x0C(%%ebp)\n" // base_addr = &secure_mem.TLS
+ "mov %%fs, %%eax\n"
+ "shr $3, %%eax\n"
+ "mov %%eax, -0x10(%%ebp)\n" // entry_number
+ "mov $243, %%eax\n" // NR_set_thread_area
+ "lea -0x10(%%ebp), %%ebx\n"
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "jnz 25b\n" // exit process
+
+ // Copy the caller's signal mask
+ "movd %%mm5, %%edx\n"
+ "mov 0x1038(%%edi), %%eax\n"
+ "mov %%eax, 0x1038(%%edx)\n"
+ "mov 0x103C(%%edi), %%eax\n"
+ "mov %%eax, 0x103C(%%edx)\n"
+
+ // Done creating trusted thread. We can now get ready to return to caller
+ "mov 0(%%ebp), %%esi\n" // %esi = threadFdPub
+ "add $8, %%ebp\n"
+
+ // Check whether this is the initial thread, or a newly created one.
+ // At startup we run the same code as when we create a new thread. At
+ // the very top of this function, you will find that we store 999f
+ // in %%mm3. That is the signal that we should return on the same
+ // stack rather than return to where clone was called.
+ "movd %%mm3, %%eax\n"
+ "movd %%mm2, %%edx\n"
+ "test %%eax, %%eax\n"
+ "jne 29f\n"
+
+ // Returning from clone() into the newly created thread is special. We
+ // cannot unroll the stack, as we just set up a new stack for this
+ // thread. We have to explicitly restore CPU registers to the values
+ // that they had when the program originally called clone().
+ // We patch the register values in the signal stack frame so that we
+ // can ask sigreturn() to restore all registers for us.
+ "sub $0x4, %%ebp\n"
+ "mov 0x28(%%edi), %%eax\n"
+ "mov %%eax, 0x00(%%ebp)\n" // return address
+ "xor %%eax, %%eax\n"
+ "mov %%eax, 0x30(%%ebp)\n" // %eax = 0
+ "mov 0x2C(%%edi), %%eax\n"
+ "mov %%eax, 0x1C(%%ebp)\n" // %ebp
+ "mov 0x30(%%edi), %%eax\n"
+ "mov %%eax, 0x14(%%ebp)\n" // %edi
+ "mov 0x34(%%edi), %%eax\n"
+ "mov %%eax, 0x18(%%ebp)\n" // %esi
+ "mov 0x38(%%edi), %%eax\n"
+ "mov %%eax, 0x28(%%ebp)\n" // %edx
+ "mov 0x3C(%%edi), %%eax\n"
+ "mov %%eax, 0x2C(%%ebp)\n" // %ecx
+ "mov 0x40(%%edi), %%eax\n"
+ "mov %%eax, 0x24(%%ebp)\n" // %ebx
+ "cmp %%edx, 4(%%edi)\n"
+ "jne 25b\n" // exit process
+
+ // Nascent thread launches a helper that doesn't share any of our
+ // resources, except for pages mapped as MAP_SHARED.
+ // clone(SIGCHLD, stack=1)
+ "29:mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25b\n" // exit process
+ "jne 31f\n"
+
+ // Use sendmsg() to send to the trusted process the file handles for
+ // communicating with the new trusted thread. We also send the address
+ // of the secure memory area (for sanity checks) and the thread id.
+ "cmp %%edx, 4(%%edi)\n"
+ "jne 25b\n" // exit process
+
+ // 0x00 socketcall:
+ // 0x00 socket (0x4C(%edi))
+ // 0x04 msg (%ecx + 0x0C)
+ // 0x08 flags ($0)
+ // 0x0C msg:
+ // 0x0C msg_name ($0)
+ // 0x10 msg_namelen ($0)
+ // 0x14 msg_iov (%ecx + 0x34)
+ // 0x18 msg_iovlen ($1)
+ // 0x1C msg_control (%ecx + 0x3C)
+ // 0x20 msg_controllen ($0x14)
+ // 0x24 data:
+ // 0x24 msg_flags/err ($0)
+ // 0x28 secure_mem (%mm5)
+ // 0x2C threadId (%mm4)
+ // 0x30 threadFdPub (%esi)
+ // 0x34 iov:
+ // 0x34 iov_base (%ecx + 0x24)
+ // 0x38 iov_len ($0x10)
+ // 0x3C cmsg:
+ // 0x3C cmsg_len ($0x14)
+ // 0x40 cmsg_level ($1, SOL_SOCKET)
+ // 0x44 cmsg_type ($1, SCM_RIGHTS)
+ // 0x48 threadFdPub (%esi)
+ // 0x4C threadFd (%mm0)
+ // 0x50
+ "lea -0x50(%%ebp), %%ecx\n"
+ "xor %%eax, %%eax\n"
+ "mov %%eax, 0x08(%%ecx)\n" // flags
+ "mov %%eax, 0x0C(%%ecx)\n" // msg_name
+ "mov %%eax, 0x10(%%ecx)\n" // msg_namelen
+ "mov %%eax, 0x24(%%ecx)\n" // msg_flags
+ "inc %%eax\n"
+ "mov %%eax, 0x18(%%ecx)\n" // msg_iovlen
+ "mov %%eax, 0x40(%%ecx)\n" // cmsg_level
+ "mov %%eax, 0x44(%%ecx)\n" // cmsg_type
+ "movl $0x10, 0x38(%%ecx)\n" // iov_len
+ "mov $0x14, %%eax\n"
+ "mov %%eax, 0x20(%%ecx)\n" // msg_controllen
+ "mov %%eax, 0x3C(%%ecx)\n" // cmsg_len
+ "mov 0x4C(%%edi), %%eax\n" // cloneFdPub
+ "mov %%eax, 0x00(%%ecx)\n" // socket
+ "lea 0x0C(%%ecx), %%eax\n"
+ "mov %%eax, 0x04(%%ecx)\n" // msg
+ "add $0x18, %%eax\n"
+ "mov %%eax, 0x34(%%ecx)\n" // iov_base
+ "add $0x10, %%eax\n"
+ "mov %%eax, 0x14(%%ecx)\n" // msg_iov
+ "add $8, %%eax\n"
+ "mov %%eax, 0x1C(%%ecx)\n" // msg_control
+ "mov %%esi, 0x30(%%ecx)\n" // threadFdPub
+ "mov %%esi, 0x48(%%ecx)\n" // threadFdPub
+ "movd %%mm5, %%eax\n"
+ "mov %%eax, 0x28(%%ecx)\n" // secure_mem
+ "movd %%mm4, %%eax\n"
+ "mov %%eax, 0x2C(%%ecx)\n" // threadId
+ "movd %%mm0, %%eax\n"
+ "mov %%eax, 0x4C(%%ecx)\n" // threadFd
+ "mov $16, %%ebx\n" // sendmsg()
+ "mov $102, %%eax\n" // NR_socketcall
+ "int $0x80\n"
+
+ // Release syscall_mutex_. This signals the trusted process that
+ // it can write into the original thread's secure memory again.
+ "mov $125, %%eax\n" // NR_mprotect
+ "lea playground$syscall_mutex, %%ebx\n"
+ "mov $4096, %%ecx\n"
+ "mov $3, %%edx\n" // PROT_READ | PROT_WRITE
+ "int $0x80\n"
+ "movd %%mm2, %%edx\n"
+ "cmp %%edx, 0x4(%%edi)\n"
+ "jnz 25b\n" // exit process
+ "lock; addl $0x80000000, (%%ebx)\n"
+ "jz 30f\n" // exit process (no error message)
+ "mov $1, %%edx\n"
+ "mov %%edx, %%ecx\n" // FUTEX_WAKE
+ "mov $240, %%eax\n" // NR_futex
+ "int $0x80\n"
+ "30:xor %%ebx, %%ebx\n"
+ "jmp 27b\n" // exit process (no error message)
+
+ // Reap helper
+ "31:mov %%eax, %%ebx\n"
+ "32:lea -4(%%ebp), %%ecx\n"
+ "xor %%edx, %%edx\n"
+ "mov $7, %%eax\n" // NR_waitpid
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 32b\n"
+ "mov -4(%%ebp), %%eax\n"
+ "test %%eax, %%eax\n"
+ "jnz 26b\n" // exit process (no error message)
+
+ // Release privileges by entering seccomp mode.
+ "33:mov $172, %%eax\n" // NR_prctl
+ "mov $22, %%ebx\n" // PR_SET_SECCOMP
+ "mov $1, %%ecx\n"
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "jnz 25b\n" // exit process
+
+ // We can finally start using the stack. Signal handlers no longer pose
+ // a threat to us.
+ "mov %%ebp, %%esp\n"
+
+ // Back in the newly created sandboxed thread, wait for trusted process
+ // to receive request. It is possible for an attacker to make us
+ // continue even before the trusted process is done. This is OK. It'll
+ // result in us putting stale values into the new thread's TLS. But that
+ // data is considered untrusted anyway.
+ "push %%eax\n"
+ "mov $1, %%edx\n" // len = 1
+ "mov %%esp, %%ecx\n" // buf = %esp
+ "mov %%esi, %%ebx\n" // fd = threadFdPub
+ "34:mov $3, %%eax\n" // NR_read
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 34b\n"
+ "cmp %%edx, %%eax\n"
+ "jne 25b\n" // exit process
+ "pop %%eax\n"
+
+ // Return to caller. We are in the new thread, now.
+ "movd %%mm3, %%ebx\n"
+ "test %%ebx, %%ebx\n"
+ "jnz 35f\n" // Returning to createTrustedThread()
+
+ // Returning to the place where clone() had been called. We rely on
+ // using sigreturn() for restoring our registers. The caller already
+ // created a signal stack frame, and we patched the register values
+ // with the ones that were in effect prior to calling sandbox_clone().
+ "pop %%ebx\n"
+ "35:mov %%ebx, 0x38(%%esp)\n" // compute new %eip
+ "mov $119, %%eax\n" // NR_sigreturn
+ "int $0x80\n"
+
+ ".pushsection \".rodata\"\n"
+ "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
+ "101:.ascii \"WARNING! This is an expensive system call\\n\"\n"
+ "102:\n"
+ ".popsection\n"
+
+ "999:pop %%ebp\n"
+ "pop %%ebx\n"
+ :
+ : "g"(&args)
+ : "eax", "ecx", "edx", "edi", "esi", "esp", "memory"
+#else
+#error Unsupported target platform
+#endif
+);
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/x86_decode.cc b/sandbox/linux/seccomp/x86_decode.cc
new file mode 100644
index 0000000..1b55139
--- /dev/null
+++ b/sandbox/linux/seccomp/x86_decode.cc
@@ -0,0 +1,310 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "x86_decode.h"
+
+namespace playground {
+
+#if defined(__x86_64__) || defined(__i386__)
+unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix,
+ char **rex_ptr, char **mod_rm_ptr, char **sib_ptr,
+ bool *is_group) {
+ enum {
+ BYTE_OP = (1<<1), // 0x02
+ IMM = (1<<2), // 0x04
+ IMM_BYTE = (2<<2), // 0x08
+ MEM_ABS = (3<<2), // 0x0C
+ MODE_MASK = (7<<2), // 0x1C
+ MOD_RM = (1<<5), // 0x20
+ STACK = (1<<6), // 0x40
+ GROUP = (1<<7), // 0x80
+ GROUP_MASK = 0x7F,
+ };
+
+ static unsigned char opcode_types[512] = {
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x00 - 0x07
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x00, // 0x08 - 0x0F
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x10 - 0x17
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x18 - 0x1F
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x20 - 0x27
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x28 - 0x2F
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x30 - 0x37
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x38 - 0x3F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x40 - 0x47
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x48 - 0x4F
+ 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x50 - 0x57
+ 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x58 - 0x5F
+ 0x01, 0x01, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0x60 - 0x67
+ 0x45, 0x25, 0x49, 0x29, 0x03, 0x01, 0x03, 0x01, // 0x68 - 0x6F
+ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x70 - 0x77
+ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x78 - 0x7F
+ 0x27, 0x25, 0x27, 0x29, 0x23, 0x21, 0x23, 0x21, // 0x80 - 0x87
+ 0x23, 0x21, 0x23, 0x21, 0x21, 0x21, 0x21, 0x80, // 0x88 - 0x8F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x90 - 0x97
+ 0x01, 0x01, 0x05, 0x01, 0x41, 0x41, 0x01, 0x01, // 0x98 - 0x9F
+ 0x0F, 0x0D, 0x0F, 0x0D, 0x03, 0x01, 0x03, 0x01, // 0xA0 - 0xA7
+ 0x09, 0x05, 0x03, 0x01, 0x03, 0x01, 0x03, 0x01, // 0xA8 - 0xAF
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // 0xB0 - 0xB7
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0xB8 - 0xBF
+ 0x27, 0x29, 0x01, 0x01, 0x21, 0x21, 0x27, 0x25, // 0xC0 - 0xC7
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x01, // 0xC8 - 0xCF
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x09, 0x01, 0x01, // 0xD0 - 0xD7
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xD8 - 0xDF
+ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0xE0 - 0xE7
+ 0x05, 0x05, 0x05, 0x09, 0x03, 0x01, 0x03, 0x01, // 0xE8 - 0xEF
+ 0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x88, 0x90, // 0xF0 - 0xF7
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x98, 0xA0, // 0xF8 - 0xFF
+ 0x00, 0xA8, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, // 0xF00 - 0xF07
+ 0x01, 0x01, 0x00, 0x01, 0x00, 0x21, 0x01, 0x00, // 0xF08 - 0xF0F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF10 - 0xF17
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF18 - 0xF1F
+ 0x21, 0x21, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0xF20 - 0xF27
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF28 - 0xF2F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, // 0xF30 - 0xF37
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF38 - 0xF3F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF40 - 0xF47
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF48 - 0xF4F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF50 - 0xF57
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF58 - 0xF5F
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF60 - 0xF67
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF68 - 0xF6F
+ 0x21, 0x00, 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, // 0xF70 - 0xF77
+ 0x21, 0x21, 0x00, 0x00, 0x21, 0x21, 0x21, 0x21, // 0xF78 - 0xF7F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF80 - 0xF87
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF88 - 0xF8F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF90 - 0xF97
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF98 - 0xF9F
+ 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x00, 0x00, // 0xFA0 - 0xFA7
+ 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x21, 0x21, // 0xFA8 - 0xFAF
+ 0x23, 0x21, 0x00, 0x21, 0x00, 0x00, 0x23, 0x21, // 0xFB0 - 0xFB7
+ 0x21, 0x00, 0x29, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFB8 - 0xFBF
+ 0x21, 0x21, 0x00, 0x21, 0x00, 0x00, 0x00, 0x21, // 0xFC0 - 0xFC7
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xFC8 - 0xFCF
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD0 - 0xFD7
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD8 - 0xFDF
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE0 - 0xFE7
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE8 - 0xFEF
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF0 - 0xFF7
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF8 - 0xFFF
+ };
+
+ static unsigned char group_table[56] = {
+ 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 1A
+ 0x27, 0x27, 0x23, 0x23, 0x23, 0x23, 0x23, 0x23, // Group 3 (Byte)
+ 0x25, 0x25, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // Group 3
+ 0x23, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 4
+ 0x21, 0x21, 0x61, 0x21, 0x61, 0x21, 0x61, 0x00, // Group 5
+ 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, 0x21, 0x23, // Group 7
+ 0x21, 0x00, 0x00, 0x21, 0x21, 0x00, 0x21, 0x00, // Group 7 (Alternate)
+ };
+
+ const unsigned char *insn_ptr = reinterpret_cast<const unsigned char *>(*ip);
+ int operand_width = 4;
+ int address_width = 4;
+ if (is64bit) {
+ address_width = 8;
+ }
+ unsigned char byte, rex = 0;
+ bool found_prefix = false;
+ if (rex_ptr) {
+ *rex_ptr = 0;
+ }
+ if (mod_rm_ptr) {
+ *mod_rm_ptr = 0;
+ }
+ if (sib_ptr) {
+ *sib_ptr = 0;
+ }
+ for (;; ++insn_ptr) {
+ switch (byte = *insn_ptr) {
+ case 0x66: // Operand width prefix
+ operand_width ^= 6;
+ break;
+ case 0x67: // Address width prefix
+ address_width ^= is64bit ? 12 : 6;
+ break;
+ case 0x26: // Segment selector prefixes
+ case 0x2e:
+ case 0x36:
+ case 0x3e:
+ case 0x64:
+ case 0x65:
+ case 0xF0:
+ case 0xF2:
+ case 0xF3:
+ break;
+ case 0x40: case 0x41: case 0x42: case 0x43: // 64 bit REX prefixes
+ case 0x44: case 0x45: case 0x46: case 0x47:
+ case 0x48: case 0x49: case 0x4A: case 0x4B:
+ case 0x4C: case 0x4D: case 0x4E: case 0x4F:
+ if (is64bit) {
+ if (rex_ptr) {
+ *rex_ptr = (char *)insn_ptr;
+ }
+ rex = byte;
+ found_prefix = true;
+ continue;
+ }
+ // fall through
+ default:
+ ++insn_ptr;
+ goto no_more_prefixes;
+ }
+ rex = 0;
+ found_prefix = true;
+ }
+no_more_prefixes:
+ if (has_prefix) {
+ *has_prefix = found_prefix;
+ }
+ if (rex & REX_W) {
+ operand_width = 8;
+ }
+ unsigned char type;
+ unsigned short insn = byte;
+ unsigned int idx = 0;
+ if (byte == 0x0F) {
+ byte = *insn_ptr++;
+ insn = (insn << 8) | byte;
+ idx = 256;
+ }
+ type = opcode_types[idx + byte];
+ bool found_mod_rm = false;
+ bool found_group = false;
+ bool found_sib = false;
+ unsigned char mod_rm = 0;
+ unsigned char sib = 0;
+ if (type & GROUP) {
+ found_mod_rm = true;
+ found_group = true;
+ mod_rm = *insn_ptr;
+ if (mod_rm_ptr) {
+ *mod_rm_ptr = (char *)insn_ptr;
+ }
+ unsigned char group = (type & GROUP_MASK) + ((mod_rm >> 3) & 0x7);
+ if ((type & GROUP_MASK) == 40 && (mod_rm >> 6) == 3) {
+ group += 8;
+ }
+ type = group_table[group];
+ }
+ if (!type) {
+ // We know that we still don't decode some of the more obscure
+ // instructions, but for all practical purposes that doesn't matter.
+ // Compilers are unlikely to output them, and even if we encounter
+ // hand-coded assembly, we will soon synchronize to the instruction
+ // stream again.
+ //
+ // std::cerr << "Unsupported instruction at 0x" << std::hex <<
+ // std::uppercase << reinterpret_cast<long>(*ip) << " [ ";
+ // for (const unsigned char *ptr =
+ // reinterpret_cast<const unsigned char *>(*ip);
+ // ptr < insn_ptr; ) {
+ // std::cerr << std::hex << std::uppercase << std::setw(2) <<
+ // std::setfill('0') << (unsigned int)*ptr++ << ' ';
+ // }
+ // std::cerr << "]" << std::endl;
+ } else {
+ if (is64bit && (type & STACK)) {
+ operand_width = 8;
+ }
+ if (type & MOD_RM) {
+ found_mod_rm = true;
+ if (mod_rm_ptr) {
+ *mod_rm_ptr = (char *)insn_ptr;
+ }
+ mod_rm = *insn_ptr++;
+ int mod = (mod_rm >> 6) & 0x3;
+ int rm = 8*(rex & REX_B) + (mod_rm & 0x7);
+ if (mod != 3) {
+ if (address_width == 2) {
+ switch (mod) {
+ case 0:
+ if (rm != 6 /* SI */) {
+ break;
+ }
+ // fall through
+ case 2:
+ insn_ptr++;
+ // fall through
+ case 1:
+ insn_ptr++;
+ break;
+ }
+ } else {
+ if ((rm & 0x7) == 4) {
+ found_sib = true;
+ if (sib_ptr) {
+ *sib_ptr = (char *)insn_ptr;
+ }
+ sib = *insn_ptr++;
+ if (!mod && (sib & 0x7) == 5 /* BP */) {
+ insn_ptr += 4;
+ }
+ }
+ switch (mod) {
+ case 0:
+ if (rm != 5 /* BP */) {
+ break;
+ }
+ // fall through
+ case 2:
+ insn_ptr += 3;
+ // fall through
+ case 1:
+ insn_ptr++;
+ break;
+ }
+ }
+ }
+ }
+ switch (insn) {
+ case 0xC8: // ENTER
+ insn_ptr++;
+ // fall through
+ case 0x9A: // CALL (far)
+ case 0xC2: // RET (near)
+ case 0xCA: // LRET
+ case 0xEA: // JMP (far)
+ insn_ptr += 2;
+ break;
+ case 0xF80: case 0xF81: case 0xF82: case 0xF83: // Jcc (rel)
+ case 0xF84: case 0xF85: case 0xF86: case 0xF87:
+ case 0xF88: case 0xF89: case 0xF8A: case 0xF8B:
+ case 0xF8C: case 0xF8D: case 0xF8E: case 0xF8F:
+ insn_ptr += operand_width;
+ break;
+ }
+ switch (type & MODE_MASK) {
+ case IMM:
+ if (!(type & BYTE_OP)) {
+ switch (insn) {
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB:
+ case 0xBC: case 0xBD: case 0xBE: case 0xBF:
+ // Allow MOV to/from 64bit addresses
+ insn_ptr += operand_width;
+ break;
+ default:
+ insn_ptr += (operand_width == 8) ? 4 : operand_width;
+ break;
+ }
+ break;
+ }
+ // fall through
+ case IMM_BYTE:
+ insn_ptr++;
+ break;
+ case MEM_ABS:
+ insn_ptr += address_width;
+ break;
+ }
+ }
+ if (is_group) {
+ *is_group = found_group;
+ }
+ *ip = reinterpret_cast<const char *>(insn_ptr);
+ return insn;
+}
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/x86_decode.h b/sandbox/linux/seccomp/x86_decode.h
new file mode 100644
index 0000000..68f0ab5
--- /dev/null
+++ b/sandbox/linux/seccomp/x86_decode.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef X86_DECODE_H__
+#define X86_DECODE_H__
+namespace playground {
+enum {
+ REX_B = 0x01,
+ REX_X = 0x02,
+ REX_R = 0x04,
+ REX_W = 0x08
+};
+
+unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix = 0,
+ char **rex_ptr = 0, char **mod_rm_ptr = 0,
+ char **sib_ptr = 0, bool *is_group = 0);
+} // namespace
+#endif // X86_DECODE_H__