summaryrefslogtreecommitdiffstats
path: root/sandbox/linux
diff options
context:
space:
mode:
authormarkus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-08-11 21:46:07 +0000
committermarkus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-08-11 21:46:07 +0000
commit0fb2bd939380e4d46bad10eb597bff4980ca7db2 (patch)
tree79d017b24dfb4d91059b856da7b8ad43764d76e6 /sandbox/linux
parent135b165d2bca7a9a7302eb4f771dc713c8100edb (diff)
downloadchromium_src-0fb2bd939380e4d46bad10eb597bff4980ca7db2.zip
chromium_src-0fb2bd939380e4d46bad10eb597bff4980ca7db2.tar.gz
chromium_src-0fb2bd939380e4d46bad10eb597bff4980ca7db2.tar.bz2
Initial version of the Seccomp sandbox. Imported from http://code.google.com/p/seccompsandbox/
Make the seccomp sandbox dependant on the --enable-seccomp-sandbox flag Review URL: http://codereview.chromium.org/165310 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@23087 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'sandbox/linux')
-rw-r--r--sandbox/linux/seccomp/access.cc77
-rw-r--r--sandbox/linux/seccomp/clone.cc111
-rw-r--r--sandbox/linux/seccomp/debug.cc225
-rw-r--r--sandbox/linux/seccomp/debug.h58
-rw-r--r--sandbox/linux/seccomp/exit.cc32
-rw-r--r--sandbox/linux/seccomp/getpid.cc11
-rw-r--r--sandbox/linux/seccomp/gettid.cc11
-rw-r--r--sandbox/linux/seccomp/ioctl.cc52
-rw-r--r--sandbox/linux/seccomp/ipc.cc337
-rw-r--r--sandbox/linux/seccomp/library.cc1360
-rw-r--r--sandbox/linux/seccomp/library.h164
-rw-r--r--sandbox/linux/seccomp/linux_syscall_support.h3173
-rw-r--r--sandbox/linux/seccomp/madvise.cc75
-rw-r--r--sandbox/linux/seccomp/maps.cc330
-rw-r--r--sandbox/linux/seccomp/maps.h105
-rw-r--r--sandbox/linux/seccomp/mmap.cc69
-rw-r--r--sandbox/linux/seccomp/mprotect.cc66
-rw-r--r--sandbox/linux/seccomp/munmap.cc64
-rw-r--r--sandbox/linux/seccomp/mutex.h149
-rw-r--r--sandbox/linux/seccomp/open.cc92
-rw-r--r--sandbox/linux/seccomp/sandbox.cc421
-rw-r--r--sandbox/linux/seccomp/sandbox.h6
-rw-r--r--sandbox/linux/seccomp/sandbox_impl.h621
-rw-r--r--sandbox/linux/seccomp/securemem.cc97
-rw-r--r--sandbox/linux/seccomp/securemem.h179
-rw-r--r--sandbox/linux/seccomp/socketcall.cc1013
-rw-r--r--sandbox/linux/seccomp/stat.cc110
-rw-r--r--sandbox/linux/seccomp/syscall.cc258
-rw-r--r--sandbox/linux/seccomp/syscall.h14
-rw-r--r--sandbox/linux/seccomp/syscall_table.c118
-rw-r--r--sandbox/linux/seccomp/syscall_table.h30
-rw-r--r--sandbox/linux/seccomp/tls.h151
-rw-r--r--sandbox/linux/seccomp/trusted_process.cc258
-rw-r--r--sandbox/linux/seccomp/trusted_thread.cc1207
-rw-r--r--sandbox/linux/seccomp/x86_decode.cc306
-rw-r--r--sandbox/linux/seccomp/x86_decode.h15
36 files changed, 11365 insertions, 0 deletions
diff --git a/sandbox/linux/seccomp/access.cc b/sandbox/linux/seccomp/access.cc
new file mode 100644
index 0000000..0a0d0e5
--- /dev/null
+++ b/sandbox/linux/seccomp/access.cc
@@ -0,0 +1,77 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_access(const char *pathname, int mode) {
+ Debug::syscall(__NR_access, "Executing handler");
+ size_t len = strlen(pathname);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Access access_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_access;
+ request->cookie = cookie();
+ request->access_req.path_length = len;
+ request->access_req.mode = mode;
+ memcpy(request->pathname, pathname, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward access() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_access(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ Access access_req;
+ if (read(sys, sandboxFd, &access_req, sizeof(access_req)) !=
+ sizeof(access_req)) {
+ read_parm_failed:
+ die("Failed to read parameters for access() [process]");
+ }
+ int rc = -ENAMETOOLONG;
+ if (access_req.path_length >= sizeof(mem->pathname)) {
+ char buf[32];
+ while (access_req.path_length > 0) {
+ size_t len = access_req.path_length > sizeof(buf) ?
+ sizeof(buf) : access_req.path_length;
+ ssize_t i = read(sys, sandboxFd, buf, len);
+ if (i <= 0) {
+ goto read_parm_failed;
+ }
+ access_req.path_length -= i;
+ }
+ if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to return data from access() [process]");
+ }
+ return false;
+ }
+ SecureMem::lockSystemCall(parentProc, mem);
+ if (read(sys, sandboxFd, mem->pathname, access_req.path_length) !=
+ (ssize_t)access_req.path_length) {
+ goto read_parm_failed;
+ }
+ mem->pathname[access_req.path_length] = '\000';
+
+ // TODO(markus): Implement sandboxing policy
+ Debug::message(("Allowing access to \"" + std::string(mem->pathname) +
+ "\"").c_str());
+
+ // Tell trusted thread to access the file.
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_access,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ access_req.mode);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc
new file mode 100644
index 0000000..109e5c6
--- /dev/null
+++ b/sandbox/linux/seccomp/clone.cc
@@ -0,0 +1,111 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid,
+ void* tls, void *wrapper_sp) {
+ Debug::syscall(__NR_clone, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ Clone clone_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_clone;
+ request.cookie = cookie();
+ request.clone_req.flags = flags;
+ request.clone_req.stack = stack;
+ request.clone_req.pid = pid;
+ request.clone_req.ctid = ctid;
+ request.clone_req.tls = tls;
+
+ // Pass along the address on the stack where syscallWrapper() stored the
+ // original CPU registers. These registers will be restored in the newly
+ // created thread prior to returning from the wrapped system call.
+ #if defined(__x86_64__)
+ memcpy(&request.clone_req.regs64, wrapper_sp,
+ sizeof(request.clone_req.regs64) + sizeof(void *));
+ #elif defined(__i386__)
+ memcpy(&request.clone_req.regs32, wrapper_sp,
+ sizeof(request.clone_req.regs32) + sizeof(void *));
+ #else
+ #error Unsupported target platform
+ #endif
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward clone() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_clone(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ Clone clone_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &clone_req, sizeof(clone_req)) !=sizeof(clone_req)){
+ die("Failed to read parameters for clone() [process]");
+ }
+
+ // TODO(markus): add policy restricting parameters for clone
+ if ((clone_req.flags & ~CLONE_DETACHED) != (CLONE_VM|CLONE_FS|CLONE_FILES|
+ CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|
+ CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID)) {
+ SecureMem::abandonSystemCall(threadFd, -EPERM);
+ return false;
+ } else {
+ SecureMem::Args* newMem = getSecureMem();
+ if (!newMem) {
+ SecureMem::abandonSystemCall(threadFd, -ENOMEM);
+ return false;
+ } else {
+ // clone() has unusual semantics. We don't want to return back into the
+ // trusted thread, but instead we need to continue execution at the IP
+ // where we got called initially.
+ SecureMem::lockSystemCall(parentProc, mem);
+ mem->ret = clone_req.ret;
+ #if defined(__x86_64__)
+ mem->rbp = clone_req.regs64.rbp;
+ mem->rbx = clone_req.regs64.rbx;
+ mem->rcx = clone_req.regs64.rcx;
+ mem->rdx = clone_req.regs64.rdx;
+ mem->rsi = clone_req.regs64.rsi;
+ mem->rdi = clone_req.regs64.rdi;
+ mem->r8 = clone_req.regs64.r8;
+ mem->r9 = clone_req.regs64.r9;
+ mem->r10 = clone_req.regs64.r10;
+ mem->r11 = clone_req.regs64.r11;
+ mem->r12 = clone_req.regs64.r12;
+ mem->r13 = clone_req.regs64.r13;
+ mem->r14 = clone_req.regs64.r14;
+ mem->r15 = clone_req.regs64.r15;
+ #elif defined(__i386__)
+ mem->ret2 = clone_req.regs32.ret2;
+ mem->ebp = clone_req.regs32.ebp;
+ mem->edi = clone_req.regs32.edi;
+ mem->esi = clone_req.regs32.esi;
+ mem->edx = clone_req.regs32.edx;
+ mem->ecx = clone_req.regs32.ecx;
+ mem->ebx = clone_req.regs32.ebx;
+ #else
+ #error Unsupported target platform
+ #endif
+ newMem->sequence = 0;
+ newMem->shmId = -1;
+ mem->newSecureMem = newMem;
+ mem->processFdPub = processFdPub_;
+ mem->cloneFdPub = cloneFdPub_;
+
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_clone,
+ clone_req.flags, clone_req.stack,
+ clone_req.pid, clone_req.ctid, clone_req.tls);
+ return true;
+ }
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/debug.cc b/sandbox/linux/seccomp/debug.cc
new file mode 100644
index 0000000..b4f30a4
--- /dev/null
+++ b/sandbox/linux/seccomp/debug.cc
@@ -0,0 +1,225 @@
+#ifndef NDEBUG
+
+#include "debug.h"
+
+namespace playground {
+
+bool Debug::enabled_;
+int Debug::numSyscallNames_;
+const char **Debug::syscallNames_;
+std::map<int, std::string> Debug::syscallNamesMap_;
+
+Debug Debug::debug_;
+
+Debug::Debug() {
+ // Logging is disabled by default, but can be turned on by setting an
+ // appropriate environment variable. Initialize this code from a global
+ // constructor, so that it runs before the sandbox is turned on.
+ enabled_ = !!getenv("SECCOMP_SANDBOX_DEBUGGING");
+
+ // Read names of system calls from header files, if available. Symbolic
+ // names make debugging so much nicer.
+ if (enabled_) {
+ static const char *filenames[] = {
+ #if __WORDSIZE == 64
+ "/usr/include/asm/unistd_64.h",
+ #elif __WORDSIZE == 32
+ "/usr/include/asm/unistd_32.h",
+ #endif
+ "/usr/include/asm/unistd.h",
+ NULL };
+ numSyscallNames_ = 0;
+ for (const char **fn = filenames; *fn; ++fn) {
+ FILE *fp = fopen(*fn, "r");
+ if (fp) {
+ std::string baseName;
+ int baseNum = -1;
+ char buf[80];
+ while (fgets(buf, sizeof(buf), fp)) {
+ // Check if the line starts with "#define"
+ static const char* whitespace = " \t\r\n";
+ char *token, *save;
+ token = strtok_r(buf, whitespace, &save);
+ if (token && !strcmp(token, "#define")) {
+
+ // Only parse identifiers that start with "__NR_"
+ token = strtok_r(NULL, whitespace, &save);
+ if (token) {
+ if (strncmp(token, "__NR_", 5)) {
+ continue;
+ }
+ std::string syscallName(token + 5);
+
+ // Parse the value of the symbol. Try to be forgiving in what
+ // we accept, as the file format might change over time.
+ token = strtok_r(NULL, "\r\n", &save);
+ if (token) {
+ // Some values are defined relative to previous values, we
+ // detect these examples by finding an earlier symbol name
+ // followed by a '+' plus character.
+ bool isRelative = false;
+ char *base = strstr(token, baseName.c_str());
+ if (baseNum >= 0 && base) {
+ base += baseName.length();
+ while (*base == ' ' || *base == '\t') {
+ ++base;
+ }
+ if (*base == '+') {
+ isRelative = true;
+ token = base;
+ }
+ }
+
+ // Skip any characters that are not part of the syscall number.
+ while (*token < '0' || *token > '9') {
+ token++;
+ }
+
+ // If we now have a valid datum, enter it into our map.
+ if (*token) {
+ int sysnum = atoi(token);
+
+ // Deal with symbols that are defined relative to earlier
+ // ones.
+ if (isRelative) {
+ sysnum += baseNum;
+ } else {
+ baseNum = sysnum;
+ baseName = syscallName;
+ }
+
+ // Keep track of the highest syscall number that we know
+ // about.
+ if (sysnum >= numSyscallNames_) {
+ numSyscallNames_ = sysnum + 1;
+ }
+
+ syscallNamesMap_[sysnum] = syscallName;
+ }
+ }
+ }
+ }
+ }
+ fclose(fp);
+ break;
+ }
+ }
+ if (numSyscallNames_) {
+ // We cannot make system calls at the time, when we are looking up
+ // the names. So, copy them into a data structure that can be
+ // accessed without having to allocated memory (i.e. no more STL).
+ syscallNames_ = reinterpret_cast<const char **>(
+ calloc(sizeof(char *), numSyscallNames_));
+ for (std::map<int, std::string>::const_iterator iter =
+ syscallNamesMap_.begin();
+ iter != syscallNamesMap_.end();
+ ++iter) {
+ syscallNames_[iter->first] = iter->second.c_str();
+ }
+ }
+ }
+}
+
+void Debug::message(const char* msg) {
+ if (enabled_) {
+ Sandbox::SysCalls sys;
+ size_t len = strlen(msg);
+ if (len && msg[len-1] != '\n') {
+ // Write operations should be atomic, so that we don't interleave
+ // messages from multiple threads. Append a newline, if it is not
+ // already there.
+ char copy[len + 1];
+ memcpy(copy, msg, len);
+ copy[len] = '\n';
+ Sandbox::write(sys, 2, copy, len + 1);
+ } else {
+ Sandbox::write(sys, 2, msg, len);
+ }
+ }
+}
+
+void Debug::syscall(int sysnum, const char* msg, int call) {
+ // This function gets called from the system call wrapper. Avoid calling
+ // any library functions that themselves need system calls.
+ if (enabled_) {
+ const char *sysname = NULL;
+ if (sysnum >= 0 && sysnum < numSyscallNames_) {
+ sysname = syscallNames_[sysnum];
+ }
+ char unnamed[40] = "Unnamed syscall #";
+ if (!sysname) {
+ itoa(strrchr(sysname = unnamed, '\000'), sysnum);
+ }
+ #if defined(__NR_socketcall) || defined(__NR_ipc)
+ char extra[40];
+ *extra = '\000';
+ #if defined(__NR_socketcall)
+ if (sysnum == __NR_socketcall) {
+ static const char* socketcall_name[] = {
+ 0, "socket", "bind", "connect", "listen", "accept", "getsockname",
+ "getpeername", "socketpair", "send", "recv", "sendto","recvfrom",
+ "shutdown", "setsockopt", "getsockopt", "sendmsg", "recvmsg",
+ "accept4"
+ };
+ if (call >= 1 && call < (int)(sizeof(socketcall_name)/sizeof(char *))) {
+ strcat(strcpy(extra, " "), socketcall_name[call]);
+ } else {
+ itoa(strcpy(extra, " #") + 2, call);
+ }
+ }
+ #endif
+ #if defined(__NR_ipc)
+ if (sysnum == __NR_ipc) {
+ static const char* ipc_name[] = {
+ 0, "semop", "semget", "semctl", "semtimedop", 0, 0, 0, 0, 0, 0,
+ "msgsnd", "msgrcv", "msgget", "msgctl", 0, 0, 0, 0, 0, 0,
+ "shmat", "shmdt", "shmget", "shmctl" };
+ if (call >= 1 && call < (int)(sizeof(ipc_name)/sizeof(char *)) &&
+ ipc_name[call]) {
+ strcat(strcpy(extra, " "), ipc_name[call]);
+ } else {
+ itoa(strcpy(extra, " #") + 2, call);
+ }
+ }
+ #endif
+ #else
+ static const char *extra = "";
+ #endif
+ char buf[strlen(sysname) + strlen(extra) + (msg ? strlen(msg) : 0) + 4];
+ strcat(strcat(strcat(strcat(strcpy(buf, sysname), extra), ": "),
+ msg ? msg : ""), "\n");
+ message(buf);
+ }
+}
+
+char* Debug::itoa(char* s, int n) {
+ // Remember return value
+ char *ret = s;
+
+ // Insert sign for negative numbers
+ if (n < 0) {
+ *s++ = '-';
+ n = -n;
+ }
+
+ // Convert to decimal (in reverse order)
+ char *start = s;
+ do {
+ *s++ = '0' + (n % 10);
+ n /= 10;
+ } while (n);
+ *s-- = '\000';
+
+ // Reverse order of digits
+ while (start < s) {
+ char ch = *s;
+ *s-- = *start;
+ *start++ = ch;
+ }
+
+ return ret;
+}
+
+} // namespace
+
+#endif // NDEBUG
diff --git a/sandbox/linux/seccomp/debug.h b/sandbox/linux/seccomp/debug.h
new file mode 100644
index 0000000..728c55c
--- /dev/null
+++ b/sandbox/linux/seccomp/debug.h
@@ -0,0 +1,58 @@
+#ifndef DEBUG_H__
+#define DEBUG_H__
+
+#include <map>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <string.h>
+
+#include "sandbox_impl.h"
+
+namespace playground {
+
+class Debug {
+ public:
+ // If debugging is enabled, write a message to stderr.
+ static void message(const char* msg)
+ #ifndef NDEBUG
+ asm("playground$debugMessage");
+ #else
+ { }
+ #endif
+
+ // If debugging is enabled, write the name of the syscall and an optional
+ // message to stderr.
+ static void syscall(int sysnum, const char* msg, int call = -1)
+ #ifndef NDEBUG
+ ;
+ #else
+ { }
+ #endif
+
+ // Check whether debugging is enabled.
+ static bool isEnabled() {
+ #ifndef NDEBUG
+ return enabled_;
+ #else
+ return false;
+ #endif
+ }
+
+ private:
+ #ifndef NDEBUG
+ Debug();
+ static char* itoa(char* s, int n);
+
+ static Debug debug_;
+
+ static bool enabled_;
+ static int numSyscallNames_;
+ static const char **syscallNames_;
+ static std::map<int, std::string> syscallNamesMap_;
+ #endif
+};
+
+} // namespace
+
+#endif // DEBUG_H__
diff --git a/sandbox/linux/seccomp/exit.cc b/sandbox/linux/seccomp/exit.cc
new file mode 100644
index 0000000..23ebc55
--- /dev/null
+++ b/sandbox/linux/seccomp/exit.cc
@@ -0,0 +1,32 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_exit(int status) {
+ Debug::syscall(__NR_exit, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_exit;
+ request.cookie = cookie();
+
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request)) {
+ die("Failed to forward exit() request [sandbox]");
+ }
+ for (;;) {
+ sys._exit(status);
+ }
+}
+
+bool Sandbox::process_exit(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ SecureMem::lockSystemCall(parentProc, mem);
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_exit, 0);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/getpid.cc b/sandbox/linux/seccomp/getpid.cc
new file mode 100644
index 0000000..5eb32b8
--- /dev/null
+++ b/sandbox/linux/seccomp/getpid.cc
@@ -0,0 +1,11 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_getpid() {
+ Debug::syscall(__NR_getpid, "Executing handler");
+ return pid_;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/gettid.cc b/sandbox/linux/seccomp/gettid.cc
new file mode 100644
index 0000000..5414510
--- /dev/null
+++ b/sandbox/linux/seccomp/gettid.cc
@@ -0,0 +1,11 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_gettid() {
+ Debug::syscall(__NR_gettid, "Executing handler");
+ return tid();
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/ioctl.cc b/sandbox/linux/seccomp/ioctl.cc
new file mode 100644
index 0000000..ac630a7
--- /dev/null
+++ b/sandbox/linux/seccomp/ioctl.cc
@@ -0,0 +1,52 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_ioctl(int d, int req, void *arg) {
+ Debug::syscall(__NR_ioctl, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ IOCtl ioctl_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_ioctl;
+ request.cookie = cookie();
+ request.ioctl_req.d = d;
+ request.ioctl_req.req = req;
+ request.ioctl_req.arg = arg;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward ioctl() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_ioctl(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ IOCtl ioctl_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &ioctl_req, sizeof(ioctl_req)) !=sizeof(ioctl_req)){
+ die("Failed to read parameters for ioctl() [process]");
+ }
+ int rc = -EINVAL;
+ switch (ioctl_req.req) {
+ case TCGETS:
+ case TIOCGWINSZ:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_ioctl,
+ ioctl_req.d, ioctl_req.req, ioctl_req.arg);
+ return true;
+ default:
+ std::cerr << "Unsupported ioctl: 0x" << std::hex << ioctl_req.req <<
+ std::endl;
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/ipc.cc b/sandbox/linux/seccomp/ipc.cc
new file mode 100644
index 0000000..f3ad9a2
--- /dev/null
+++ b/sandbox/linux/seccomp/ipc.cc
@@ -0,0 +1,337 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+#ifndef IPC_PRIVATE
+#define IPC_PRIVATE 0
+#endif
+#ifndef IPC_RMID
+#define IPC_RMID 0
+#endif
+#ifndef IPC_64
+#define IPC_64 256
+#endif
+
+#if defined(__NR_shmget)
+void* Sandbox::sandbox_shmat(int shmid, const void* shmaddr, int shmflg) {
+ Debug::syscall(__NR_shmat, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmAt shmat_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmat;
+ request.cookie = cookie();
+ request.shmat_req.shmid = shmid;
+ request.shmat_req.shmaddr = shmaddr;
+ request.shmat_req.shmflg = shmflg;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmat() request [sandbox]");
+ }
+ return reinterpret_cast<void *>(rc);
+}
+
+int Sandbox::sandbox_shmctl(int shmid, int cmd, void* buf) {
+ Debug::syscall(__NR_shmctl, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmCtl shmctl_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmctl;
+ request.cookie = cookie();
+ request.shmctl_req.shmid = shmid;
+ request.shmctl_req.cmd = cmd;
+ request.shmctl_req.buf = buf;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmctl() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+int Sandbox::sandbox_shmdt(const void* shmaddr) {
+ Debug::syscall(__NR_shmdt, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmDt shmdt_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmdt;
+ request.cookie = cookie();
+ request.shmdt_req.shmaddr = shmaddr;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmdt() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+int Sandbox::sandbox_shmget(int key, size_t size, int shmflg) {
+ Debug::syscall(__NR_shmget, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ ShmGet shmget_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_shmget;
+ request.cookie = cookie();
+ request.shmget_req.key = key;
+ request.shmget_req.size = size;
+ request.shmget_req.shmflg = shmflg;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward shmget() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_shmat(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmAt shmat_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmat_req, sizeof(shmat_req)) !=
+ sizeof(shmat_req)) {
+ die("Failed to read parameters for shmat() [process]");
+ }
+
+ // We only allow attaching to the shm identifier that was returned by
+ // the most recent call to shmget(IPC_PRIVATE)
+ if (shmat_req.shmaddr || shmat_req.shmflg || shmat_req.shmid != mem->shmId) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmat, shmat_req.shmid, shmat_req.shmaddr,
+ shmat_req.shmflg);
+ return true;
+}
+
+bool Sandbox::process_shmctl(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmCtl shmctl_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmctl_req, sizeof(shmctl_req)) !=
+ sizeof(shmctl_req)) {
+ die("Failed to read parameters for shmctl() [process]");
+ }
+
+ // The only shmctl() operation that we need to support is removal. This
+ // operation is generally safe.
+ if ((shmctl_req.cmd & ~(IPC_64 | IPC_RMID)) || shmctl_req.buf) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmctl, shmctl_req.shmid, shmctl_req.cmd,
+ shmctl_req.buf);
+ return true;
+}
+
+bool Sandbox::process_shmdt(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmDt shmdt_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmdt_req, sizeof(shmdt_req)) !=
+ sizeof(shmdt_req)) {
+ die("Failed to read parameters for shmdt() [process]");
+ }
+
+ // Detaching shared memory segments it generally safe, but just in case
+ // of a kernel bug, we make sure that the address does not fall into any
+ // of the reserved memory regions.
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)shmdt_req.shmaddr);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first <= shmdt_req.shmaddr;
+ ++iter){
+ if (shmdt_req.shmaddr < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ shmdt_req.shmaddr >= iter->first) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmdt, shmdt_req.shmaddr);
+ return true;
+}
+
+bool Sandbox::process_shmget(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ ShmGet shmget_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &shmget_req, sizeof(shmget_req)) !=
+ sizeof(shmget_req)) {
+ die("Failed to read parameters for shmget() [process]");
+ }
+
+ // We do not want to allow the sandboxed application to access arbitrary
+ // shared memory regions. We only allow it to access regions that it
+ // created itself.
+ if (shmget_req.key != IPC_PRIVATE || shmget_req.shmflg & ~0777) {
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_shmget, shmget_req.key, shmget_req.size,
+ shmget_req.shmflg);
+ return true;
+}
+#endif
+
+#if defined(__NR_ipc)
+#ifndef SHMAT
+#define SHMAT 21
+#endif
+#ifndef SHMDT
+#define SHMDT 22
+#endif
+#ifndef SHMGET
+#define SHMGET 23
+#endif
+#ifndef SHMCTL
+#define SHMCTL 24
+#endif
+
+int Sandbox::sandbox_ipc(unsigned call, int first, int second, int third,
+ void* ptr, long fifth) {
+ Debug::syscall(__NR_ipc, "Executing handler", call);
+ struct {
+ int sysnum;
+ long long cookie;
+ IPC ipc_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_ipc;
+ request.cookie = cookie();
+ request.ipc_req.call = call;
+ request.ipc_req.first = first;
+ request.ipc_req.second = second;
+ request.ipc_req.third = third;
+ request.ipc_req.ptr = ptr;
+ request.ipc_req.fifth = fifth;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward ipc() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_ipc(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ IPC ipc_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &ipc_req, sizeof(ipc_req)) != sizeof(ipc_req)) {
+ die("Failed to read parameters for ipc() [process]");
+ }
+
+ // We do not support all of the SysV IPC calls. In fact, we only support
+ // the minimum feature set necessary for Chrome's renderers to share memory
+ // with the X server.
+ switch (ipc_req.call) {
+ case SHMAT: {
+ // We only allow attaching to the shm identifier that was returned by
+ // the most recent call to shmget(IPC_PRIVATE)
+ if (ipc_req.ptr || ipc_req.second || ipc_req.first != mem->shmId) {
+ goto deny;
+ }
+ accept:
+ mem->shmId = -1;
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_ipc, ipc_req.call, ipc_req.first,
+ ipc_req.second, ipc_req.third, ipc_req.ptr,
+ ipc_req.fifth);
+ return true;
+ }
+ case SHMCTL:
+ // The only shmctl() operation that we need to support is removal. This
+ // operation is generally safe.
+ if ((ipc_req.second & ~(IPC_64 | IPC_RMID)) || ipc_req.ptr) {
+ goto deny;
+ } else {
+ goto accept;
+ }
+ case SHMDT: {
+ // Detaching shared memory segments it generally safe, but just in case
+ // of a kernel bug, we make sure that the address does not fall into any
+ // of the reserved memory regions.
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)ipc_req.ptr);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first <=ipc_req.ptr; ++iter){
+ if (ipc_req.ptr < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ ipc_req.ptr >= iter->first) {
+ goto deny;
+ }
+ }
+ goto accept;
+ }
+ case SHMGET:
+ // We do not want to allow the sandboxed application to access arbitrary
+ // shared memory regions. We only allow it to access regions that it
+ // created itself.
+ if (ipc_req.first != IPC_PRIVATE || ipc_req.third & ~0777) {
+ goto deny;
+ } else {
+ goto accept;
+ }
+ default:
+ // Other than SysV shared memory, we do not actually need to support any
+ // other SysV IPC calls.
+ deny:
+ mem->shmId = -1;
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+}
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc
new file mode 100644
index 0000000..a6c406e
--- /dev/null
+++ b/sandbox/linux/seccomp/library.cc
@@ -0,0 +1,1360 @@
+#define XOPEN_SOURCE 500
+#include <algorithm>
+#include <elf.h>
+#include <errno.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <iostream>
+#include <linux/unistd.h>
+#include <set>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "debug.h"
+#include "library.h"
+#include "sandbox_impl.h"
+#include "syscall.h"
+#include "syscall_table.h"
+#include "x86_decode.h"
+
+#if defined(__x86_64__)
+typedef Elf64_Phdr Elf_Phdr;
+typedef Elf64_Rela Elf_Rel;
+
+typedef Elf64_Half Elf_Half;
+typedef Elf64_Word Elf_Word;
+typedef Elf64_Sword Elf_Sword;
+typedef Elf64_Xword Elf_Xword;
+typedef Elf64_Sxword Elf_Sxword;
+typedef Elf64_Off Elf_Off;
+typedef Elf64_Section Elf_Section;
+typedef Elf64_Versym Elf_Versym;
+
+#define ELF_ST_BIND ELF64_ST_BIND
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_INFO ELF64_ST_INFO
+#define ELF_R_SYM ELF64_R_SYM
+#define ELF_R_TYPE ELF64_R_TYPE
+#define ELF_R_INFO ELF64_R_INFO
+
+#define ELF_REL_PLT ".rela.plt"
+#define ELF_JUMP_SLOT R_X86_64_JUMP_SLOT
+#elif defined(__i386__)
+typedef Elf32_Phdr Elf_Phdr;
+typedef Elf32_Rel Elf_Rel;
+
+typedef Elf32_Half Elf_Half;
+typedef Elf32_Word Elf_Word;
+typedef Elf32_Sword Elf_Sword;
+typedef Elf32_Xword Elf_Xword;
+typedef Elf32_Sxword Elf_Sxword;
+typedef Elf32_Off Elf_Off;
+typedef Elf32_Section Elf_Section;
+typedef Elf32_Versym Elf_Versym;
+
+#define ELF_ST_BIND ELF32_ST_BIND
+#define ELF_ST_TYPE ELF32_ST_TYPE
+#define ELF_ST_INFO ELF32_ST_INFO
+#define ELF_R_SYM ELF32_R_SYM
+#define ELF_R_TYPE ELF32_R_TYPE
+#define ELF_R_INFO ELF32_R_INFO
+
+#define ELF_REL_PLT ".rel.plt"
+#define ELF_JUMP_SLOT R_386_JMP_SLOT
+#else
+#error Unsupported target platform
+#endif
+
+namespace playground {
+
+char* Library::__kernel_vsyscall;
+char* Library::__kernel_sigreturn;
+char* Library::__kernel_rt_sigreturn;
+
+char* Library::getBytes(char* dst, const char* src, ssize_t len) {
+ // Some kernels don't allow accessing the VDSO from write()
+ if (isVDSO_ &&
+ src >= memory_ranges_.begin()->second.start &&
+ src <= memory_ranges_.begin()->second.stop) {
+ ssize_t max =
+ reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - src;
+ if (len > max) {
+ len = max;
+ }
+ memcpy(dst, src, len);
+ return dst;
+ }
+
+ // Read up to "len" bytes from "src" and copy them to "dst". Short
+ // copies are possible, if we are at the end of a mapping. Returns
+ // NULL, if the operation failed completely.
+ static int helper_socket[2];
+ Sandbox::SysCalls sys;
+ if (!helper_socket[0] && !helper_socket[1]) {
+ // Copy data through a socketpair, as this allows us to access it
+ // without incurring a segmentation fault.
+ sys.socketpair(AF_UNIX, SOCK_STREAM, 0, helper_socket);
+ }
+ char* ptr = dst;
+ int inc = 4096;
+ while (len > 0) {
+ ssize_t l = inc == 1 ? inc : 4096 - (reinterpret_cast<long>(src) & 0xFFF);
+ if (l > len) {
+ l = len;
+ }
+ l = NOINTR_SYS(sys.write(helper_socket[0], src, l));
+ if (l == -1) {
+ if (sys.my_errno == EFAULT) {
+ if (inc == 1) {
+ if (ptr == dst) {
+ return NULL;
+ }
+ break;
+ }
+ inc = 1;
+ continue;
+ } else {
+ return NULL;
+ }
+ }
+ l = sys.read(helper_socket[1], ptr, l);
+ if (l <= 0) {
+ return NULL;
+ }
+ ptr += l;
+ src += l;
+ len -= l;
+ }
+ return dst;
+}
+
+char *Library::get(Elf_Addr offset, char *buf, size_t len) {
+ if (!valid_) {
+ memset(buf, 0, len);
+ return NULL;
+ }
+ RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
+ if (iter == memory_ranges_.end()) {
+ memset(buf, 0, len);
+ return NULL;
+ }
+ offset -= iter->first;
+ long size = reinterpret_cast<char *>(iter->second.stop) -
+ reinterpret_cast<char *>(iter->second.start);
+ if (offset > size - len) {
+ if (!maps_ && memory_ranges_.size() == 1 &&
+ !memory_ranges_.begin()->first && !isVDSO_) {
+ // We are in the child and have exactly one mapping covering the whole
+ // library. We are trying to read data past the end of what is currently
+ // mapped. Check if we can expand the memory mapping to recover the
+ // needed data
+ Sandbox::SysCalls sys;
+ long new_size = (offset + len + 4095) & ~4095;
+ void *new_start = sys.mremap(iter->second.start, size, new_size,
+ MREMAP_MAYMOVE);
+ if (new_start != MAP_FAILED) {
+ memory_ranges_.clear();
+ memory_ranges_.insert(std::make_pair(0,
+ Range(new_start, reinterpret_cast<void *>(
+ reinterpret_cast<char *>(new_start) + new_size),
+ PROT_READ)));
+ iter = memory_ranges_.begin();
+ goto ok;
+ }
+ }
+ memset(buf, 0, len);
+ return NULL;
+ }
+ok:
+ char *src = reinterpret_cast<char *>(iter->second.start) + offset;
+ memset(buf, 0, len);
+ if (!getBytes(buf, src, len)) {
+ return NULL;
+ }
+ return buf;
+}
+
+std::string Library::get(Elf_Addr offset) {
+ if (!valid_) {
+ return "";
+ }
+ RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
+ if (iter == memory_ranges_.end()) {
+ return "";
+ }
+ offset -= iter->first;
+ size_t size = reinterpret_cast<char *>(iter->second.stop) -
+ reinterpret_cast<char *>(iter->second.start);
+ if (offset > size - 4096) {
+ if (!maps_ && memory_ranges_.size() == 1 &&
+ !memory_ranges_.begin()->first && !isVDSO_) {
+ // We are in the child and have exactly one mapping covering the whole
+ // library. We are trying to read data past the end of what is currently
+ // mapped. Check if we can expand the memory mapping to recover the
+ // needed data. We assume that strings are never longer than 4kB.
+ Sandbox::SysCalls sys;
+ long new_size = (offset + 4096 + 4095) & ~4095;
+ void *new_start = sys.mremap(iter->second.start, size, new_size,
+ MREMAP_MAYMOVE);
+ if (new_start != MAP_FAILED) {
+ memory_ranges_.clear();
+ memory_ranges_.insert(std::make_pair(0,
+ Range(new_start, reinterpret_cast<void *>(
+ reinterpret_cast<char *>(new_start) + new_size),
+ PROT_READ)));
+ iter = memory_ranges_.begin();
+ goto ok;
+ }
+ }
+ }
+ok:
+ const char *start = reinterpret_cast<char *>(iter->second.start) + offset;
+ const char *stop = reinterpret_cast<char *>(iter->second.stop) + offset;
+ char buf[4096] = { 0 };
+ getBytes(buf, start, stop - start >= (int)sizeof(buf) ?
+ sizeof(buf) - 1 : stop - start);
+ start = buf;
+ stop = buf;
+ while (*stop) {
+ ++stop;
+ }
+ std::string s = stop > start ? std::string(start, stop - start) : "";
+ return s;
+}
+
+char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) {
+ if (!valid_) {
+ memset(buf, 0, len);
+ return NULL;
+ }
+ if (maps_) {
+ return maps_->forwardGetRequest(this, offset, buf, len);
+ }
+ return get(offset, buf, len);
+}
+
+std::string Library::getOriginal(Elf_Addr offset) {
+ if (!valid_) {
+ return "";
+ }
+ if (maps_) {
+ return maps_->forwardGetRequest(this, offset);
+ }
+ return get(offset);
+}
+
+const Elf_Ehdr* Library::getEhdr() {
+ if (!valid_) {
+ return NULL;
+ }
+ return &ehdr_;
+}
+
+const Elf_Shdr* Library::getSection(const std::string& section) {
+ if (!valid_) {
+ return NULL;
+ }
+ SectionTable::const_iterator iter = section_table_.find(section);
+ if (iter == section_table_.end()) {
+ return NULL;
+ }
+ return &iter->second.second;
+}
+
+const int Library::getSectionIndex(const std::string& section) {
+ if (!valid_) {
+ return -1;
+ }
+ SectionTable::const_iterator iter = section_table_.find(section);
+ if (iter == section_table_.end()) {
+ return -1;
+ }
+ return iter->second.first;
+}
+
+void **Library::getRelocation(const std::string& symbol) {
+ PltTable::const_iterator iter = plt_entries_.find(symbol);
+ if (iter == plt_entries_.end()) {
+ return NULL;
+ }
+ return reinterpret_cast<void **>(asr_offset_ + iter->second);
+}
+
+void *Library::getSymbol(const std::string& symbol) {
+ SymbolTable::const_iterator iter = symbols_.find(symbol);
+ if (iter == symbols_.end() || !iter->second.st_value) {
+ return NULL;
+ }
+ return asr_offset_ + iter->second.st_value;
+}
+
+void Library::makeWritable(bool state) const {
+ for (RangeMap::const_iterator iter = memory_ranges_.begin();
+ iter != memory_ranges_.end(); ++iter) {
+ const Range& range = iter->second;
+ long length = reinterpret_cast<char *>(range.stop) -
+ reinterpret_cast<char *>(range.start);
+ Sandbox::SysCalls sys;
+ sys.mprotect(range.start, length,
+ range.prot | (state ? PROT_WRITE : 0));
+ }
+}
+
+bool Library::isSafeInsn(unsigned short insn) {
+ // Check if the instruction has no unexpected side-effects. If so, it can
+ // be safely relocated from the function that we are patching into the
+ // out-of-line scratch space that we are setting up. This is often necessary
+ // to make room for the JMP into the scratch space.
+ return ((insn & 0x7) < 0x6 && (insn & 0xF0) < 0x40
+ /* ADD, OR, ADC, SBB, AND, SUB, XOR, CMP */) ||
+ #if defined(__x86_64__)
+ insn == 0x63 /* MOVSXD */ ||
+ #endif
+ (insn >= 0x80 && insn <= 0x8E /* ADD, OR, ADC,
+ SBB, AND, SUB, XOR, CMP, TEST, XCHG, MOV, LEA */) ||
+ (insn == 0x90) || /* NOP */
+ (insn >= 0xA0 && insn <= 0xA9) /* MOV, TEST */ ||
+ (insn >= 0xB0 && insn <= 0xBF /* MOV */) ||
+ (insn >= 0xC0 && insn <= 0xC1) || /* Bit Shift */
+ (insn >= 0xD0 && insn <= 0xD3) || /* Bit Shift */
+ (insn >= 0xC6 && insn <= 0xC7 /* MOV */) ||
+ (insn == 0xF7) /* TEST, NOT, NEG, MUL, IMUL, DIV, IDIV */;
+}
+
+char* Library::getScratchSpace(const Maps* maps, char* near, int needed,
+ char** extraSpace, int* extraLength) {
+ if (needed > *extraLength ||
+ labs(*extraSpace - reinterpret_cast<char *>(near)) > (1536 << 20)) {
+ if (*extraSpace) {
+ // Start a new scratch page and mark any previous page as write-protected
+ Sandbox::SysCalls sys;
+ sys.mprotect(*extraSpace, 4096, PROT_READ|PROT_EXEC);
+ }
+ // Our new scratch space is initially executable and writable.
+ *extraLength = 4096;
+ *extraSpace = maps->allocNearAddr(near, *extraLength,
+ PROT_READ|PROT_WRITE|PROT_EXEC);
+ }
+ if (*extraSpace) {
+ *extraLength -= needed;
+ return *extraSpace + *extraLength;
+ }
+ Sandbox::die("Insufficient space to intercept system call");
+}
+
+void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
+ char *end, char** extraSpace,
+ int* extraLength) {
+ std::set<char *> branch_targets;
+ for (char *ptr = start; ptr < end; ) {
+ unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64);
+ char *target;
+ if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || insn == 0xEB /* JMP */) {
+ target = ptr + (reinterpret_cast<signed char *>(ptr))[-1];
+ } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ ||
+ (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) {
+ target = ptr + (reinterpret_cast<int *>(ptr))[-1];
+ } else {
+ continue;
+ }
+ branch_targets.insert(target);
+ }
+ struct Code {
+ char* addr;
+ int len;
+ unsigned short insn;
+ bool is_ip_relative;
+ } code[5] = { { 0 } };
+ int codeIdx = 0;
+ char* ptr = start;
+ while (ptr < end) {
+ // Keep a ring-buffer of the last few instruction in order to find the
+ // correct place to patch the code.
+ char *mod_rm;
+ code[codeIdx].addr = ptr;
+ code[codeIdx].insn = next_inst((const char **)&ptr, __WORDSIZE == 64,
+ 0, 0, &mod_rm, 0, 0);
+ code[codeIdx].len = ptr - code[codeIdx].addr;
+ code[codeIdx].is_ip_relative = mod_rm && (*mod_rm & 0xC7) == 0x5;
+
+ // Whenever we find a system call, we patch it with a jump to out-of-line
+ // code that redirects to our system call wrapper.
+ bool is_syscall = true;
+ #if defined(__x86_64__)
+ bool is_indirect_call = false;
+ if (code[codeIdx].insn == 0x0F05 /* SYSCALL */ ||
+ // In addition, on x86-64, we need to redirect all CALLs between the
+ // VDSO and the VSyscalls page. We want these to jump to our own
+ // modified copy of the VSyscalls. As we know that the VSyscalls are
+ // always more than 2GB away from the VDSO, the compiler has to
+ // generate some form of indirect jumps. We can find all indirect
+ // CALLs and redirect them to a separate scratch area, where we can
+ // inspect the destination address. If it indeed points to the
+ // VSyscall area, we then adjust the destination address accordingly.
+ (is_indirect_call =
+ (isVDSO_ && vsys_offset_ && code[codeIdx].insn == 0xFF &&
+ !code[codeIdx].is_ip_relative &&
+ mod_rm && (*mod_rm & 0x38) == 0x10 /* CALL (indirect) */))) {
+ is_syscall = !is_indirect_call;
+ #elif defined(__i386__)
+ bool is_gs_call = false;
+ if (code[codeIdx].len == 7 &&
+ code[codeIdx].insn == 0xFF &&
+ code[codeIdx].addr[2] == '\x15' /* CALL (indirect) */ &&
+ code[codeIdx].addr[0] == '\x65' /* %gs prefix */) {
+ char* target;
+ asm volatile("mov %%gs:(%1), %0\n"
+ : "=a"(target)
+ : "c"(*reinterpret_cast<int *>(code[codeIdx].addr+3)));
+ if (target == __kernel_vsyscall) {
+ is_gs_call = true;
+ // TODO(markus): also handle the other vsyscalls
+ }
+ }
+ if (is_gs_call ||
+ (code[codeIdx].insn == 0xCD &&
+ code[codeIdx].addr[1] == '\x80' /* INT $0x80 */)) {
+ #else
+ #error Unsupported target platform
+ #endif
+ // Found a system call. Search backwards to figure out how to redirect
+ // the code. We will need to overwrite a couple of instructions and,
+ // of course, move these instructions somewhere else.
+ int startIdx = codeIdx;
+ int endIdx = codeIdx;
+ int length = code[codeIdx].len;
+ for (int idx = codeIdx;
+ (idx = (idx + (sizeof(code) / sizeof(struct Code)) - 1) %
+ (sizeof(code) / sizeof(struct Code))) != codeIdx; ) {
+ std::set<char *>::const_iterator iter =
+ std::upper_bound(branch_targets.begin(), branch_targets.end(),
+ code[idx].addr);
+ if (iter != branch_targets.end() && *iter < ptr) {
+ // Found a branch pointing to somewhere past our instruction. This
+ // instruction cannot be moved safely. Leave it in place.
+ break;
+ }
+ if (code[idx].addr && !code[idx].is_ip_relative &&
+ isSafeInsn(code[idx].insn)) {
+ // These are all benign instructions with no side-effects and no
+ // dependency on the program counter. We should be able to safely
+ // relocate them.
+ startIdx = idx;
+ length = ptr - code[startIdx].addr;
+ } else {
+ break;
+ }
+ }
+ // Search forward past the system call, too. Sometimes, we can only
+ // find relocatable instructions following the system call.
+ #if defined(__i386__)
+ findEndIdx:
+ #endif
+ char *next = ptr;
+ for (int i = codeIdx;
+ (i = (i + 1) % (sizeof(code) / sizeof(struct Code))) != startIdx;
+ ) {
+ std::set<char *>::const_iterator iter =
+ std::lower_bound(branch_targets.begin(), branch_targets.end(),
+ next);
+ if (iter != branch_targets.end() && *iter == next) {
+ // Found branch target pointing to our instruction
+ break;
+ }
+ char *tmp_rm;
+ code[i].addr = next;
+ code[i].insn = next_inst((const char **)&next, __WORDSIZE == 64,
+ 0, 0, &tmp_rm, 0, 0);
+ code[i].len = next - code[i].addr;
+ code[i].is_ip_relative = tmp_rm && (*tmp_rm & 0xC7) == 0x5;
+ if (!code[i].is_ip_relative && isSafeInsn(code[i].insn)) {
+ endIdx = i;
+ length = next - code[startIdx].addr;
+ } else {
+ break;
+ }
+ }
+ // We now know, how many instructions neighboring the system call we
+ // can safely overwrite. We need five bytes to insert a JMP/CALL and a
+ // 32bit address. We then jump to a code fragment that safely forwards
+ // to our system call wrapper. On x86-64, this is complicated by
+ // the fact that the API allows up to 128 bytes of red-zones below the
+ // current stack pointer. So, we cannot write to the stack until we
+ // have adjusted the stack pointer.
+ //
+ // .. .. .. .. ; any leading instructions copied from original code
+ // 48 81 EC 80 00 00 00 SUB $0x80, %rsp
+ // 50 PUSH %rax
+ // 48 8D 05 .. .. .. .. LEA ...(%rip), %rax
+ // 50 PUSH %rax
+ // 48 B8 .. .. .. .. MOV $syscallWrapper, %rax
+ // .. .. .. ..
+ // 50 PUSH %rax
+ // 48 8D 05 06 00 00 00 LEA 6(%rip), %rax
+ // 48 87 44 24 10 XCHG %rax, 16(%rsp)
+ // C3 RETQ
+ // 48 81 C4 80 00 00 00 ADD $0x80, %rsp
+ // .. .. .. .. ; any trailing instructions copied from original code
+ // E9 .. .. .. .. JMPQ ...
+ //
+ // Total: 52 bytes + any bytes that were copied
+ //
+ // On x86-32, the stack is available and we can do:
+ //
+ // TODO(markus): Try to maintain frame pointers on x86-32
+ //
+ // .. .. .. .. ; any leading instructions copied from original code
+ // 68 .. .. .. .. PUSH return_addr
+ // 68 .. .. .. .. PUSH $syscallWrapper
+ // C3 RET
+ // .. .. .. .. ; any trailing instructions copied from original code
+ // C3 RET
+ //
+ // Total: 12 bytes + any bytes that were copied
+ //
+ // For indirect jumps from the VDSO to the VSyscall page, we instead
+ // replace the following code (this is only necessary on x86-64). This
+ // time, we don't have to worry about red zones:
+ //
+ // .. .. .. .. ; any leading instructions copied from original code
+ // E8 00 00 00 00 CALL .
+ // 48 83 04 24 .. ADDQ $.., (%rsp)
+ // FF .. .. .. .. .. PUSH .. ; from original CALL instruction
+ // 48 81 3C 24 00 00 00 FF CMPQ $0xFFFFFFFFFF000000, 0(%rsp)
+ // 72 10 JB . + 16
+ // 81 2C 24 .. .. .. .. SUBL ..., 0(%rsp)
+ // C7 44 24 04 00 00 00 00 MOVL $0, 4(%rsp)
+ // C3 RETQ
+ // 48 87 04 24 XCHG %rax,(%rsp)
+ // 48 89 44 24 08 MOV %rax,0x8(%rsp)
+ // 58 POP %rax
+ // C3 RETQ
+ // .. .. .. .. ; any trailing instructions copied from original code
+ // E9 .. .. .. .. JMPQ ...
+ //
+ // Total: 52 bytes + any bytes that were copied
+
+ if (length < 5) {
+ // There are a very small number of instruction sequences that we
+ // cannot easily intercept, and that have been observed in real world
+ // examples. Handle them here:
+ #if defined(__i386__)
+ int diff;
+ if (!memcmp(code[codeIdx].addr, "\xCD\x80\xEB", 3) &&
+ (diff = *reinterpret_cast<signed char *>(
+ code[codeIdx].addr + 3)) < 0 && diff >= -6) {
+ // We have seen...
+ // for (;;) {
+ // _exit(0);
+ // }
+ // ..get compiled to:
+ // B8 01 00 00 00 MOV $__NR_exit, %eax
+ // 66 90 XCHG %ax, %ax
+ // 31 DB 0:XOR %ebx, %ebx
+ // CD 80 INT $0x80
+ // EB FA JMP 0b
+ // The JMP is really superfluous as the system call never returns.
+ // And there are in fact no returning system calls that need to be
+ // unconditionally repeated in an infinite loop.
+ // If we replace the JMP with NOPs, the system call can successfully
+ // be intercepted.
+ *reinterpret_cast<unsigned short *>(code[codeIdx].addr + 2) = 0x9090;
+ goto findEndIdx;
+ }
+ #endif
+ // If we cannot figure out any other way to intercept this system call,
+ // we replace it with a call to INT0. This causes a SEGV which we then
+ // handle in the signal handler. That's a lot slower than rewriting the
+ // instruction with a jump, but it should only happen very rarely.
+ if (is_syscall) {
+ memcpy(code[codeIdx].addr, "\xCD", 2);
+ if (code[codeIdx].len > 2) {
+ memset(code[codeIdx].addr + 2, 0x90, code[codeIdx].len - 2);
+ }
+ goto replaced;
+ } else {
+ Sandbox::die("Cannot intercept system call");
+ }
+ }
+ int needed = 5 - code[codeIdx].len;
+ int first = codeIdx;
+ while (needed > 0 && first != startIdx) {
+ first = (first + (sizeof(code) / sizeof(struct Code)) - 1) %
+ (sizeof(code) / sizeof(struct Code));
+ needed -= code[first].len;
+ }
+ int second = codeIdx;
+ while (needed > 0) {
+ second = (second + 1) % (sizeof(code) / sizeof(struct Code));
+ needed -= code[second].len;
+ }
+ int preamble = code[codeIdx].addr - code[first].addr;
+ int postamble = code[second].addr + code[second].len -
+ code[codeIdx].addr - code[codeIdx].len;
+
+ // The following is all the code that construct the various bits of
+ // assembly code.
+ #if defined(__x86_64__)
+ if (is_indirect_call) {
+ needed = 52 + preamble + code[codeIdx].len + postamble;
+ } else {
+ needed = 52 + preamble + postamble;
+ }
+ #elif defined(__i386__)
+ needed = 12 + preamble + postamble;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Allocate scratch space and copy the preamble of code that was moved
+ // from the function that we are patching.
+ char* dest = getScratchSpace(maps, code[first].addr, needed,
+ extraSpace, extraLength);
+ memcpy(dest, code[first].addr, preamble);
+
+ // For indirect calls, we need to copy the actual CALL instruction and
+ // turn it into a PUSH instruction.
+ #if defined(__x86_64__)
+ if (is_indirect_call) {
+ memcpy(dest + preamble, "\xE8\x00\x00\x00\x00\x48\x83\x04\x24", 9);
+ dest[preamble + 9] = code[codeIdx].len + 42;
+ memcpy(dest + preamble + 10, code[codeIdx].addr, code[codeIdx].len);
+
+ // Convert CALL -> PUSH
+ dest[preamble + 10 + (mod_rm - code[codeIdx].addr)] |= 0x20;
+ preamble += 10 + code[codeIdx].len;
+ }
+ #endif
+
+ // Copy the static body of the assembly code.
+ memcpy(dest + preamble,
+ #if defined(__x86_64__)
+ is_indirect_call ?
+ "\x48\x81\x3C\x24\x00\x00\x00\xFF\x72\x10\x81\x2C\x24\x00\x00\x00"
+ "\x00\xC7\x44\x24\x04\x00\x00\x00\x00\xC3\x48\x87\x04\x24\x48\x89"
+ "\x44\x24\x08\x58\xC3" :
+ "\x48\x81\xEC\x80\x00\x00\x00\x50\x48\x8D\x05\x00\x00\x00\x00\x50"
+ "\x48\xB8\x00\x00\x00\x00\x00\x00\x00\x00\x50\x48\x8D\x05\x06\x00"
+ "\x00\x00\x48\x87\x44\x24\x10\xC3\x48\x81\xC4\x80\x00\x00",
+ is_indirect_call ? 37 : 47
+ #elif defined(__i386__)
+ "\x68\x00\x00\x00\x00\x68\x00\x00\x00\x00\xC3", 11
+ #else
+ #error Unsupported target platform
+ #endif
+ );
+
+ // Copy the postamble that was moved from the function that we are
+ // patching.
+ memcpy(dest + preamble +
+ #if defined(__x86_64__)
+ (is_indirect_call ? 37 : 47),
+ #elif defined(__i386__)
+ 11,
+ #else
+ #error Unsupported target platform
+ #endif
+ code[codeIdx].addr + code[codeIdx].len,
+ postamble);
+
+ // Patch up the various computed values
+ #if defined(__x86_64__)
+ int post = preamble + (is_indirect_call ? 37 : 47) + postamble;
+ dest[post] = '\xE9';
+ *reinterpret_cast<int *>(dest + post + 1) =
+ (code[second].addr + code[second].len) - (dest + post + 5);
+ if (is_indirect_call) {
+ *reinterpret_cast<int *>(dest + preamble + 13) = vsys_offset_;
+ } else {
+ *reinterpret_cast<int *>(dest + preamble + 11) =
+ (code[second].addr + code[second].len) - (dest + preamble + 15);
+ *reinterpret_cast<void **>(dest + preamble + 18) =
+ reinterpret_cast<void *>(&syscallWrapper);
+ }
+ #elif defined(__i386__)
+ *(dest + preamble + 11 + postamble) = '\xC3';
+ *reinterpret_cast<char **>(dest + preamble + 1) =
+ dest + preamble + 11;
+ *reinterpret_cast<void (**)()>(dest + preamble + 6) = syscallWrapper;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Pad unused space in the original function with NOPs
+ memset(code[first].addr, 0x90 /* NOP */,
+ code[second].addr + code[second].len - code[first].addr);
+
+ // Replace the system call with an unconditional jump to our new code.
+ #if defined(__x86_64__)
+ *code[first].addr = '\xE9'; // JMPQ
+ #elif defined(__i386__)
+ *code[first].addr = '\xE8'; // CALL
+ #else
+ #error Unsupported target platform
+ #endif
+ *reinterpret_cast<int *>(code[first].addr + 1) =
+ dest - (code[first].addr + 5);
+ }
+ replaced:
+ codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code));
+ }
+}
+
+void Library::patchVDSO(char** extraSpace, int* extraLength){
+ #if defined(__i386__)
+ Sandbox::SysCalls sys;
+ if (!__kernel_vsyscall ||
+ sys.mprotect(reinterpret_cast<void *>(
+ reinterpret_cast<long>(__kernel_vsyscall) & ~0xFFF),
+ 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) {
+ return;
+ }
+
+ // x86-32 has a small number of well-defined functions in the VDSO library.
+ // These functions do not easily lend themselves to be rewritten by the
+ // automatic code. Instead, we explicitly find new definitions for them.
+ //
+ // We don't bother with optimizing the syscall instruction instead always
+ // use INT $0x80, no matter whether the hardware supports more modern
+ // calling conventions.
+ //
+ // TODO(markus): Investigate whether it is worthwhile to optimize this
+ // code path and use the platform-specific entry code.
+ if (__kernel_vsyscall) {
+ // Replace the kernel entry point with:
+ //
+ // E9 .. .. .. .. JMP syscallWrapper
+ *__kernel_vsyscall = '\xE9';
+ *reinterpret_cast<long *>(__kernel_vsyscall + 1) =
+ reinterpret_cast<char *>(&syscallWrapper) -
+ reinterpret_cast<char *>(__kernel_vsyscall + 5);
+ }
+ if (__kernel_sigreturn) {
+ // Replace the sigreturn() system call with a jump to code that does:
+ //
+ // 58 POP %eax
+ // B8 77 00 00 00 MOV $0x77, %eax
+ // E9 .. .. .. .. JMP syscallWrapper
+ char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace,
+ extraLength);
+ memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE9", 7);
+ *reinterpret_cast<char *>(dest + 7) =
+ reinterpret_cast<char *>(&syscallWrapper) -
+ reinterpret_cast<char *>(dest + 11);
+ *__kernel_sigreturn = '\xE9';
+ *reinterpret_cast<char *>(__kernel_sigreturn + 1) =
+ dest - reinterpret_cast<char *>(__kernel_sigreturn + 5);
+ }
+ if (__kernel_rt_sigreturn) {
+ // Replace the rt_sigreturn() system call with a jump to code that does:
+ //
+ // B8 AD 00 00 00 MOV $0xAD, %eax
+ // E9 .. .. .. .. JMP syscallWrapper
+ char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace,
+ extraLength);
+ memcpy(dest, "\xB8\xAD\x00\x00\x00\xE9", 6);
+ *reinterpret_cast<char *>(dest + 6) =
+ reinterpret_cast<char *>(&syscallWrapper) -
+ reinterpret_cast<char *>(dest + 10);
+ *__kernel_rt_sigreturn = '\xE9';
+ *reinterpret_cast<char *>(__kernel_rt_sigreturn + 1) =
+ dest - reinterpret_cast<char *>(__kernel_rt_sigreturn + 5);
+ }
+ #endif
+}
+
+int Library::patchVSystemCalls() {
+ #if defined(__x86_64__)
+ // VSyscalls live in a shared 4kB page at the top of the address space. This
+ // page cannot be unmapped nor remapped. We have to create a copy within
+ // 2GB of the page, and rewrite all IP-relative accesses to shared variables.
+ // As the top of the address space is not accessible by mmap(), this means
+ // that we need to wrap around addresses to the bottom 2GB of the address
+ // space.
+ // Only x86-64 has VSyscalls.
+ if (maps_->vsyscall()) {
+ char* copy = maps_->allocNearAddr(maps_->vsyscall(), 0x1000,
+ PROT_READ|PROT_WRITE);
+ char* extraSpace = copy;
+ int extraLength = 0x1000;
+ memcpy(copy, maps_->vsyscall(), 0x1000);
+ long adjust = (long)maps_->vsyscall() - (long)copy;
+ for (int vsys = 0; vsys < 0x1000; vsys += 0x400) {
+ char* start = copy + vsys;
+ char* end = start + 0x400;
+
+ // There can only be up to four VSyscalls starting at an offset of
+ // n*0x1000, each. VSyscalls are invoked by functions in the VDSO
+ // and provide fast implementations of a time source. We don't exactly
+ // know where the code and where the data is in the VSyscalls page.
+ // So, we disassemble the code for each function and find all branch
+ // targets within the function in order to find the last address of
+ // function.
+ for (char *last = start, *vars = end, *ptr = start; ptr < end; ) {
+ new_function:
+ char* mod_rm;
+ unsigned short insn = next_inst((const char **)&ptr, true, 0, 0,
+ &mod_rm, 0, 0);
+ if (mod_rm && (*mod_rm & 0xC7) == 0x5) {
+ // Instruction has IP relative addressing mode. Adjust to reference
+ // the variables in the original VSyscall segment.
+ long offset = *reinterpret_cast<int *>(mod_rm + 1);
+ char* var = ptr + offset;
+ if (var >= ptr && var < vars) {
+ // Variables are stored somewhere past all the functions. Remember
+ // the first variable in the VSyscall slot, so that we stop
+ // scanning for instructions once we reach that address.
+ vars = var;
+ }
+ offset += adjust;
+ if ((offset >> 32) && (offset >> 32) != -1) {
+ Sandbox::die("Cannot patch [vsystemcall]");
+ }
+ *reinterpret_cast<int *>(mod_rm + 1) = offset;
+ }
+
+ // Check for jump targets to higher addresses (but within our own
+ // VSyscall slot). They extend the possible end-address of this
+ // function.
+ char *target = 0;
+ if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ ||
+ insn == 0xEB /* JMP */) {
+ target = ptr + (reinterpret_cast<signed char *>(ptr))[-1];
+ } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ ||
+ (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) {
+ target = ptr + (reinterpret_cast<int *>(ptr))[-1];
+ }
+
+ // The function end is found, once the loop reaches the last valid
+ // address in the VSyscall slot, or once it finds a RET instruction
+ // that is not followed by any jump targets. Unconditional jumps that
+ // point backwards are treated the same as a RET instruction.
+ if (insn == 0xC3 /* RET */ ||
+ (target < ptr &&
+ (insn == 0xEB /* JMP */ || insn == 0xE9 /* JMP */))) {
+ if (last >= ptr) {
+ continue;
+ } else {
+ // The function can optionally be followed by more functions in
+ // the same VSyscall slot. Allow for alignment to a 16 byte
+ // boundary. If we then find more non-zero bytes, and if this is
+ // not the known start of the variables, assume a new function
+ // started.
+ for (; ptr < vars; ++ptr) {
+ if ((long)ptr & 0xF) {
+ if (*ptr && *ptr != '\x90' /* NOP */) {
+ goto new_function;
+ }
+ *ptr = '\x90'; // NOP
+ } else {
+ if (*ptr && *ptr != '\x90' /* NOP */) {
+ goto new_function;
+ }
+ break;
+ }
+ }
+
+ // Translate all SYSCALLs to jumps into our system call handler.
+ patchSystemCallsInFunction(NULL, start, ptr,
+ &extraSpace, &extraLength);
+ break;
+ }
+ }
+
+ // Adjust assumed end address for this function, if a valid jump
+ // target has been found that originates from the current instruction.
+ if (target > last && target < start + 0x100) {
+ last = target;
+ }
+ }
+ }
+
+ // We are done. Write-protect our code and make it executable.
+ Sandbox::SysCalls sys;
+ sys.mprotect(copy, 0x1000, PROT_READ|PROT_EXEC);
+ return maps_->vsyscall() - copy;
+ }
+ #endif
+ return 0;
+}
+
+void Library::patchSystemCalls() {
+ if (!valid_) {
+ return;
+ }
+ int extraLength = 0;
+ char* extraSpace = NULL;
+ if (isVDSO_) {
+ // patchVDSO() calls patchSystemCallsInFunction() which needs vsys_offset_
+ // iff processing the VDSO library. So, make sure we call
+ // patchVSystemCalls() first.
+ vsys_offset_ = patchVSystemCalls();
+ #if defined(__i386__)
+ patchVDSO(&extraSpace, &extraLength);
+ return;
+ #endif
+ }
+ SectionTable::const_iterator iter;
+ if ((iter = section_table_.find(".text")) == section_table_.end()) {
+ return;
+ }
+ const Elf_Shdr& shdr = iter->second.second;
+ char* start = reinterpret_cast<char *>(shdr.sh_addr + asr_offset_);
+ char* stop = start + shdr.sh_size;
+ char* func = start;
+ int nopcount = 0;
+ bool has_syscall = false;
+ for (char *ptr = start; ptr < stop; ptr++) {
+ #if defined(__x86_64__)
+ if ((*ptr == '\x0F' && ptr[1] == '\x05' /* SYSCALL */) ||
+ (isVDSO_ && *ptr == '\xFF')) {
+ #elif defined(__i386__)
+ if ((*ptr == '\xCD' && ptr[1] == '\x80' /* INT $0x80 */) ||
+ (*ptr == '\x65' && ptr[1] == '\xFF' &&
+ ptr[2] == '\x15' /* CALL %gs:.. */)) {
+ #else
+ #error Unsupported target platform
+ #endif
+ ptr++;
+ has_syscall = true;
+ nopcount = 0;
+ } else if (*ptr == '\x90' /* NOP */) {
+ nopcount++;
+ } else if (!(reinterpret_cast<long>(ptr) & 0xF)) {
+ if (nopcount > 2) {
+ // This is very likely the beginning of a new function. Functions
+ // are aligned on 16 byte boundaries and the preceding function is
+ // padded out with NOPs.
+ //
+ // For performance reasons, we quickly scan the entire text segment
+ // for potential SYSCALLs, and then patch the code in increments of
+ // individual functions.
+ if (has_syscall) {
+ has_syscall = false;
+ // Our quick scan of the function found a potential system call.
+ // Do a more thorough scan, now.
+ patchSystemCallsInFunction(maps_, func, ptr, &extraSpace,
+ &extraLength);
+ }
+ func = ptr;
+ }
+ nopcount = 0;
+ } else {
+ nopcount = 0;
+ }
+ }
+ if (has_syscall) {
+ // Patch any remaining system calls that were in the last function before
+ // the loop terminated.
+ patchSystemCallsInFunction(maps_, func, stop, &extraSpace, &extraLength);
+ }
+
+ // Mark our scratch space as write-protected and executable.
+ if (extraSpace) {
+ Sandbox::SysCalls sys;
+ sys.mprotect(extraSpace, 4096, PROT_READ|PROT_EXEC);
+ }
+}
+
+bool Library::parseElf() {
+ valid_ = true;
+
+ // Verify ELF header
+ Elf_Shdr str_shdr;
+ if (!getOriginal(0, &ehdr_) ||
+ ehdr_.e_ehsize < sizeof(Elf_Ehdr) ||
+ ehdr_.e_phentsize < sizeof(Elf_Phdr) ||
+ ehdr_.e_shentsize < sizeof(Elf_Shdr) ||
+ !getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize,
+ &str_shdr)) {
+ // Not all memory mappings are necessarily ELF files. Skip memory
+ // mappings that we cannot identify.
+ valid_ = false;
+ return false;
+ }
+
+ // Find PT_DYNAMIC segment. This is what our PLT entries and symbols will
+ // point to. This information is probably incorrect in the child, as it
+ // requires access to the original memory mappings.
+ for (int i = 0; i < ehdr_.e_phnum; i++) {
+ Elf_Phdr phdr;
+ if (getOriginal(ehdr_.e_phoff + i*ehdr_.e_phentsize, &phdr) &&
+ phdr.p_type == PT_DYNAMIC) {
+ RangeMap::const_iterator iter =
+ memory_ranges_.lower_bound(phdr.p_offset);
+ if (iter != memory_ranges_.end()) {
+ asr_offset_ = reinterpret_cast<char *>(iter->second.start) -
+ (phdr.p_vaddr - (phdr.p_offset - iter->first));
+ }
+ break;
+ }
+ }
+
+ // Parse section table and find all sections in this ELF file
+ for (int i = 0; i < ehdr_.e_shnum; i++) {
+ Elf_Shdr shdr;
+ if (!getOriginal(ehdr_.e_shoff + i*ehdr_.e_shentsize, &shdr)) {
+ continue;
+ }
+ section_table_.insert(
+ std::make_pair(getOriginal(str_shdr.sh_offset + shdr.sh_name),
+ std::make_pair(i, shdr)));
+ }
+
+ return !isVDSO_ || parseSymbols();
+}
+
+bool Library::parseSymbols() {
+ if (!valid_) {
+ return false;
+ }
+
+ Elf_Shdr str_shdr;
+ getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, &str_shdr);
+
+ // Find PLT and symbol tables
+ const Elf_Shdr* plt = getSection(ELF_REL_PLT);
+ const Elf_Shdr* symtab = getSection(".dynsym");
+ Elf_Shdr strtab = { 0 };
+ if (symtab) {
+ if (symtab->sh_link >= ehdr_.e_shnum ||
+ !getOriginal(ehdr_.e_shoff + symtab->sh_link * ehdr_.e_shentsize,
+ &strtab)) {
+ Debug::message("Cannot find valid symbol table\n");
+ valid_ = false;
+ return false;
+ }
+ }
+
+ if (plt && symtab) {
+ // Parse PLT table and add its entries
+ for (int i = plt->sh_size/sizeof(Elf_Rel); --i >= 0; ) {
+ Elf_Rel rel;
+ if (!getOriginal(plt->sh_offset + i * sizeof(Elf_Rel), &rel) ||
+ ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym) >= symtab->sh_size) {
+ Debug::message("Encountered invalid plt entry\n");
+ valid_ = false;
+ return false;
+ }
+
+ if (ELF_R_TYPE(rel.r_info) != ELF_JUMP_SLOT) {
+ continue;
+ }
+ Elf_Sym sym;
+ if (!getOriginal(symtab->sh_offset +
+ ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym), &sym) ||
+ sym.st_shndx >= ehdr_.e_shnum) {
+ Debug::message("Encountered invalid symbol for plt entry\n");
+ valid_ = false;
+ return false;
+ }
+ std::string name = getOriginal(strtab.sh_offset + sym.st_name);
+ if (name.empty()) {
+ continue;
+ }
+ plt_entries_.insert(std::make_pair(name, rel.r_offset));
+ }
+ }
+
+ if (symtab) {
+ // Parse symbol table and add its entries
+ for (Elf_Addr addr = 0; addr < symtab->sh_size; addr += sizeof(Elf_Sym)) {
+ Elf_Sym sym;
+ if (!getOriginal(symtab->sh_offset + addr, &sym) ||
+ (sym.st_shndx >= ehdr_.e_shnum &&
+ sym.st_shndx < SHN_LORESERVE)) {
+ Debug::message("Encountered invalid symbol\n");
+ valid_ = false;
+ return false;
+ }
+ std::string name = getOriginal(strtab.sh_offset + sym.st_name);
+ if (name.empty()) {
+ continue;
+ }
+ symbols_.insert(std::make_pair(name, sym));
+ }
+ }
+
+ SymbolTable::const_iterator iter = symbols_.find("__kernel_vsyscall");
+ if (iter != symbols_.end() && iter->second.st_value) {
+ __kernel_vsyscall = asr_offset_ + iter->second.st_value;
+ }
+ iter = symbols_.find("__kernel_sigreturn");
+ if (iter != symbols_.end() && iter->second.st_value) {
+ __kernel_sigreturn = asr_offset_ + iter->second.st_value;
+ }
+ iter = symbols_.find("__kernel_rt_sigreturn");
+ if (iter != symbols_.end() && iter->second.st_value) {
+ __kernel_rt_sigreturn = asr_offset_ + iter->second.st_value;
+ }
+
+ return true;
+}
+
+void Library::recoverOriginalDataParent(Maps* maps) {
+ maps_ = maps;
+}
+
+void Library::recoverOriginalDataChild(const std::string& filename) {
+ if (isVDSO_) {
+ valid_ = true;
+ return;
+ }
+ if (memory_ranges_.empty() || memory_ranges_.rbegin()->first) {
+ failed:
+ memory_ranges_.clear();
+ } else {
+ const Range& range = memory_ranges_.rbegin()->second;
+ struct Args {
+ void* old_addr;
+ long old_length;
+ void* new_addr;
+ long new_length;
+ long prot;
+ } args = {
+ range.start,
+ (reinterpret_cast<long>(range.stop) -
+ reinterpret_cast<long>(range.start) + 4095) & ~4095,
+ 0,
+ (memory_ranges_.begin()->first +
+ (reinterpret_cast<long>(memory_ranges_.begin()->second.stop) -
+ reinterpret_cast<long>(memory_ranges_.begin()->second.start)) +
+ 4095) & ~4095,
+ range.prot
+ };
+ // We find the memory mapping that starts at file offset zero and
+ // extend it to cover the entire file. This is a little difficult to
+ // do, as the mapping needs to be moved to a different address. But
+ // we are potentially running code that is inside of this mapping at the
+ // time when it gets moved.
+ //
+ // We have to write the code in assembly. We allocate temporary
+ // storage and copy the critical code into this page. We then execute
+ // from this page, while we relocate the mapping. Finally, we allocate
+ // memory at the original location and copy the original data into it.
+ // The program can now resume execution.
+ #if defined(__x86_64__)
+ asm volatile(
+ // new_addr = 4096 + mmap(0, new_length + 4096,
+ // PROT_READ|PROT_WRITE|PROT_EXEC,
+ // MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ "mov $0, %%r9\n"
+ "mov $-1, %%r8\n"
+ "mov $0x22, %%r10\n"
+ "mov $7, %%rdx\n"
+ "mov 0x18(%0), %%rsi\n"
+ "add $4096, %%rsi\n"
+ "mov $0, %%rdi\n"
+ "mov $9, %%rax\n"
+ "syscall\n"
+ "cmp $-4096, %%rax\n"
+ "ja 6f\n"
+ "mov %%rax, %%r12\n"
+ "add $4096, %%r12\n"
+
+ // memcpy(new_addr - 4096, &&asm, asm_length)
+ "lea 2f(%%rip), %%rsi\n"
+ "lea 6f(%%rip), %%rdi\n"
+ "sub %%rsi, %%rdi\n"
+ "0:sub $1, %%rdi\n"
+ "test %%rdi, %%rdi\n"
+ "js 1f\n"
+ "movzbl (%%rsi, %%rdi, 1), %%ebx\n"
+ "mov %%bl, (%%rax, %%rdi, 1)\n"
+ "jmp 0b\n"
+ "1:\n"
+
+ // ((void (*)())new_addr - 4096)()
+ "lea 6f(%%rip), %%rbx\n"
+ "push %%rbx\n"
+ "jmp *%%rax\n"
+
+ // mremap(old_addr, old_length, new_length,
+ // MREMAP_MAYMOVE|MREMAP_FIXED, new_addr)
+ "2:mov %%r12, %%r8\n"
+ "mov $3, %%r10\n"
+ "mov 0x18(%0), %%rdx\n"
+ "mov 0x8(%0), %%rsi\n"
+ "mov 0(%0), %%rdi\n"
+ "mov $25, %%rax\n"
+ "syscall\n"
+ "cmp $-4096, %%rax\n"
+ "ja 5f\n"
+
+ // mmap(old_addr, old_length, PROT_WRITE,
+ // MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0)
+ "mov $0, %%r9\n"
+ "mov $-1, %%r8\n"
+ "mov $0x32, %%r10\n"
+ "mov $2, %%rdx\n"
+ "mov 0x8(%0), %%rsi\n"
+ "mov 0(%0), %%rdi\n"
+ "mov $9, %%rax\n"
+ "syscall\n"
+ "cmp $-12, %%eax\n"
+ "jz 4f\n"
+ "cmp $-4096, %%rax\n"
+ "ja 5f\n"
+
+ // memcpy(old_addr, new_addr, old_length)
+ "mov 0x8(%0), %%rdi\n"
+ "3:sub $1, %%rdi\n"
+ "test %%rdi, %%rdi\n"
+ "js 4f\n"
+ "movzbl (%%r12, %%rdi, 1), %%ebx\n"
+ "mov %%bl, (%%rax, %%rdi, 1)\n"
+ "jmp 3b\n"
+ "4:\n"
+
+ // mprotect(old_addr, old_length, prot)
+ "mov 0x20(%0), %%rdx\n"
+ "mov 0x8(%0), %%rsi\n"
+ "mov %%rax, %%rdi\n"
+ "mov $10, %%rax\n"
+ "syscall\n"
+
+ // args.new_addr = new_addr
+ "mov %%r12, 0x10(%0)\n"
+ "5:retq\n"
+
+ // munmap(new_addr - 4096, 4096)
+ "6:mov $4096, %%rsi\n"
+ "mov %%r12, %%rdi\n"
+ "sub %%rsi, %%rdi\n"
+ "mov $11, %%rax\n"
+ "syscall\n"
+ :
+ : "q"(&args)
+ : "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+ "r8", "r9", "r10", "r11", "r12", "memory");
+ #elif defined(__i386__)
+ asm volatile(
+ "push %%ebp\n"
+ "push %%ebx\n"
+ "push %%edi\n"
+
+ // new_addr = 4096 + mmap(0, new_length + 4096,
+ // PROT_READ|PROT_WRITE|PROT_EXEC,
+ // MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ "mov $0, %%ebp\n"
+ "mov $0x22, %%esi\n"
+ "mov $7, %%edx\n"
+ "mov 12(%%edi), %%ecx\n"
+ "add $4096, %%ecx\n"
+ "mov $-1, %%edi\n"
+ "mov $0, %%ebx\n"
+ "mov $192, %%eax\n"
+ "int $0x80\n"
+ "cmp $-4096, %%eax\n"
+ "ja 6f\n"
+ "mov %%eax, %%ebp\n"
+ "add $4096, %%ebp\n"
+
+ // memcpy(new_addr - 4096, &&asm, asm_length)
+ "lea 2f, %%ecx\n"
+ "lea 6f, %%ebx\n"
+ "sub %%ecx, %%ebx\n"
+ "0:dec %%ebx\n"
+ "test %%ebx, %%ebx\n"
+ "js 1f\n"
+ "movzbl (%%ecx, %%ebx, 1), %%edx\n"
+ "mov %%dl, (%%eax, %%ebx, 1)\n"
+ "jmp 0b\n"
+ "1:\n"
+
+ // ((void (*)())new_addr - 4096)()
+ "lea 6f, %%ebx\n"
+ "push %%ebx\n"
+ "jmp *%%eax\n"
+
+ // mremap(old_addr, old_length, new_length,
+ // MREMAP_MAYMOVE|MREMAP_FIXED, new_addr)
+ "2:push %%ebp\n"
+ "mov $3, %%esi\n"
+ "mov 8(%%esp), %%edi\n"
+ "mov 12(%%edi), %%edx\n"
+ "mov 4(%%edi), %%ecx\n"
+ "mov 0(%%edi), %%ebx\n"
+ "mov %%ebp, %%edi\n"
+ "mov $163, %%eax\n"
+ "int $0x80\n"
+ "cmp $-4096, %%eax\n"
+ "ja 5f\n"
+
+ // mmap(old_addr, old_length, PROT_WRITE,
+ // MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0)
+ "mov $0, %%ebp\n"
+ "mov $0x32, %%esi\n"
+ "mov $2, %%edx\n"
+ "mov 8(%%esp), %%edi\n"
+ "mov 4(%%edi), %%ecx\n"
+ "mov 0(%%edi), %%ebx\n"
+ "mov $-1, %%edi\n"
+ "mov $192, %%eax\n"
+ "int $0x80\n"
+ "cmp $-12, %%eax\n"
+ "jz 4f\n"
+ "cmp $-4096, %%eax\n"
+ "ja 5f\n"
+
+ // memcpy(old_addr, new_addr, old_length)
+ "mov 0(%%esp), %%ecx\n"
+ "mov 8(%%esp), %%edi\n"
+ "mov 4(%%edi), %%ebx\n"
+ "3:dec %%ebx\n"
+ "test %%ebx, %%ebx\n"
+ "js 4f\n"
+ "movzbl (%%ecx, %%ebx, 1), %%edx\n"
+ "mov %%dl, (%%eax, %%ebx, 1)\n"
+ "jmp 3b\n"
+ "4:\n"
+
+ // mprotect(old_addr, old_length, prot)
+ "mov 8(%%esp), %%edi\n"
+ "mov 16(%%edi), %%edx\n"
+ "mov 4(%%edi), %%ecx\n"
+ "mov %%eax, %%ebx\n"
+ "mov $125, %%eax\n"
+ "int $0x80\n"
+
+ // args.new_addr = new_addr
+ "mov 8(%%esp), %%edi\n"
+ "mov 0(%%esp), %%ebp\n"
+ "mov %%ebp, 0x8(%%edi)\n"
+
+ "5:pop %%ebx\n"
+ "ret\n"
+
+ // munmap(new_addr - 4096, 4096)
+ "6:mov $4096, %%ecx\n"
+ "sub %%ecx, %%ebx\n"
+ "mov $91, %%eax\n"
+ "int $0x80\n"
+ "pop %%edi\n"
+ "pop %%ebx\n"
+ "pop %%ebp\n"
+ :
+ : "D"(&args)
+ : "eax", "ecx", "edx", "esi", "memory");
+ #else
+ #error Unsupported target platform
+ #endif
+ if (!args.new_addr) {
+ goto failed;
+ }
+
+ memory_ranges_.clear();
+ memory_ranges_.insert(std::make_pair(0, Range(args.new_addr,
+ reinterpret_cast<char *>(args.new_addr) + args.new_length,
+ PROT_READ)));
+ valid_ = true;
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/library.h b/sandbox/linux/seccomp/library.h
new file mode 100644
index 0000000..002992b
--- /dev/null
+++ b/sandbox/linux/seccomp/library.h
@@ -0,0 +1,164 @@
+#ifndef LIBRARY_H__
+#define LIBRARY_H__
+
+#include <elf.h>
+#include <map>
+#include <set>
+#include <string>
+#include <string.h>
+#include <sys/mman.h>
+
+#include "maps.h"
+
+#if defined(__x86_64__)
+typedef Elf64_Ehdr Elf_Ehdr;
+typedef Elf64_Shdr Elf_Shdr;
+typedef Elf64_Sym Elf_Sym;
+typedef Elf64_Addr Elf_Addr;
+#elif defined(__i386__)
+typedef Elf32_Ehdr Elf_Ehdr;
+typedef Elf32_Shdr Elf_Shdr;
+typedef Elf32_Sym Elf_Sym;
+typedef Elf32_Addr Elf_Addr;
+#else
+#error Unsupported target platform
+#endif
+
+struct SyscallTable;
+namespace playground {
+
+class Library {
+ friend class Maps;
+ public:
+ Library() :
+ valid_(false),
+ isVDSO_(false),
+ asr_offset_(0),
+ vsys_offset_(0),
+ maps_(0) {
+ }
+
+ void addMemoryRange(void* start, void* stop, Elf_Addr offset, int prot,
+ int isVDSO) {
+ memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot)));
+ isVDSO_ = isVDSO;
+ }
+
+ char *get(Elf_Addr offset, char *buf, size_t len);
+ std::string get(Elf_Addr offset);
+ char *getOriginal(Elf_Addr offset, char *buf, size_t len);
+ std::string getOriginal(Elf_Addr offset);
+
+ template<class T>T* get(Elf_Addr offset, T* t) {
+ if (!valid_) {
+ memset(t, 0, sizeof(T));
+ return NULL;
+ }
+ return reinterpret_cast<T *>(get(offset, reinterpret_cast<char *>(t),
+ sizeof(T)));
+ }
+
+ template<class T>T* getOriginal(Elf_Addr offset, T* t) {
+ if (!valid_) {
+ memset(t, 0, sizeof(T));
+ return false;
+ }
+ if (maps_) {
+ return reinterpret_cast<T *>(maps_->forwardGetRequest(
+ this, offset, reinterpret_cast<char *>(t), sizeof(T)));
+ }
+ return get(offset, t);
+ }
+
+ template<class T>bool set(void *addr, T* value) {
+ if (!valid_) {
+ return false;
+ }
+ *reinterpret_cast<T *>(addr) = *value;
+ return true;
+ }
+
+ template<class T>bool set(Elf_Addr offset, T* value) {
+ if (!valid_) {
+ return false;
+ }
+ RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset);
+ if (iter == memory_ranges_.end()) {
+ return false;
+ }
+ offset -= iter->first;
+ if (offset >
+ reinterpret_cast<char *>(iter->second.stop) -
+ reinterpret_cast<char *>(iter->second.start) -
+ sizeof(T)) {
+ return false;
+ }
+ *reinterpret_cast<T *>(
+ reinterpret_cast<char *>(iter->second.start) + offset) = *value;
+ return true;
+ }
+
+ const Elf_Ehdr* getEhdr();
+ const Elf_Shdr* getSection(const std::string& section);
+ const int getSectionIndex(const std::string& section);
+ void **getRelocation(const std::string& symbol);
+ void *getSymbol(const std::string& symbol);
+ void makeWritable(bool state) const;
+ void patchSystemCalls();
+ bool isVDSO() const { return isVDSO_; }
+
+ protected:
+ bool parseElf();
+ bool parseSymbols();
+ void recoverOriginalDataParent(Maps* maps);
+ void recoverOriginalDataChild(const std::string& child);
+
+ private:
+ class GreaterThan : public std::binary_function<Elf_Addr, Elf_Addr, bool> {
+ public:
+ bool operator() (Elf_Addr s1, Elf_Addr s2) const {
+ return s1 > s2;
+ }
+ };
+
+ struct Range {
+ Range(void* start, void* stop, int prot) :
+ start(start), stop(stop), prot(prot) { }
+ void* start;
+ void* stop;
+ int prot;
+ };
+
+ typedef std::map<Elf_Addr, Range, GreaterThan> RangeMap;
+ typedef std::map<std::string, std::pair<int, Elf_Shdr> > SectionTable;
+ typedef std::map<std::string, Elf_Sym> SymbolTable;
+ typedef std::map<std::string, Elf_Addr> PltTable;
+
+ char* getBytes(char* dst, const char* src, ssize_t len);
+ static bool isSafeInsn(unsigned short insn);
+ static int isSimpleSystemCall(char *start, char *end);
+ static char* getScratchSpace(const Maps* maps, char* near, int needed,
+ char** extraSpace, int* extraLength);
+ void patchSystemCallsInFunction(const Maps* maps, char *start, char *end,
+ char** extraSpace, int* extraLength);
+ int patchVSystemCalls();
+ void patchVDSO(char** extraSpace, int* extraLength);
+
+ RangeMap memory_ranges_;
+ bool valid_;
+ bool isVDSO_;
+ char* asr_offset_;
+ int vsys_offset_;
+ Maps* maps_;
+ Elf_Ehdr ehdr_;
+ SectionTable section_table_;
+ SymbolTable symbols_;
+ PltTable plt_entries_;
+ static char* __kernel_vsyscall;
+ static char* __kernel_sigreturn;
+ static char* __kernel_rt_sigreturn;
+};
+
+} // namespace
+
+#endif // LIBRARY_H__
diff --git a/sandbox/linux/seccomp/linux_syscall_support.h b/sandbox/linux/seccomp/linux_syscall_support.h
new file mode 100644
index 0000000..876c279
--- /dev/null
+++ b/sandbox/linux/seccomp/linux_syscall_support.h
@@ -0,0 +1,3173 @@
+/* Copyright (c) 2005-2008, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Markus Gutschke
+ */
+
+/* This file includes Linux-specific support functions common to the
+ * coredumper and the thread lister; primarily, this is a collection
+ * of direct system calls, and a couple of symbols missing from
+ * standard header files.
+ * There are a few options that the including file can set to control
+ * the behavior of this file:
+ *
+ * SYS_CPLUSPLUS:
+ * The entire header file will normally be wrapped in 'extern "C" { }",
+ * making it suitable for compilation as both C and C++ source. If you
+ * do not want to do this, you can set the SYS_CPLUSPLUS macro to inhibit
+ * the wrapping. N.B. doing so will suppress inclusion of all prerequisite
+ * system header files, too. It is the caller's responsibility to provide
+ * the necessary definitions.
+ *
+ * SYS_ERRNO:
+ * All system calls will update "errno" unless overriden by setting the
+ * SYS_ERRNO macro prior to including this file. SYS_ERRNO should be
+ * an l-value.
+ *
+ * SYS_INLINE:
+ * New symbols will be defined "static inline", unless overridden by
+ * the SYS_INLINE macro.
+ *
+ * SYS_LINUX_SYSCALL_SUPPORT_H
+ * This macro is used to avoid multiple inclusions of this header file.
+ * If you need to include this file more than once, make sure to
+ * unset SYS_LINUX_SYSCALL_SUPPORT_H before each inclusion.
+ *
+ * SYS_PREFIX:
+ * New system calls will have a prefix of "sys_" unless overridden by
+ * the SYS_PREFIX macro. Valid values for this macro are [0..9] which
+ * results in prefixes "sys[0..9]_". It is also possible to set this
+ * macro to -1, which avoids all prefixes.
+ *
+ * This file defines a few internal symbols that all start with "LSS_".
+ * Do not access these symbols from outside this file. They are not part
+ * of the supported API.
+ */
+#ifndef SYS_LINUX_SYSCALL_SUPPORT_H
+#define SYS_LINUX_SYSCALL_SUPPORT_H
+
+/* We currently only support x86-32, x86-64, ARM, MIPS, and PPC on Linux.
+ * Porting to other related platforms should not be difficult.
+ */
+#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) || \
+ defined(__mips__) || defined(__PPC__)) && defined(__linux)
+
+#ifndef SYS_CPLUSPLUS
+#ifdef __cplusplus
+/* Some system header files in older versions of gcc neglect to properly
+ * handle being included from C++. As it appears to be harmless to have
+ * multiple nested 'extern "C"' blocks, just add another one here.
+ */
+extern "C" {
+#endif
+
+#include <errno.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <string.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/unistd.h>
+#include <endian.h>
+
+#ifdef __mips__
+/* Include definitions of the ABI currently in use. */
+#include <sgidefs.h>
+#endif
+
+#endif
+
+/* As glibc often provides subtly incompatible data structures (and implicit
+ * wrapper functions that convert them), we provide our own kernel data
+ * structures for use by the system calls.
+ * These structures have been developed by using Linux 2.6.23 headers for
+ * reference. Note though, we do not care about exact API compatibility
+ * with the kernel, and in fact the kernel often does not have a single
+ * API that works across architectures. Instead, we try to mimic the glibc
+ * API where reasonable, and only guarantee ABI compatibility with the
+ * kernel headers.
+ * Most notably, here are a few changes that were made to the structures
+ * defined by kernel headers:
+ *
+ * - we only define structures, but not symbolic names for kernel data
+ * types. For the latter, we directly use the native C datatype
+ * (i.e. "unsigned" instead of "mode_t").
+ * - in a few cases, it is possible to define identical structures for
+ * both 32bit (e.g. i386) and 64bit (e.g. x86-64) platforms by
+ * standardizing on the 64bit version of the data types. In particular,
+ * this means that we use "unsigned" where the 32bit headers say
+ * "unsigned long".
+ * - overall, we try to minimize the number of cases where we need to
+ * conditionally define different structures.
+ * - the "struct kernel_sigaction" class of structures have been
+ * modified to more closely mimic glibc's API by introducing an
+ * anonymous union for the function pointer.
+ * - a small number of field names had to have an underscore appended to
+ * them, because glibc defines a global macro by the same name.
+ */
+
+/* include/linux/dirent.h */
+struct kernel_dirent64 {
+ unsigned long long d_ino;
+ long long d_off;
+ unsigned short d_reclen;
+ unsigned char d_type;
+ char d_name[256];
+};
+
+/* include/linux/dirent.h */
+struct kernel_dirent {
+ long d_ino;
+ long d_off;
+ unsigned short d_reclen;
+ char d_name[256];
+};
+
+/* include/linux/uio.h */
+struct kernel_iovec {
+ void *iov_base;
+ unsigned long iov_len;
+};
+
+/* include/linux/socket.h */
+struct kernel_msghdr {
+ void *msg_name;
+ int msg_namelen;
+ struct kernel_iovec*msg_iov;
+ unsigned long msg_iovlen;
+ void *msg_control;
+ unsigned long msg_controllen;
+ unsigned msg_flags;
+};
+
+/* include/asm-generic/poll.h */
+struct kernel_pollfd {
+ int fd;
+ short events;
+ short revents;
+};
+
+/* include/linux/resource.h */
+struct kernel_rlimit {
+ unsigned long rlim_cur;
+ unsigned long rlim_max;
+};
+
+/* include/linux/time.h */
+struct kernel_timespec {
+ long tv_sec;
+ long tv_nsec;
+};
+
+/* include/linux/time.h */
+struct kernel_timeval {
+ long tv_sec;
+ long tv_usec;
+};
+
+/* include/linux/resource.h */
+struct kernel_rusage {
+ struct kernel_timeval ru_utime;
+ struct kernel_timeval ru_stime;
+ long ru_maxrss;
+ long ru_ixrss;
+ long ru_idrss;
+ long ru_isrss;
+ long ru_minflt;
+ long ru_majflt;
+ long ru_nswap;
+ long ru_inblock;
+ long ru_oublock;
+ long ru_msgsnd;
+ long ru_msgrcv;
+ long ru_nsignals;
+ long ru_nvcsw;
+ long ru_nivcsw;
+};
+
+struct siginfo;
+#if defined(__i386__) || defined(__ARM_ARCH_3__) || defined(__PPC__)
+
+/* include/asm-{arm,i386,mips,ppc}/signal.h */
+struct kernel_old_sigaction {
+ union {
+ void (*sa_handler_)(int);
+ void (*sa_sigaction_)(int, struct siginfo *, void *);
+ };
+ unsigned long sa_mask;
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+} __attribute__((packed,aligned(4)));
+#elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+ #define kernel_old_sigaction kernel_sigaction
+#endif
+
+/* Some kernel functions (e.g. sigaction() in 2.6.23) require that the
+ * exactly match the size of the signal set, even though the API was
+ * intended to be extensible. We define our own KERNEL_NSIG to deal with
+ * this.
+ * Please note that glibc provides signals [1.._NSIG-1], whereas the
+ * kernel (and this header) provides the range [1..KERNEL_NSIG]. The
+ * actual number of signals is obviously the same, but the constants
+ * differ by one.
+ */
+#ifdef __mips__
+#define KERNEL_NSIG 128
+#else
+#define KERNEL_NSIG 64
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64}/signal.h */
+struct kernel_sigset_t {
+ unsigned long sig[(KERNEL_NSIG + 8*sizeof(unsigned long) - 1)/
+ (8*sizeof(unsigned long))];
+};
+
+/* include/asm-{arm,i386,mips,x86_64,ppc}/signal.h */
+struct kernel_sigaction {
+#ifdef __mips__
+ unsigned long sa_flags;
+ union {
+ void (*sa_handler_)(int);
+ void (*sa_sigaction_)(int, struct siginfo *, void *);
+ };
+ struct kernel_sigset_t sa_mask;
+#else
+ union {
+ void (*sa_handler_)(int);
+ void (*sa_sigaction_)(int, struct siginfo *, void *);
+ };
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+ struct kernel_sigset_t sa_mask;
+#endif
+};
+
+/* include/linux/socket.h */
+struct kernel_sockaddr {
+ unsigned short sa_family;
+ char sa_data[14];
+};
+
+/* include/asm-{arm,i386,mips,ppc}/stat.h */
+#ifdef __mips__
+#if _MIPS_SIM == _MIPS_SIM_ABI64
+struct kernel_stat {
+#else
+struct kernel_stat64 {
+#endif
+ unsigned st_dev;
+ unsigned __pad0[3];
+ unsigned long long st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned st_rdev;
+ unsigned __pad1[3];
+ long long st_size;
+ unsigned st_atime_;
+ unsigned st_atime_nsec_;
+ unsigned st_mtime_;
+ unsigned st_mtime_nsec_;
+ unsigned st_ctime_;
+ unsigned st_ctime_nsec_;
+ unsigned st_blksize;
+ unsigned __pad2;
+ unsigned long long st_blocks;
+};
+#elif defined __PPC__
+struct kernel_stat64 {
+ unsigned long long st_dev;
+ unsigned long long st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned long long st_rdev;
+ unsigned short int __pad2;
+ long long st_size;
+ long st_blksize;
+ long long st_blocks;
+ long st_atime_;
+ unsigned long st_atime_nsec_;
+ long st_mtime_;
+ unsigned long st_mtime_nsec_;
+ long st_ctime_;
+ unsigned long st_ctime_nsec_;
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+#else
+struct kernel_stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+ unsigned __st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+ long long st_size;
+ unsigned st_blksize;
+ unsigned long long st_blocks;
+ unsigned st_atime_;
+ unsigned st_atime_nsec_;
+ unsigned st_mtime_;
+ unsigned st_mtime_nsec_;
+ unsigned st_ctime_;
+ unsigned st_ctime_nsec_;
+ unsigned long long st_ino;
+};
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64,ppc}/stat.h */
+#if defined(__i386__) || defined(__ARM_ARCH_3__)
+struct kernel_stat {
+ /* The kernel headers suggest that st_dev and st_rdev should be 32bit
+ * quantities encoding 12bit major and 20bit minor numbers in an interleaved
+ * format. In reality, we do not see useful data in the top bits. So,
+ * we'll leave the padding in here, until we find a better solution.
+ */
+ unsigned short st_dev;
+ short pad1;
+ unsigned st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned short st_rdev;
+ short pad2;
+ unsigned st_size;
+ unsigned st_blksize;
+ unsigned st_blocks;
+ unsigned st_atime_;
+ unsigned st_atime_nsec_;
+ unsigned st_mtime_;
+ unsigned st_mtime_nsec_;
+ unsigned st_ctime_;
+ unsigned st_ctime_nsec_;
+ unsigned __unused4;
+ unsigned __unused5;
+};
+#elif defined(__x86_64__)
+struct kernel_stat {
+ unsigned long st_dev;
+ unsigned long st_ino;
+ unsigned long st_nlink;
+ unsigned st_mode;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned __pad0;
+ unsigned long st_rdev;
+ long st_size;
+ long st_blksize;
+ long st_blocks;
+ unsigned long st_atime_;
+ unsigned long st_atime_nsec_;
+ unsigned long st_mtime_;
+ unsigned long st_mtime_nsec_;
+ unsigned long st_ctime_;
+ unsigned long st_ctime_nsec_;
+ long __unused[3];
+};
+#elif defined(__PPC__)
+struct kernel_stat {
+ unsigned st_dev;
+ unsigned long st_ino; // ino_t
+ unsigned long st_mode; // mode_t
+ unsigned short st_nlink; // nlink_t
+ unsigned st_uid; // uid_t
+ unsigned st_gid; // gid_t
+ unsigned st_rdev;
+ long st_size; // off_t
+ unsigned long st_blksize;
+ unsigned long st_blocks;
+ unsigned long st_atime_;
+ unsigned long st_atime_nsec_;
+ unsigned long st_mtime_;
+ unsigned long st_mtime_nsec_;
+ unsigned long st_ctime_;
+ unsigned long st_ctime_nsec_;
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+#elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64)
+struct kernel_stat {
+ unsigned st_dev;
+ int st_pad1[3];
+ unsigned st_ino;
+ unsigned st_mode;
+ unsigned st_nlink;
+ unsigned st_uid;
+ unsigned st_gid;
+ unsigned st_rdev;
+ int st_pad2[2];
+ long st_size;
+ int st_pad3;
+ long st_atime_;
+ long st_atime_nsec_;
+ long st_mtime_;
+ long st_mtime_nsec_;
+ long st_ctime_;
+ long st_ctime_nsec_;
+ int st_blksize;
+ int st_blocks;
+ int st_pad4[14];
+};
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64,ppc}/statfs.h */
+#ifdef __mips__
+#if _MIPS_SIM != _MIPS_SIM_ABI64
+struct kernel_statfs64 {
+ unsigned long f_type;
+ unsigned long f_bsize;
+ unsigned long f_frsize;
+ unsigned long __pad;
+ unsigned long long f_blocks;
+ unsigned long long f_bfree;
+ unsigned long long f_files;
+ unsigned long long f_ffree;
+ unsigned long long f_bavail;
+ struct { int val[2]; } f_fsid;
+ unsigned long f_namelen;
+ unsigned long f_spare[6];
+};
+#endif
+#elif !defined(__x86_64__)
+struct kernel_statfs64 {
+ unsigned long f_type;
+ unsigned long f_bsize;
+ unsigned long long f_blocks;
+ unsigned long long f_bfree;
+ unsigned long long f_bavail;
+ unsigned long long f_files;
+ unsigned long long f_ffree;
+ struct { int val[2]; } f_fsid;
+ unsigned long f_namelen;
+ unsigned long f_frsize;
+ unsigned long f_spare[5];
+};
+#endif
+
+/* include/asm-{arm,i386,mips,x86_64,ppc,generic}/statfs.h */
+#ifdef __mips__
+struct kernel_statfs {
+ long f_type;
+ long f_bsize;
+ long f_frsize;
+ long f_blocks;
+ long f_bfree;
+ long f_files;
+ long f_ffree;
+ long f_bavail;
+ struct { int val[2]; } f_fsid;
+ long f_namelen;
+ long f_spare[6];
+};
+#else
+struct kernel_statfs {
+ /* x86_64 actually defines all these fields as signed, whereas all other */
+ /* platforms define them as unsigned. Leaving them at unsigned should not */
+ /* cause any problems. */
+ unsigned long f_type;
+ unsigned long f_bsize;
+ unsigned long f_blocks;
+ unsigned long f_bfree;
+ unsigned long f_bavail;
+ unsigned long f_files;
+ unsigned long f_ffree;
+ struct { int val[2]; } f_fsid;
+ unsigned long f_namelen;
+ unsigned long f_frsize;
+ unsigned long f_spare[5];
+};
+#endif
+
+
+/* Definitions missing from the standard header files */
+#ifndef O_DIRECTORY
+#if defined(__ARM_ARCH_3__)
+#define O_DIRECTORY 0040000
+#else
+#define O_DIRECTORY 0200000
+#endif
+#endif
+#ifndef NT_PRXFPREG
+#define NT_PRXFPREG 0x46e62b7f
+#endif
+#ifndef PTRACE_GETFPXREGS
+#define PTRACE_GETFPXREGS ((enum __ptrace_request)18)
+#endif
+#ifndef PR_GET_DUMPABLE
+#define PR_GET_DUMPABLE 3
+#endif
+#ifndef PR_SET_DUMPABLE
+#define PR_SET_DUMPABLE 4
+#endif
+#ifndef PR_GET_SECCOMP
+#define PR_GET_SECCOMP 21
+#endif
+#ifndef PR_SET_SECCOMP
+#define PR_SET_SECCOMP 22
+#endif
+#ifndef AT_FDCWD
+#define AT_FDCWD (-100)
+#endif
+#ifndef AT_SYMLINK_NOFOLLOW
+#define AT_SYMLINK_NOFOLLOW 0x100
+#endif
+#ifndef AT_REMOVEDIR
+#define AT_REMOVEDIR 0x200
+#endif
+#ifndef MREMAP_FIXED
+#define MREMAP_FIXED 2
+#endif
+#ifndef SA_RESTORER
+#define SA_RESTORER 0x04000000
+#endif
+#ifndef CPUCLOCK_PROF
+#define CPUCLOCK_PROF 0
+#endif
+#ifndef CPUCLOCK_VIRT
+#define CPUCLOCK_VIRT 1
+#endif
+#ifndef CPUCLOCK_SCHED
+#define CPUCLOCK_SCHED 2
+#endif
+#ifndef CPUCLOCK_PERTHREAD_MASK
+#define CPUCLOCK_PERTHREAD_MASK 4
+#endif
+#ifndef MAKE_PROCESS_CPUCLOCK
+#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
+ ((~(int)(pid) << 3) | (int)(clock))
+#endif
+#ifndef MAKE_THREAD_CPUCLOCK
+#define MAKE_THREAD_CPUCLOCK(tid, clock) \
+ ((~(int)(tid) << 3) | (int)((clock) | CPUCLOCK_PERTHREAD_MASK))
+#endif
+
+#if defined(__x86_64__)
+#ifndef ARCH_SET_GS
+#define ARCH_SET_GS 0x1001
+#endif
+#ifndef ARCH_GET_GS
+#define ARCH_GET_GS 0x1004
+#endif
+#endif
+
+#if defined(__i386__)
+#ifndef __NR_quotactl
+#define __NR_quotactl 131
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid 164
+#define __NR_getresuid 165
+#define __NR_setresgid 170
+#define __NR_getresgid 171
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn 173
+#define __NR_rt_sigaction 174
+#define __NR_rt_sigprocmask 175
+#define __NR_rt_sigpending 176
+#define __NR_rt_sigsuspend 179
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 180
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 181
+#endif
+#ifndef __NR_ugetrlimit
+#define __NR_ugetrlimit 191
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 195
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 197
+#endif
+#ifndef __NR_setresuid32
+#define __NR_setresuid32 208
+#define __NR_getresuid32 209
+#define __NR_setresgid32 210
+#define __NR_getresgid32 211
+#endif
+#ifndef __NR_setfsuid32
+#define __NR_setfsuid32 215
+#define __NR_setfsgid32 216
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 220
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead 225
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr 226
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr 227
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr 229
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr 230
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr 232
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr 233
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill 238
+#endif
+#ifndef __NR_futex
+#define __NR_futex 240
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 241
+#define __NR_sched_getaffinity 242
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address 258
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime 265
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres 266
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 268
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 269
+#endif
+#ifndef __NR_fadvise64_64
+#define __NR_fadvise64_64 272
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 289
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get 290
+#endif
+#ifndef __NR_openat
+#define __NR_openat 295
+#endif
+#ifndef __NR_fstatat64
+#define __NR_fstatat64 300
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat 301
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages 317
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu 318
+#endif
+#ifndef __NR_fallocate
+#define __NR_fallocate 324
+#endif
+/* End of i386 definitions */
+#elif defined(__ARM_ARCH_3__)
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_SYSCALL_BASE + 164)
+#define __NR_getresuid (__NR_SYSCALL_BASE + 165)
+#define __NR_setresgid (__NR_SYSCALL_BASE + 170)
+#define __NR_getresgid (__NR_SYSCALL_BASE + 171)
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn (__NR_SYSCALL_BASE + 173)
+#define __NR_rt_sigaction (__NR_SYSCALL_BASE + 174)
+#define __NR_rt_sigprocmask (__NR_SYSCALL_BASE + 175)
+#define __NR_rt_sigpending (__NR_SYSCALL_BASE + 176)
+#define __NR_rt_sigsuspend (__NR_SYSCALL_BASE + 179)
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 (__NR_SYSCALL_BASE + 180)
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 (__NR_SYSCALL_BASE + 181)
+#endif
+#ifndef __NR_ugetrlimit
+#define __NR_ugetrlimit (__NR_SYSCALL_BASE + 191)
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 (__NR_SYSCALL_BASE + 195)
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 (__NR_SYSCALL_BASE + 197)
+#endif
+#ifndef __NR_setresuid32
+#define __NR_setresuid32 (__NR_SYSCALL_BASE + 208)
+#define __NR_getresuid32 (__NR_SYSCALL_BASE + 209)
+#define __NR_setresgid32 (__NR_SYSCALL_BASE + 210)
+#define __NR_getresgid32 (__NR_SYSCALL_BASE + 211)
+#endif
+#ifndef __NR_setfsuid32
+#define __NR_setfsuid32 (__NR_SYSCALL_BASE + 215)
+#define __NR_setfsgid32 (__NR_SYSCALL_BASE + 216)
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 (__NR_SYSCALL_BASE + 217)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_SYSCALL_BASE + 224)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_SYSCALL_BASE + 225)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_SYSCALL_BASE + 226)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_SYSCALL_BASE + 227)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_SYSCALL_BASE + 229)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_SYSCALL_BASE + 230)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_SYSCALL_BASE + 232)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_SYSCALL_BASE + 233)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_SYSCALL_BASE + 238)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_SYSCALL_BASE + 240)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_SYSCALL_BASE + 241)
+#define __NR_sched_getaffinity (__NR_SYSCALL_BASE + 242)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_SYSCALL_BASE + 256)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_SYSCALL_BASE + 263)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_SYSCALL_BASE + 264)
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 (__NR_SYSCALL_BASE + 266)
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 (__NR_SYSCALL_BASE + 267)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_SYSCALL_BASE + 314)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_SYSCALL_BASE + 315)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_SYSCALL_BASE + 344)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_SYSCALL_BASE + 345)
+#endif
+/* End of ARM 3 definitions */
+#elif defined(__x86_64__)
+#ifndef __NR_pread64
+#define __NR_pread64 17
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 18
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid 117
+#define __NR_getresuid 118
+#define __NR_setresgid 119
+#define __NR_getresgid 120
+#endif
+#ifndef __NR_quotactl
+#define __NR_quotactl 179
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid 186
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead 187
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr 188
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr 189
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr 191
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr 192
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr 194
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr 195
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill 200
+#endif
+#ifndef __NR_futex
+#define __NR_futex 202
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#define __NR_sched_getaffinity 204
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 217
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address 218
+#endif
+#ifndef __NR_fadvise64
+#define __NR_fadvise64 221
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime 228
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres 229
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 251
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get 252
+#endif
+#ifndef __NR_openat
+#define __NR_openat 257
+#endif
+#ifndef __NR_newfstatat
+#define __NR_newfstatat 262
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat 263
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages 279
+#endif
+#ifndef __NR_fallocate
+#define __NR_fallocate 285
+#endif
+/* End of x86-64 definitions */
+#elif defined(__mips__)
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_Linux + 185)
+#define __NR_getresuid (__NR_Linux + 186)
+#define __NR_setresgid (__NR_Linux + 190)
+#define __NR_getresgid (__NR_Linux + 191)
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn (__NR_Linux + 193)
+#define __NR_rt_sigaction (__NR_Linux + 194)
+#define __NR_rt_sigprocmask (__NR_Linux + 195)
+#define __NR_rt_sigpending (__NR_Linux + 196)
+#define __NR_rt_sigsuspend (__NR_Linux + 199)
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 (__NR_Linux + 200)
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 (__NR_Linux + 201)
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 (__NR_Linux + 213)
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 (__NR_Linux + 215)
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 (__NR_Linux + 219)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_Linux + 222)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_Linux + 223)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_Linux + 224)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_Linux + 225)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_Linux + 227)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_Linux + 228)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_Linux + 230)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_Linux + 231)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_Linux + 236)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_Linux + 238)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_Linux + 239)
+#define __NR_sched_getaffinity (__NR_Linux + 240)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_Linux + 252)
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 (__NR_Linux + 255)
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 (__NR_Linux + 256)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_Linux + 263)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_Linux + 264)
+#endif
+#ifndef __NR_openat
+#define __NR_openat (__NR_Linux + 288)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat (__NR_Linux + 293)
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat (__NR_Linux + 294)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_Linux + 308)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_Linux + 312)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_Linux + 314)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_Linux + 315)
+#endif
+/* End of MIPS (old 32bit API) definitions */
+#elif _MIPS_SIM == _MIPS_SIM_ABI64
+#ifndef __NR_pread64
+#define __NR_pread64 (__NR_Linux + 16)
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 (__NR_Linux + 17)
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_Linux + 115)
+#define __NR_getresuid (__NR_Linux + 116)
+#define __NR_setresgid (__NR_Linux + 117)
+#define __NR_getresgid (__NR_Linux + 118)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_Linux + 178)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_Linux + 179)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_Linux + 180)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_Linux + 181)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_Linux + 183)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_Linux + 184)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_Linux + 186)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_Linux + 187)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_Linux + 192)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_Linux + 194)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_Linux + 195)
+#define __NR_sched_getaffinity (__NR_Linux + 196)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_Linux + 212)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_Linux + 222)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_Linux + 223)
+#endif
+#ifndef __NR_openat
+#define __NR_openat (__NR_Linux + 247)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat (__NR_Linux + 252)
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat (__NR_Linux + 253)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_Linux + 267)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_Linux + 271)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_Linux + 273)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_Linux + 274)
+#endif
+/* End of MIPS (64bit API) definitions */
+#else
+#ifndef __NR_setresuid
+#define __NR_setresuid (__NR_Linux + 115)
+#define __NR_getresuid (__NR_Linux + 116)
+#define __NR_setresgid (__NR_Linux + 117)
+#define __NR_getresgid (__NR_Linux + 118)
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid (__NR_Linux + 178)
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead (__NR_Linux + 179)
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr (__NR_Linux + 180)
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr (__NR_Linux + 181)
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr (__NR_Linux + 183)
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr (__NR_Linux + 184)
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr (__NR_Linux + 186)
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr (__NR_Linux + 187)
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill (__NR_Linux + 192)
+#endif
+#ifndef __NR_futex
+#define __NR_futex (__NR_Linux + 194)
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity (__NR_Linux + 195)
+#define __NR_sched_getaffinity (__NR_Linux + 196)
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address (__NR_Linux + 213)
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 (__NR_Linux + 217)
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 (__NR_Linux + 218)
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime (__NR_Linux + 226)
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres (__NR_Linux + 227)
+#endif
+#ifndef __NR_openat
+#define __NR_openat (__NR_Linux + 251)
+#endif
+#ifndef __NR_fstatat
+#define __NR_fstatat (__NR_Linux + 256)
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat (__NR_Linux + 257)
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages (__NR_Linux + 271)
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu (__NR_Linux + 275)
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set (__NR_Linux + 277)
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get (__NR_Linux + 278)
+#endif
+/* End of MIPS (new 32bit API) definitions */
+#endif
+/* End of MIPS definitions */
+#elif defined(__PPC__)
+#ifndef __NR_setfsuid
+#define __NR_setfsuid 138
+#define __NR_setfsgid 139
+#endif
+#ifndef __NR_setresuid
+#define __NR_setresuid 164
+#define __NR_getresuid 165
+#define __NR_setresgid 169
+#define __NR_getresgid 170
+#endif
+#ifndef __NR_rt_sigaction
+#define __NR_rt_sigreturn 172
+#define __NR_rt_sigaction 173
+#define __NR_rt_sigprocmask 174
+#define __NR_rt_sigpending 175
+#define __NR_rt_sigsuspend 178
+#endif
+#ifndef __NR_pread64
+#define __NR_pread64 179
+#endif
+#ifndef __NR_pwrite64
+#define __NR_pwrite64 180
+#endif
+#ifndef __NR_ugetrlimit
+#define __NR_ugetrlimit 190
+#endif
+#ifndef __NR_readahead
+#define __NR_readahead 191
+#endif
+#ifndef __NR_stat64
+#define __NR_stat64 195
+#endif
+#ifndef __NR_fstat64
+#define __NR_fstat64 197
+#endif
+#ifndef __NR_getdents64
+#define __NR_getdents64 202
+#endif
+#ifndef __NR_gettid
+#define __NR_gettid 207
+#endif
+#ifndef __NR_tkill
+#define __NR_tkill 208
+#endif
+#ifndef __NR_setxattr
+#define __NR_setxattr 209
+#endif
+#ifndef __NR_lsetxattr
+#define __NR_lsetxattr 210
+#endif
+#ifndef __NR_getxattr
+#define __NR_getxattr 212
+#endif
+#ifndef __NR_lgetxattr
+#define __NR_lgetxattr 213
+#endif
+#ifndef __NR_listxattr
+#define __NR_listxattr 215
+#endif
+#ifndef __NR_llistxattr
+#define __NR_llistxattr 216
+#endif
+#ifndef __NR_futex
+#define __NR_futex 221
+#endif
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 222
+#define __NR_sched_getaffinity 223
+#endif
+#ifndef __NR_set_tid_address
+#define __NR_set_tid_address 232
+#endif
+#ifndef __NR_clock_gettime
+#define __NR_clock_gettime 246
+#endif
+#ifndef __NR_clock_getres
+#define __NR_clock_getres 247
+#endif
+#ifndef __NR_statfs64
+#define __NR_statfs64 252
+#endif
+#ifndef __NR_fstatfs64
+#define __NR_fstatfs64 253
+#endif
+#ifndef __NR_fadvise64_64
+#define __NR_fadvise64_64 254
+#endif
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 273
+#endif
+#ifndef __NR_ioprio_get
+#define __NR_ioprio_get 274
+#endif
+#ifndef __NR_openat
+#define __NR_openat 286
+#endif
+#ifndef __NR_fstatat64
+#define __NR_fstatat64 291
+#endif
+#ifndef __NR_unlinkat
+#define __NR_unlinkat 292
+#endif
+#ifndef __NR_move_pages
+#define __NR_move_pages 301
+#endif
+#ifndef __NR_getcpu
+#define __NR_getcpu 302
+#endif
+/* End of powerpc defininitions */
+#endif
+
+
+/* After forking, we must make sure to only call system calls. */
+#if __BOUNDED_POINTERS__
+ #error "Need to port invocations of syscalls for bounded ptrs"
+#else
+ /* The core dumper and the thread lister get executed after threads
+ * have been suspended. As a consequence, we cannot call any functions
+ * that acquire locks. Unfortunately, libc wraps most system calls
+ * (e.g. in order to implement pthread_atfork, and to make calls
+ * cancellable), which means we cannot call these functions. Instead,
+ * we have to call syscall() directly.
+ */
+ #undef LSS_ERRNO
+ #ifdef SYS_ERRNO
+ /* Allow the including file to override the location of errno. This can
+ * be useful when using clone() with the CLONE_VM option.
+ */
+ #define LSS_ERRNO SYS_ERRNO
+ #else
+ #define LSS_ERRNO errno
+ #endif
+
+ #undef LSS_INLINE
+ #ifdef SYS_INLINE
+ #define LSS_INLINE SYS_INLINE
+ #else
+ #define LSS_INLINE static inline
+ #endif
+
+ /* Allow the including file to override the prefix used for all new
+ * system calls. By default, it will be set to "sys_".
+ */
+ #undef LSS_NAME
+ #ifndef SYS_PREFIX
+ #define LSS_NAME(name) sys_##name
+ #elif SYS_PREFIX < 0
+ #define LSS_NAME(name) name
+ #elif SYS_PREFIX == 0
+ #define LSS_NAME(name) sys0_##name
+ #elif SYS_PREFIX == 1
+ #define LSS_NAME(name) sys1_##name
+ #elif SYS_PREFIX == 2
+ #define LSS_NAME(name) sys2_##name
+ #elif SYS_PREFIX == 3
+ #define LSS_NAME(name) sys3_##name
+ #elif SYS_PREFIX == 4
+ #define LSS_NAME(name) sys4_##name
+ #elif SYS_PREFIX == 5
+ #define LSS_NAME(name) sys5_##name
+ #elif SYS_PREFIX == 6
+ #define LSS_NAME(name) sys6_##name
+ #elif SYS_PREFIX == 7
+ #define LSS_NAME(name) sys7_##name
+ #elif SYS_PREFIX == 8
+ #define LSS_NAME(name) sys8_##name
+ #elif SYS_PREFIX == 9
+ #define LSS_NAME(name) sys9_##name
+ #endif
+
+ #undef LSS_RETURN
+ #if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__))
+ /* Failing system calls return a negative result in the range of
+ * -1..-4095. These are "errno" values with the sign inverted.
+ */
+ #define LSS_RETURN(type, res) \
+ do { \
+ if ((unsigned long)(res) >= (unsigned long)(-4095)) { \
+ LSS_ERRNO = -(res); \
+ res = -1; \
+ } \
+ return (type) (res); \
+ } while (0)
+ #elif defined(__mips__)
+ /* On MIPS, failing system calls return -1, and set errno in a
+ * separate CPU register.
+ */
+ #define LSS_RETURN(type, res, err) \
+ do { \
+ if (err) { \
+ LSS_ERRNO = (res); \
+ res = -1; \
+ } \
+ return (type) (res); \
+ } while (0)
+ #elif defined(__PPC__)
+ /* On PPC, failing system calls return -1, and set errno in a
+ * separate CPU register. See linux/unistd.h.
+ */
+ #define LSS_RETURN(type, res, err) \
+ do { \
+ if (err & 0x10000000 ) { \
+ LSS_ERRNO = (res); \
+ res = -1; \
+ } \
+ return (type) (res); \
+ } while (0)
+ #endif
+ #if defined(__i386__)
+ /* In PIC mode (e.g. when building shared libraries), gcc for i386
+ * reserves ebx. Unfortunately, most distribution ship with implementations
+ * of _syscallX() which clobber ebx.
+ * Also, most definitions of _syscallX() neglect to mark "memory" as being
+ * clobbered. This causes problems with compilers, that do a better job
+ * at optimizing across __asm__ calls.
+ * So, we just have to redefine all of the _syscallX() macros.
+ */
+ #undef LSS_BODY
+ #define LSS_BODY(type,args...) \
+ long __res; \
+ __asm__ __volatile__("push %%ebx\n" \
+ "movl %2,%%ebx\n" \
+ "int $0x80\n" \
+ "pop %%ebx" \
+ args \
+ : "memory"); \
+ LSS_RETURN(type,__res)
+ #undef _syscall0
+ #define _syscall0(type,name) \
+ type LSS_NAME(name)(void) { \
+ long __res; \
+ __asm__ volatile("int $0x80" \
+ : "=a" (__res) \
+ : "0" (__NR_##name) \
+ : "memory"); \
+ LSS_RETURN(type,__res); \
+ }
+ #undef _syscall1
+ #define _syscall1(type,name,type1,arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name), "ri" ((long)(arg1))); \
+ }
+ #undef _syscall2
+ #define _syscall2(type,name,type1,arg1,type2,arg2) \
+ type LSS_NAME(name)(type1 arg1,type2 arg2) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name),"ri" ((long)(arg1)), "c" ((long)(arg2))); \
+ }
+ #undef _syscall3
+ #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
+ type LSS_NAME(name)(type1 arg1,type2 arg2,type3 arg3) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \
+ "d" ((long)(arg3))); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_BODY(type, \
+ : "=a" (__res) \
+ : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \
+ "d" ((long)(arg3)),"S" ((long)(arg4))); \
+ }
+ #undef _syscall5
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ long __res; \
+ __asm__ __volatile__("push %%ebx\n" \
+ "movl %2,%%ebx\n" \
+ "movl %1,%%eax\n" \
+ "int $0x80\n" \
+ "pop %%ebx" \
+ : "=a" (__res) \
+ : "i" (__NR_##name), "ri" ((long)(arg1)), \
+ "c" ((long)(arg2)), "d" ((long)(arg3)), \
+ "S" ((long)(arg4)), "D" ((long)(arg5)) \
+ : "memory"); \
+ LSS_RETURN(type,__res); \
+ }
+ #undef _syscall6
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ long __res; \
+ struct { long __a1; long __a6; } __s = { (long)arg1, (long) arg6 }; \
+ __asm__ __volatile__("push %%ebp\n" \
+ "push %%ebx\n" \
+ "movl 4(%2),%%ebp\n" \
+ "movl 0(%2), %%ebx\n" \
+ "movl %1,%%eax\n" \
+ "int $0x80\n" \
+ "pop %%ebx\n" \
+ "pop %%ebp" \
+ : "=a" (__res) \
+ : "i" (__NR_##name), "0" ((long)(&__s)), \
+ "c" ((long)(arg2)), "d" ((long)(arg3)), \
+ "S" ((long)(arg4)), "D" ((long)(arg5)) \
+ : "memory"); \
+ LSS_RETURN(type,__res); \
+ }
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __res;
+ __asm__ __volatile__(/* if (fn == NULL)
+ * return -EINVAL;
+ */
+ "movl %3,%%ecx\n"
+ "jecxz 1f\n"
+
+ /* if (child_stack == NULL)
+ * return -EINVAL;
+ */
+ "movl %4,%%ecx\n"
+ "jecxz 1f\n"
+
+ /* Set up alignment of the child stack:
+ * child_stack = (child_stack & ~0xF) - 20;
+ */
+ "andl $-16,%%ecx\n"
+ "subl $20,%%ecx\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ "movl %6,%%eax\n"
+ "movl %%eax,4(%%ecx)\n"
+ "movl %3,%%eax\n"
+ "movl %%eax,(%%ecx)\n"
+
+ /* %eax = syscall(%eax = __NR_clone,
+ * %ebx = flags,
+ * %ecx = child_stack,
+ * %edx = parent_tidptr,
+ * %esi = newtls,
+ * %edi = child_tidptr)
+ * Also, make sure that %ebx gets preserved as it is
+ * used in PIC mode.
+ */
+ "movl %8,%%esi\n"
+ "movl %7,%%edx\n"
+ "movl %5,%%eax\n"
+ "movl %9,%%edi\n"
+ "pushl %%ebx\n"
+ "movl %%eax,%%ebx\n"
+ "movl %2,%%eax\n"
+ "int $0x80\n"
+
+ /* In the parent: restore %ebx
+ * In the child: move "fn" into %ebx
+ */
+ "popl %%ebx\n"
+
+ /* if (%eax != 0)
+ * return %eax;
+ */
+ "test %%eax,%%eax\n"
+ "jnz 1f\n"
+
+ /* In the child, now. Terminate frame pointer chain.
+ */
+ "movl $0,%%ebp\n"
+
+ /* Call "fn". "arg" is already on the stack.
+ */
+ "call *%%ebx\n"
+
+ /* Call _exit(%ebx). Unfortunately older versions
+ * of gcc restrict the number of arguments that can
+ * be passed to asm(). So, we need to hard-code the
+ * system call number.
+ */
+ "movl %%eax,%%ebx\n"
+ "movl $1,%%eax\n"
+ "int $0x80\n"
+
+ /* Return to parent.
+ */
+ "1:\n"
+ : "=a" (__res)
+ : "0"(-EINVAL), "i"(__NR_clone),
+ "m"(fn), "m"(child_stack), "m"(flags), "m"(arg),
+ "m"(parent_tidptr), "m"(newtls), "m"(child_tidptr)
+ : "memory", "ecx", "edx", "esi", "edi");
+ LSS_RETURN(int, __res);
+ }
+
+ #define __NR__fadvise64_64 __NR_fadvise64_64
+ LSS_INLINE _syscall6(int, _fadvise64_64, int, fd,
+ unsigned, offset_lo, unsigned, offset_hi,
+ unsigned, len_lo, unsigned, len_hi,
+ int, advice)
+
+ LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset,
+ loff_t len, int advice) {
+ return LSS_NAME(_fadvise64_64)(fd,
+ (unsigned)offset, (unsigned)(offset >>32),
+ (unsigned)len, (unsigned)(len >> 32),
+ advice);
+ }
+
+ #define __NR__fallocate __NR_fallocate
+ LSS_INLINE _syscall6(int, _fallocate, int, fd,
+ int, mode,
+ unsigned, offset_lo, unsigned, offset_hi,
+ unsigned, len_lo, unsigned, len_hi)
+
+ LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode,
+ loff_t offset, loff_t len) {
+ union { loff_t off; unsigned w[2]; } o = { offset }, l = { len };
+ return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]);
+ }
+
+ LSS_INLINE _syscall1(int, set_thread_area, void *, u)
+ LSS_INLINE _syscall1(int, get_thread_area, void *, u)
+
+ LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
+ /* On i386, the kernel does not know how to return from a signal
+ * handler. Instead, it relies on user space to provide a
+ * restorer function that calls the {rt_,}sigreturn() system call.
+ * Unfortunately, we cannot just reference the glibc version of this
+ * function, as glibc goes out of its way to make it inaccessible.
+ */
+ void (*res)(void);
+ __asm__ __volatile__("call 2f\n"
+ "0:.align 16\n"
+ "1:movl %1,%%eax\n"
+ "int $0x80\n"
+ "2:popl %0\n"
+ "addl $(1b-0b),%0\n"
+ : "=a" (res)
+ : "i" (__NR_rt_sigreturn));
+ return res;
+ }
+ LSS_INLINE void (*LSS_NAME(restore)(void))(void) {
+ /* On i386, the kernel does not know how to return from a signal
+ * handler. Instead, it relies on user space to provide a
+ * restorer function that calls the {rt_,}sigreturn() system call.
+ * Unfortunately, we cannot just reference the glibc version of this
+ * function, as glibc goes out of its way to make it inaccessible.
+ */
+ void (*res)(void);
+ __asm__ __volatile__("call 2f\n"
+ "0:.align 16\n"
+ "1:pop %%eax\n"
+ "movl %1,%%eax\n"
+ "int $0x80\n"
+ "2:popl %0\n"
+ "addl $(1b-0b),%0\n"
+ : "=a" (res)
+ : "i" (__NR_sigreturn));
+ return res;
+ }
+ #elif defined(__x86_64__)
+ /* There are no known problems with any of the _syscallX() macros
+ * currently shipping for x86_64, but we still need to be able to define
+ * our own version so that we can override the location of the errno
+ * location (e.g. when using the clone() system call with the CLONE_VM
+ * option).
+ */
+ #undef LSS_BODY
+ #define LSS_BODY(type,name, ...) \
+ long __res; \
+ __asm__ __volatile__("syscall" : "=a" (__res) : "0" (__NR_##name), \
+ ##__VA_ARGS__ : "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res)
+ #undef _syscall0
+ #define _syscall0(type,name) \
+ type LSS_NAME(name)() { \
+ LSS_BODY(type, name); \
+ }
+ #undef _syscall1
+ #define _syscall1(type,name,type1,arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_BODY(type, name, "D" ((long)(arg1))); \
+ }
+ #undef _syscall2
+ #define _syscall2(type,name,type1,arg1,type2,arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2))); \
+ }
+ #undef _syscall3
+ #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)), \
+ "d" ((long)(arg3))); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ long __res; \
+ __asm__ __volatile__("movq %5,%%r10; syscall" : \
+ "=a" (__res) : "0" (__NR_##name), \
+ "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \
+ "g" ((long)(arg4)) : "r10", "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res); \
+ }
+ #undef _syscall5
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ long __res; \
+ __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; syscall" : \
+ "=a" (__res) : "0" (__NR_##name), \
+ "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \
+ "g" ((long)(arg4)), "g" ((long)(arg5)) : \
+ "r8", "r10", "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res); \
+ }
+ #undef _syscall6
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ long __res; \
+ __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; movq %7,%%r9;" \
+ "syscall" : \
+ "=a" (__res) : "0" (__NR_##name), \
+ "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \
+ "g" ((long)(arg4)), "g" ((long)(arg5)), "g" ((long)(arg6)) : \
+ "r8", "r9", "r10", "r11", "rcx", "memory"); \
+ LSS_RETURN(type, __res); \
+ }
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __res;
+ {
+ register void *__tls __asm__("r8") = newtls;
+ register int *__ctid __asm__("r10") = child_tidptr;
+ __asm__ __volatile__(/* if (fn == NULL)
+ * return -EINVAL;
+ */
+ "testq %4,%4\n"
+ "jz 1f\n"
+
+ /* if (child_stack == NULL)
+ * return -EINVAL;
+ */
+ "testq %5,%5\n"
+ "jz 1f\n"
+
+ /* childstack -= 2*sizeof(void *);
+ */
+ "subq $16,%5\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ "movq %7,8(%5)\n"
+ "movq %4,0(%5)\n"
+
+ /* %rax = syscall(%rax = __NR_clone,
+ * %rdi = flags,
+ * %rsi = child_stack,
+ * %rdx = parent_tidptr,
+ * %r8 = new_tls,
+ * %r10 = child_tidptr)
+ */
+ "movq %2,%%rax\n"
+ "syscall\n"
+
+ /* if (%rax != 0)
+ * return;
+ */
+ "testq %%rax,%%rax\n"
+ "jnz 1f\n"
+
+ /* In the child. Terminate frame pointer chain.
+ */
+ "xorq %%rbp,%%rbp\n"
+
+ /* Call "fn(arg)".
+ */
+ "popq %%rax\n"
+ "popq %%rdi\n"
+ "call *%%rax\n"
+
+ /* Call _exit(%ebx).
+ */
+ "movq %%rax,%%rdi\n"
+ "movq %3,%%rax\n"
+ "syscall\n"
+
+ /* Return to parent.
+ */
+ "1:\n"
+ : "=a" (__res)
+ : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
+ "r"(fn), "S"(child_stack), "D"(flags), "r"(arg),
+ "d"(parent_tidptr), "r"(__tls), "r"(__ctid)
+ : "memory", "r11", "rcx");
+ }
+ LSS_RETURN(int, __res);
+ }
+ LSS_INLINE _syscall2(int, arch_prctl, int, c, void *, a)
+ LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len,
+ int, advice)
+
+ LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) {
+ /* On x86-64, the kernel does not know how to return from
+ * a signal handler. Instead, it relies on user space to provide a
+ * restorer function that calls the rt_sigreturn() system call.
+ * Unfortunately, we cannot just reference the glibc version of this
+ * function, as glibc goes out of its way to make it inaccessible.
+ */
+ void (*res)(void);
+ __asm__ __volatile__("call 2f\n"
+ "0:.align 16\n"
+ "1:movq %1,%%rax\n"
+ "syscall\n"
+ "2:popq %0\n"
+ "addq $(1b-0b),%0\n"
+ : "=a" (res)
+ : "i" (__NR_rt_sigreturn));
+ return res;
+ }
+ #elif defined(__ARM_ARCH_3__)
+ /* Most definitions of _syscallX() neglect to mark "memory" as being
+ * clobbered. This causes problems with compilers, that do a better job
+ * at optimizing across __asm__ calls.
+ * So, we just have to redefine all fo the _syscallX() macros.
+ */
+ #undef LSS_REG
+ #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a
+ #undef LSS_BODY
+ #define LSS_BODY(type,name,args...) \
+ register long __res_r0 __asm__("r0"); \
+ long __res; \
+ __asm__ __volatile__ (__syscall(name) \
+ : "=r"(__res_r0) : args : "lr", "memory"); \
+ __res = __res_r0; \
+ LSS_RETURN(type, __res)
+ #undef _syscall0
+ #define _syscall0(type, name) \
+ type LSS_NAME(name)() { \
+ LSS_BODY(type, name); \
+ }
+ #undef _syscall1
+ #define _syscall1(type, name, type1, arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__r0)); \
+ }
+ #undef _syscall2
+ #define _syscall2(type, name, type1, arg1, type2, arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1)); \
+ }
+ #undef _syscall3
+ #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2)); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_REG(3, arg4); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3)); \
+ }
+ #undef _syscall5
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_REG(3, arg4); LSS_REG(4, arg5); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \
+ "r"(__r4)); \
+ }
+ #undef _syscall6
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \
+ LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6); \
+ LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \
+ "r"(__r4), "r"(__r5)); \
+ }
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __res;
+ {
+ register int __flags __asm__("r0") = flags;
+ register void *__stack __asm__("r1") = child_stack;
+ register void *__ptid __asm__("r2") = parent_tidptr;
+ register void *__tls __asm__("r3") = newtls;
+ register int *__ctid __asm__("r4") = child_tidptr;
+ __asm__ __volatile__(/* if (fn == NULL || child_stack == NULL)
+ * return -EINVAL;
+ */
+ "cmp %2,#0\n"
+ "cmpne %3,#0\n"
+ "moveq %0,%1\n"
+ "beq 1f\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ "str %5,[%3,#-4]!\n"
+ "str %2,[%3,#-4]!\n"
+
+ /* %r0 = syscall(%r0 = flags,
+ * %r1 = child_stack,
+ * %r2 = parent_tidptr,
+ * %r3 = newtls,
+ * %r4 = child_tidptr)
+ */
+ __syscall(clone)"\n"
+
+ /* if (%r0 != 0)
+ * return %r0;
+ */
+ "movs %0,r0\n"
+ "bne 1f\n"
+
+ /* In the child, now. Call "fn(arg)".
+ */
+ "ldr r0,[sp, #4]\n"
+ "mov lr,pc\n"
+ "ldr pc,[sp]\n"
+
+ /* Call _exit(%r0).
+ */
+ __syscall(exit)"\n"
+ "1:\n"
+ : "=r" (__res)
+ : "i"(-EINVAL),
+ "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
+ "r"(__ptid), "r"(__tls), "r"(__ctid)
+ : "lr", "memory");
+ }
+ LSS_RETURN(int, __res);
+ }
+ #elif defined(__mips__)
+ #undef LSS_REG
+ #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) = \
+ (unsigned long)(a)
+ #undef LSS_BODY
+ #define LSS_BODY(type,name,r7,...) \
+ register unsigned long __v0 __asm__("$2") = __NR_##name; \
+ __asm__ __volatile__ ("syscall\n" \
+ : "=&r"(__v0), r7 (__r7) \
+ : "0"(__v0), ##__VA_ARGS__ \
+ : "$8", "$9", "$10", "$11", "$12", \
+ "$13", "$14", "$15", "$24", "memory"); \
+ LSS_RETURN(type, __v0, __r7)
+ #undef _syscall0
+ #define _syscall0(type, name) \
+ type LSS_NAME(name)() { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_BODY(type, name, "=r"); \
+ }
+ #undef _syscall1
+ #define _syscall1(type, name, type1, arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_REG(4, arg1); LSS_BODY(type, name, "=r", "r"(__r4)); \
+ }
+ #undef _syscall2
+ #define _syscall2(type, name, type1, arg1, type2, arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); \
+ LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5)); \
+ }
+ #undef _syscall3
+ #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ register unsigned long __r7 __asm__("$7"); \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5), "r"(__r6)); \
+ }
+ #undef _syscall4
+ #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); \
+ LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6)); \
+ }
+ #undef _syscall5
+ #if _MIPS_SIM == _MIPS_SIM_ABI32
+ /* The old 32bit MIPS system call API passes the fifth and sixth argument
+ * on the stack, whereas the new APIs use registers "r8" and "r9".
+ */
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); \
+ register unsigned long __v0 __asm__("$2"); \
+ __asm__ __volatile__ (".set noreorder\n" \
+ "lw $2, %6\n" \
+ "subu $29, 32\n" \
+ "sw $2, 16($29)\n" \
+ "li $2, %2\n" \
+ "syscall\n" \
+ "addiu $29, 32\n" \
+ ".set reorder\n" \
+ : "=&r"(__v0), "+r" (__r7) \
+ : "i" (__NR_##name), "r"(__r4), "r"(__r5), \
+ "r"(__r6), "m" ((unsigned long)arg5) \
+ : "$8", "$9", "$10", "$11", "$12", \
+ "$13", "$14", "$15", "$24", "memory"); \
+ LSS_RETURN(type, __v0, __r7); \
+ }
+ #else
+ #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); LSS_REG(8, arg5); \
+ LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \
+ "r"(__r8)); \
+ }
+ #endif
+ #undef _syscall6
+ #if _MIPS_SIM == _MIPS_SIM_ABI32
+ /* The old 32bit MIPS system call API passes the fifth and sixth argument
+ * on the stack, whereas the new APIs use registers "r8" and "r9".
+ */
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); \
+ register unsigned long __v0 __asm__("$2"); \
+ __asm__ __volatile__ (".set noreorder\n" \
+ "lw $2, %6\n" \
+ "lw $8, %7\n" \
+ "subu $29, 32\n" \
+ "sw $2, 16($29)\n" \
+ "sw $8, 20($29)\n" \
+ "li $2, %2\n" \
+ "syscall\n" \
+ "addiu $29, 32\n" \
+ ".set reorder\n" \
+ : "=&r"(__v0), "+r" (__r7) \
+ : "i" (__NR_##name), "r"(__r4), "r"(__r5), \
+ "r"(__r6), "r" ((unsigned long)arg5), \
+ "r" ((unsigned long)arg6) \
+ : "$8", "$9", "$10", "$11", "$12", \
+ "$13", "$14", "$15", "$24", "memory"); \
+ LSS_RETURN(type, __v0, __r7); \
+ }
+ #else
+ #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
+ type5,arg5,type6,arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5,type6 arg6) { \
+ LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \
+ LSS_REG(7, arg4); LSS_REG(8, arg5); LSS_REG(9, arg6); \
+ LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \
+ "r"(__r8), "r"(__r9)); \
+ }
+ #endif
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ register unsigned long __v0 __asm__("$2");
+ register unsigned long __r7 __asm__("$7") = (unsigned long)newtls;
+ {
+ register int __flags __asm__("$4") = flags;
+ register void *__stack __asm__("$5") = child_stack;
+ register void *__ptid __asm__("$6") = parent_tidptr;
+ register int *__ctid __asm__("$8") = child_tidptr;
+ __asm__ __volatile__(
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "subu $29,24\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "sub $29,16\n"
+ #else
+ "dsubu $29,16\n"
+ #endif
+
+ /* if (fn == NULL || child_stack == NULL)
+ * return -EINVAL;
+ */
+ "li %0,%2\n"
+ "beqz %5,1f\n"
+ "beqz %6,1f\n"
+
+ /* Push "arg" and "fn" onto the stack that will be
+ * used by the child.
+ */
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "subu %6,32\n"
+ "sw %5,0(%6)\n"
+ "sw %8,4(%6)\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "sub %6,32\n"
+ "sw %5,0(%6)\n"
+ "sw %8,8(%6)\n"
+ #else
+ "dsubu %6,32\n"
+ "sd %5,0(%6)\n"
+ "sd %8,8(%6)\n"
+ #endif
+
+ /* $7 = syscall($4 = flags,
+ * $5 = child_stack,
+ * $6 = parent_tidptr,
+ * $7 = newtls,
+ * $8 = child_tidptr)
+ */
+ "li $2,%3\n"
+ "syscall\n"
+
+ /* if ($7 != 0)
+ * return $2;
+ */
+ "bnez $7,1f\n"
+ "bnez $2,1f\n"
+
+ /* In the child, now. Call "fn(arg)".
+ */
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "lw $25,0($29)\n"
+ "lw $4,4($29)\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "lw $25,0($29)\n"
+ "lw $4,8($29)\n"
+ #else
+ "ld $25,0($29)\n"
+ "ld $4,8($29)\n"
+ #endif
+ "jalr $25\n"
+
+ /* Call _exit($2)
+ */
+ "move $4,$2\n"
+ "li $2,%4\n"
+ "syscall\n"
+
+ "1:\n"
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32
+ "addu $29, 24\n"
+ #elif _MIPS_SIM == _MIPS_SIM_NABI32
+ "add $29, 16\n"
+ #else
+ "daddu $29,16\n"
+ #endif
+ : "=&r" (__v0), "=r" (__r7)
+ : "i"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit),
+ "r"(fn), "r"(__stack), "r"(__flags), "r"(arg),
+ "r"(__ptid), "r"(__r7), "r"(__ctid)
+ : "$9", "$10", "$11", "$12", "$13", "$14", "$15",
+ "$24", "memory");
+ }
+ LSS_RETURN(int, __v0, __r7);
+ }
+ #elif defined (__PPC__)
+ #undef LSS_LOADARGS_0
+ #define LSS_LOADARGS_0(name, dummy...) \
+ __sc_0 = __NR_##name
+ #undef LSS_LOADARGS_1
+ #define LSS_LOADARGS_1(name, arg1) \
+ LSS_LOADARGS_0(name); \
+ __sc_3 = (unsigned long) (arg1)
+ #undef LSS_LOADARGS_2
+ #define LSS_LOADARGS_2(name, arg1, arg2) \
+ LSS_LOADARGS_1(name, arg1); \
+ __sc_4 = (unsigned long) (arg2)
+ #undef LSS_LOADARGS_3
+ #define LSS_LOADARGS_3(name, arg1, arg2, arg3) \
+ LSS_LOADARGS_2(name, arg1, arg2); \
+ __sc_5 = (unsigned long) (arg3)
+ #undef LSS_LOADARGS_4
+ #define LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4) \
+ LSS_LOADARGS_3(name, arg1, arg2, arg3); \
+ __sc_6 = (unsigned long) (arg4)
+ #undef LSS_LOADARGS_5
+ #define LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5) \
+ LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4); \
+ __sc_7 = (unsigned long) (arg5)
+ #undef LSS_LOADARGS_6
+ #define LSS_LOADARGS_6(name, arg1, arg2, arg3, arg4, arg5, arg6) \
+ LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5); \
+ __sc_8 = (unsigned long) (arg6)
+ #undef LSS_ASMINPUT_0
+ #define LSS_ASMINPUT_0 "0" (__sc_0)
+ #undef LSS_ASMINPUT_1
+ #define LSS_ASMINPUT_1 LSS_ASMINPUT_0, "1" (__sc_3)
+ #undef LSS_ASMINPUT_2
+ #define LSS_ASMINPUT_2 LSS_ASMINPUT_1, "2" (__sc_4)
+ #undef LSS_ASMINPUT_3
+ #define LSS_ASMINPUT_3 LSS_ASMINPUT_2, "3" (__sc_5)
+ #undef LSS_ASMINPUT_4
+ #define LSS_ASMINPUT_4 LSS_ASMINPUT_3, "4" (__sc_6)
+ #undef LSS_ASMINPUT_5
+ #define LSS_ASMINPUT_5 LSS_ASMINPUT_4, "5" (__sc_7)
+ #undef LSS_ASMINPUT_6
+ #define LSS_ASMINPUT_6 LSS_ASMINPUT_5, "6" (__sc_8)
+ #undef LSS_BODY
+ #define LSS_BODY(nr, type, name, args...) \
+ long __sc_ret, __sc_err; \
+ { \
+ register unsigned long __sc_0 __asm__ ("r0"); \
+ register unsigned long __sc_3 __asm__ ("r3"); \
+ register unsigned long __sc_4 __asm__ ("r4"); \
+ register unsigned long __sc_5 __asm__ ("r5"); \
+ register unsigned long __sc_6 __asm__ ("r6"); \
+ register unsigned long __sc_7 __asm__ ("r7"); \
+ register unsigned long __sc_8 __asm__ ("r8"); \
+ \
+ LSS_LOADARGS_##nr(name, args); \
+ __asm__ __volatile__ \
+ ("sc\n\t" \
+ "mfcr %0" \
+ : "=&r" (__sc_0), \
+ "=&r" (__sc_3), "=&r" (__sc_4), \
+ "=&r" (__sc_5), "=&r" (__sc_6), \
+ "=&r" (__sc_7), "=&r" (__sc_8) \
+ : LSS_ASMINPUT_##nr \
+ : "cr0", "ctr", "memory", \
+ "r9", "r10", "r11", "r12"); \
+ __sc_ret = __sc_3; \
+ __sc_err = __sc_0; \
+ } \
+ LSS_RETURN(type, __sc_ret, __sc_err)
+ #undef _syscall0
+ #define _syscall0(type, name) \
+ type LSS_NAME(name)(void) { \
+ LSS_BODY(0, type, name); \
+ }
+ #undef _syscall1
+ #define _syscall1(type, name, type1, arg1) \
+ type LSS_NAME(name)(type1 arg1) { \
+ LSS_BODY(1, type, name, arg1); \
+ }
+ #undef _syscall2
+ #define _syscall2(type, name, type1, arg1, type2, arg2) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2) { \
+ LSS_BODY(2, type, name, arg1, arg2); \
+ }
+ #undef _syscall3
+ #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \
+ LSS_BODY(3, type, name, arg1, arg2, arg3); \
+ }
+ #undef _syscall4
+ #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \
+ LSS_BODY(4, type, name, arg1, arg2, arg3, arg4); \
+ }
+ #undef _syscall5
+ #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5) { \
+ LSS_BODY(5, type, name, arg1, arg2, arg3, arg4, arg5); \
+ }
+ #undef _syscall6
+ #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \
+ type4, arg4, type5, arg5, type6, arg6) \
+ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
+ type5 arg5, type6 arg6) { \
+ LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \
+ }
+ /* clone function adapted from glibc 2.3.6 clone.S */
+ /* TODO(csilvers): consider wrapping some args up in a struct, like we
+ * do for i386's _syscall6, so we can compile successfully on gcc 2.95
+ */
+ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack,
+ int flags, void *arg, int *parent_tidptr,
+ void *newtls, int *child_tidptr) {
+ long __ret, __err;
+ {
+ register int (*__fn)(void *) __asm__ ("r8") = fn;
+ register void *__cstack __asm__ ("r4") = child_stack;
+ register int __flags __asm__ ("r3") = flags;
+ register void * __arg __asm__ ("r9") = arg;
+ register int * __ptidptr __asm__ ("r5") = parent_tidptr;
+ register void * __newtls __asm__ ("r6") = newtls;
+ register int * __ctidptr __asm__ ("r7") = child_tidptr;
+ __asm__ __volatile__(
+ /* check for fn == NULL
+ * and child_stack == NULL
+ */
+ "cmpwi cr0, %6, 0\n\t"
+ "cmpwi cr1, %7, 0\n\t"
+ "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t"
+ "beq- cr0, 1f\n\t"
+
+ /* set up stack frame for child */
+ "clrrwi %7, %7, 4\n\t"
+ "li 0, 0\n\t"
+ "stwu 0, -16(%7)\n\t"
+
+ /* fn, arg, child_stack are saved across the syscall: r28-30 */
+ "mr 28, %6\n\t"
+ "mr 29, %7\n\t"
+ "mr 27, %9\n\t"
+
+ /* syscall */
+ "li 0, %4\n\t"
+ /* flags already in r3
+ * child_stack already in r4
+ * ptidptr already in r5
+ * newtls already in r6
+ * ctidptr already in r7
+ */
+ "sc\n\t"
+
+ /* Test if syscall was successful */
+ "cmpwi cr1, 3, 0\n\t"
+ "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t"
+ "bne- cr1, 1f\n\t"
+
+ /* Do the function call */
+ "mtctr 28\n\t"
+ "mr 3, 27\n\t"
+ "bctrl\n\t"
+
+ /* Call _exit(r3) */
+ "li 0, %5\n\t"
+ "sc\n\t"
+
+ /* Return to parent */
+ "1:\n"
+ "mfcr %1\n\t"
+ "mr %0, 3\n\t"
+ : "=r" (__ret), "=r" (__err)
+ : "0" (-1), "1" (EINVAL),
+ "i" (__NR_clone), "i" (__NR_exit),
+ "r" (__fn), "r" (__cstack), "r" (__flags),
+ "r" (__arg), "r" (__ptidptr), "r" (__newtls),
+ "r" (__ctidptr)
+ : "cr0", "cr1", "memory", "ctr",
+ "r0", "r29", "r27", "r28");
+ }
+ LSS_RETURN(int, __ret, __err);
+ }
+ #endif
+ #define __NR__exit __NR_exit
+ #define __NR__gettid __NR_gettid
+ #define __NR__mremap __NR_mremap
+ LSS_INLINE _syscall1(int, brk, void *, e)
+ LSS_INLINE _syscall1(int, chdir, const char *,p)
+ LSS_INLINE _syscall1(int, close, int, f)
+ LSS_INLINE _syscall2(int, clock_getres, int, c,
+ struct kernel_timespec*, t)
+ LSS_INLINE _syscall2(int, clock_gettime, int, c,
+ struct kernel_timespec*, t)
+ LSS_INLINE _syscall1(int, dup, int, f)
+ LSS_INLINE _syscall2(int, dup2, int, s,
+ int, d)
+ LSS_INLINE _syscall3(int, execve, const char*, f,
+ const char*const*,a,const char*const*, e)
+ LSS_INLINE _syscall1(int, _exit, int, e)
+ LSS_INLINE _syscall1(int, exit_group, int, e)
+ LSS_INLINE _syscall3(int, fcntl, int, f,
+ int, c, long, a)
+ LSS_INLINE _syscall0(pid_t, fork)
+ LSS_INLINE _syscall2(int, fstat, int, f,
+ struct kernel_stat*, b)
+ LSS_INLINE _syscall2(int, fstatfs, int, f,
+ struct kernel_statfs*, b)
+ LSS_INLINE _syscall2(int, ftruncate, int, f,
+ off_t, l)
+ LSS_INLINE _syscall4(int, futex, int*, a,
+ int, o, int, v,
+ struct kernel_timespec*, t)
+ LSS_INLINE _syscall3(int, getdents, int, f,
+ struct kernel_dirent*, d, int, c)
+ LSS_INLINE _syscall3(int, getdents64, int, f,
+ struct kernel_dirent64*, d, int, c)
+ LSS_INLINE _syscall0(gid_t, getegid)
+ LSS_INLINE _syscall0(uid_t, geteuid)
+ LSS_INLINE _syscall0(pid_t, getpgrp)
+ LSS_INLINE _syscall0(pid_t, getpid)
+ LSS_INLINE _syscall0(pid_t, getppid)
+ LSS_INLINE _syscall2(int, getpriority, int, a,
+ int, b)
+ LSS_INLINE _syscall3(int, getresgid, gid_t *, r,
+ gid_t *, e, gid_t *, s)
+ LSS_INLINE _syscall3(int, getresuid, uid_t *, r,
+ uid_t *, e, uid_t *, s)
+ LSS_INLINE _syscall2(int, getrlimit, int, r,
+ struct kernel_rlimit*, l)
+ LSS_INLINE _syscall1(pid_t, getsid, pid_t, p)
+ LSS_INLINE _syscall0(pid_t, _gettid)
+ LSS_INLINE _syscall2(int, gettimeofday, struct timeval *, v,
+ struct timezone *, z)
+ LSS_INLINE _syscall5(int, setxattr, const char *,p,
+ const char *, n, const void *,v,
+ size_t, s, int, f)
+ LSS_INLINE _syscall5(int, lsetxattr, const char *,p,
+ const char *, n, const void *,v,
+ size_t, s, int, f)
+ LSS_INLINE _syscall4(ssize_t, getxattr, const char *,p,
+ const char *, n, void *, v, size_t, s)
+ LSS_INLINE _syscall4(ssize_t, lgetxattr, const char *,p,
+ const char *, n, void *, v, size_t, s)
+ LSS_INLINE _syscall3(ssize_t, listxattr, const char *,p,
+ char *, l, size_t, s)
+ LSS_INLINE _syscall3(ssize_t, llistxattr, const char *,p,
+ char *, l, size_t, s)
+ LSS_INLINE _syscall3(int, ioctl, int, d,
+ int, r, void *, a)
+ LSS_INLINE _syscall2(int, ioprio_get, int, which,
+ int, who)
+ LSS_INLINE _syscall3(int, ioprio_set, int, which,
+ int, who, int, ioprio)
+ LSS_INLINE _syscall2(int, kill, pid_t, p,
+ int, s)
+ LSS_INLINE _syscall3(off_t, lseek, int, f,
+ off_t, o, int, w)
+ LSS_INLINE _syscall2(int, munmap, void*, s,
+ size_t, l)
+ LSS_INLINE _syscall6(long, move_pages, pid_t, p,
+ unsigned long, n, void **,g, int *, d,
+ int *, s, int, f)
+ LSS_INLINE _syscall3(int, mprotect, const void *,a,
+ size_t, l, int, p)
+ LSS_INLINE _syscall5(void*, _mremap, void*, o,
+ size_t, os, size_t, ns,
+ unsigned long, f, void *, a)
+ LSS_INLINE _syscall3(int, open, const char*, p,
+ int, f, int, m)
+ LSS_INLINE _syscall3(int, poll, struct kernel_pollfd*, u,
+ unsigned int, n, int, t)
+ LSS_INLINE _syscall2(int, prctl, int, o,
+ long, a)
+ LSS_INLINE _syscall4(long, ptrace, int, r,
+ pid_t, p, void *, a, void *, d)
+ #if defined(__NR_quotactl)
+ // Defined on x86_64 / i386 only
+ LSS_INLINE _syscall4(int, quotactl, int, cmd, const char *, special,
+ int, id, caddr_t, addr)
+ #endif
+ LSS_INLINE _syscall3(ssize_t, read, int, f,
+ void *, b, size_t, c)
+ LSS_INLINE _syscall3(int, readlink, const char*, p,
+ char*, b, size_t, s)
+ LSS_INLINE _syscall4(int, rt_sigaction, int, s,
+ const struct kernel_sigaction*, a,
+ struct kernel_sigaction*, o, size_t, c)
+ LSS_INLINE _syscall2(int, rt_sigpending, struct kernel_sigset_t *, s,
+ size_t, c)
+ LSS_INLINE _syscall4(int, rt_sigprocmask, int, h,
+ const struct kernel_sigset_t*, s,
+ struct kernel_sigset_t*, o, size_t, c);
+ LSS_INLINE _syscall1(int, rt_sigreturn, unsigned long, u);
+ LSS_INLINE _syscall2(int, rt_sigsuspend,
+ const struct kernel_sigset_t*, s, size_t, c);
+ LSS_INLINE _syscall3(int, sched_getaffinity,pid_t, p,
+ unsigned int, l, unsigned long *, m)
+ LSS_INLINE _syscall3(int, sched_setaffinity,pid_t, p,
+ unsigned int, l, unsigned long *, m)
+ LSS_INLINE _syscall0(int, sched_yield)
+ LSS_INLINE _syscall1(long, set_tid_address, int *, t)
+ LSS_INLINE _syscall1(int, setfsgid, gid_t, g)
+ LSS_INLINE _syscall1(int, setfsuid, uid_t, u)
+ LSS_INLINE _syscall1(int, setuid, uid_t, u)
+ LSS_INLINE _syscall1(int, setgid, gid_t, g)
+ LSS_INLINE _syscall2(int, setpgid, pid_t, p,
+ pid_t, g)
+ LSS_INLINE _syscall3(int, setpriority, int, a,
+ int, b, int, p)
+ LSS_INLINE _syscall3(int, setresgid, gid_t, r,
+ gid_t, e, gid_t, s)
+ LSS_INLINE _syscall3(int, setresuid, uid_t, r,
+ uid_t, e, uid_t, s)
+ LSS_INLINE _syscall2(int, setrlimit, int, r,
+ const struct kernel_rlimit*, l)
+ LSS_INLINE _syscall0(pid_t, setsid)
+ LSS_INLINE _syscall2(int, sigaltstack, const stack_t*, s,
+ const stack_t*, o)
+ #if defined(__NR_sigreturn)
+ LSS_INLINE _syscall1(int, sigreturn, unsigned long, u);
+ #endif
+ LSS_INLINE _syscall2(int, stat, const char*, f,
+ struct kernel_stat*, b)
+ LSS_INLINE _syscall2(int, statfs, const char*, f,
+ struct kernel_statfs*, b)
+ LSS_INLINE _syscall3(int, tgkill, pid_t, p,
+ pid_t, t, int, s)
+ LSS_INLINE _syscall2(int, tkill, pid_t, p,
+ int, s)
+ LSS_INLINE _syscall3(ssize_t, write, int, f,
+ const void *, b, size_t, c)
+ LSS_INLINE _syscall3(ssize_t, writev, int, f,
+ const struct kernel_iovec*, v, size_t, c)
+ LSS_INLINE _syscall1(int, unlink, const char*, f)
+ #if defined(__NR_getcpu)
+ LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu,
+ unsigned *, node, void *, unused);
+ #endif
+ #if defined(__x86_64__) || \
+ (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32)
+ LSS_INLINE _syscall3(int, recvmsg, int, s,
+ struct kernel_msghdr*, m, int, f)
+ LSS_INLINE _syscall3(int, sendmsg, int, s,
+ const struct kernel_msghdr*, m, int, f)
+ LSS_INLINE _syscall6(int, sendto, int, s,
+ const void*, m, size_t, l,
+ int, f,
+ const struct kernel_sockaddr*, a, int, t)
+ LSS_INLINE _syscall2(int, shutdown, int, s,
+ int, h)
+ LSS_INLINE _syscall3(int, socket, int, d,
+ int, t, int, p)
+ LSS_INLINE _syscall4(int, socketpair, int, d,
+ int, t, int, p, int*, s)
+ #endif
+ #if defined(__x86_64__)
+ LSS_INLINE _syscall4(int, fallocate, int, fd, int, mode,
+ loff_t, offset, loff_t, len)
+
+ LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid,
+ gid_t *egid,
+ gid_t *sgid) {
+ return LSS_NAME(getresgid)(rgid, egid, sgid);
+ }
+
+ LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid,
+ uid_t *euid,
+ uid_t *suid) {
+ return LSS_NAME(getresuid)(ruid, euid, suid);
+ }
+
+ LSS_INLINE _syscall6(void*, mmap, void*, s,
+ size_t, l, int, p,
+ int, f, int, d,
+ __off64_t, o)
+
+ LSS_INLINE _syscall4(int, newfstatat, int, d,
+ const char *, p,
+ struct kernel_stat*, b, int, f)
+
+ LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) {
+ return LSS_NAME(setfsgid)(gid);
+ }
+
+ LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) {
+ return LSS_NAME(setfsuid)(uid);
+ }
+
+ LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) {
+ return LSS_NAME(setresgid)(rgid, egid, sgid);
+ }
+
+ LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) {
+ return LSS_NAME(setresuid)(ruid, euid, suid);
+ }
+
+ LSS_INLINE int LSS_NAME(sigaction)(int signum,
+ const struct kernel_sigaction *act,
+ struct kernel_sigaction *oldact) {
+ /* On x86_64, the kernel requires us to always set our own
+ * SA_RESTORER in order to be able to return from a signal handler.
+ * This function must have a "magic" signature that the "gdb"
+ * (and maybe the kernel?) can recognize.
+ */
+ if (act != NULL && !(act->sa_flags & SA_RESTORER)) {
+ struct kernel_sigaction a = *act;
+ a.sa_flags |= SA_RESTORER;
+ a.sa_restorer = LSS_NAME(restore_rt)();
+ return LSS_NAME(rt_sigaction)(signum, &a, oldact,
+ (KERNEL_NSIG+7)/8);
+ } else {
+ return LSS_NAME(rt_sigaction)(signum, act, oldact,
+ (KERNEL_NSIG+7)/8);
+ }
+ }
+
+ LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) {
+ return LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8);
+ }
+
+ LSS_INLINE int LSS_NAME(sigprocmask)(int how,
+ const struct kernel_sigset_t *set,
+ struct kernel_sigset_t *oldset) {
+ return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
+ }
+
+ LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) {
+ return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8);
+ }
+ #endif
+ #if defined(__x86_64__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32)
+ LSS_INLINE _syscall4(pid_t, wait4, pid_t, p,
+ int*, s, int, o,
+ struct kernel_rusage*, r)
+
+ LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){
+ return LSS_NAME(wait4)(pid, status, options, 0);
+ }
+ #endif
+ #if defined(__i386__) || defined(__x86_64__)
+ LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m)
+ LSS_INLINE _syscall3(int, unlinkat, int, d, const char *, p, int, f)
+ #endif
+ #if defined(__i386__) || defined(__ARM_ARCH_3__)
+ #define __NR__getresgid32 __NR_getresgid32
+ #define __NR__getresuid32 __NR_getresuid32
+ #define __NR__setfsgid32 __NR_setfsgid32
+ #define __NR__setfsuid32 __NR_setfsuid32
+ #define __NR__setresgid32 __NR_setresgid32
+ #define __NR__setresuid32 __NR_setresuid32
+ LSS_INLINE _syscall2(int, ugetrlimit, int, r,
+ struct kernel_rlimit*, l)
+ LSS_INLINE _syscall3(int, _getresgid32, gid_t *, r,
+ gid_t *, e, gid_t *, s)
+ LSS_INLINE _syscall3(int, _getresuid32, uid_t *, r,
+ uid_t *, e, uid_t *, s)
+ LSS_INLINE _syscall1(int, _setfsgid32, gid_t, f)
+ LSS_INLINE _syscall1(int, _setfsuid32, uid_t, f)
+ LSS_INLINE _syscall3(int, _setresgid32, gid_t, r,
+ gid_t, e, gid_t, s)
+ LSS_INLINE _syscall3(int, _setresuid32, uid_t, r,
+ uid_t, e, uid_t, s)
+
+ LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid,
+ gid_t *egid,
+ gid_t *sgid) {
+ int rc;
+ if ((rc = LSS_NAME(_getresgid32)(rgid, egid, sgid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((rgid == NULL) || (egid == NULL) || (sgid == NULL)) {
+ return EFAULT;
+ }
+ // Clear the high bits first, since getresgid only sets 16 bits
+ *rgid = *egid = *sgid = 0;
+ rc = LSS_NAME(getresgid)(rgid, egid, sgid);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid,
+ uid_t *euid,
+ uid_t *suid) {
+ int rc;
+ if ((rc = LSS_NAME(_getresuid32)(ruid, euid, suid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((ruid == NULL) || (euid == NULL) || (suid == NULL)) {
+ return EFAULT;
+ }
+ // Clear the high bits first, since getresuid only sets 16 bits
+ *ruid = *euid = *suid = 0;
+ rc = LSS_NAME(getresuid)(ruid, euid, suid);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) {
+ int rc;
+ if ((rc = LSS_NAME(_setfsgid32)(gid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)gid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setfsgid)(gid);
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) {
+ int rc;
+ if ((rc = LSS_NAME(_setfsuid32)(uid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)uid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setfsuid)(uid);
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) {
+ int rc;
+ if ((rc = LSS_NAME(_setresgid32)(rgid, egid, sgid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)rgid & ~0xFFFFu ||
+ (unsigned int)egid & ~0xFFFFu ||
+ (unsigned int)sgid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setresgid)(rgid, egid, sgid);
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) {
+ int rc;
+ if ((rc = LSS_NAME(_setresuid32)(ruid, euid, suid)) < 0 &&
+ LSS_ERRNO == ENOSYS) {
+ if ((unsigned int)ruid & ~0xFFFFu ||
+ (unsigned int)euid & ~0xFFFFu ||
+ (unsigned int)suid & ~0xFFFFu) {
+ rc = EINVAL;
+ } else {
+ rc = LSS_NAME(setresuid)(ruid, euid, suid);
+ }
+ }
+ return rc;
+ }
+ #endif
+ LSS_INLINE int LSS_NAME(sigemptyset)(struct kernel_sigset_t *set) {
+ memset(&set->sig, 0, sizeof(set->sig));
+ return 0;
+ }
+
+ LSS_INLINE int LSS_NAME(sigfillset)(struct kernel_sigset_t *set) {
+ memset(&set->sig, -1, sizeof(set->sig));
+ return 0;
+ }
+
+ LSS_INLINE int LSS_NAME(sigaddset)(struct kernel_sigset_t *set,
+ int signum) {
+ if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+ LSS_ERRNO = EINVAL;
+ return -1;
+ } else {
+ set->sig[(signum - 1)/(8*sizeof(set->sig[0]))]
+ |= 1UL << ((signum - 1) % (8*sizeof(set->sig[0])));
+ return 0;
+ }
+ }
+
+ LSS_INLINE int LSS_NAME(sigdelset)(struct kernel_sigset_t *set,
+ int signum) {
+ if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+ LSS_ERRNO = EINVAL;
+ return -1;
+ } else {
+ set->sig[(signum - 1)/(8*sizeof(set->sig[0]))]
+ &= ~(1UL << ((signum - 1) % (8*sizeof(set->sig[0]))));
+ return 0;
+ }
+ }
+
+ LSS_INLINE int LSS_NAME(sigismember)(struct kernel_sigset_t *set,
+ int signum) {
+ if (signum < 1 || signum > (int)(8*sizeof(set->sig))) {
+ LSS_ERRNO = EINVAL;
+ return -1;
+ } else {
+ return !!(set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] &
+ (1UL << ((signum - 1) % (8*sizeof(set->sig[0])))));
+ }
+ }
+ #if defined(__i386__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || defined(__PPC__)
+ #define __NR__sigaction __NR_sigaction
+ #define __NR__sigpending __NR_sigpending
+ #define __NR__sigprocmask __NR_sigprocmask
+ #define __NR__sigsuspend __NR_sigsuspend
+ #define __NR__socketcall __NR_socketcall
+ LSS_INLINE _syscall2(int, fstat64, int, f,
+ struct kernel_stat64 *, b)
+ LSS_INLINE _syscall5(int, _llseek, uint, fd, ulong, hi, ulong, lo,
+ loff_t *, res, uint, wh)
+ LSS_INLINE _syscall1(void*, mmap, void*, a)
+ LSS_INLINE _syscall6(void*, mmap2, void*, s,
+ size_t, l, int, p,
+ int, f, int, d,
+ __off64_t, o)
+ LSS_INLINE _syscall3(int, _sigaction, int, s,
+ const struct kernel_old_sigaction*, a,
+ struct kernel_old_sigaction*, o)
+ LSS_INLINE _syscall1(int, _sigpending, unsigned long*, s)
+ LSS_INLINE _syscall3(int, _sigprocmask, int, h,
+ const unsigned long*, s,
+ unsigned long*, o)
+ #ifdef __PPC__
+ LSS_INLINE _syscall1(int, _sigsuspend, unsigned long, s)
+ #else
+ LSS_INLINE _syscall3(int, _sigsuspend, const void*, a,
+ int, b,
+ unsigned long, s)
+ #endif
+ LSS_INLINE _syscall2(int, stat64, const char *, p,
+ struct kernel_stat64 *, b)
+
+ LSS_INLINE int LSS_NAME(sigaction)(int signum,
+ const struct kernel_sigaction *act,
+ struct kernel_sigaction *oldact) {
+ int old_errno = LSS_ERRNO;
+ int rc;
+ struct kernel_sigaction a;
+ if (act != NULL) {
+ a = *act;
+ #ifdef __i386__
+ /* On i386, the kernel requires us to always set our own
+ * SA_RESTORER when using realtime signals. Otherwise, it does not
+ * know how to return from a signal handler. This function must have
+ * a "magic" signature that the "gdb" (and maybe the kernel?) can
+ * recognize.
+ * Apparently, a SA_RESTORER is implicitly set by the kernel, when
+ * using non-realtime signals.
+ *
+ * TODO: Test whether ARM needs a restorer
+ */
+ if (!(a.sa_flags & SA_RESTORER)) {
+ a.sa_flags |= SA_RESTORER;
+ a.sa_restorer = (a.sa_flags & SA_SIGINFO)
+ ? LSS_NAME(restore_rt)() : LSS_NAME(restore)();
+ }
+ #endif
+ }
+ rc = LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact,
+ (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ struct kernel_old_sigaction oa, ooa, *ptr_a = &oa, *ptr_oa = &ooa;
+ if (!act) {
+ ptr_a = NULL;
+ } else {
+ oa.sa_handler_ = act->sa_handler_;
+ memcpy(&oa.sa_mask, &act->sa_mask, sizeof(oa.sa_mask));
+ #ifndef __mips__
+ oa.sa_restorer = act->sa_restorer;
+ #endif
+ oa.sa_flags = act->sa_flags;
+ }
+ if (!oldact) {
+ ptr_oa = NULL;
+ }
+ LSS_ERRNO = old_errno;
+ rc = LSS_NAME(_sigaction)(signum, ptr_a, ptr_oa);
+ if (rc == 0 && oldact) {
+ if (act) {
+ memcpy(oldact, act, sizeof(*act));
+ } else {
+ memset(oldact, 0, sizeof(*oldact));
+ }
+ oldact->sa_handler_ = ptr_oa->sa_handler_;
+ oldact->sa_flags = ptr_oa->sa_flags;
+ memcpy(&oldact->sa_mask, &ptr_oa->sa_mask, sizeof(ptr_oa->sa_mask));
+ #ifndef __mips__
+ oldact->sa_restorer = ptr_oa->sa_restorer;
+ #endif
+ }
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) {
+ int old_errno = LSS_ERRNO;
+ int rc = LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ LSS_ERRNO = old_errno;
+ LSS_NAME(sigemptyset)(set);
+ rc = LSS_NAME(_sigpending)(&set->sig[0]);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(sigprocmask)(int how,
+ const struct kernel_sigset_t *set,
+ struct kernel_sigset_t *oldset) {
+ int olderrno = LSS_ERRNO;
+ int rc = LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ LSS_ERRNO = olderrno;
+ if (oldset) {
+ LSS_NAME(sigemptyset)(oldset);
+ }
+ rc = LSS_NAME(_sigprocmask)(how,
+ set ? &set->sig[0] : NULL,
+ oldset ? &oldset->sig[0] : NULL);
+ }
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) {
+ int olderrno = LSS_ERRNO;
+ int rc = LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8);
+ if (rc < 0 && LSS_ERRNO == ENOSYS) {
+ LSS_ERRNO = olderrno;
+ rc = LSS_NAME(_sigsuspend)(
+ #ifndef __PPC__
+ set, 0,
+ #endif
+ set->sig[0]);
+ }
+ return rc;
+ }
+ #endif
+ #if defined(__PPC__)
+ #undef LSS_SC_LOADARGS_0
+ #define LSS_SC_LOADARGS_0(dummy...)
+ #undef LSS_SC_LOADARGS_1
+ #define LSS_SC_LOADARGS_1(arg1) \
+ __sc_4 = (unsigned long) (arg1)
+ #undef LSS_SC_LOADARGS_2
+ #define LSS_SC_LOADARGS_2(arg1, arg2) \
+ LSS_SC_LOADARGS_1(arg1); \
+ __sc_5 = (unsigned long) (arg2)
+ #undef LSS_SC_LOADARGS_3
+ #define LSS_SC_LOADARGS_3(arg1, arg2, arg3) \
+ LSS_SC_LOADARGS_2(arg1, arg2); \
+ __sc_6 = (unsigned long) (arg3)
+ #undef LSS_SC_LOADARGS_4
+ #define LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4) \
+ LSS_SC_LOADARGS_3(arg1, arg2, arg3); \
+ __sc_7 = (unsigned long) (arg4)
+ #undef LSS_SC_LOADARGS_5
+ #define LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5) \
+ LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4); \
+ __sc_8 = (unsigned long) (arg5)
+ #undef LSS_SC_BODY
+ #define LSS_SC_BODY(nr, type, opt, args...) \
+ long __sc_ret, __sc_err; \
+ { \
+ register unsigned long __sc_0 __asm__ ("r0") = __NR_socketcall; \
+ register unsigned long __sc_3 __asm__ ("r3") = opt; \
+ register unsigned long __sc_4 __asm__ ("r4"); \
+ register unsigned long __sc_5 __asm__ ("r5"); \
+ register unsigned long __sc_6 __asm__ ("r6"); \
+ register unsigned long __sc_7 __asm__ ("r7"); \
+ register unsigned long __sc_8 __asm__ ("r8"); \
+ LSS_SC_LOADARGS_##nr(args); \
+ __asm__ __volatile__ \
+ ("stwu 1, -48(1)\n\t" \
+ "stw 4, 20(1)\n\t" \
+ "stw 5, 24(1)\n\t" \
+ "stw 6, 28(1)\n\t" \
+ "stw 7, 32(1)\n\t" \
+ "stw 8, 36(1)\n\t" \
+ "addi 4, 1, 20\n\t" \
+ "sc\n\t" \
+ "mfcr %0" \
+ : "=&r" (__sc_0), \
+ "=&r" (__sc_3), "=&r" (__sc_4), \
+ "=&r" (__sc_5), "=&r" (__sc_6), \
+ "=&r" (__sc_7), "=&r" (__sc_8) \
+ : LSS_ASMINPUT_##nr \
+ : "cr0", "ctr", "memory"); \
+ __sc_ret = __sc_3; \
+ __sc_err = __sc_0; \
+ } \
+ LSS_RETURN(type, __sc_ret, __sc_err)
+
+ LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg,
+ int flags){
+ LSS_SC_BODY(3, ssize_t, 17, s, msg, flags);
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s,
+ const struct kernel_msghdr *msg,
+ int flags) {
+ LSS_SC_BODY(3, ssize_t, 16, s, msg, flags);
+ }
+
+ // TODO(csilvers): why is this ifdef'ed out?
+#if 0
+ LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len,
+ int flags,
+ const struct kernel_sockaddr *to,
+ unsigned int tolen) {
+ LSS_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen);
+ }
+#endif
+
+ LSS_INLINE int LSS_NAME(shutdown)(int s, int how) {
+ LSS_SC_BODY(2, int, 13, s, how);
+ }
+
+ LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) {
+ LSS_SC_BODY(3, int, 1, domain, type, protocol);
+ }
+
+ LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol,
+ int sv[2]) {
+ LSS_SC_BODY(4, int, 8, d, type, protocol, sv);
+ }
+ #endif
+ #if defined(__i386__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+ #define __NR__socketcall __NR_socketcall
+ LSS_INLINE _syscall2(int, _socketcall, int, c,
+ va_list, a)
+
+ LSS_INLINE int LSS_NAME(socketcall)(int op, ...) {
+ int rc;
+ va_list ap;
+ va_start(ap, op);
+ rc = LSS_NAME(_socketcall)(op, ap);
+ va_end(ap);
+ return rc;
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg,
+ int flags){
+ return (ssize_t)LSS_NAME(socketcall)(17, s, msg, flags);
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s,
+ const struct kernel_msghdr *msg,
+ int flags) {
+ return (ssize_t)LSS_NAME(socketcall)(16, s, msg, flags);
+ }
+
+ LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len,
+ int flags,
+ const struct kernel_sockaddr *to,
+ unsigned int tolen) {
+ return (ssize_t)LSS_NAME(socketcall)(11, s, buf, len, flags, to, tolen);
+ }
+
+ LSS_INLINE int LSS_NAME(shutdown)(int s, int how) {
+ return LSS_NAME(socketcall)(13, s, how);
+ }
+
+ LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) {
+ return LSS_NAME(socketcall)(1, domain, type, protocol);
+ }
+
+ LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol,
+ int sv[2]) {
+ return LSS_NAME(socketcall)(8, d, type, protocol, sv);
+ }
+ #endif
+ #if defined(__i386__) || defined(__PPC__)
+ LSS_INLINE _syscall4(int, fstatat64, int, d,
+ const char *, p,
+ struct kernel_stat64 *, b, int, f)
+ #endif
+ #if defined(__i386__) || defined(__PPC__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32)
+ LSS_INLINE _syscall3(pid_t, waitpid, pid_t, p,
+ int*, s, int, o)
+ #endif
+ #if defined(__mips__)
+ /* sys_pipe() on MIPS has non-standard calling conventions, as it returns
+ * both file handles through CPU registers.
+ */
+ LSS_INLINE int LSS_NAME(pipe)(int *p) {
+ register unsigned long __v0 __asm__("$2") = __NR_pipe;
+ register unsigned long __v1 __asm__("$3");
+ register unsigned long __r7 __asm__("$7");
+ __asm__ __volatile__ ("syscall\n"
+ : "=&r"(__v0), "=&r"(__v1), "+r" (__r7)
+ : "0"(__v0)
+ : "$8", "$9", "$10", "$11", "$12",
+ "$13", "$14", "$15", "$24", "memory");
+ if (__r7) {
+ LSS_ERRNO = __v0;
+ return -1;
+ } else {
+ p[0] = __v0;
+ p[1] = __v1;
+ return 0;
+ }
+ }
+ #else
+ LSS_INLINE _syscall1(int, pipe, int *, p)
+ #endif
+ /* TODO(csilvers): see if ppc can/should support this as well */
+ #if defined(__i386__) || defined(__ARM_ARCH_3__) || \
+ (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64)
+ #define __NR__statfs64 __NR_statfs64
+ #define __NR__fstatfs64 __NR_fstatfs64
+ LSS_INLINE _syscall3(int, _statfs64, const char*, p,
+ size_t, s,struct kernel_statfs64*, b)
+ LSS_INLINE _syscall3(int, _fstatfs64, int, f,
+ size_t, s,struct kernel_statfs64*, b)
+ LSS_INLINE int LSS_NAME(statfs64)(const char *p,
+ struct kernel_statfs64 *b) {
+ return LSS_NAME(_statfs64)(p, sizeof(*b), b);
+ }
+ LSS_INLINE int LSS_NAME(fstatfs64)(int f,struct kernel_statfs64 *b) {
+ return LSS_NAME(_fstatfs64)(f, sizeof(*b), b);
+ }
+ #endif
+
+ LSS_INLINE int LSS_NAME(execv)(const char *path, const char *const argv[]) {
+ extern char **environ;
+ return LSS_NAME(execve)(path, argv, (const char *const *)environ);
+ }
+
+ LSS_INLINE pid_t LSS_NAME(gettid)() {
+ pid_t tid = LSS_NAME(_gettid)();
+ if (tid != -1) {
+ return tid;
+ }
+ return LSS_NAME(getpid)();
+ }
+
+ LSS_INLINE void *LSS_NAME(mremap)(void *old_address, size_t old_size,
+ size_t new_size, int flags, ...) {
+ va_list ap;
+ void *new_address, *rc;
+ va_start(ap, flags);
+ new_address = va_arg(ap, void *);
+ rc = LSS_NAME(_mremap)(old_address, old_size, new_size,
+ flags, new_address);
+ va_end(ap);
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) {
+ /* PTRACE_DETACH can sometimes forget to wake up the tracee and it
+ * then sends job control signals to the real parent, rather than to
+ * the tracer. We reduce the risk of this happening by starting a
+ * whole new time slice, and then quickly sending a SIGCONT signal
+ * right after detaching from the tracee.
+ *
+ * We use tkill to ensure that we only issue a wakeup for the thread being
+ * detached. Large multi threaded apps can take a long time in the kernel
+ * processing SIGCONT.
+ */
+ int rc, err;
+ LSS_NAME(sched_yield)();
+ rc = LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0);
+ err = LSS_ERRNO;
+ LSS_NAME(tkill)(pid, SIGCONT);
+ /* Old systems don't have tkill */
+ if (LSS_ERRNO == ENOSYS)
+ LSS_NAME(kill)(pid, SIGCONT);
+ LSS_ERRNO = err;
+ return rc;
+ }
+
+ LSS_INLINE int LSS_NAME(raise)(int sig) {
+ return LSS_NAME(kill)(LSS_NAME(getpid)(), sig);
+ }
+
+ LSS_INLINE int LSS_NAME(setpgrp)() {
+ return LSS_NAME(setpgid)(0, 0);
+ }
+
+ LSS_INLINE int LSS_NAME(sysconf)(int name) {
+ extern int __getpagesize(void);
+ switch (name) {
+ case _SC_OPEN_MAX: {
+ struct kernel_rlimit limit;
+ return LSS_NAME(getrlimit)(RLIMIT_NOFILE, &limit) < 0
+ ? 8192 : limit.rlim_cur;
+ }
+ case _SC_PAGESIZE:
+ return __getpagesize();
+ default:
+ LSS_ERRNO = ENOSYS;
+ return -1;
+ }
+ }
+ #if defined(__x86_64__) || \
+ (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64)
+ LSS_INLINE _syscall4(ssize_t, pread64, int, f,
+ void *, b, size_t, c,
+ loff_t, o)
+ LSS_INLINE _syscall4(ssize_t, pwrite64, int, f,
+ const void *, b, size_t, c,
+ loff_t, o)
+ LSS_INLINE _syscall3(int, readahead, int, f,
+ loff_t, o, unsigned, c)
+ #else
+ #define __NR__pread64 __NR_pread64
+ #define __NR__pwrite64 __NR_pwrite64
+ #define __NR__readahead __NR_readahead
+ LSS_INLINE _syscall5(ssize_t, _pread64, int, f,
+ void *, b, size_t, c, unsigned, o1,
+ unsigned, o2)
+ LSS_INLINE _syscall5(ssize_t, _pwrite64, int, f,
+ const void *, b, size_t, c, unsigned, o1,
+ long, o2)
+ LSS_INLINE _syscall4(int, _readahead, int, f,
+ unsigned, o1, unsigned, o2, size_t, c);
+ /* We force 64bit-wide parameters onto the stack, then access each
+ * 32-bit component individually. This guarantees that we build the
+ * correct parameters independent of the native byte-order of the
+ * underlying architecture.
+ */
+ LSS_INLINE ssize_t LSS_NAME(pread64)(int fd, void *buf, size_t count,
+ loff_t off) {
+ union { loff_t off; unsigned arg[2]; } o = { off };
+ return LSS_NAME(_pread64)(fd, buf, count, o.arg[0], o.arg[1]);
+ }
+ LSS_INLINE ssize_t LSS_NAME(pwrite64)(int fd, const void *buf,
+ size_t count, loff_t off) {
+ union { loff_t off; unsigned arg[2]; } o = { off };
+ return LSS_NAME(_pwrite64)(fd, buf, count, o.arg[0], o.arg[1]);
+ }
+ LSS_INLINE int LSS_NAME(readahead)(int fd, loff_t off, int len) {
+ union { loff_t off; unsigned arg[2]; } o = { off };
+ return LSS_NAME(_readahead)(fd, o.arg[0], o.arg[1], len);
+ }
+ #endif
+#endif
+
+#if defined(__cplusplus) && !defined(SYS_CPLUSPLUS)
+}
+#endif
+
+#endif
+#endif
diff --git a/sandbox/linux/seccomp/madvise.cc b/sandbox/linux/seccomp/madvise.cc
new file mode 100644
index 0000000..738da7f
--- /dev/null
+++ b/sandbox/linux/seccomp/madvise.cc
@@ -0,0 +1,75 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_madvise(void* start, size_t length, int advice) {
+ Debug::syscall(__NR_madvise, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MAdvise madvise_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_madvise;
+ request.cookie = cookie();
+ request.madvise_req.start = start;
+ request.madvise_req.len = length;
+ request.madvise_req.advice = advice;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward madvise() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_madvise(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ MAdvise madvise_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &madvise_req, sizeof(madvise_req)) !=
+ sizeof(madvise_req)) {
+ die("Failed to read parameters for madvise() [process]");
+ }
+ int rc = -EINVAL;
+ switch (madvise_req.advice) {
+ case MADV_NORMAL:
+ case MADV_RANDOM:
+ case MADV_SEQUENTIAL:
+ case MADV_WILLNEED:
+ ok:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_madvise,
+ madvise_req.start, madvise_req.len,
+ madvise_req.advice);
+ return true;
+ default:
+ // All other flags to madvise() are potential dangerous (as opposed to
+ // merely affecting overall performance). Do not allow them on memory
+ // ranges that were part of the original mappings.
+ void *stop = reinterpret_cast<void *>(
+ (char *)madvise_req.start + madvise_req.len);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)madvise_req.start);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (madvise_req.start < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+
+ // Changing attributes on memory regions that were newly mapped inside of
+ // the sandbox is OK.
+ goto ok;
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/maps.cc b/sandbox/linux/seccomp/maps.cc
new file mode 100644
index 0000000..606b65d
--- /dev/null
+++ b/sandbox/linux/seccomp/maps.cc
@@ -0,0 +1,330 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <iostream>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "library.h"
+#include "maps.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+Maps::Maps(const std::string& maps_file) :
+ maps_file_(maps_file),
+ begin_iter_(this, true, false),
+ end_iter_(this, false, true),
+ pid_(-1),
+ vsyscall_(0) {
+ memset(fds_, -1, sizeof(fds_));
+ int fd = open(maps_file.c_str(), O_RDONLY);
+ Sandbox::SysCalls sys;
+ if (fd >= 0) {
+ char buf[256] = { 0 };
+ int len = 0, rc = 1;
+ bool long_line = false;
+ do {
+ if (rc > 0) {
+ rc = Sandbox::read(sys, fd, buf + len, sizeof(buf) - len - 1);
+ if (rc > 0) {
+ len += rc;
+ }
+ }
+ char *ptr = buf;
+ if (!long_line) {
+ long_line = true;
+ unsigned long start = strtoul(ptr, &ptr, 16);
+ unsigned long stop = strtoul(ptr + 1, &ptr, 16);
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ char *perm_ptr = ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
+ std::string perm(perm_ptr, ptr - perm_ptr);
+ unsigned long offset = strtoul(ptr, &ptr, 16);
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ char *id_ptr = ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
+ std::string id(id_ptr, ptr - id_ptr);
+ while (*ptr == ' ' || *ptr == '\t') ++ptr;
+ char *library_ptr = ptr;
+ while (*ptr && *ptr != ' ' && *ptr != '\t' && *ptr != '\n') ++ptr;
+ std::string library(library_ptr, ptr - library_ptr);
+ bool isVDSO = false;
+ if (library == "[vdso]") {
+ // /proc/self/maps has a misleading file offset in the [vdso] entry.
+ // Override it with a sane value.
+ offset = 0;
+ isVDSO = true;
+ } else if (library == "[vsyscall]") {
+ vsyscall_ = reinterpret_cast<char *>(start);
+ } else if (library.empty() || library[0] == '[') {
+ goto skip_entry;
+ }
+ int prot = 0;
+ if (perm.find('r') != std::string::npos) {
+ prot |= PROT_READ;
+ }
+ if (perm.find('w') != std::string::npos) {
+ prot |= PROT_WRITE;
+ }
+ if (perm.find('x') != std::string::npos) {
+ prot |= PROT_EXEC;
+ }
+ if ((prot & (PROT_EXEC | PROT_READ)) == 0) {
+ goto skip_entry;
+ }
+ libs_[id + ' ' + library].addMemoryRange(
+ reinterpret_cast<void *>(start),
+ reinterpret_cast<void *>(stop),
+ Elf_Addr(offset),
+ prot, isVDSO);
+ }
+ skip_entry:
+ for (;;) {
+ if (!*ptr || *ptr++ == '\n') {
+ long_line = false;
+ memmove(buf, ptr, len - (ptr - buf));
+ memset(buf + len - (ptr - buf), 0, ptr - buf);
+ len -= (ptr - buf);
+ break;
+ }
+ }
+ } while (len || long_line);
+ NOINTR_SYS(close(fd));
+
+ // The runtime loader clobbers some of the data that we want to read,
+ // when it relocates objects. As we cannot trust the filename that we
+ // obtained from /proc/self/maps, we instead fork() a child process and
+ // use mremap() to uncover the obscured data.
+ int tmp_fds[4];
+ pipe(tmp_fds);
+ pipe(tmp_fds + 2);
+ pid_ = fork();
+ if (pid_ >= 0) {
+ // Set up read and write file descriptors for exchanging data
+ // between parent and child.
+ fds_[ !pid_] = tmp_fds[ !pid_];
+ fds_[!!pid_] = tmp_fds[2 + !!pid_];
+ NOINTR_SYS(close( tmp_fds[ !!pid_]));
+ NOINTR_SYS(close( tmp_fds[2 + !pid_]));
+
+ for (LibraryMap::iterator iter = libs_.begin(); iter != libs_.end(); ){
+ Library* lib = &iter->second;
+ if (pid_) {
+ lib->recoverOriginalDataParent(this);
+ } else {
+ lib->recoverOriginalDataChild(strrchr(iter->first.c_str(), ' ') + 1);
+ }
+ if (pid_ && !lib->parseElf()) {
+ libs_.erase(iter++);
+ } else {
+ ++iter;
+ }
+ }
+
+ // Handle requests sent from the parent to the child
+ if (!pid_) {
+ Request req;
+ for (;;) {
+ if (Sandbox::read(sys, fds_[0], &req, sizeof(Request)) !=
+ sizeof(Request)) {
+ _exit(0);
+ }
+ switch (req.type) {
+ case Request::REQ_GET:
+ {
+ char *buf = new char[req.length];
+ if (!req.library->get(req.offset, buf, req.length)) {
+ req.length = -1;
+ Sandbox::write(sys, fds_[1], &req.length,sizeof(req.length));
+ } else {
+ Sandbox::write(sys, fds_[1], &req.length,sizeof(req.length));
+ Sandbox::write(sys, fds_[1], buf, req.length);
+ }
+ delete[] buf;
+ }
+ break;
+ case Request::REQ_GET_STR:
+ {
+ std::string s = req.library->get(req.offset);
+ req.length = s.length();
+ Sandbox::write(sys, fds_[1], &req.length, sizeof(req.length));
+ Sandbox::write(sys, fds_[1], s.c_str(), req.length);
+ }
+ break;
+ }
+ }
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ NOINTR_SYS(close(tmp_fds[i]));
+ }
+ }
+ }
+}
+
+Maps::~Maps() {
+ Sandbox::SysCalls sys;
+ sys.kill(pid_, SIGKILL);
+ sys.waitpid(pid_, NULL, 0);
+}
+
+char *Maps::forwardGetRequest(Library *library, Elf_Addr offset,
+ char *buf, size_t length) const {
+ Request req(Request::REQ_GET, library, offset, length);
+ Sandbox::SysCalls sys;
+ if (Sandbox::write(sys, fds_[1], &req, sizeof(Request)) != sizeof(Request) ||
+ Sandbox::read(sys, fds_[0], &req.length, sizeof(req.length)) !=
+ sizeof(req.length) ||
+ req.length == -1 ||
+ Sandbox::read(sys, fds_[0], buf, length) != (ssize_t)length) {
+ memset(buf, 0, length);
+ return NULL;
+ }
+ return buf;
+}
+
+std::string Maps::forwardGetRequest(Library *library,
+ Elf_Addr offset) const {
+ Request req(Request::REQ_GET_STR, library, offset, -1);
+ Sandbox::SysCalls sys;
+ if (Sandbox::write(sys, fds_[1], &req, sizeof(Request)) != sizeof(Request) ||
+ Sandbox::read(sys, fds_[0], &req.length, sizeof(req.length)) !=
+ sizeof(req.length)) {
+ return "";
+ }
+ char *buf = new char[req.length];
+ if (Sandbox::read(sys, fds_[0], buf, req.length) != (ssize_t)req.length) {
+ delete[] buf;
+ return "";
+ }
+ std::string s(buf, req.length);
+ delete[] buf;
+ return s;
+}
+
+Maps::Iterator::Iterator(Maps* maps, bool at_beginning, bool at_end)
+ : maps_(maps),
+ at_beginning_(at_beginning),
+ at_end_(at_end) {
+}
+
+Maps::LibraryMap::iterator& Maps::Iterator::getIterator() const {
+ if (at_beginning_) {
+ iter_ = maps_->libs_.begin();
+ } else if (at_end_) {
+ iter_ = maps_->libs_.end();
+ }
+ return iter_;
+}
+
+Maps::Iterator Maps::Iterator::begin() {
+ return maps_->begin_iter_;
+}
+
+Maps::Iterator Maps::Iterator::end() {
+ return maps_->end_iter_;
+}
+
+Maps::Iterator& Maps::Iterator::operator++() {
+ getIterator().operator++();
+ at_beginning_ = false;
+ return *this;
+}
+
+Maps::Iterator Maps::Iterator::operator++(int i) {
+ getIterator().operator++(i);
+ at_beginning_ = false;
+ return *this;
+}
+
+Library* Maps::Iterator::operator*() const {
+ return &getIterator().operator*().second;
+}
+
+bool Maps::Iterator::operator==(const Maps::Iterator& iter) const {
+ return getIterator().operator==(iter.getIterator());
+}
+
+bool Maps::Iterator::operator!=(const Maps::Iterator& iter) const {
+ return !operator==(iter);
+}
+
+std::string Maps::Iterator::name() const {
+ return getIterator()->first;
+}
+
+char* Maps::allocNearAddr(char* addr, size_t size, int prot) const {
+ // We try to allocate memory within 1.5GB of a target address. This means,
+ // we will be able to perform relative 32bit jumps from the target address.
+ size = (size + 4095) & ~4095;
+ Sandbox::SysCalls sys;
+ int fd = sys.open(maps_file_.c_str(), O_RDONLY, 0);
+ if (fd < 0) {
+ return NULL;
+ }
+
+ char buf[256] = { 0 };
+ int len = 0, rc = 1;
+ bool long_line = false;
+ unsigned long gap_start = 0x10000;
+ char *new_addr;
+ do {
+ if (rc > 0) {
+ do {
+ rc = Sandbox::read(sys, fd, buf + len, sizeof(buf) - len - 1);
+ if (rc > 0) {
+ len += rc;
+ }
+ } while (rc > 0 && len < (int)sizeof(buf) - 1);
+ }
+ char *ptr = buf;
+ if (!long_line) {
+ long_line = true;
+ unsigned long start = strtoul(ptr, &ptr, 16);
+ unsigned long stop = strtoul(ptr + 1, &ptr, 16);
+ if (start - gap_start >= size) {
+ if (reinterpret_cast<long>(addr) - static_cast<long>(start) >= 0) {
+ if (reinterpret_cast<long>(addr) - (start - size) < (1536 << 20)) {
+ new_addr = reinterpret_cast<char *>(sys.MMAP
+ (reinterpret_cast<void *>(start - size), size, prot,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0));
+ if (new_addr != MAP_FAILED) {
+ goto done;
+ }
+ }
+ } else if (gap_start + size - reinterpret_cast<long>(addr) <
+ (1536 << 20)) {
+ new_addr = reinterpret_cast<char *>(sys.MMAP
+ (reinterpret_cast<void *>(gap_start), size, prot,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1 ,0));
+ if (new_addr != MAP_FAILED) {
+ goto done;
+ }
+ }
+ }
+ gap_start = stop;
+ }
+ for (;;) {
+ if (!*ptr || *ptr++ == '\n') {
+ long_line = false;
+ memmove(buf, ptr, len - (ptr - buf));
+ memset(buf + len - (ptr - buf), 0, ptr - buf);
+ len -= (ptr - buf);
+ break;
+ }
+ }
+ } while (len || long_line);
+ new_addr = NULL;
+done:
+ sys.close(fd);
+ return new_addr;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/maps.h b/sandbox/linux/seccomp/maps.h
new file mode 100644
index 0000000..6b86555
--- /dev/null
+++ b/sandbox/linux/seccomp/maps.h
@@ -0,0 +1,105 @@
+#ifndef MAPS_H__
+#define MAPS_H__
+
+#include <elf.h>
+#include <string>
+#include <vector>
+
+#if defined(__x86_64__)
+typedef Elf64_Addr Elf_Addr;
+#elif defined(__i386__)
+typedef Elf32_Addr Elf_Addr;
+#else
+#error Undefined target platform
+#endif
+
+namespace playground {
+
+class Library;
+class Maps {
+ friend class Library;
+ public:
+ Maps(const std::string& maps_file);
+ ~Maps();
+
+ protected:
+ char *forwardGetRequest(Library *library, Elf_Addr offset, char *buf,
+ size_t length) const;
+ std::string forwardGetRequest(Library *library, Elf_Addr offset) const;
+
+ // A map with all the libraries currently loaded into the application.
+ // The key is a unique combination of device number, inode number, and
+ // file name. It should be treated as opaque.
+ typedef std::map<std::string, Library> LibraryMap;
+ friend class Iterator;
+ class Iterator {
+ friend class Maps;
+
+ protected:
+ explicit Iterator(Maps* maps);
+ Iterator(Maps* maps, bool at_beginning, bool at_end);
+ Maps::LibraryMap::iterator& getIterator() const;
+
+ public:
+ Iterator begin();
+ Iterator end();
+ Iterator& operator++();
+ Iterator operator++(int i);
+ Library* operator*() const;
+ bool operator==(const Iterator& iter) const;
+ bool operator!=(const Iterator& iter) const;
+ std::string name() const;
+
+ protected:
+ mutable LibraryMap::iterator iter_;
+ Maps *maps_;
+ bool at_beginning_;
+ bool at_end_;
+ };
+
+ public:
+ typedef class Iterator const_iterator;
+
+ const_iterator begin() {
+ return begin_iter_;
+ }
+
+ const_iterator end() {
+ return end_iter_;
+ }
+
+ char* allocNearAddr(char *addr, size_t size, int prot) const;
+
+ char* vsyscall() const { return vsyscall_; }
+
+ private:
+ struct Request {
+ enum Type { REQ_GET, REQ_GET_STR };
+
+ Request() { }
+
+ Request(enum Type t, Library* i, Elf_Addr o, ssize_t l) :
+ library(i), offset(o), length(l), type(t), padding(0) {
+ }
+
+ Library* library;
+ Elf_Addr offset;
+ ssize_t length;
+ enum Type type;
+ int padding; // for valgrind
+ };
+
+ protected:
+ const std::string maps_file_;
+ const Iterator begin_iter_;
+ const Iterator end_iter_;
+
+ LibraryMap libs_;
+ pid_t pid_;
+ int fds_[2];
+ char* vsyscall_;
+};
+
+} // namespace
+
+#endif // MAPS_H__
diff --git a/sandbox/linux/seccomp/mmap.cc b/sandbox/linux/seccomp/mmap.cc
new file mode 100644
index 0000000..9ffd110
--- /dev/null
+++ b/sandbox/linux/seccomp/mmap.cc
@@ -0,0 +1,69 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+void* Sandbox::sandbox_mmap(void *start, size_t length, int prot, int flags,
+ int fd, off_t offset) {
+ Debug::syscall(__NR_mmap, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MMap mmap_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_MMAP;
+ request.cookie = cookie();
+ request.mmap_req.start = start;
+ request.mmap_req.length = length;
+ request.mmap_req.prot = prot;
+ request.mmap_req.flags = flags;
+ request.mmap_req.fd = fd;
+ request.mmap_req.offset = offset;
+
+ void* rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward mmap() request [sandbox]");
+ }
+ return rc;
+}
+
+bool Sandbox::process_mmap(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ MMap mmap_req;
+ if (read(sys, sandboxFd, &mmap_req, sizeof(mmap_req)) != sizeof(mmap_req)) {
+ die("Failed to read parameters for mmap() [process]");
+ }
+
+ if (mmap_req.flags & MAP_FIXED) {
+ // Cannot map a memory area that was part of the original memory mappings.
+ void *stop = reinterpret_cast<void *>(
+ (char *)mmap_req.start + mmap_req.length);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)mmap_req.start);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (mmap_req.start < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ int rc = -EINVAL;
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+ }
+
+ // All other mmap() requests are OK
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_MMAP,
+ mmap_req.start, mmap_req.length, mmap_req.prot,
+ mmap_req.flags, mmap_req.fd, mmap_req.offset);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/mprotect.cc b/sandbox/linux/seccomp/mprotect.cc
new file mode 100644
index 0000000..1852b7d
--- /dev/null
+++ b/sandbox/linux/seccomp/mprotect.cc
@@ -0,0 +1,66 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_mprotect(const void *addr, size_t len, int prot) {
+ Debug::syscall(__NR_mprotect, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MProtect mprotect_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_mprotect;
+ request.cookie = cookie();
+ request.mprotect_req.addr = addr;
+ request.mprotect_req.len = len;
+ request.mprotect_req.prot = prot;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward mprotect() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_mprotect(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ MProtect mprotect_req;
+ if (read(sys, sandboxFd, &mprotect_req, sizeof(mprotect_req)) !=
+ sizeof(mprotect_req)) {
+ die("Failed to read parameters for mprotect() [process]");
+ }
+
+ // Cannot change permissions on any memory region that was part of the
+ // original memory mappings.
+ int rc = -EINVAL;
+ void *stop = reinterpret_cast<void *>(
+ (char *)mprotect_req.addr + mprotect_req.len);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ (void *)mprotect_req.addr);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (mprotect_req.addr < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+
+ // Changing permissions on memory regions that were newly mapped inside of
+ // the sandbox is OK.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_mprotect,
+ mprotect_req.addr, mprotect_req.len,
+ mprotect_req.prot);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/munmap.cc b/sandbox/linux/seccomp/munmap.cc
new file mode 100644
index 0000000..ddab897
--- /dev/null
+++ b/sandbox/linux/seccomp/munmap.cc
@@ -0,0 +1,64 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_munmap(void* start, size_t length) {
+ Debug::syscall(__NR_munmap, "Executing handler");
+ struct {
+ int sysnum;
+ long long cookie;
+ MUnmap munmap_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_munmap;
+ request.cookie = cookie();
+ request.munmap_req.start = start;
+ request.munmap_req.length = length;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward munmap() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_munmap(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ MUnmap munmap_req;
+ if (read(sys, sandboxFd, &munmap_req, sizeof(munmap_req)) !=
+ sizeof(munmap_req)) {
+ die("Failed to read parameters for munmap() [process]");
+ }
+
+ // Cannot unmap any memory region that was part of the original memory
+ // mappings.
+ int rc = -EINVAL;
+ void *stop = reinterpret_cast<void *>(
+ reinterpret_cast<char *>(munmap_req.start) + munmap_req.length);
+ ProtectedMap::const_iterator iter = protectedMap_.lower_bound(
+ munmap_req.start);
+ if (iter != protectedMap_.begin()) {
+ --iter;
+ }
+ for (; iter != protectedMap_.end() && iter->first < stop; ++iter) {
+ if (munmap_req.start < reinterpret_cast<void *>(
+ reinterpret_cast<char *>(iter->first) + iter->second) &&
+ stop > iter->first) {
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+ }
+
+ // Unmapping memory regions that were newly mapped inside of the sandbox
+ // is OK.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_munmap,
+ munmap_req.start, munmap_req.length);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/mutex.h b/sandbox/linux/seccomp/mutex.h
new file mode 100644
index 0000000..7729be6
--- /dev/null
+++ b/sandbox/linux/seccomp/mutex.h
@@ -0,0 +1,149 @@
+#ifndef MUTEX_H__
+#define MUTEX_H__
+
+#include "sandbox_impl.h"
+
+namespace playground {
+
+class Mutex {
+ public:
+ typedef int mutex_t;
+
+ enum { kInitValue = 0 };
+
+ static void initMutex(mutex_t* mutex) {
+ // Mutex is unlocked, and nobody is waiting for it
+ *mutex = kInitValue;
+ }
+
+ static void unlockMutex(mutex_t* mutex) {
+ char status;
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; addl %2, %0\n"
+ "setz %1"
+ : "=m"(*mutex), "=qm"(status)
+ : "ir"(0x80000000), "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ if (status) {
+ // Mutex is zero now. No other waiters. So, we can return.
+ return;
+ }
+ // We unlocked the mutex, but still need to wake up other waiters.
+ Sandbox::SysCalls sys;
+ sys.futex(mutex, FUTEX_WAKE, 1, NULL);
+ }
+
+ static bool lockMutex(mutex_t* mutex, int timeout = 0) {
+ bool rc = true;
+ // Increment mutex to add ourselves to the list of waiters
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; incl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ for (;;) {
+ // Atomically check whether the mutex is available and if so, acquire it
+ char status;
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; btsl %3, %1\n"
+ "setc %0"
+ : "=q"(status), "=m"(*mutex)
+ : "m"(*mutex), "ir"(31));
+ #else
+ #error Unsupported target platform
+ #endif
+ if (!status) {
+ done:
+ // If the mutex was available, remove ourselves from list of waiters
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; decl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ return rc;
+ }
+ int value = *mutex;
+ if (value >= 0) {
+ // Mutex has just become available, no need to call kernel
+ continue;
+ }
+ Sandbox::SysCalls sys;
+ Sandbox::SysCalls::kernel_timespec tm;
+ if (timeout) {
+ tm.tv_sec = timeout / 1000;
+ tm.tv_nsec = (timeout % 1000) * 1000 * 1000;
+ } else {
+ tm.tv_sec = 0;
+ tm.tv_nsec = 0;
+ }
+ if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) &&
+ sys.my_errno == ETIMEDOUT) {
+ rc = false;
+ goto done;
+ }
+ }
+ }
+
+ static bool waitForUnlock(mutex_t* mutex, int timeout = 0) {
+ bool rc = true;
+ // Increment mutex to add ourselves to the list of waiters
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; incl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ Sandbox::SysCalls sys;
+ for (;;) {
+ mutex_t value = *mutex;
+ if (value >= 0) {
+ done:
+ // Mutex was not locked. Remove ourselves from list of waiters, notify
+ // any other waiters (if any), and return.
+ #if defined(__x86_64__) || defined(__i386__)
+ asm volatile(
+ "lock; decl %0\n"
+ : "=m"(*mutex)
+ : "m"(*mutex));
+ #else
+ #error Unsupported target platform
+ #endif
+ NOINTR_SYS(sys.futex(mutex, FUTEX_WAKE, 1, 0));
+ return rc;
+ }
+
+ // Wait for mutex to become unlocked
+ Sandbox::SysCalls::kernel_timespec tm;
+ if (timeout) {
+ tm.tv_sec = timeout / 1000;
+ tm.tv_nsec = (timeout % 1000) * 1000 * 1000;
+ } else {
+ tm.tv_sec = 0;
+ tm.tv_nsec = 0;
+ }
+
+ if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) &&
+ sys.my_errno == ETIMEDOUT) {
+ rc = false;
+ goto done;
+ }
+ }
+ }
+
+};
+
+} // namespace
+
+#endif // MUTEX_H__
diff --git a/sandbox/linux/seccomp/open.cc b/sandbox/linux/seccomp/open.cc
new file mode 100644
index 0000000..9b4786b
--- /dev/null
+++ b/sandbox/linux/seccomp/open.cc
@@ -0,0 +1,92 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_open(const char *pathname, int flags, mode_t mode) {
+ Debug::syscall(__NR_open, "Executing handler");
+ size_t len = strlen(pathname);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Open open_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_open;
+ request->cookie = cookie();
+ request->open_req.path_length = len;
+ request->open_req.flags = flags;
+ request->open_req.mode = mode;
+ memcpy(request->pathname, pathname, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward open() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_open(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ Open open_req;
+ if (read(sys, sandboxFd, &open_req, sizeof(open_req)) != sizeof(open_req)) {
+ read_parm_failed:
+ die("Failed to read parameters for open() [process]");
+ }
+ int rc = -ENAMETOOLONG;
+ if (open_req.path_length >= sizeof(mem->pathname)) {
+ char buf[32];
+ while (open_req.path_length > 0) {
+ size_t len = open_req.path_length > sizeof(buf) ?
+ sizeof(buf) : open_req.path_length;
+ ssize_t i = read(sys, sandboxFd, buf, len);
+ if (i <= 0) {
+ goto read_parm_failed;
+ }
+ open_req.path_length -= i;
+ }
+ if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to return data from open() [process]");
+ }
+ return false;
+ }
+
+ if ((open_req.flags & O_ACCMODE) != O_RDONLY) {
+ // After locking the mutex, we can no longer abandon the system call. So,
+ // perform checks before clobbering the securely shared memory.
+ char tmp[open_req.path_length];
+ if (read(sys, sandboxFd, tmp, open_req.path_length) !=
+ (ssize_t)open_req.path_length) {
+ goto read_parm_failed;
+ }
+ Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str());
+ SecureMem::abandonSystemCall(threadFd, -EACCES);
+ return false;
+ }
+
+ SecureMem::lockSystemCall(parentProc, mem);
+ if (read(sys, sandboxFd, mem->pathname, open_req.path_length) !=
+ (ssize_t)open_req.path_length) {
+ goto read_parm_failed;
+ }
+ mem->pathname[open_req.path_length] = '\000';
+
+ // TODO(markus): Implement sandboxing policy. For now, we allow read
+ // access to everything. That's probably not correct.
+ Debug::message(("Allowing access to \"" + std::string(mem->pathname) +
+ "\"").c_str());
+
+ // Tell trusted thread to open the file.
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_open,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ open_req.flags, open_req.mode);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc
new file mode 100644
index 0000000..0c3e499
--- /dev/null
+++ b/sandbox/linux/seccomp/sandbox.cc
@@ -0,0 +1,421 @@
+#include "library.h"
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+// Global variables
+int Sandbox::pid_;
+int Sandbox::processFdPub_;
+int Sandbox::cloneFdPub_;
+Sandbox::ProtectedMap Sandbox::protectedMap_;
+std::vector<SecureMem::Args*> Sandbox::secureMemPool_;
+
+
+bool Sandbox::sendFd(int transport, int fd0, int fd1, const void* buf,
+ size_t len) {
+ int fds[2], count = 0;
+ if (fd0 >= 0) { fds[count++] = fd0; }
+ if (fd1 >= 0) { fds[count++] = fd1; }
+ if (!count) {
+ return false;
+ }
+ char cmsg_buf[CMSG_SPACE(count*sizeof(int))];
+ memset(cmsg_buf, 0, sizeof(cmsg_buf));
+ struct SysCalls::kernel_iovec iov[2] = { { 0 } };
+ struct SysCalls::kernel_msghdr msg = { 0 };
+ int dummy = 0;
+ iov[0].iov_base = &dummy;
+ iov[0].iov_len = sizeof(dummy);
+ if (buf && len > 0) {
+ iov[1].iov_base = const_cast<void *>(buf);
+ iov[1].iov_len = len;
+ }
+ msg.msg_iov = iov;
+ msg.msg_iovlen = (buf && len > 0) ? 2 : 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = CMSG_LEN(count*sizeof(int));
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(count*sizeof(int));
+ memcpy(CMSG_DATA(cmsg), fds, count*sizeof(int));
+ SysCalls sys;
+ return NOINTR_SYS(sys.sendmsg(transport, &msg, 0)) ==
+ (ssize_t)(sizeof(dummy) + ((buf && len > 0) ? len : 0));
+}
+
+bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) {
+ int count = 0;
+ int *err = NULL;
+ if (fd0) {
+ count++;
+ err = fd0;
+ *fd0 = -1;
+ }
+ if (fd1) {
+ if (!count++) {
+ err = fd1;
+ }
+ *fd1 = -1;
+ }
+ if (!count) {
+ return false;
+ }
+ char cmsg_buf[CMSG_SPACE(count*sizeof(int))];
+ memset(cmsg_buf, 0, sizeof(cmsg_buf));
+ struct SysCalls::kernel_iovec iov[2] = { { 0 } };
+ struct SysCalls::kernel_msghdr msg = { 0 };
+ iov[0].iov_base = err;
+ iov[0].iov_len = sizeof(int);
+ if (buf && len && *len > 0) {
+ iov[1].iov_base = buf;
+ iov[1].iov_len = *len;
+ }
+ msg.msg_iov = iov;
+ msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = CMSG_LEN(count*sizeof(int));
+ SysCalls sys;
+ ssize_t bytes = NOINTR_SYS(sys.recvmsg(transport, &msg, 0));
+ if (len) {
+ *len = bytes > (int)sizeof(int) ?
+ bytes - sizeof(int) : 0;
+ }
+ if (bytes != (ssize_t)(sizeof(int) + ((buf && len && *len > 0) ? *len : 0))){
+ *err = bytes >= 0 ? 0 : -EBADF;
+ return false;
+ }
+ if (*err) {
+ // "err" is the first four bytes of the payload. If these are non-zero,
+ // the sender on the other side of the socketpair sent us an errno value.
+ // We don't expect to get any file handles in this case.
+ return false;
+ }
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) ||
+ !cmsg ||
+ cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS ||
+ cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) {
+ *err = -EBADF;
+ return false;
+ }
+ if (fd1) { *fd1 = ((int *)CMSG_DATA(cmsg))[--count]; }
+ if (fd0) { *fd0 = ((int *)CMSG_DATA(cmsg))[--count]; }
+ return true;
+}
+
+void Sandbox::setupSignalHandlers() {
+ SysCalls sys;
+ struct SysCalls::kernel_sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler_ = SIG_DFL;
+ sys.sigaction(SIGCHLD, &sa, NULL);
+
+ // Set up SEGV handler for dealing with RDTSC instructions
+ sa.sa_handler_ = segv();
+ sys.sigaction(SIGSEGV, &sa, NULL);
+
+ // Block all asynchronous signals, except for SIGCHLD which needs to be
+ // set to SIG_DFL for waitpid() to work.
+ SysCalls::kernel_sigset_t mask;
+ memset(&mask, 0xFF, sizeof(mask));
+ mask.sig[0] &= ~((1 << (SIGSEGV - 1)) | (1 << (SIGINT - 1)) |
+ (1 << (SIGTERM - 1)) | (1 << (SIGQUIT - 1)) |
+ (1 << (SIGHUP - 1)) | (1 << (SIGABRT - 1)) |
+ (1 << (SIGCHLD - 1)));
+ sys.sigprocmask(SIG_SETMASK, &mask, 0);
+}
+
+void (*Sandbox::segv())(int signo) {
+ void (*fnc)(int signo);
+ asm volatile(
+ "call 999f\n"
+#if defined(__x86_64__)
+ // Inspect instruction at the point where the segmentation fault
+ // happened. If it is RDTSC, forward the request to the trusted
+ // thread.
+ "mov $-3, %%r14\n" // request for RDTSC
+ "mov 0xB0(%%rsp), %%r15\n" // %rip at time of segmentation fault
+ "cmpw $0x310F, (%%r15)\n" // RDTSC
+ "jz 0f\n"
+ "cmpw $0x010F, (%%r15)\n" // RDTSCP
+ "jnz 8f\n"
+ "cmpb $0xF9, 2(%%r15)\n"
+ "jnz 8f\n"
+ "mov $-4, %%r14\n" // request for RDTSCP
+ "0:"
+#ifndef NDEBUG
+ "lea 100f(%%rip), %%rdi\n"
+ "call playground$debugMessage\n"
+#endif
+ "sub $4, %%rsp\n"
+ "push %%r14\n"
+ "mov %%gs:16, %%edi\n" // fd = threadFdPub
+ "mov %%rsp, %%rsi\n" // buf = %esp
+ "mov $4, %%edx\n" // len = sizeof(int)
+ "1:mov $1, %%eax\n" // NR_write
+ "syscall\n"
+ "cmp %%rax, %%rdx\n"
+ "jz 5f\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 1b\n"
+ "2:add $12, %%rsp\n"
+ "movq $0, 0x98(%%rsp)\n" // %rax at time of segmentation fault
+ "movq $0, 0x90(%%rsp)\n" // %rdx at time of segmentation fault
+ "cmpw $0x310F, (%%r15)\n" // RDTSC
+ "jz 3f\n"
+ "movq $0, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault
+ "3:addq $2, 0xB0(%%rsp)\n" // %rip at time of segmentation fault
+ "cmpw $0x010F, (%%r15)\n" // RDTSC
+ "jnz 4f\n"
+ "addq $1, 0xB0(%%rsp)\n" // %rip at time of segmentation fault
+ "4:ret\n"
+ "5:mov $12, %%edx\n" // len = 3*sizeof(int)
+ "6:mov $0, %%eax\n" // NR_read
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 6b\n"
+ "cmp %%rax, %%rdx\n"
+ "jnz 2b\n"
+ "mov 0(%%rsp), %%eax\n"
+ "mov 4(%%rsp), %%edx\n"
+ "mov 8(%%rsp), %%ecx\n"
+ "add $12, %%rsp\n"
+ "mov %%rdx, 0x90(%%rsp)\n" // %rdx at time of segmentation fault
+ "cmpw $0x310F, (%%r15)\n" // RDTSC
+ "jz 7f\n"
+ "mov %%rcx, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault
+ "7:mov %%rax, 0x98(%%rsp)\n" // %rax at time of segmentation fault
+ "jmp 3b\n"
+
+ // If the instruction is INT 0, then this was probably the result
+ // of playground::Library being unable to find a way to safely
+ // rewrite the system call instruction. Retrieve the CPU register
+ // at the time of the segmentation fault and invoke syscallWrapper().
+ "8:cmpw $0xCD, (%%r15)\n" // INT $0x0
+ "jnz 9f\n"
+#ifndef NDEBUG
+ "lea 200f(%%rip), %%rdi\n"
+ "call playground$debugMessage\n"
+#endif
+ "mov 0x98(%%rsp), %%rax\n" // %rax at time of segmentation fault
+ "mov 0x70(%%rsp), %%rdi\n" // %rdi at time of segmentation fault
+ "mov 0x78(%%rsp), %%rsi\n" // %rsi at time of segmentation fault
+ "mov 0x90(%%rsp), %%rdx\n" // %rdx at time of segmentation fault
+ "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault
+ "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault
+ "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault
+ "lea 7b(%%rip), %%rcx\n"
+ "push %%rcx\n"
+ "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault
+ "lea playground$syscallWrapper(%%rip), %%rcx\n"
+ "jmp *%%rcx\n"
+
+ // This was a genuine segmentation fault. Trigger the kernel's default
+ // signal disposition. The only way we can do this from seccomp mode
+ // is by blocking the signal and retriggering it.
+ "9:mov $2, %%edi\n" // stderr
+ "lea 300f(%%rip), %%rsi\n" // "Segmentation fault\n"
+ "mov $301f-300f, %%edx\n"
+ "mov $1, %%eax\n" // NR_write
+ "syscall\n"
+ "orb $4, 0x131(%%rsp)\n" // signal mask at time of segmentation fault
+ "ret\n"
+#elif defined(__i386__)
+ // Inspect instruction at the point where the segmentation fault
+ // happened. If it is RDTSC, forward the request to the trusted
+ // thread.
+ "mov $-3, %%ebx\n" // request for RDTSC
+ "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault
+ "cmpw $0x310F, (%%ebp)\n" // RDTSC
+ "jz 0f\n"
+ "cmpw $0x010F, (%%ebp)\n"
+ "jnz 8f\n"
+ "cmpb $0xF9, 2(%%ebp)\n"
+ "jnz 8f\n"
+ "mov $-4, %%ebx\n" // request for RDTSCP
+ "0:"
+#ifndef NDEBUG
+ "lea 100f, %%eax\n"
+ "push %%eax\n"
+ "call playground$debugMessage\n"
+ "sub $4, %%esp\n"
+#else
+ "sub $8, %%esp\n"
+#endif
+ "push %%ebx\n"
+ "mov %%fs:16, %%ebx\n" // fd = threadFdPub
+ "mov %%esp, %%ecx\n" // buf = %esp
+ "mov $4, %%edx\n" // len = sizeof(int)
+ "1:mov %%edx, %%eax\n" // NR_write
+ "int $0x80\n"
+ "cmp %%eax, %%edx\n"
+ "jz 5f\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 1b\n"
+ "2:add $12, %%esp\n"
+ "movl $0, 0x34(%%esp)\n" // %eax at time of segmentation fault
+ "movl $0, 0x2C(%%esp)\n" // %edx at time of segmentation fault
+ "cmpw $0x310F, (%%ebp)\n" // RDTSC
+ "jz 3f\n"
+ "movl $0, 0x30(%%esp)\n" // %ecx at time of segmentation fault
+ "3:addl $2, 0x40(%%esp)\n" // %eip at time of segmentation fault
+ "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault
+ "cmpw $0x010F, (%%ebp)\n" // RDTSC
+ "jnz 4f\n"
+ "addl $1, 0x40(%%esp)\n" // %eip at time of segmentation fault
+ "4:ret\n"
+ "5:mov $12, %%edx\n" // len = 3*sizeof(int)
+ "6:mov $3, %%eax\n" // NR_read
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 6b\n"
+ "cmp %%eax, %%edx\n"
+ "jnz 2b\n"
+ "pop %%eax\n"
+ "pop %%edx\n"
+ "pop %%ecx\n"
+ "mov %%edx, 0x2C(%%esp)\n" // %edx at time of segmentation fault
+ "cmpw $0x310F, (%%ebp)\n" // RDTSC
+ "jz 7f\n"
+ "mov %%ecx, 0x30(%%esp)\n" // %ecx at time of segmentation fault
+ "7:mov %%eax, 0x34(%%esp)\n" // %eax at time of segmentation fault
+ "jmp 3b\n"
+
+ // If the instruction is INT 0, then this was probably the result
+ // of playground::Library being unable to find a way to safely
+ // rewrite the system call instruction. Retrieve the CPU register
+ // at the time of the segmentation fault and invoke syscallWrapper().
+ "8:cmpw $0xCD, (%%ebp)\n" // INT $0x0
+ "jnz 9f\n"
+#ifndef NDEBUG
+ "lea 200f, %%eax\n"
+ "push %%eax\n"
+ "call playground$debugMessage\n"
+ "add $0x4, %%esp\n"
+#endif
+ "mov 0x34(%%esp), %%eax\n" // %eax at time of segmentation fault
+ "mov 0x28(%%esp), %%ebx\n" // %ebx at time of segmentation fault
+ "mov 0x30(%%esp), %%ecx\n" // %ecx at time of segmentation fault
+ "mov 0x2C(%%esp), %%edx\n" // %edx at time of segmentation fault
+ "mov 0x1C(%%esp), %%esi\n" // %esi at time of segmentation fault
+ "mov 0x18(%%esp), %%edi\n" // %edi at time of segmentation fault
+ "mov 0x20(%%esp), %%ebp\n" // %ebp at time of segmentation fault
+ "call playground$syscallWrapper\n"
+ "jmp 7b\n"
+
+ // This was a genuine segmentation fault. Trigger the kernel's default
+ // signal disposition. The only way we can do this from seccomp mode
+ // is by blocking the signal and retriggering it.
+ "9:mov $2, %%ebx\n" // stderr
+ "lea 300f, %%ecx\n" // "Segmentation fault\n"
+ "mov $301f-300f, %%edx\n"
+ "mov $4, %%eax\n" // NR_write
+ "int $0x80\n"
+ "orb $4, 0x59(%%esp)\n" // signal mask at time of segmentation fault
+ "ret\n"
+#else
+#error Unsupported target platform
+#endif
+ ".pushsection \".rodata\"\n"
+#ifndef NDEBUG
+ "100:.asciz \"RDTSC(P): Executing handler\\n\"\n"
+ "200:.asciz \"INT $0x0: Executing handler\\n\"\n"
+#endif
+ "300:.ascii \"Segmentation fault\\n\"\n"
+ "301:\n"
+ ".popsection\n"
+ "999:pop %0\n"
+ : "=g"(fnc)
+ );
+ return fnc;
+}
+
+void Sandbox::snapshotMemoryMappings(int processFd) {
+ SysCalls sys;
+ int mapsFd = sys.open("/proc/self/maps", O_RDONLY, 0);
+ if (mapsFd < 0 || !sendFd(processFd, mapsFd, -1, NULL, NULL)) {
+ failure:
+ die("Cannot access /proc/self/maps");
+ }
+ NOINTR_SYS(sys.close(mapsFd));
+ int dummy;
+ if (read(sys, processFd, &dummy, sizeof(dummy)) != sizeof(dummy)) {
+ goto failure;
+ }
+}
+
+void Sandbox::startSandbox() {
+ SysCalls sys;
+
+ // The pid is unchanged for the entire program, so we can retrieve it once
+ // and store it in a global variable.
+ pid_ = sys.getpid();
+
+ // Block all signals, except for the RDTSC handler
+ setupSignalHandlers();
+
+ // Get socketpairs for talking to the trusted process
+ int pair[4];
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, pair) ||
+ socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) {
+ die("Failed to create trusted thread");
+ }
+ processFdPub_ = pair[0];
+ cloneFdPub_ = pair[2];
+ SecureMemArgs::Args* secureMem = createTrustedProcess(pair[0], pair[1],
+ pair[2], pair[3]);
+
+ // We find all libraries that have system calls and redirect the system
+ // calls to the sandbox. If we miss any system calls, the application will be
+ // terminated by the kernel's seccomp code. So, from a security point of
+ // view, if this code fails to identify system calls, we are still behaving
+ // correctly.
+ {
+ Maps maps("/proc/self/maps");
+ const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL };
+
+ // Intercept system calls in the VDSO segment (if any). This has to happen
+ // before intercepting system calls in any of the other libraries, as
+ // the main kernel entry point might be inside of the VDSO and we need to
+ // determine its address before we can compare it to jumps from inside
+ // other libraries.
+ for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){
+ Library* library = *iter;
+ if (library->isVDSO()) {
+ library->makeWritable(true);
+ library->patchSystemCalls();
+ library->makeWritable(false);
+ break;
+ }
+ }
+
+ // Intercept system calls in libraries that are known to have them.
+ for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){
+ Library* library = *iter;
+ for (const char **ptr = libs; *ptr; ptr++) {
+ char *name = strstr(iter.name().c_str(), *ptr);
+ if (name) {
+ char ch = name[strlen(*ptr)];
+ if (ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z') {
+ library->makeWritable(true);
+ library->patchSystemCalls();
+ library->makeWritable(false);
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // Take a snapshot of the current memory mappings. These mappings will be
+ // off-limits to all future mmap(), munmap(), mremap(), and mprotect() calls.
+ snapshotMemoryMappings(processFdPub_);
+
+ // Creating the trusted thread enables sandboxing
+ createTrustedThread(processFdPub_, cloneFdPub_, secureMem);
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/sandbox.h b/sandbox/linux/seccomp/sandbox.h
new file mode 100644
index 0000000..959156b
--- /dev/null
+++ b/sandbox/linux/seccomp/sandbox.h
@@ -0,0 +1,6 @@
+#ifndef SANDBOX_H__
+#define SANDBOX_H__
+
+extern "C" void StartSeccompSandbox();
+
+#endif // SANDBOX_H__
diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h
new file mode 100644
index 0000000..3edb8c9
--- /dev/null
+++ b/sandbox/linux/seccomp/sandbox_impl.h
@@ -0,0 +1,621 @@
+#ifndef SANDBOX_IMPL_H__
+#define SANDBOX_IMPL_H__
+
+#include <asm/ldt.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/futex.h>
+#include <linux/prctl.h>
+#include <linux/unistd.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#define NOINTR_SYS(x) \
+ ({ typeof(x) i__; while ((i__ = (x)) < 0 && sys.my_errno == EINTR); i__;})
+
+#ifdef __cplusplus
+#include <iostream>
+#include <map>
+#include <vector>
+#include "sandbox.h"
+#include "securemem.h"
+#include "tls.h"
+
+namespace playground {
+
+class Sandbox {
+ // TODO(markus): restrict access to our private file handles
+ public:
+ enum { kMaxThreads = 100 };
+
+ // This is the main public entry point. It finds all system calls that
+ // need rewriting, sets up the resources needed by the sandbox, and
+ // enters Seccomp mode.
+ static void startSandbox() asm("StartSeccompSandbox");
+
+ private:
+// syscall_table.c has to be implemented in C, as C++ does not support
+// designated initializers for arrays. The only other alternative would be
+// to have a source code generator for this table.
+//
+// We would still like the C source file to include our header file. This
+// requires some define statements to transform C++ specific constructs to
+// something that is palatable to a C compiler.
+#define STATIC static
+#define SecureMemArgs SecureMem::Args
+ // Clone() is special as it has a wrapper in syscall_table.c. The wrapper
+ // adds one extra argument (the pointer to the saved registers) and then
+ // calls playground$sandbox__clone().
+ static int sandbox_clone(int flags, void* stack, int* pid, int* ctid,
+ void* tls, void* wrapper_sp)
+ asm("playground$sandbox__clone");
+#else
+#define STATIC
+#define bool int
+#define SecureMemArgs void
+ // This is the wrapper entry point that is found in the syscall_table.
+ int sandbox_clone(int flags, void* stack, int* pid, int* ctid, void* tls)
+ asm("playground$sandbox_clone");
+#endif
+
+ // Entry points for sandboxed code that is attempting to make system calls
+ STATIC int sandbox_access(const char*, int)
+ asm("playground$sandbox_access");
+ STATIC int sandbox_exit(int status) asm("playground$sandbox_exit");
+ STATIC int sandbox_getpid() asm("playground$sandbox_getpid");
+ #if defined(__NR_getsockopt)
+ STATIC int sandbox_getsockopt(int, int, int, void*, socklen_t*)
+ asm("playground$sandbox_getsockopt");
+ #endif
+ STATIC int sandbox_gettid() asm("playground$sandbox_gettid");
+ STATIC int sandbox_ioctl(int d, int req, void* arg)
+ asm("playground$sandbox_ioctl");
+ #if defined(__NR_ipc)
+ STATIC int sandbox_ipc(unsigned, int, int, int, void*, long)
+ asm("playground$sandbox_ipc");
+ #endif
+ STATIC int sandbox_madvise(void*, size_t, int)
+ asm("playground$sandbox_madvise");
+ STATIC void *sandbox_mmap(void* start, size_t length, int prot, int flags,
+ int fd, off_t offset)
+ asm("playground$sandbox_mmap");
+ STATIC int sandbox_mprotect(const void*, size_t, int)
+ asm("playground$sandbox_mprotect");
+ STATIC int sandbox_munmap(void* start, size_t length)
+ asm("playground$sandbox_munmap");
+ STATIC int sandbox_open(const char*, int, mode_t)
+ asm("playground$sandbox_open");
+ #if defined(__NR_recvfrom)
+ STATIC ssize_t sandbox_recvfrom(int, void*, size_t, int, void*, socklen_t*)
+ asm("playground$sandbox_recvfrom");
+ STATIC ssize_t sandbox_recvmsg(int, struct msghdr*, int)
+ asm("playground$sandbox_recvmsg");
+ STATIC size_t sandbox_sendmsg(int, const struct msghdr*, int)
+ asm("playground$sandbox_sendmsg");
+ STATIC ssize_t sandbox_sendto(int, const void*, size_t, int, const void*,
+ socklen_t)asm("playground$sandbox_sendto");
+ #if defined(__NR_shmat)
+ STATIC void* sandbox_shmat(int, const void*, int)
+ asm("playground$sandbox_shmat");
+ STATIC int sandbox_shmctl(int, int, void*)
+ asm("playground$sandbox_shmctl");
+ STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt");
+ STATIC int sandbox_shmget(int, size_t, int)
+ asm("playground$sandbox_shmget");
+ #endif
+ STATIC int sandbox_setsockopt(int, int, int, const void*, socklen_t)
+ asm("playground$sandbox_setsockopt");
+ #endif
+ #if defined(__NR_socketcall)
+ STATIC int sandbox_socketcall(int call, void* args)
+ asm("playground$sandbox_socketcall");
+ #endif
+ STATIC int sandbox_stat(const char* path, void* buf)
+ asm("playground$sandbox_stat");
+ #if defined(__NR_stat64)
+ STATIC int sandbox_stat64(const char *path, void* b)
+ asm("playground$sandbox_stat64");
+ #endif
+
+ // Functions for system calls that need to be handled in the trusted process
+ STATIC bool process_access(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_access");
+ STATIC bool process_clone(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_clone");
+ STATIC bool process_exit(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_exit");
+ #if defined(__NR_getsockopt)
+ STATIC bool process_getsockopt(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_getsockopt");
+ #endif
+ STATIC bool process_ioctl(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_ioctl");
+ #if defined(__NR_ipc)
+ STATIC bool process_ipc(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_ipc");
+ #endif
+ STATIC bool process_madvise(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_madvise");
+ STATIC bool process_mmap(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_mmap");
+ STATIC bool process_mprotect(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_mprotect");
+ STATIC bool process_munmap(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_munmap");
+ STATIC bool process_open(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_open");
+ #if defined(__NR_recvfrom)
+ STATIC bool process_recvfrom(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_recvfrom");
+ STATIC bool process_recvmsg(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_recvmsg");
+ STATIC bool process_sendmsg(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_sendmsg");
+ STATIC bool process_sendto(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_sendto");
+ STATIC bool process_setsockopt(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_setsockopt");
+ #endif
+ #if defined(__NR_shmat)
+ STATIC bool process_shmat(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmat");
+ STATIC bool process_shmctl(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmctl");
+ STATIC bool process_shmdt(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmdt");
+ STATIC bool process_shmget(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_shmget");
+ #endif
+ #if defined(__NR_socketcall)
+ STATIC bool process_socketcall(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_socketcall");
+ #endif
+ STATIC bool process_stat(int, int, int, int, SecureMemArgs*)
+ asm("playground$process_stat");
+
+#ifdef __cplusplus
+ friend class Debug;
+ friend class Library;
+ friend class Maps;
+ friend class Mutex;
+ friend class SecureMem;
+ friend class TLS;
+
+ // Define our own inline system calls. These calls will not be rewritten
+ // to point to the sandboxed wrapper functions. They thus allow us to
+ // make actual system calls (e.g. in the sandbox initialization code, and
+ // in the trusted process)
+ class SysCalls {
+ public:
+ #define SYS_CPLUSPLUS
+ #define SYS_ERRNO my_errno
+ #define SYS_INLINE inline
+ #define SYS_PREFIX -1
+ #undef SYS_LINUX_SYSCALL_SUPPORT_H
+ #include "linux_syscall_support.h"
+ SysCalls() : my_errno(0) { }
+ int my_errno;
+ };
+ #ifdef __NR_mmap2
+ #define MMAP mmap2
+ #define __NR_MMAP __NR_mmap2
+ #else
+ #define MMAP mmap
+ #define __NR_MMAP __NR_mmap
+ #endif
+
+ // Print an error message and terminate the program. Used for fatal errors.
+ static void die(const char *msg = 0) __attribute__((noreturn)) {
+ SysCalls sys;
+ if (msg) {
+ sys.write(2, msg, strlen(msg));
+ sys.write(2, "\n", 1);
+ }
+ for (;;) {
+ sys.exit_group(1);
+ sys._exit(1);
+ }
+ }
+
+ // Wrapper around "read()" that can deal with partial and interrupted reads
+ // and that does not modify the global errno variable.
+ static ssize_t read(SysCalls& sys, int fd, void* buf, size_t len) {
+ if (len < 0) {
+ sys.my_errno = EINVAL;
+ return -1;
+ }
+ size_t offset = 0;
+ while (offset < len) {
+ ssize_t partial =
+ NOINTR_SYS(sys.read(fd, reinterpret_cast<char*>(buf) + offset,
+ len - offset));
+ if (partial < 0) {
+ return partial;
+ } else if (!partial) {
+ break;
+ }
+ offset += partial;
+ }
+ return offset;
+ }
+
+ // Wrapper around "write()" that can deal with interrupted writes and that
+ // does not modify the global errno variable.
+ static ssize_t write(SysCalls& sys, int fd, const void* buf, size_t len){
+ return NOINTR_SYS(sys.write(fd, buf, len));
+ }
+
+ // Sends a file handle to another process.
+ static bool sendFd(int transport, int fd0, int fd1, const void* buf,
+ size_t len) asm("playground$sendFd");
+
+ // If getFd() fails, it will set the first valid fd slot (e.g. fd0) to
+ // -errno.
+ static bool getFd(int transport, int* fd0, int* fd1, void* buf,
+ size_t* len);
+
+ // Data structures used to forward system calls to the trusted process.
+ struct Accept {
+ int sockfd;
+ void* addr;
+ socklen_t* addrlen;
+ } __attribute__((packed));
+
+ struct Accept4 {
+ int sockfd;
+ void* addr;
+ socklen_t* addrlen;
+ int flags;
+ } __attribute__((packed));
+
+ struct Access {
+ size_t path_length;
+ int mode;
+ } __attribute__((packed));
+
+ struct Bind {
+ int sockfd;
+ void* addr;
+ socklen_t addrlen;
+ } __attribute__((packed));
+
+ struct Clone {
+ int flags;
+ void* stack;
+ int* pid;
+ int* ctid;
+ void* tls;
+ #if defined(__x86_64__)
+ struct {
+ void* r15;
+ void* r14;
+ void* r13;
+ void* r12;
+ void* r11;
+ void* r10;
+ void* r9;
+ void* r8;
+ void* rdi;
+ void* rsi;
+ void* rdx;
+ void* rcx;
+ void* rbx;
+ void* rbp;
+ void* fake_ret;
+ } regs64 __attribute__((packed));
+ #elif defined(__i386__)
+ struct {
+ void* ebp;
+ void* edi;
+ void* esi;
+ void* edx;
+ void* ecx;
+ void* ebx;
+ void* ret2;
+ } regs32 __attribute__((packed));
+ #else
+ #error Unsupported target platform
+ #endif
+ void* ret;
+ } __attribute__((packed));
+
+ struct Connect {
+ int sockfd;
+ void* addr;
+ socklen_t addrlen;
+ } __attribute__((packed));
+
+ struct GetSockName {
+ int sockfd;
+ void* name;
+ socklen_t* namelen;
+ } __attribute__((packed));
+
+ struct GetPeerName {
+ int sockfd;
+ void* name;
+ socklen_t* namelen;
+ } __attribute__((packed));
+
+ struct GetSockOpt {
+ int sockfd;
+ int level;
+ int optname;
+ void* optval;
+ socklen_t* optlen;
+ } __attribute__((packed));
+
+ struct IOCtl {
+ int d;
+ int req;
+ void *arg;
+ } __attribute__((packed));
+
+ #if defined(__NR_ipc)
+ struct IPC {
+ unsigned call;
+ int first;
+ int second;
+ int third;
+ void* ptr;
+ long fifth;
+ } __attribute__((packed));
+ #endif
+
+ struct Listen {
+ int sockfd;
+ int backlog;
+ } __attribute__((packed));
+
+ struct MAdvise {
+ const void* start;
+ size_t len;
+ int advice;
+ } __attribute__((packed));
+
+ struct MMap {
+ void* start;
+ size_t length;
+ int prot;
+ int flags;
+ int fd;
+ off_t offset;
+ } __attribute__((packed));
+
+ struct MProtect {
+ const void* addr;
+ size_t len;
+ int prot;
+ };
+
+ struct MUnmap {
+ void* start;
+ size_t length;
+ } __attribute__((packed));
+
+ struct Open {
+ size_t path_length;
+ int flags;
+ mode_t mode;
+ } __attribute__((packed));
+
+ struct Recv {
+ int sockfd;
+ void* buf;
+ size_t len;
+ int flags;
+ } __attribute__((packed));
+
+ struct RecvFrom {
+ int sockfd;
+ void* buf;
+ size_t len;
+ int flags;
+ void* from;
+ socklen_t *fromlen;
+ } __attribute__((packed));
+
+ struct RecvMsg {
+ int sockfd;
+ struct msghdr* msg;
+ int flags;
+ } __attribute__((packed));
+
+ struct Send {
+ int sockfd;
+ const void* buf;
+ size_t len;
+ int flags;
+ } __attribute__((packed));
+
+ struct SendMsg {
+ int sockfd;
+ const struct msghdr* msg;
+ int flags;
+ } __attribute__((packed));
+
+ struct SendTo {
+ int sockfd;
+ const void* buf;
+ size_t len;
+ int flags;
+ const void* to;
+ socklen_t tolen;
+ } __attribute__((packed));
+
+ struct SetSockOpt {
+ int sockfd;
+ int level;
+ int optname;
+ const void* optval;
+ socklen_t optlen;
+ } __attribute__((packed));
+
+ #if defined(__NR_shmat)
+ struct ShmAt {
+ int shmid;
+ const void* shmaddr;
+ int shmflg;
+ } __attribute__((packed));
+
+ struct ShmCtl {
+ int shmid;
+ int cmd;
+ void *buf;
+ } __attribute__((packed));
+
+ struct ShmDt {
+ const void *shmaddr;
+ } __attribute__((packed));
+
+ struct ShmGet {
+ int key;
+ size_t size;
+ int shmflg;
+ } __attribute__((packed));
+ #endif
+
+ struct ShutDown {
+ int sockfd;
+ int how;
+ } __attribute__((packed));
+
+ struct Socket {
+ int domain;
+ int type;
+ int protocol;
+ } __attribute__((packed));
+
+ struct SocketPair {
+ int domain;
+ int type;
+ int protocol;
+ int* pair;
+ } __attribute__((packed));
+
+ #if defined(__NR_socketcall)
+ struct SocketCall {
+ int call;
+ void* arg_ptr;
+ union {
+ Socket socket;
+ Bind bind;
+ Connect connect;
+ Listen listen;
+ Accept accept;
+ GetSockName getsockname;
+ GetPeerName getpeername;
+ SocketPair socketpair;
+ Send send;
+ Recv recv;
+ SendTo sendto;
+ RecvFrom recvfrom;
+ ShutDown shutdown;
+ SetSockOpt setsockopt;
+ GetSockOpt getsockopt;
+ SendMsg sendmsg;
+ RecvMsg recvmsg;
+ Accept4 accept4;
+ } args;
+ } __attribute__((packed));
+ #endif
+
+ struct Stat {
+ int sysnum;
+ size_t path_length;
+ void* buf;
+ } __attribute__((packed));
+
+ // Thread local data available from each sandboxed thread.
+ enum { TLS_COOKIE, TLS_TID, TLS_THREAD_FD };
+ static long long cookie() { return TLS::getTLSValue<long long>(TLS_COOKIE); }
+ static int tid() { return TLS::getTLSValue<int>(TLS_TID); }
+ static int threadFdPub() { return TLS::getTLSValue<int>(TLS_THREAD_FD); }
+ static int processFdPub() { return processFdPub_; }
+
+ // The SEGV handler knows how to handle RDTSC instructions
+ static void setupSignalHandlers();
+ static void (*segv())(int signo);
+
+ // If no specific handler has been registered for a system call, call this
+ // function which asks the trusted thread to perform the call. This is used
+ // for system calls that are not restricted.
+ static void* defaultSystemCallHandler(int syscallNum, void* arg0,
+ void* arg1, void* arg2, void* arg3,
+ void* arg4, void* arg5)
+ asm("playground$defaultSystemCallHandler");
+
+ // Return a secure memory structure that can be used by a newly created
+ // thread.
+ static SecureMem::Args* getSecureMem();
+
+ // This functions runs in the trusted process at startup and finds all the
+ // memory mappings that existed when the sandbox was first enabled. Going
+ // forward, all these mappings are off-limits for operations such as
+ // mmap(), munmap(), and mprotect().
+ static void initializeProtectedMap(int fd);
+
+ // Helper functions that allows the trusted process to get access to
+ // "/proc/self/maps" in the sandbox.
+ static void snapshotMemoryMappings(int processFd);
+
+ // Main loop for the trusted process.
+ static void trustedProcess(int parentProc, int processFdPub, int sandboxFd,
+ int cloneFd, SecureMem::Args* secureArena)
+ __attribute__((noreturn));
+
+ // Fork()s of the trusted process.
+ static SecureMem::Args* createTrustedProcess(int processFdPub, int sandboxFd,
+ int cloneFdPub, int cloneFd);
+
+ // Creates the trusted thread for the initial thread, then enables
+ // Seccomp mode.
+ static void createTrustedThread(int processFdPub, int cloneFdPub,
+ SecureMem::Args* secureMem);
+
+ static int pid_;
+ static int processFdPub_;
+ static int cloneFdPub_;
+
+ #ifdef __i386__
+ struct SocketCallArgInfo;
+ static const struct SocketCallArgInfo socketCallArgInfo[];
+ #endif
+
+ // The syscall_mutex_ can only be directly accessed by the trusted process.
+ // It can be accessed by the trusted thread after fork()ing and calling
+ // mprotect(PROT_READ|PROT_WRITE). The mutex is used for system calls that
+ // require passing additional data, and that require the trusted process to
+ // wait until the trusted thread is done processing (e.g. exit(), clone(),
+ // open(), stat())
+ static int syscall_mutex_ asm("playground$syscall_mutex");
+
+ // Available in trusted process, only
+ typedef std::map<void *, long> ProtectedMap;
+ static ProtectedMap protectedMap_;
+ static std::vector<SecureMem::Args*> secureMemPool_;
+};
+
+} // namespace
+
+using playground::Sandbox;
+#endif // __cplusplus
+
+#endif // SANDBOX_IMPL_H__
diff --git a/sandbox/linux/seccomp/securemem.cc b/sandbox/linux/seccomp/securemem.cc
new file mode 100644
index 0000000..c8e59f9
--- /dev/null
+++ b/sandbox/linux/seccomp/securemem.cc
@@ -0,0 +1,97 @@
+#include "debug.h"
+#include "mutex.h"
+#include "sandbox_impl.h"
+#include "securemem.h"
+
+namespace playground {
+
+void SecureMem::abandonSystemCall(int fd, int err) {
+ void* rc = reinterpret_cast<void *>(err);
+ if (err) {
+ Debug::message("System call failed\n");
+ }
+ Sandbox::SysCalls sys;
+ if (Sandbox::write(sys, fd, &rc, sizeof(rc)) != sizeof(rc)) {
+ Sandbox::die("Failed to send system call");
+ }
+}
+
+void SecureMem::dieIfParentDied(int parentProc) {
+ // The syscall_mutex_ should not be contended. If it is, we are either
+ // experiencing a very unusual load of system calls that the sandbox is not
+ // optimized for; or, more likely, the sandboxed process terminated while the
+ // trusted process was in the middle of waiting for the mutex. We detect
+ // this situation and terminate the trusted process.
+ char proc[80];
+ sprintf(proc, "/proc/self/fd/%d/status", parentProc);
+ struct stat sb;
+ if (stat(proc, &sb)) {
+ Sandbox::die();
+ }
+}
+
+void SecureMem::lockSystemCall(int parentProc, Args* mem) {
+ while (!Mutex::lockMutex(&Sandbox::syscall_mutex_, 500)) {
+ dieIfParentDied(parentProc);
+ }
+ asm volatile(
+ #if defined(__x86_64__)
+ "lock; incq (%0)\n"
+ #elif defined(__i386__)
+ "lock; incl (%0)\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ :
+ : "q"(&mem->sequence)
+ : "memory");
+}
+
+void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentProc,
+ Args* mem, int syscallNum, void* arg1,
+ void* arg2, void* arg3, void* arg4,
+ void* arg5, void* arg6) {
+ if (!locked) {
+ asm volatile(
+ #if defined(__x86_64__)
+ "lock; incq (%0)\n"
+ #elif defined(__i386__)
+ "lock; incl (%0)\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ :
+ : "q"(&mem->sequence)
+ : "memory");
+ }
+ mem->syscallNum = syscallNum;
+ mem->arg1 = arg1;
+ mem->arg2 = arg2;
+ mem->arg3 = arg3;
+ mem->arg4 = arg4;
+ mem->arg5 = arg5;
+ mem->arg6 = arg6;
+ asm volatile(
+ #if defined(__x86_64__)
+ "lock; incq (%0)\n"
+ #elif defined(__i386__)
+ "lock; incl (%0)\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ :
+ : "q"(&mem->sequence)
+ : "memory");
+ int data = locked ? -2 : -1;
+ Sandbox::SysCalls sys;
+ if (Sandbox::write(sys, fd, &data, sizeof(data)) != sizeof(data)) {
+ Sandbox::die("Failed to send system call");
+ }
+ if (parentProc >= 0) {
+ while (!Mutex::waitForUnlock(&Sandbox::syscall_mutex_, 500)) {
+ dieIfParentDied(parentProc);
+ }
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h
new file mode 100644
index 0000000..4c208ce
--- /dev/null
+++ b/sandbox/linux/seccomp/securemem.h
@@ -0,0 +1,179 @@
+#ifndef SECURE_MEM_H__
+#define SECURE_MEM_H__
+
+#include <stdlib.h>
+
+namespace playground {
+
+class SecureMem {
+ public:
+ // Each thread is associated with two memory pages (i.e. 8192 bytes). This
+ // memory is fully accessible by the trusted process, but in the trusted
+ // thread and the sandboxed thread, the first page is only mapped PROT_READ,
+ // and the second one is PROT_READ|PROT_WRITE.
+ //
+ // The first page can be modified by the trusted process and this is the
+ // main mechanism how it communicates with the trusted thread. After each
+ // update, it updates the "sequence" number. The trusted process must
+ // check the "sequence" number has the expected value, and only then can
+ // it trust the data in this page.
+ typedef struct Args {
+ union {
+ struct {
+ union {
+ struct {
+ struct Args* self;
+ long sequence;
+ long syscallNum;
+ void* arg1;
+ void* arg2;
+ void* arg3;
+ void* arg4;
+ void* arg5;
+ void* arg6;
+
+ // Used by clone() to allow return from the syscall wrapper.
+ void* ret;
+ #if defined(__x86_64__)
+ void* rbp;
+ void* rbx;
+ void* rcx;
+ void* rdx;
+ void* rsi;
+ void* rdi;
+ void* r8;
+ void* r9;
+ void* r10;
+ void* r11;
+ void* r12;
+ void* r13;
+ void* r14;
+ void* r15;
+ #elif defined(__i386__)
+ void* ret2;
+ void* ebp;
+ void* edi;
+ void* esi;
+ void* edx;
+ void* ecx;
+ void* ebx;
+ #else
+ #error Unsupported target platform
+ #endif
+
+ // Used by clone() to set up data for the new thread.
+ struct Args* newSecureMem;
+ int processFdPub;
+ int cloneFdPub;
+
+ // Set to non-zero, if in debugging mode
+ int allowAllSystemCalls;
+
+ // The most recent SysV SHM identifier returned by
+ // shmget(IPC_PRIVATE)
+ int shmId;
+
+ // The following entries make up the sandboxed thread's TLS
+ long long cookie;
+ long long threadId;
+ long long threadFdPub;
+ } __attribute__((packed));
+ char header[512];
+ };
+ // Used for calls such as open() and stat().
+ char pathname[4096 - 512];
+ } __attribute__((packed));
+ char securePage[4096];
+ };
+ union {
+ // This scratch space is used by the trusted thread to read parameters
+ // for unrestricted system calls.
+ struct {
+ long tmpSyscallNum;
+ void* tmpArg1;
+ void* tmpArg2;
+ void* tmpArg3;
+ void* tmpArg4;
+ void* tmpArg5;
+ void* tmpArg6;
+ void* tmpReturnValue;
+ } __attribute__((packed));
+ char scratchPage[4096];
+ };
+ } __attribute__((packed)) Args;
+
+ // Allows the trusted process to check whether the parent process still
+ // exists. If it doesn't, kill the trusted process.
+ static void dieIfParentDied(int parentProc);
+
+ // The trusted process received a system call that it intends to deny.
+ static void abandonSystemCall(int fd, int err);
+
+ // Acquires the syscall_mutex_ prior to making changes to the parameters in
+ // the secure memory page. Used by calls such as exit(), clone(), open(),
+ // socketcall(), and stat().
+ // After locking the mutex, it is no longer valid to abandon the system
+ // call!
+ static void lockSystemCall(int parentProc, Args* mem);
+
+ // Sends a system call to the trusted thread. If "locked" is true, the
+ // caller must first call lockSystemCall() and must also provide
+ // "parentProc". In locked mode, sendSystemCall() won't return until the
+ // trusted thread has completed processing.
+ // Use sparingly as it serializes the operation of the trusted process.
+ static void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum);
+ }
+ template<class T1> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1);
+ }
+ template<class T1, class T2> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2);
+ }
+ template<class T1, class T2, class T3> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3);
+ }
+ template<class T1, class T2, class T3, class T4> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4);
+ }
+ template<class T1, class T2, class T3, class T4, class T5> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4,
+ T5 arg5) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4,
+ (void*)arg5);
+ }
+ template<class T1, class T2, class T3, class T4, class T5, class T6> static
+ void sendSystemCall(int fd, bool locked, int parentProc, Args* mem,
+ int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4,
+ T5 arg5, T6 arg6) {
+ sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum,
+ (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4,
+ (void*)arg5, (void*)arg6);
+ }
+
+ private:
+ static void sendSystemCallInternal(int fd, bool locked, int parentProc,
+ Args* mem, int syscallNum, void* arg1 = 0,
+ void* arg2 = 0, void* arg3 = 0,
+ void* arg4 = 0, void* arg5 = 0,
+ void* arg6 = 0);
+};
+
+} // namespace
+
+#endif // SECURE_MEM_H__
diff --git a/sandbox/linux/seccomp/socketcall.cc b/sandbox/linux/seccomp/socketcall.cc
new file mode 100644
index 0000000..d51431d
--- /dev/null
+++ b/sandbox/linux/seccomp/socketcall.cc
@@ -0,0 +1,1013 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+#if defined(__NR_socket)
+
+ssize_t Sandbox::sandbox_recvfrom(int sockfd, void* buf, size_t len, int flags,
+ void* from, socklen_t* fromlen) {
+ Debug::syscall(__NR_recvfrom, "Executing handler");
+
+ SysCalls sys;
+ if (!from && !flags) {
+ // recv() with a NULL sender and no flags is the same as read(), which
+ // is unrestricted in seccomp mode.
+ Debug::message("Replaced recv() with call to read()");
+ ssize_t rc = sys.read(sockfd, buf, len);
+ if (rc < 0) {
+ return -sys.my_errno;
+ } else {
+ return rc;
+ }
+ }
+
+ struct {
+ int sysnum;
+ long long cookie;
+ RecvFrom recvfrom_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_recvfrom;
+ request.cookie = cookie();
+ request.recvfrom_req.sockfd = sockfd;
+ request.recvfrom_req.buf = buf;
+ request.recvfrom_req.len = len;
+ request.recvfrom_req.flags = flags;
+ request.recvfrom_req.from = from;
+ request.recvfrom_req.fromlen = fromlen;
+
+ long rc;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward recvfrom() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+ssize_t Sandbox::sandbox_recvmsg(int sockfd, struct msghdr* msg, int flags) {
+ Debug::syscall(__NR_recvmsg, "Executing handler");
+
+ // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do
+ // not know whether the caller needs us to set msg->msg_flags.
+ struct {
+ int sysnum;
+ long long cookie;
+ RecvMsg recvmsg_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_recvmsg;
+ request.cookie = cookie();
+ request.recvmsg_req.sockfd = sockfd;
+ request.recvmsg_req.msg = msg;
+ request.recvmsg_req.flags = flags;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward recvmsg() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+size_t Sandbox::sandbox_sendmsg(int sockfd, const struct msghdr* msg,
+ int flags) {
+ Debug::syscall(__NR_sendmsg, "Executing handler");
+
+ if (msg->msg_iovlen == 1 && msg->msg_controllen == 0) {
+ // sendmsg() can sometimes be simplified as sendto()
+ return sandbox_sendto(sockfd, msg->msg_iov, msg->msg_iovlen,
+ flags, msg->msg_name, msg->msg_namelen);
+ }
+
+ struct Request {
+ int sysnum;
+ long long cookie;
+ SendMsg sendmsg_req;
+ struct msghdr msg;
+ } __attribute__((packed));
+ char data[sizeof(struct Request) + msg->msg_namelen + msg->msg_controllen];
+ struct Request *request = reinterpret_cast<struct Request *>(data);
+ request->sysnum = __NR_sendmsg;
+ request->cookie = cookie();
+ request->sendmsg_req.sockfd = sockfd;
+ request->sendmsg_req.msg = msg;
+ request->sendmsg_req.flags = flags;
+ request->msg = *msg;
+ memcpy(reinterpret_cast<char *>(
+ memcpy(request + 1, msg->msg_name, msg->msg_namelen)) +
+ msg->msg_namelen,
+ msg->msg_control, msg->msg_controllen);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &data, sizeof(data)) !=
+ (ssize_t)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward sendmsg() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+ssize_t Sandbox::sandbox_sendto(int sockfd, const void* buf, size_t len,
+ int flags, const void* to, socklen_t tolen) {
+ Debug::syscall(__NR_sendto, "Executing handler");
+
+ SysCalls sys;
+ if (!to && !flags) {
+ // sendto() with a NULL recipient and no flags is the same as write(),
+ // which is unrestricted in seccomp mode.
+ Debug::message("Replaced sendto() with call to write()");
+ ssize_t rc = sys.write(sockfd, buf, len);
+ if (rc < 0) {
+ return -sys.my_errno;
+ } else {
+ return rc;
+ }
+ }
+
+ struct {
+ int sysnum;
+ long long cookie;
+ SendTo sendto_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_sendto;
+ request.cookie = cookie();
+ request.sendto_req.sockfd = sockfd;
+ request.sendto_req.buf = buf;
+ request.sendto_req.len = len;
+ request.sendto_req.flags = flags;
+ request.sendto_req.to = to;
+ request.sendto_req.tolen = tolen;
+
+ long rc;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward sendto() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+int Sandbox::sandbox_setsockopt(int sockfd, int level, int optname,
+ const void* optval, socklen_t optlen) {
+ Debug::syscall(__NR_setsockopt, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ SetSockOpt setsockopt_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_setsockopt;
+ request.cookie = cookie();
+ request.setsockopt_req.sockfd = sockfd;
+ request.setsockopt_req.level = level;
+ request.setsockopt_req.optname = optname;
+ request.setsockopt_req.optval = optval;
+ request.setsockopt_req.optlen = optlen;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward setsockopt() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+int Sandbox::sandbox_getsockopt(int sockfd, int level, int optname,
+ void* optval, socklen_t* optlen) {
+ Debug::syscall(__NR_getsockopt, "Executing handler");
+
+ struct {
+ int sysnum;
+ long long cookie;
+ GetSockOpt getsockopt_req;
+ } __attribute__((packed)) request;
+ request.sysnum = __NR_getsockopt;
+ request.cookie = cookie();
+ request.getsockopt_req.sockfd = sockfd;
+ request.getsockopt_req.level = level;
+ request.getsockopt_req.optname = optname;
+ request.getsockopt_req.optval = optval;
+ request.getsockopt_req.optlen = optlen;
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), &request, sizeof(request)) !=
+ sizeof(request) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward getsockopt() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_recvfrom(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ RecvFrom recvfrom_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &recvfrom_req, sizeof(recvfrom_req)) !=
+ sizeof(recvfrom_req)) {
+ die("Failed to read parameters for recvfrom() [process]");
+ }
+
+ // Unsupported flag encountered. Deny the call.
+ if (recvfrom_req.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // While we do not anticipate any particular need to receive data on
+ // unconnected sockets, there is no particular risk in doing so.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_recvfrom, recvfrom_req.sockfd,
+ recvfrom_req.buf, recvfrom_req.len,
+ recvfrom_req.flags, recvfrom_req.from,
+ recvfrom_req.fromlen);
+ return true;
+}
+
+bool Sandbox::process_recvmsg(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ RecvMsg recvmsg_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &recvmsg_req, sizeof(recvmsg_req)) !=
+ sizeof(recvmsg_req)) {
+ die("Failed to read parameters for recvmsg() [process]");
+ }
+
+ // Unsupported flag encountered. Deny the call.
+ if (recvmsg_req.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // Receiving messages is general not security critical.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_recvmsg, recvmsg_req.sockfd,
+ recvmsg_req.msg, recvmsg_req.flags);
+ return true;
+}
+
+bool Sandbox::process_sendmsg(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ struct {
+ SendMsg sendmsg_req;
+ struct msghdr msg;
+ } __attribute__((packed)) data;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &data, sizeof(data)) != sizeof(data)) {
+ die("Failed to read parameters for sendmsg() [process]");
+ }
+
+ if (data.msg.msg_namelen < 0 || data.msg.msg_namelen > 4096 ||
+ data.msg.msg_controllen < 0 || data.msg.msg_controllen > 4096) {
+ die("Unexpected size for socketcall() payload [process]");
+ }
+ char extra[data.msg.msg_namelen + data.msg.msg_controllen];
+ if (read(sys, sandboxFd, &extra, sizeof(extra)) != (ssize_t)sizeof(extra)) {
+ die("Failed to read parameters for sendmsg() [process]");
+ }
+ if (sizeof(struct msghdr) + sizeof(extra) > sizeof(mem->pathname)) {
+ goto deny;
+ }
+
+ if (data.msg.msg_namelen ||
+ (data.sendmsg_req.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))) {
+ deny:
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // The trusted process receives file handles when a new untrusted thread
+ // gets created. We have security checks in place that prevent any
+ // critical information from being tampered with during thread creation.
+ // But if we disallowed passing of file handles, this would add an extra
+ // hurdle for an attacker.
+ // Unfortunately, for now, this is not possible as Chrome's
+ // base::SendRecvMsg() needs the ability to pass file handles.
+ if (data.msg.msg_controllen) {
+ data.msg.msg_control = extra + data.msg.msg_namelen;
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&data.msg);
+ do {
+ if (cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS) {
+ goto deny;
+ }
+ } while ((cmsg = CMSG_NXTHDR(&data.msg, cmsg)) != NULL);
+ }
+
+ // This must be a locked system call, because we have to ensure that the
+ // untrusted code does not tamper with the msghdr after we have examined it.
+ SecureMem::lockSystemCall(parentProc, mem);
+ if (sizeof(extra) > 0) {
+ if (data.msg.msg_namelen > 0) {
+ data.msg.msg_name = mem->pathname + sizeof(struct msghdr);
+ }
+ if (data.msg.msg_controllen > 0) {
+ data.msg.msg_control = mem->pathname + sizeof(struct msghdr) +
+ data.msg.msg_namelen;
+ }
+ memcpy(mem->pathname + sizeof(struct msghdr), extra, sizeof(extra));
+ }
+ memcpy(mem->pathname, &data.msg, sizeof(struct msghdr));
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem,
+ __NR_sendmsg, data.sendmsg_req.sockfd,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ data.sendmsg_req.flags);
+ return true;
+}
+
+bool Sandbox::process_sendto(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SendTo sendto_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &sendto_req, sizeof(sendto_req)) !=
+ sizeof(sendto_req)) {
+ die("Failed to read parameters for sendto() [process]");
+ }
+
+ // The sandbox does not allow sending to arbitrary addresses.
+ if (sendto_req.to) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // Unsupported flag encountered. Deny the call.
+ if (sendto_req.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) {
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+ }
+
+ // Sending data on a connected socket is similar to calling write().
+ // Allow it.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_sendto, sendto_req.sockfd,
+ sendto_req.buf, sendto_req.len,
+ sendto_req.flags, sendto_req.to,
+ sendto_req.tolen);
+ return true;
+}
+
+bool Sandbox::process_setsockopt(int parentProc, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ SetSockOpt setsockopt_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &setsockopt_req, sizeof(setsockopt_req)) !=
+ sizeof(setsockopt_req)) {
+ die("Failed to read parameters for setsockopt() [process]");
+ }
+
+ switch (setsockopt_req.level) {
+ case SOL_SOCKET:
+ switch (setsockopt_req.optname) {
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_setsockopt, setsockopt_req.sockfd,
+ setsockopt_req.level, setsockopt_req.optname,
+ setsockopt_req.optval, setsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (setsockopt_req.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_setsockopt, setsockopt_req.sockfd,
+ setsockopt_req.level, setsockopt_req.optname,
+ setsockopt_req.optval, setsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+}
+
+bool Sandbox::process_getsockopt(int parentProc, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ GetSockOpt getsockopt_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &getsockopt_req, sizeof(getsockopt_req)) !=
+ sizeof(getsockopt_req)) {
+ die("Failed to read parameters for getsockopt() [process]");
+ }
+
+ switch (getsockopt_req.level) {
+ case SOL_SOCKET:
+ switch (getsockopt_req.optname) {
+ case SO_ACCEPTCONN:
+ case SO_ERROR:
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ case SO_TYPE:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_getsockopt, getsockopt_req.sockfd,
+ getsockopt_req.level, getsockopt_req.optname,
+ getsockopt_req.optval, getsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (getsockopt_req.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem,
+ __NR_getsockopt, getsockopt_req.sockfd,
+ getsockopt_req.level, getsockopt_req.optname,
+ getsockopt_req.optval, getsockopt_req.optlen);
+ return true;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ SecureMem::abandonSystemCall(threadFd, -EINVAL);
+ return false;
+}
+
+#endif
+#if defined(__NR_socketcall)
+
+enum {
+ SYS_SOCKET = 1,
+ SYS_BIND = 2,
+ SYS_CONNECT = 3,
+ SYS_LISTEN = 4,
+ SYS_ACCEPT = 5,
+ SYS_GETSOCKNAME = 6,
+ SYS_GETPEERNAME = 7,
+ SYS_SOCKETPAIR = 8,
+ SYS_SEND = 9,
+ SYS_RECV = 10,
+ SYS_SENDTO = 11,
+ SYS_RECVFROM = 12,
+ SYS_SHUTDOWN = 13,
+ SYS_SETSOCKOPT = 14,
+ SYS_GETSOCKOPT = 15,
+ SYS_SENDMSG = 16,
+ SYS_RECVMSG = 17,
+ SYS_ACCEPT4 = 18
+};
+
+struct Sandbox::SocketCallArgInfo {
+ size_t len;
+ off_t addrOff;
+ off_t lengthOff;
+};
+const struct Sandbox::SocketCallArgInfo Sandbox::socketCallArgInfo[] = {
+ #define STRUCT(s) reinterpret_cast<SocketCall *>(0)->args.s
+ #define SIZE(s) sizeof(STRUCT(s))
+ #define OFF(s, f) offsetof(typeof STRUCT(s), f)
+ { 0 },
+ { SIZE(socket) },
+ { SIZE(bind), OFF(bind, addr), OFF(bind, addrlen) },
+ { SIZE(connect), OFF(connect, addr), OFF(connect, addrlen) },
+ { SIZE(listen) },
+ { SIZE(accept) },
+ { SIZE(getsockname) },
+ { SIZE(getpeername) },
+ { SIZE(socketpair) },
+ { SIZE(send) },
+ { SIZE(recv) },
+ { SIZE(sendto), OFF(sendto, to), OFF(sendto, tolen) },
+ { SIZE(recvfrom) },
+ { SIZE(shutdown) },
+ { SIZE(setsockopt), OFF(setsockopt, optval), OFF(setsockopt, optlen) },
+ { SIZE(getsockopt) },
+ { SIZE(sendmsg) },
+ { SIZE(recvmsg) },
+ { SIZE(accept4) }
+ #undef STRUCT
+ #undef SIZE
+ #undef OFF
+};
+
+int Sandbox::sandbox_socketcall(int call, void* args) {
+ Debug::syscall(__NR_socketcall, "Executing handler", call);
+
+ // When demultiplexing socketcall(), only accept calls that have a valid
+ // "call" opcode.
+ if (call < SYS_SOCKET || call > SYS_ACCEPT4) {
+ return -ENOSYS;
+ }
+
+ // Some type of calls include a pointer to an address or name, which cannot
+ // be accessed by the trusted process, as it lives in a separate address
+ // space. For these calls, append the extra data to the serialized request.
+ // This requires some copying of data, as we have to make sure there is
+ // only a single atomic call to write().
+ socklen_t numExtraData = 0;
+ const void* extraDataAddr = NULL;
+ if (socketCallArgInfo[call].lengthOff) {
+ memcpy(&numExtraData,
+ reinterpret_cast<char *>(args) + socketCallArgInfo[call].lengthOff,
+ sizeof(socklen_t));
+ extraDataAddr = reinterpret_cast<char *>(args) +
+ socketCallArgInfo[call].addrOff;
+ }
+
+ // sendmsg() and recvmsg() have more complicated requirements for computing
+ // the amount of extra data that needs to be sent to the trusted process.
+ if (call == SYS_SENDMSG) {
+ SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args);
+ if (sendmsg_args->msg->msg_iovlen == 1 &&
+ !sendmsg_args->msg->msg_control) {
+ // Further down in the code, this sendmsg() call will be simplified to
+ // a sendto() call. Make sure we already compute the correct value for
+ // numExtraData, as it is needed when we allocate "data[]" on the stack.
+ numExtraData = sendmsg_args->msg->msg_namelen;
+ extraDataAddr = sendmsg_args->msg->msg_name;
+ } else {
+ // sendmsg() needs to include some of the extra data so that we can
+ // inspect it in process_socketcall()
+ numExtraData = sizeof(*sendmsg_args->msg) +
+ sendmsg_args->msg->msg_namelen +
+ sendmsg_args->msg->msg_controllen;
+ extraDataAddr = NULL;
+ }
+ }
+ if (call == SYS_RECVMSG) {
+ RecvMsg *recvmsg_args = reinterpret_cast<RecvMsg *>(args);
+ numExtraData = sizeof(*recvmsg_args->msg);
+ extraDataAddr = recvmsg_args->msg;
+ }
+
+ // Set up storage for the request header and copy the data from "args"
+ // into it.
+ struct Request {
+ int sysnum;
+ long long cookie;
+ SocketCall socketcall_req;
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + numExtraData];
+ request = reinterpret_cast<struct Request *>(data);
+ memcpy(&request->socketcall_req.args, args, socketCallArgInfo[call].len);
+
+ // Simplify send(), sendto() and sendmsg(), if there are simpler equivalent
+ // calls. This allows us to occasionally replace them with calls to write(),
+ // which don't have to be forwarded to the trusted process.
+ SysCalls sys;
+ if (call == SYS_SENDMSG &&
+ request->socketcall_req.args.sendmsg.msg->msg_iovlen == 1 &&
+ !request->socketcall_req.args.sendmsg.msg->msg_control) {
+ // Ordering of these assignments is important, as we are reshuffling
+ // fields inside of a union.
+ call = SYS_SENDTO;
+ request->socketcall_req.args.sendto.flags =
+ request->socketcall_req.args.sendmsg.flags;
+ request->socketcall_req.args.sendto.to =
+ request->socketcall_req.args.sendmsg.msg->msg_name;
+ request->socketcall_req.args.sendto.tolen =
+ request->socketcall_req.args.sendmsg.msg->msg_namelen;
+ request->socketcall_req.args.sendto.len =
+ request->socketcall_req.args.sendmsg.msg->msg_iov->iov_len;
+ request->socketcall_req.args.sendto.buf =
+ request->socketcall_req.args.sendmsg.msg->msg_iov->iov_base;
+ }
+ if (call == SYS_SENDTO && !request->socketcall_req.args.sendto.to) {
+ // sendto() with a NULL address is the same as send()
+ call = SYS_SEND;
+ numExtraData = 0;
+ }
+ if (call == SYS_SEND && !request->socketcall_req.args.send.flags) {
+ // send() with no flags is the same as write(), which is unrestricted
+ // in seccomp mode.
+ Debug::message("Replaced socketcall() with call to write()");
+ ssize_t rc = sys.write(request->socketcall_req.args.send.sockfd,
+ request->socketcall_req.args.send.buf,
+ request->socketcall_req.args.send.len);
+ if (rc < 0) {
+ return -sys.my_errno;
+ } else {
+ return rc;
+ }
+ }
+
+ // Simplify recv(), and recvfrom(), if there are simpler equivalent calls.
+ // This allows us to occasionally replace them with calls to read(), which
+ // don't have to be forwarded to the trusted process.
+ // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do
+ // not know whether the caller needs us to set msg->msg_flags.
+ if (call == SYS_RECVFROM && !request->socketcall_req.args.recvfrom.from) {
+ // recvfrom() with a NULL address buffer is the same as recv()
+ call = SYS_RECV;
+ }
+ if (call == SYS_RECV && !request->socketcall_req.args.recv.flags) {
+ // recv() with no flags is the same as read(), which is unrestricted
+ // in seccomp mode.
+ Debug::message("Replaced socketcall() with call to read()");
+ ssize_t rc = sys.read(request->socketcall_req.args.recv.sockfd,
+ request->socketcall_req.args.recv.buf,
+ request->socketcall_req.args.recv.len);
+ if (rc < 0) {
+ return -sys.my_errno;
+ } else {
+ return rc;
+ }
+ }
+
+ // Fill in the rest of the request header.
+ request->sysnum = __NR_socketcall;
+ request->cookie = cookie();
+ request->socketcall_req.call = call;
+ request->socketcall_req.arg_ptr = args;
+ int padding = sizeof(request->socketcall_req.args) -
+ socketCallArgInfo[call].len;
+ if (padding > 0) {
+ memset((char *)(&request->socketcall_req.args + 1) - padding, 0, padding);
+ }
+ if (call == SYS_SENDMSG) {
+ // for sendmsg() we include the (optional) destination address, and the
+ // (optional) control data in the payload.
+ SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args);
+ memcpy(reinterpret_cast<char *>(
+ memcpy(reinterpret_cast<char *>(
+ memcpy(request + 1, sendmsg_args->msg, sizeof(*sendmsg_args->msg))) +
+ sizeof(*sendmsg_args->msg),
+ sendmsg_args->msg->msg_name, sendmsg_args->msg->msg_namelen)) +
+ sendmsg_args->msg->msg_namelen,
+ sendmsg_args->msg->msg_control, sendmsg_args->msg->msg_controllen);
+ } else if (extraDataAddr) {
+ memcpy(request + 1, extraDataAddr, numExtraData);
+ }
+
+ // Send request to trusted process and collect response from trusted thread.
+ long rc;
+ ssize_t len = sizeof(struct Request) + numExtraData;
+ if (write(sys, processFdPub(), data, len) != len ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward socketcall() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+bool Sandbox::process_socketcall(int parentProc, int sandboxFd,
+ int threadFdPub, int threadFd,
+ SecureMem::Args* mem) {
+ // Read request
+ SocketCall socketcall_req;
+ SysCalls sys;
+ if (read(sys, sandboxFd, &socketcall_req, sizeof(socketcall_req)) !=
+ sizeof(socketcall_req)) {
+ die("Failed to read parameters for socketcall() [process]");
+ }
+
+ // sandbox_socketcall() should never send us an unexpected "call" opcode.
+ // If it did, something went very wrong and we better terminate the process.
+ if (socketcall_req.call < SYS_SOCKET || socketcall_req.call > SYS_ACCEPT4) {
+ die("Unexpected socketcall() [process]");
+ }
+
+ // Check if this particular operation carries an extra payload.
+ socklen_t numExtraData = 0;
+ if (socketCallArgInfo[socketcall_req.call].lengthOff) {
+ memcpy(&numExtraData,
+ reinterpret_cast<char *>(&socketcall_req) +
+ socketCallArgInfo[socketcall_req.call].lengthOff,
+ sizeof(socklen_t));
+ } else if (socketcall_req.call == SYS_SENDMSG) {
+ numExtraData = sizeof(*socketcall_req.args.sendmsg.msg);
+ } else if (socketcall_req.call == SYS_RECVMSG) {
+ numExtraData = sizeof(*socketcall_req.args.recvmsg.msg);
+ }
+
+ // Verify that the length for the payload is reasonable. We don't want to
+ // blow up our stack, and excessive (or negative) buffer sizes are almost
+ // certainly a bug.
+ if (numExtraData < 0 || numExtraData > 4096) {
+ die("Unexpected size for socketcall() payload [process]");
+ }
+
+ // Read the extra payload, if any.
+ char extra[numExtraData];
+ if (numExtraData) {
+ if (read(sys, sandboxFd, extra, numExtraData) != (ssize_t)numExtraData) {
+ die("Failed to read socketcall() payload [process]");
+ }
+ }
+
+ // sendmsg() has another level of indirection and can carry even more payload
+ ssize_t numSendmsgExtra = 0;
+ if (socketcall_req.call == SYS_SENDMSG) {
+ struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra);
+ if (msg->msg_namelen < 0 || msg->msg_namelen > 4096 ||
+ msg->msg_controllen < 0 || msg->msg_controllen > 4096) {
+ die("Unexpected size for socketcall() payload [process]");
+ }
+ numSendmsgExtra = msg->msg_namelen + msg->msg_controllen;
+ }
+ char sendmsgExtra[numSendmsgExtra];
+ if (numSendmsgExtra) {
+ if (read(sys, sandboxFd, sendmsgExtra, numSendmsgExtra) !=
+ numSendmsgExtra) {
+ die("Failed to read socketcall() payload [process]");
+ }
+ }
+
+ int rc = -EINVAL;
+ switch (socketcall_req.call) {
+ case SYS_SOCKET:
+ // The sandbox does not allow creation of any new sockets.
+ goto deny;
+ case SYS_BIND:
+ // The sandbox does not allow binding an address to a socket.
+ goto deny;
+ case SYS_CONNECT:
+ // The sandbox does not allow connecting a socket.
+ goto deny;
+ case SYS_LISTEN:
+ // The sandbox does not allow a socket to enter listening state.
+ goto deny;
+ case SYS_ACCEPT4:
+ case SYS_ACCEPT:
+ // If the sandbox obtained a socket that is already in the listening
+ // state (e.g. because somebody sent it a suitable file descriptor), it
+ // is permissible to call accept().
+
+ accept_simple:
+ // None of the parameters need to be checked, so it is OK to refer
+ // to the parameter block created by the untrusted code.
+ SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_socketcall,
+ socketcall_req.call, socketcall_req.arg_ptr);
+ return true;
+ case SYS_GETSOCKNAME:
+ case SYS_GETPEERNAME:
+ // Querying the local and the remote name is not considered security
+ // sensitive for the purposes of the sandbox.
+ goto accept_simple;
+ case SYS_SOCKETPAIR:
+ // Socket pairs are connected to each other and not considered
+ // security sensitive.
+ goto accept_simple;
+ case SYS_SENDTO:
+ if (socketcall_req.args.sendto.to) {
+ // The sandbox does not allow sending to arbitrary addresses.
+ goto deny;
+ }
+ // Fall through
+ case SYS_SEND:
+ if (socketcall_req.args.send.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) {
+ // Unsupported flag encountered. Deny the call.
+ goto deny;
+ }
+ // Sending data on a connected socket is similar to calling write().
+ // Allow it.
+
+ accept_complex:
+ // The parameter block contains potentially security critical information
+ // that should not be tampered with after it has been inspected. Copy it
+ // into the write-protected securely shared memory before telling the
+ // trusted thread to execute the socket call.
+ SecureMem::lockSystemCall(parentProc, mem);
+ memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args));
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem,
+ __NR_socketcall, socketcall_req.call,
+ mem->pathname - (char*)mem + (char*)mem->self);
+ return true;
+ case SYS_RECVFROM:
+ // While we do not anticipate any particular need to receive data on
+ // unconnected sockets, there is no particular risk in doing so.
+ // Fall through
+ case SYS_RECV:
+ if (socketcall_req.args.recv.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ // Unsupported flag encountered. Deny the call.
+ goto deny;
+ }
+ // Receiving data on a connected socket is similar to calling read().
+ // Allow it.
+ goto accept_complex;
+ case SYS_SHUTDOWN:
+ // Shutting down a socket is always OK.
+ goto accept_simple;
+ case SYS_SETSOCKOPT:
+ switch (socketcall_req.args.setsockopt.level) {
+ case SOL_SOCKET:
+ switch (socketcall_req.args.setsockopt.optname) {
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (socketcall_req.args.setsockopt.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ goto deny;
+ case SYS_GETSOCKOPT:
+ switch (socketcall_req.args.getsockopt.level) {
+ case SOL_SOCKET:
+ switch (socketcall_req.args.getsockopt.optname) {
+ case SO_ACCEPTCONN:
+ case SO_ERROR:
+ case SO_KEEPALIVE:
+ case SO_LINGER:
+ case SO_OOBINLINE:
+ case SO_RCVBUF:
+ case SO_RCVLOWAT:
+ case SO_SNDLOWAT:
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_TIMESTAMP:
+ case SO_TYPE:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (socketcall_req.args.getsockopt.optname) {
+ case TCP_CORK:
+ case TCP_DEFER_ACCEPT:
+ case TCP_INFO:
+ case TCP_KEEPCNT:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_LINGER2:
+ case TCP_MAXSEG:
+ case TCP_NODELAY:
+ case TCP_QUICKACK:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ goto accept_complex;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ goto deny;
+ case SYS_SENDMSG: {
+ struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra);
+
+ if (sizeof(socketcall_req.args) + sizeof(*msg) + numSendmsgExtra >
+ sizeof(mem->pathname)) {
+ goto deny;
+ }
+
+ if (msg->msg_namelen ||
+ (socketcall_req.args.sendmsg.flags &
+ ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))){
+ goto deny;
+ }
+
+ // The trusted process receives file handles when a new untrusted thread
+ // gets created. We have security checks in place that prevent any
+ // critical information from being tampered with during thread creation.
+ // But if we disallowed passing of file handles, this would add an extra
+ // hurdle for an attacker.
+ // Unfortunately, for now, this is not possible as Chrome's
+ // base::SendRecvMsg() needs the ability to pass file handles.
+ if (msg->msg_controllen) {
+ msg->msg_control = sendmsgExtra + msg->msg_namelen;
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+ do {
+ if (cmsg->cmsg_level != SOL_SOCKET ||
+ cmsg->cmsg_type != SCM_RIGHTS) {
+ goto deny;
+ }
+ } while ((cmsg = CMSG_NXTHDR(msg, cmsg)) != NULL);
+ }
+
+ // This must be a locked system call, because we have to ensure that
+ // the untrusted code does not tamper with the msghdr after we have
+ // examined it.
+ SecureMem::lockSystemCall(parentProc, mem);
+ socketcall_req.args.sendmsg.msg =
+ reinterpret_cast<struct msghdr*>(mem->pathname +
+ sizeof(socketcall_req.args) -
+ (char*)mem + (char*)mem->self);
+ memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args));
+ if (numSendmsgExtra) {
+ if (msg->msg_namelen > 0) {
+ msg->msg_name = const_cast<struct msghdr*>(
+ socketcall_req.args.sendmsg.msg) + 1;
+ }
+ if (msg->msg_controllen > 0) {
+ msg->msg_control = (char *)(
+ socketcall_req.args.sendmsg.msg + 1) + msg->msg_namelen;
+ }
+ memcpy(mem->pathname + sizeof(socketcall_req.args) + sizeof(*msg),
+ sendmsgExtra, numSendmsgExtra);
+ }
+ memcpy(mem->pathname + sizeof(socketcall_req.args), msg, sizeof(*msg));
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem,
+ __NR_socketcall, socketcall_req.call,
+ mem->pathname - (char*)mem + (char*)mem->self);
+ return true;
+ }
+ case SYS_RECVMSG:
+ // Receiving messages is general not security critical.
+ if (socketcall_req.args.recvmsg.flags &
+ ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) {
+ goto deny;
+ }
+ goto accept_complex;
+ default:
+ deny:
+ SecureMem::abandonSystemCall(threadFd, rc);
+ return false;
+ }
+}
+
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/stat.cc b/sandbox/linux/seccomp/stat.cc
new file mode 100644
index 0000000..8634fdf
--- /dev/null
+++ b/sandbox/linux/seccomp/stat.cc
@@ -0,0 +1,110 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+
+namespace playground {
+
+int Sandbox::sandbox_stat(const char *path, void *buf) {
+ Debug::syscall(__NR_stat, "Executing handler");
+ size_t len = strlen(path);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Stat stat_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_stat;
+ request->cookie = cookie();
+ request->stat_req.sysnum = __NR_stat;
+ request->stat_req.path_length = len;
+ request->stat_req.buf = buf;
+ memcpy(request->pathname, path, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward stat() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+
+#if defined(__NR_stat64)
+int Sandbox::sandbox_stat64(const char *path, void *buf) {
+ Debug::syscall(__NR_stat64, "Executing handler");
+ size_t len = strlen(path);
+ struct Request {
+ int sysnum;
+ long long cookie;
+ Stat stat_req;
+ char pathname[0];
+ } __attribute__((packed)) *request;
+ char data[sizeof(struct Request) + len];
+ request = reinterpret_cast<struct Request*>(data);
+ request->sysnum = __NR_stat64;
+ request->cookie = cookie();
+ request->stat_req.sysnum = __NR_stat64;
+ request->stat_req.path_length = len;
+ request->stat_req.buf = buf;
+ memcpy(request->pathname, path, len);
+
+ long rc;
+ SysCalls sys;
+ if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) ||
+ read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward stat64() request [sandbox]");
+ }
+ return static_cast<int>(rc);
+}
+#endif
+
+bool Sandbox::process_stat(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMem::Args* mem) {
+ // Read request
+ SysCalls sys;
+ Stat stat_req;
+ if (read(sys, sandboxFd, &stat_req, sizeof(stat_req)) != sizeof(stat_req)) {
+ read_parm_failed:
+ die("Failed to read parameters for stat() [process]");
+ }
+ int rc = -ENAMETOOLONG;
+ if (stat_req.path_length >= (int)sizeof(mem->pathname)) {
+ char buf[32];
+ while (stat_req.path_length > 0) {
+ size_t len = stat_req.path_length > sizeof(buf) ?
+ sizeof(buf) : stat_req.path_length;
+ ssize_t i = read(sys, sandboxFd, buf, len);
+ if (i <= 0) {
+ goto read_parm_failed;
+ }
+ stat_req.path_length -= i;
+ }
+ if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to return data from stat() [process]");
+ }
+ return false;
+ }
+ SecureMem::lockSystemCall(parentProc, mem);
+ if (read(sys, sandboxFd, mem->pathname, stat_req.path_length) !=
+ (ssize_t)stat_req.path_length) {
+ goto read_parm_failed;
+ }
+ mem->pathname[stat_req.path_length] = '\000';
+
+ // TODO(markus): Implement sandboxing policy
+ Debug::message(("Allowing access to \"" + std::string(mem->pathname) +
+ "\"").c_str());
+
+ // Tell trusted thread to stat the file.
+ SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem,
+ #if defined(__i386__)
+ stat_req.sysnum == __NR_stat64 ? __NR_stat64 :
+ #endif
+ __NR_stat,
+ mem->pathname - (char*)mem + (char*)mem->self,
+ stat_req.buf);
+ return true;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc
new file mode 100644
index 0000000..b25146b
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall.cc
@@ -0,0 +1,258 @@
+#include "debug.h"
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+// TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file
+asm(
+ ".pushsection .text, \"ax\", @progbits\n"
+
+ // This is the special wrapper for the clone() system call. The code
+ // relies on the stack layout of the system call wrapper (c.f. below). It
+ // passes the stack pointer as an additional argument to sandbox__clone(),
+ // so that upon starting the child, register values can be restored and
+ // the child can start executing at the correct IP, instead of trying to
+ // run in the trusted thread.
+ "playground$sandbox_clone:"
+ ".globl playground$sandbox_clone\n"
+ ".type playground$sandbox_clone, @function\n"
+ #if defined(__x86_64__)
+ // Skip the 8 byte return address into the system call wrapper. The
+ // following bytes are the saved register values that we need to restore
+ // upon return from clone() in the new thread.
+ "lea 8(%rsp), %r9\n"
+ "jmp playground$sandbox__clone\n"
+ #elif defined(__i386__)
+ // As i386 passes function arguments on the stack, we need to skip a few
+ // more values before we can get to the saved registers.
+ "lea 28(%esp), %eax\n"
+ "mov %eax, 24(%esp)\n"
+ "jmp playground$sandbox__clone\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ ".size playground$sandbox_clone, .-playground$sandbox_clone\n"
+
+
+ // This is the wrapper which is called by the untrusted code, trying to
+ // make a system call.
+ "playground$syscallWrapper:"
+ ".globl playground$syscallWrapper\n"
+ ".type playground$syscallWrapper, @function\n"
+ #if defined(__x86_64__)
+ // Save all registers
+ "push %rbp\n"
+ "mov %rsp, %rbp\n"
+ "push %rbx\n"
+ "push %rcx\n"
+ "push %rdx\n"
+ "push %rsi\n"
+ "push %rdi\n"
+ "push %r8\n"
+ "push %r9\n"
+ "push %r10\n"
+ "push %r11\n"
+ "push %r12\n"
+ "push %r13\n"
+ "push %r14\n"
+ "push %r15\n"
+
+ // Convert from syscall calling conventions to C calling conventions.
+ // System calls have a subtly different register ordering than the user-
+ // space x86-64 ABI.
+ "mov %r10, %rcx\n"
+
+ // Check range of system call
+ "cmp playground$maxSyscall(%rip), %eax\n"
+ "ja 1f\n"
+
+ // Retrieve function call from system call table (c.f. syscall_table.c).
+ // We have three different types of entries; zero for denied system calls,
+ // that should be handled by the defaultSystemCallHandler(); minus one
+ // for unrestricted system calls that need to be forwarded to the trusted
+ // thread; and function pointers to specific handler functions.
+ "mov %rax, %r10\n"
+ "shl $4, %r10\n"
+ "lea playground$syscallTable(%rip), %r11\n"
+ "add %r11, %r10\n"
+ "mov 0(%r10), %r10\n"
+
+ // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
+ // jump to fallback handler.
+ "cmp $1, %r10\n"
+ "jbe 1f\n"
+ "call *%r10\n"
+ "0:"
+
+ // Restore CPU registers, except for %rax which was set by the system call.
+ "pop %r15\n"
+ "pop %r14\n"
+ "pop %r13\n"
+ "pop %r12\n"
+ "pop %r11\n"
+ "pop %r10\n"
+ "pop %r9\n"
+ "pop %r8\n"
+ "pop %rdi\n"
+ "pop %rsi\n"
+ "pop %rdx\n"
+ "pop %rcx\n"
+ "pop %rbx\n"
+ "pop %rbp\n"
+
+ // Remove fake return address. This is added in the patching code in
+ // library.cc and it makes stack traces a little cleaner.
+ "add $8, %rsp\n"
+
+ // Return to caller
+ "ret\n"
+
+ "1:"
+ // If we end up calling a specific handler, we don't need to know the
+ // system call number. However, in the generic case, we do. Shift
+ // registers so that the system call number becomes visible as the
+ // first function argument.
+ "push %r9\n"
+ "mov %r8, %r9\n"
+ "mov %rcx, %r8\n"
+ "mov %rdx, %rcx\n"
+ "mov %rsi, %rdx\n"
+ "mov %rdi, %rsi\n"
+ "mov %rax, %rdi\n"
+
+ // Call default handler.
+ "call playground$defaultSystemCallHandler\n"
+ "pop %r9\n"
+ "jmp 0b\n"
+ #elif defined(__i386__)
+ // Preserve all registers
+ "push %ebx\n"
+ "push %ecx\n"
+ "push %edx\n"
+ "push %esi\n"
+ "push %edi\n"
+ "push %ebp\n"
+
+ // Convert from syscall calling conventions to C calling conventions
+ "push %ebp\n"
+ "push %edi\n"
+ "push %esi\n"
+ "push %edx\n"
+ "push %ecx\n"
+ "push %ebx\n"
+ "push %eax\n"
+
+ // Check range of system call
+ "cmp playground$maxSyscall, %eax\n"
+ "ja 1f\n"
+
+ // Retrieve function call from system call table (c.f. syscall_table.c).
+ // We have three different types of entries; zero for denied system calls,
+ // that should be handled by the defaultSystemCallHandler(); minus one
+ // for unrestricted system calls that need to be forwarded to the trusted
+ // thread; and function pointers to specific handler functions.
+ "shl $3, %eax\n"
+ "lea playground$syscallTable, %ebx\n"
+ "add %ebx, %eax\n"
+ "mov 0(%eax), %eax\n"
+
+ // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
+ // jump to fallback handler.
+ "cmp $1, %eax\n"
+ "jbe 1f\n"
+ "add $4, %esp\n"
+ "call *%eax\n"
+ "add $24, %esp\n"
+ "0:"
+
+ // Restore CPU registers, except for %eax which was set by the system call.
+ "pop %ebp\n"
+ "pop %edi\n"
+ "pop %esi\n"
+ "pop %edx\n"
+ "pop %ecx\n"
+ "pop %ebx\n"
+
+ // Return to caller
+ "ret\n"
+
+ "1:"
+ // Call default handler.
+ "push $2f\n"
+ "push $playground$defaultSystemCallHandler\n"
+ "ret\n"
+ "2:add $28, %esp\n"
+ "jmp 0b\n"
+
+ #else
+ #error Unsupported target platform
+ #endif
+ ".size playground$syscallWrapper, .-playground$syscallWrapper\n"
+ ".popsection\n"
+);
+
+
+void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
+ void* arg2, void* arg3, void* arg4,
+ void* arg5) {
+ // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.
+
+ // We try to avoid intercepting read(), write(), and sigreturn(), as
+ // these system calls are not restricted in Seccomp mode. But depending on
+ // the exact instruction sequence in libc, we might not be able to reliably
+ // filter out these system calls at the time when we instrument the code.
+ SysCalls sys;
+ unsigned long rc;
+ switch (syscallNum) {
+ case __NR_read:
+ Debug::syscall(syscallNum, "Allowing unrestricted system call");
+ rc = sys.read((long)arg0, arg1, (size_t)arg2);
+ break;
+ case __NR_write:
+ Debug::syscall(syscallNum, "Allowing unrestricted system call");
+ rc = sys.write((long)arg0, arg1, (size_t)arg2);
+ break;
+ case __NR_rt_sigreturn:
+ Debug::syscall(syscallNum, "Allowing unrestricted system call");
+ rc = sys.rt_sigreturn((unsigned long)arg0);
+ break;
+ default:
+ if (Debug::isEnabled()) {
+ // In debug mode, prevent stderr from being closed
+ if (syscallNum == __NR_close && arg0 == (void *)2)
+ return 0;
+ }
+
+ if ((unsigned)syscallNum <= maxSyscall &&
+ syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) {
+ Debug::syscall(syscallNum, "Allowing unrestricted system call");
+ perform_unrestricted:
+ struct {
+ int sysnum;
+ void* unrestricted_req[6];
+ } __attribute__((packed)) request = {
+ syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } };
+
+ int thread = threadFdPub();
+ void* rc;
+ if (write(sys, thread, &request, sizeof(request)) != sizeof(request) ||
+ read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward unrestricted system call");
+ }
+ return rc;
+ } else if (Debug::isEnabled()) {
+ Debug::syscall(syscallNum,
+ "In production mode, this call would be disallowed");
+ goto perform_unrestricted;
+ } else {
+ return (void *)-ENOSYS;
+ }
+ }
+ if (rc < 0) {
+ rc = -sys.my_errno;
+ }
+ return (void *)rc;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/syscall.h b/sandbox/linux/seccomp/syscall.h
new file mode 100644
index 0000000..e4390c2
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall.h
@@ -0,0 +1,14 @@
+#ifndef SYSCALL_H__
+#define SYSCALL_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void syscallWrapper() asm("playground$syscallWrapper");
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // SYSCALL_H__
diff --git a/sandbox/linux/seccomp/syscall_table.c b/sandbox/linux/seccomp/syscall_table.c
new file mode 100644
index 0000000..79b281e
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall_table.c
@@ -0,0 +1,118 @@
+#include <asm/unistd.h>
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+#if defined(__x86_64__)
+#ifndef __NR_set_robust_list
+#define __NR_set_robust_list 273
+#endif
+#ifndef __NR_accept4
+#define __NR_accept4 288
+#endif
+#elif defined(__i386__)
+#ifndef __NR_set_robust_list
+#define __NR_set_robust_list 311
+#endif
+#else
+#error Unsupported target platform
+#endif
+
+// TODO(markus): This is an incredibly dirty hack to make the syscallTable
+// live in r/o memory.
+// Unfortunately, gcc doesn't give us a clean option to do
+// this. Ultimately, we should probably write some code that
+// parses /usr/include/asm/unistd*.h and generates a *.S file.
+// But we then need to figure out how to integrate this code
+// with our build system.
+
+const struct SyscallTable syscallTable[] __attribute__((
+ section(".rodata, \"a\", @progbits\n#"))) ={
+
+ #if defined(__NR_accept)
+ [ __NR_accept ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_accept4 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_access ] = { (void*)&sandbox_access, process_access },
+ [ __NR_brk ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_clock_gettime ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_clone ] = { (void*)&sandbox_clone, process_clone },
+ [ __NR_close ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_epoll_create ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_epoll_ctl ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_epoll_wait ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_exit ] = { (void*)&sandbox_exit, process_exit },
+ [ __NR_exit_group ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_fcntl ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_fcntl64)
+ [ __NR_fcntl64 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_fstat ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_fstat64)
+ [ __NR_fstat64 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_futex ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_getdents ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_getdents64 ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_getpeername)
+ [ __NR_getpeername ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_getpid ] = { (void*)&sandbox_getpid, 0 },
+ #if defined(__NR_getsockname)
+ [ __NR_getsockname ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_getsockopt ] = { (void*)&sandbox_getsockopt,process_getsockopt },
+ #endif
+ [ __NR_gettid ] = { (void*)&sandbox_gettid, 0 },
+ [ __NR_gettimeofday ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_ioctl ] = { (void*)&sandbox_ioctl, process_ioctl },
+ #if defined(__NR_ipc)
+ [ __NR_ipc ] = { (void*)&sandbox_ipc, process_ipc },
+ #endif
+ #if defined(__NR__llseek)
+ [ __NR__llseek ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ [ __NR_lseek ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_madvise ] = { (void*)&sandbox_madvise, process_madvise },
+ #if defined(__NR_mmap2)
+ [ __NR_mmap2 ] =
+ #else
+ [ __NR_mmap ] =
+ #endif
+ { (void*)&sandbox_mmap, process_mmap },
+ [ __NR_mprotect ] = { (void*)&sandbox_mprotect, process_mprotect },
+ [ __NR_munmap ] = { (void*)&sandbox_munmap, process_munmap },
+ [ __NR_open ] = { (void*)&sandbox_open, process_open },
+ [ __NR_pipe ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_poll ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_recvfrom)
+ [ __NR_recvfrom ] = { (void*)&sandbox_recvfrom, process_recvfrom },
+ [ __NR_recvmsg ] = { (void*)&sandbox_recvmsg, process_recvmsg },
+ [ __NR_sendmsg ] = { (void*)&sandbox_sendmsg, process_sendmsg },
+ [ __NR_sendto ] = { (void*)&sandbox_sendto, process_sendto },
+ #endif
+ [ __NR_set_robust_list ] = { UNRESTRICTED_SYSCALL, 0 },
+ #if defined(__NR_setsockopt)
+ [ __NR_setsockopt ] = { (void*)&sandbox_setsockopt,process_setsockopt },
+ #if defined(__NR_shmat)
+ [ __NR_shmat ] = { (void*)&sandbox_shmat, process_shmat },
+ [ __NR_shmctl ] = { (void*)&sandbox_shmctl, process_shmctl },
+ [ __NR_shmdt ] = { (void*)&sandbox_shmdt, process_shmdt },
+ [ __NR_shmget ] = { (void*)&sandbox_shmget, process_shmget },
+ #endif
+ [ __NR_shutdown ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_socketpair ] = { UNRESTRICTED_SYSCALL, 0 },
+ #endif
+ #if defined(__NR_socketcall)
+ [ __NR_socketcall ] = { (void*)&sandbox_socketcall,process_socketcall },
+ #endif
+ [ __NR_stat ] = { (void*)&sandbox_stat, process_stat },
+ #if defined(__NR_stat64)
+ [ __NR_stat64 ] = { (void*)&sandbox_stat64, process_stat },
+ #endif
+ [ __NR_time ] = { UNRESTRICTED_SYSCALL, 0 },
+ [ __NR_uname ] = { UNRESTRICTED_SYSCALL, 0 },
+};
+const unsigned maxSyscall __attribute__((section(".rodata"))) =
+ sizeof(syscallTable)/sizeof(struct SyscallTable);
+
+const int syscall_mutex_[4096/sizeof(int)] asm("playground$syscall_mutex")
+ __attribute__((section(".rodata"),aligned(4096))) = { 0x80000000 };
diff --git a/sandbox/linux/seccomp/syscall_table.h b/sandbox/linux/seccomp/syscall_table.h
new file mode 100644
index 0000000..d678c0b
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall_table.h
@@ -0,0 +1,30 @@
+#ifndef SYSCALL_TABLE_H__
+#define SYSCALL_TABLE_H__
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+#include "securemem.h"
+extern "C" {
+namespace playground {
+#define SecureMemArgs SecureMem::Args
+#else
+#define SecureMemArgs void
+#define bool int
+#endif
+ #define UNRESTRICTED_SYSCALL ((void *)1)
+
+ struct SyscallTable {
+ void *handler;
+ bool (*trustedProcess)(int parentProc, int sandboxFd, int threadFdPub,
+ int threadFd, SecureMemArgs* mem);
+ };
+ extern const struct SyscallTable syscallTable[]
+ asm("playground$syscallTable");
+ extern const unsigned maxSyscall asm("playground$maxSyscall");
+#ifdef __cplusplus
+} // namespace
+}
+#endif
+
+#endif // SYSCALL_TABLE_H__
diff --git a/sandbox/linux/seccomp/tls.h b/sandbox/linux/seccomp/tls.h
new file mode 100644
index 0000000..8eae697
--- /dev/null
+++ b/sandbox/linux/seccomp/tls.h
@@ -0,0 +1,151 @@
+#ifndef TLS_H__
+#define TLS_H__
+
+#include <asm/ldt.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+namespace playground {
+
+class TLS {
+ private:
+ class SysCalls {
+ public:
+ #define SYS_CPLUSPLUS
+ #define SYS_ERRNO my_errno
+ #define SYS_INLINE inline
+ #define SYS_PREFIX -1
+ #undef SYS_LINUX_SYSCALL_SUPPORT_H
+ #include "linux_syscall_support.h"
+ SysCalls() : my_errno(0) { }
+ int my_errno;
+ };
+
+ public:
+ static void *allocateTLS() {
+ SysCalls sys;
+ #if defined(__x86_64__)
+ void *addr = sys.mmap(0, 4096, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ if (sys.arch_prctl(ARCH_SET_GS, addr) < 0) {
+ return NULL;
+ }
+ #elif defined(__i386__)
+ void *addr = sys.mmap2(0, 4096, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ struct user_desc u;
+ u.entry_number = (typeof u.entry_number)-1;
+ u.base_addr = (int)addr;
+ u.limit = 0xfffff;
+ u.seg_32bit = 1;
+ u.contents = 0;
+ u.read_exec_only = 0;
+ u.limit_in_pages = 1;
+ u.seg_not_present = 0;
+ u.useable = 1;
+ if (sys.set_thread_area(&u) < 0) {
+ return NULL;
+ }
+ asm volatile(
+ "movw %w0, %%fs"
+ :
+ : "q"(8*u.entry_number+3));
+ #else
+ #error Unsupported target platform
+ #endif
+ return addr;
+ }
+
+ static void freeTLS() {
+ SysCalls sys;
+ void *addr;
+ #if defined(__x86_64__)
+ sys.arch_prctl(ARCH_GET_GS, &addr);
+ #elif defined(__i386__)
+ struct user_desc u;
+ sys.get_thread_area(&u);
+ addr = (void *)u.base_addr;
+ #else
+ #error Unsupported target platform
+ #endif
+ sys.munmap(addr, 4096);
+ }
+
+ template<class T> static inline bool setTLSValue(int idx, T val) {
+ #if defined(__x86_64__)
+ if (idx < 0 || idx >= 4096/8) {
+ return false;
+ }
+ asm volatile(
+ "movq %0, %%gs:(%1)\n"
+ :
+ : "q"((void *)val), "q"(8ll * idx));
+ #elif defined(__i386__)
+ if (idx < 0 || idx >= 4096/8) {
+ return false;
+ }
+ if (sizeof(T) == 8) {
+ asm volatile(
+ "movl %0, %%fs:(%1)\n"
+ :
+ : "r"((unsigned)val), "r"(8 * idx));
+ asm volatile(
+ "movl %0, %%fs:(%1)\n"
+ :
+ : "r"((unsigned)((unsigned long long)val >> 32)), "r"(8 * idx + 4));
+ } else {
+ asm volatile(
+ "movl %0, %%fs:(%1)\n"
+ :
+ : "r"(val), "r"(8 * idx));
+ }
+ #else
+ #error Unsupported target platform
+ #endif
+ return true;
+ }
+
+ template<class T> static inline T getTLSValue(int idx) {
+ #if defined(__x86_64__)
+ long long rc;
+ if (idx < 0 || idx >= 4096/8) {
+ return 0;
+ }
+ asm volatile(
+ "movq %%gs:(%1), %0\n"
+ : "=q"(rc)
+ : "q"(8ll * idx));
+ return (T)rc;
+ #elif defined(__i386__)
+ if (idx < 0 || idx >= 4096/8) {
+ return 0;
+ }
+ if (sizeof(T) == 8) {
+ unsigned lo, hi;
+ asm volatile(
+ "movl %%fs:(%1), %0\n"
+ : "=r"(lo)
+ : "r"(8 * idx));
+ asm volatile(
+ "movl %%fs:(%1), %0\n"
+ : "=r"(hi)
+ : "r"(8 * idx + 4));
+ return (T)((unsigned long long)lo + ((unsigned long long)hi << 32));
+ } else {
+ long rc;
+ asm volatile(
+ "movl %%fs:(%1), %0\n"
+ : "=r"(rc)
+ : "r"(8 * idx));
+ return (T)rc;
+ }
+ #else
+ #error Unsupported target platform
+ #endif
+ }
+
+};
+
+} // namespace
+#endif
diff --git a/sandbox/linux/seccomp/trusted_process.cc b/sandbox/linux/seccomp/trusted_process.cc
new file mode 100644
index 0000000..b4bee94
--- /dev/null
+++ b/sandbox/linux/seccomp/trusted_process.cc
@@ -0,0 +1,258 @@
+#include <dirent.h>
+#include <map>
+
+#include "debug.h"
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+struct Thread {
+ int fdPub, fd;
+ SecureMem::Args* mem;
+};
+
+SecureMem::Args* Sandbox::getSecureMem() {
+ if (!secureMemPool_.empty()) {
+ SecureMem::Args* rc = secureMemPool_.back();
+ secureMemPool_.pop_back();
+ return rc;
+ }
+ return NULL;
+}
+
+void Sandbox::trustedProcess(int parentProc, int processFdPub, int sandboxFd,
+ int cloneFd, SecureMem::Args* secureArena) {
+ std::map<long long, struct Thread> threads;
+ SysCalls sys;
+ long long cookie = 0;
+
+ // The very first entry in the secure memory arena has been assigned to the
+ // initial thread. The remaining entries are available for allocation.
+ SecureMem::Args* startAddress = secureArena;
+ SecureMem::Args* nextThread = startAddress;
+ for (int i = 0; i < kMaxThreads-1; i++) {
+ secureMemPool_.push_back(++startAddress);
+ }
+
+newThreadCreated:
+ // Receive information from newly created thread
+ Thread *newThread = &threads[++cookie];
+ memset(newThread, 0, sizeof(Thread));
+ struct {
+ SecureMem::Args* self;
+ int tid;
+ int fdPub;
+ } __attribute__((packed)) data;
+
+ size_t dataLen = sizeof(data);
+ if (!getFd(cloneFd, &newThread->fdPub, &newThread->fd, &data, &dataLen) ||
+ dataLen != sizeof(data)) {
+ // We get here either because the sandbox got corrupted, or because our
+ // parent process has terminated.
+ if (newThread->fdPub || dataLen) {
+ die("Failed to receive new thread information");
+ }
+ die();
+ }
+ if (data.self != nextThread) {
+ // The only potentially security critical information received from the
+ // newly created thread is "self". The "tid" is for informational purposes
+ // (and for use in the new thread's TLS), and "fdPub" is uncritical as all
+ // file descriptors are considered untrusted.
+ // Thus, we only use "self" for a sanity check, but don't actually trust
+ // it beyond that.
+ die("Received corrupted thread information");
+ }
+ newThread->mem = nextThread;
+
+ // Set up TLS area and let thread know that the data is now ready
+ nextThread->cookie = cookie;
+ nextThread->threadId = data.tid;
+ nextThread->threadFdPub = data.fdPub;
+ write(sys, newThread->fd, "", 1);
+
+ // Dispatch system calls that have been forwarded from the trusted thread(s).
+ for (;;) {
+ struct {
+ unsigned int sysnum;
+ long long cookie;
+ } __attribute__((packed)) header;
+
+ int rc;
+ if ((rc = read(sys, sandboxFd, &header, sizeof(header))) !=sizeof(header)){
+ if (rc) {
+ die("Failed to read system call number and thread id");
+ }
+ die();
+ }
+ std::map<long long, struct Thread>::iterator iter =
+ threads.find(header.cookie);
+ if (iter == threads.end()) {
+ die("Received request from unknown thread");
+ }
+ struct Thread* currentThread = &iter->second;
+ if (header.sysnum > maxSyscall ||
+ !syscallTable[header.sysnum].trustedProcess) {
+ die("Trusted process encountered unexpected system call");
+ }
+
+ // Dispatch system call to handler function. Treat both exit() and clone()
+ // specially.
+ if (syscallTable[header.sysnum].trustedProcess(parentProc,
+ sandboxFd,
+ currentThread->fdPub,
+ currentThread->fd,
+ currentThread->mem) &&
+ header.sysnum == __NR_clone) {
+ nextThread = currentThread->mem->newSecureMem;
+ goto newThreadCreated;
+ } else if (header.sysnum == __NR_exit) {
+ NOINTR_SYS(sys.close(iter->second.fdPub));
+ NOINTR_SYS(sys.close(iter->second.fd));
+ SecureMem::Args* secureMem = currentThread->mem;
+ threads.erase(iter);
+ secureMemPool_.push_back(secureMem);
+ }
+ }
+}
+
+void Sandbox::initializeProtectedMap(int fd) {
+ int mapsFd;
+ if (!getFd(fd, &mapsFd, NULL, NULL, NULL)) {
+ maps_failure:
+ die("Cannot access /proc/self/maps");
+ }
+
+ // Read the memory mappings as they were before the sandbox takes effect.
+ // These mappings cannot be changed by the sandboxed process.
+ char line[80];
+ FILE *fp = fdopen(mapsFd, "r");
+ for (bool truncated = false;;) {
+ if (fgets(line, sizeof(line), fp) == NULL) {
+ if (feof(fp) || errno != EINTR) {
+ break;
+ }
+ continue;
+ }
+ if (!truncated) {
+ unsigned long start, stop;
+ char *ptr = line;
+ errno = 0;
+ start = strtoul(ptr, &ptr, 16);
+ if (errno || *ptr++ != '-') {
+ parse_failure:
+ die("Failed to parse /proc/self/maps");
+ }
+ stop = strtoul(ptr, &ptr, 16);
+ if (errno || *ptr++ != ' ') {
+ goto parse_failure;
+ }
+ protectedMap_[reinterpret_cast<void *>(start)] = stop - start;
+ }
+ truncated = strchr(line, '\n') == NULL;
+ }
+ SysCalls sys;
+ NOINTR_SYS(sys.close(mapsFd));
+
+ // Prevent low address memory allocations. Some buggy kernels allow those
+ if (protectedMap_[0] < (64 << 10)) {
+ protectedMap_[0] = 64 << 10;
+ }
+
+ // Let the sandbox know that we are done parsing the memory map.
+ if (write(sys, fd, &mapsFd, sizeof(mapsFd)) != sizeof(mapsFd)) {
+ goto maps_failure;
+ }
+}
+
+SecureMem::Args* Sandbox::createTrustedProcess(int processFdPub, int sandboxFd,
+ int cloneFdPub, int cloneFd) {
+ // Allocate memory that will be used by an arena for storing the secure
+ // memory. While we allow this memory area to be empty at times (e.g. when
+ // not all threads are in use), we make sure that it never gets overwritten
+ // by user-allocated memory. This happens in initializeProtectedMap() and
+ // snapshotMemoryMappings().
+ SecureMem::Args* secureArena = reinterpret_cast<SecureMem::Args*>(
+ mmap(NULL, 8192*kMaxThreads, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS, -1, 0));
+ if (secureArena == MAP_FAILED) {
+ die("Failed to allocate secure memory arena");
+ }
+
+ // Set up the mutex to be accessible from the trusted process and from
+ // children of the trusted thread(s)
+ if (mmap(&syscall_mutex_, 4096, PROT_READ|PROT_WRITE,
+ MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0) != &syscall_mutex_) {
+ die("Failed to initialize secure mutex");
+ }
+ syscall_mutex_ = 0x80000000;
+
+
+ // Hold on to a file handle in the parent's process directory. We can use
+ // this later to reliably tell if the parent died.
+ int parentProc = open("/proc/self/", O_RDONLY|O_DIRECTORY);
+ if (parentProc < 0) {
+ die("Failed to access /proc/self");
+ }
+
+ // Create a trusted process that can evaluate system call parameters and
+ // decide whether a system call should execute. This process runs outside of
+ // the seccomp sandbox. It communicates with the sandbox'd process through
+ // a socketpair() and through securely shared memory.
+ pid_t pid = fork();
+ if (pid < 0) {
+ die("Failed to create trusted process");
+ }
+ if (!pid) {
+ // Close all file handles except for sandboxFd, cloneFd, and stdio
+ DIR *dir = opendir("/proc/self/fd");
+ if (dir == 0) {
+ // If we don't know the list of our open file handles, just try closing
+ // all valid ones.
+ for (int fd = sysconf(_SC_OPEN_MAX); --fd > 2; ) {
+ if (fd != parentProc && fd != sandboxFd && fd != cloneFd) {
+ close(fd);
+ }
+ }
+ } else {
+ // If available, if is much more efficient to just close the file
+ // handles that show up in /proc/self/fd/
+ struct dirent de, *res;
+ while (!readdir_r(dir, &de, &res) && res) {
+ if (res->d_name[0] < '0')
+ continue;
+ int fd = atoi(res->d_name);
+ if (fd > 2 &&
+ fd != parentProc && fd != sandboxFd && fd != cloneFd &&
+ fd != dirfd(dir)) {
+ close(fd);
+ }
+ }
+ closedir(dir);
+ }
+
+ // Initialize secure memory used for threads
+ for (int i = 0; i < kMaxThreads; i++) {
+ SecureMem::Args* args = secureArena + i;
+ args->self = args;
+ #ifndef NDEBUG
+ args->allowAllSystemCalls= Debug::isEnabled();
+ #endif
+ }
+
+ initializeProtectedMap(sandboxFd);
+ trustedProcess(parentProc, processFdPub, sandboxFd, cloneFd, secureArena);
+ die();
+ }
+
+ // We are still in the untrusted code. Deny access to restricted resources.
+ mprotect(secureArena, 8192*kMaxThreads, PROT_NONE);
+ mprotect(&syscall_mutex_, 4096, PROT_NONE);
+ close(parentProc);
+ close(sandboxFd);
+
+ return secureArena;
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc
new file mode 100644
index 0000000..985d053
--- /dev/null
+++ b/sandbox/linux/seccomp/trusted_thread.cc
@@ -0,0 +1,1207 @@
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
+ SecureMem::Args* secureMem) {
+ SecureMem::Args args = { { { { { 0 } } } } };
+ args.self = &args;
+ args.newSecureMem = secureMem;
+ args.processFdPub = processFdPub;
+ args.cloneFdPub = cloneFdPub;
+#if defined(__x86_64__)
+ asm volatile(
+ "push %%rbx\n"
+ "push %%rbp\n"
+ "mov %0, %%rbp\n" // %rbp = args
+ "xor %%rbx, %%rbx\n" // initial sequence number
+ "lea 999f(%%rip), %%r15\n" // continue in same thread
+ "jmp 19f\n" // create trusted thread
+
+ // TODO(markus): Coalesce the read() operations by reading into a bigger
+ // buffer.
+
+ // Parameters:
+ // *%fs: secure memory region
+ // the page following this one contains the scratch space
+ // %r13: thread's side of threadFd
+ // %r15: processFdPub
+
+ // Local variables:
+ // %rbx: sequence number for trusted calls
+
+ // Temporary variables:
+ // %r9: system call number
+ // %rbp: secure memory of previous thread
+
+ // Layout of secure shared memory region (c.f. securemem.h):
+ // 0x00: pointer to the secure shared memory region (i.e. self)
+ // 0x08: sequence number; must match %rbx
+ // 0x10: system call number; passed to syscall in %rax
+ // 0x18: first argument; passed to syscall in %rdi
+ // 0x20: second argument; passed to syscall in %rsi
+ // 0x28: third argument; passed to syscall in %rdx
+ // 0x30: fourth argument; passed to syscall in %r10
+ // 0x38: fifth argument; passed to syscall in %r8
+ // 0x40: sixth argument; passed to syscall in %r9
+ // 0x48: stored return address for clone() system call
+ // 0x50: stored %rbp value for clone() system call
+ // 0x58: stored %rbx value for clone() system call
+ // 0x60: stored %rcx value for clone() system call
+ // 0x68: stored %rdx value for clone() system call
+ // 0x70: stored %rsi value for clone() system call
+ // 0x78: stored %rdi value for clone() system call
+ // 0x80: stored %r8 value for clone() system call
+ // 0x88: stored %r9 value for clone() system call
+ // 0x90: stored %r10 value for clone() system call
+ // 0x98: stored %r11 value for clone() system call
+ // 0xA0: stored %r12 value for clone() system call
+ // 0xA8: stored %r13 value for clone() system call
+ // 0xB0: stored %r14 value for clone() system call
+ // 0xB8: stored %r15 value for clone() system call
+ // 0xC0: new shared memory for clone()
+ // 0xC8: processFdPub for talking to trusted process
+ // 0xCC: cloneFdPub for talking to trusted process
+ // 0xD0: set to non-zero, if in debugging mode
+ // 0xD4: most recent SHM id returned by shmget(IPC_PRIVATE)
+ // 0xD8: cookie assigned to us by the trusted process (TLS_COOKIE)
+ // 0xE0: thread id (TLS_TID)
+ // 0xE8: threadFdPub (TLS_THREAD_FD)
+ // 0x200-0x1000: securely passed verified file name(s)
+
+ // Layout of (untrusted) scratch space:
+ // 0x00: syscall number; passed in %rax
+ // 0x04: first argument; passed in %rdi
+ // 0x0C: second argument; passed in %rsi
+ // 0x14: third argument; passed in %rdx
+ // 0x1C: fourth argument; passed in %r10
+ // 0x24: fifth argument; passed in %r8
+ // 0x2C: sixth argument; passed in %r9
+ // 0x34: return value
+ // 0x3C: RDTSCP result (%eax)
+ // 0x40: RDTSCP result (%edx)
+ // 0x44: RDTSCP result (%ecx)
+
+ // We use the %fs register for accessing the secure read-only page, and
+ // the untrusted scratch space immediately following it. The segment
+ // register and the local descriptor table is set up by passing
+ // appropriate arguments to clone().
+
+ "0:xor %%rsp, %%rsp\n"
+ "mov $2, %%ebx\n" // %rbx = initial sequence number
+
+ // Read request from untrusted thread, or from trusted process. In either
+ // case, the data that we read has to be considered untrusted.
+ // read(threadFd, &scratch, 4)
+ "1:xor %%rax, %%rax\n" // NR_read
+ "mov %%r13, %%rdi\n" // fd = threadFd
+ "mov %%fs:0x0, %%rsi\n"
+ "add $0x1000, %%rsi\n" // buf = &scratch
+ "mov $4, %%edx\n" // len = 4
+ "2:syscall\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 2b\n"
+ "cmp %%rdx, %%rax\n"
+ "jnz 25f\n" // exit process
+
+ // Retrieve system call number. It is crucial that we only dereference
+ // %fs:0x1000 exactly once. Afterwards, memory becomes untrusted and
+ // we must use the value that we have read the first time.
+ "mov 0(%%rsi), %%eax\n"
+
+ // If syscall number is -1, execute an unlocked system call from the
+ // secure memory area
+ "cmp $-1, %%eax\n"
+ "jnz 5f\n"
+ "3:cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "mov %%fs:0x10, %%rax\n"
+ "mov %%fs:0x18, %%rdi\n"
+ "mov %%fs:0x20, %%rsi\n"
+ "mov %%fs:0x28, %%rdx\n"
+ "mov %%fs:0x30, %%r10\n"
+ "mov %%fs:0x38, %%r8\n"
+ "mov %%fs:0x40, %%r9\n"
+ "cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "add $2, %%rbx\n"
+
+ // shmget() gets some special treatment. Whenever we return from this
+ // system call, we remember the most recently returned SysV shm id.
+ "cmp $29, %%eax\n" // NR_shmget
+ "jnz 4f\n"
+ "syscall\n"
+ "mov %%rax, %%r8\n"
+ "mov $56, %%eax\n" // NR_clone
+ "mov $17, %%edi\n" // flags = SIGCHLD
+ "mov $1, %%esi\n" // stack = 1
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25f\n" // exit process
+ "mov %%rax, %%rdi\n"
+ "jnz 7f\n" // wait for child, then return result
+ "mov %%fs:0x0, %%rdi\n" // start = secure_mem
+ "mov $4096, %%esi\n" // len = 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $10, %%eax\n" // NR_mprotect
+ "syscall\n"
+ "mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id
+ "xor %%rdi, %%rdi\n"
+ "jmp 26f\n" // exit program, no message
+ "4:syscall\n"
+ "jmp 14f\n" // return result
+
+ // If syscall number is -2, execute locked system call from the
+ // secure memory area
+ "5:jg 11f\n"
+ "cmp $-2, %%eax\n"
+ "jnz 8f\n"
+ "cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "mov %%fs:0x10, %%rax\n"
+ "mov %%fs:0x18, %%rdi\n"
+ "mov %%fs:0x20, %%rsi\n"
+ "mov %%fs:0x28, %%rdx\n"
+ "mov %%fs:0x30, %%r10\n"
+ "mov %%fs:0x38, %%r8\n"
+ "mov %%fs:0x40, %%r9\n"
+ "cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+
+ // clone() has unusual calling conventions and must be handled specially
+ "cmp $56, %%rax\n" // NR_clone
+ "jz 18f\n"
+
+ // exit() terminates trusted thread
+ "cmp $60, %%eax\n" // NR_exit
+ "jz 17f\n"
+
+ // Perform requested system call
+ "syscall\n"
+
+ // Unlock mutex
+ "6:cmp %%rbx, %%fs:0x8\n"
+ "jne 25f\n" // exit process
+ "add $2, %%rbx\n"
+ "mov %%rax, %%r8\n"
+ "mov $56, %%eax\n" // NR_clone
+ "mov $17, %%rdi\n" // flags = SIGCHLD
+ "mov $1, %%rsi\n" // stack = 1
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25f\n" // exit process
+ "jz 22f\n" // unlock and exit
+ "mov %%rax, %%rdi\n"
+ "7:xor %%rsi, %%rsi\n"
+ "xor %%rdx, %%rdx\n"
+ "xor %%r10, %%r10\n"
+ "mov $61, %%eax\n" // NR_wait4
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 7b\n"
+ "mov %%r8, %%rax\n"
+ "jmp 14f\n" // return result
+
+ // If syscall number is -3, read the time stamp counter
+ "8:cmp $-3, %%eax\n"
+ "jnz 9f\n"
+ "rdtsc\n" // sets %edx:%eax
+ "xor %%rcx, %%rcx\n"
+ "jmp 10f\n"
+ "9:cmp $-4, %%eax\n"
+ "jnz 11f\n"
+ "rdtscp\n" // sets %edx:%eax and %ecx
+ "10:add $0x3C, %%rsi\n"
+ "mov %%eax, 0(%%rsi)\n"
+ "mov %%edx, 4(%%rsi)\n"
+ "mov %%ecx, 8(%%rsi)\n"
+ "mov $12, %%edx\n"
+ "jmp 15f\n" // return result
+
+ // Check in syscallTable whether this system call is unrestricted
+ "11:mov %%rax, %%r9\n"
+ #ifndef NDEBUG
+ "cmpw $0, %%fs:0xD0\n" // debug mode
+ "jnz 12f\n"
+ #endif
+ "cmp playground$maxSyscall(%%rip), %%eax\n"
+ "ja 25f\n" // exit process
+ "shl $4, %%rax\n"
+ "lea playground$syscallTable(%%rip), %%rdi\n"
+ "add %%rdi, %%rax\n"
+ "mov 0(%%rax), %%rax\n"
+ "cmp $1, %%rax\n"
+ "jne 25f\n" // exit process
+
+ // Default behavior for unrestricted system calls is to just execute
+ // them. Read the remaining arguments first.
+ "12:mov %%rsi, %%r8\n"
+ "xor %%rax, %%rax\n" // NR_read
+ "mov %%r13, %%rdi\n" // fd = threadFd
+ "add $4, %%rsi\n" // buf = &scratch + 4
+ "mov $48, %%edx\n" // len = 6*sizeof(void *)
+ "13:syscall\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 13b\n"
+ "cmp %%rdx, %%rax\n"
+ "jnz 25f\n" // exit process
+ "mov %%r9, %%rax\n"
+ "mov 0x04(%%r8), %%rdi\n"
+ "mov 0x0C(%%r8), %%rsi\n"
+ "mov 0x14(%%r8), %%rdx\n"
+ "mov 0x1C(%%r8), %%r10\n"
+ "mov 0x2C(%%r8), %%r9\n"
+ "mov 0x24(%%r8), %%r8\n"
+ "cmp $231, %%rax\n" // NR_exit_group
+ "jz 26f\n" // exit program, no message
+ "syscall\n"
+
+ // Return result of system call to sandboxed thread
+ "14:mov %%fs:0x0, %%rsi\n"
+ "add $0x1034, %%rsi\n" // buf = &scratch + 52
+ "mov %%rax, (%%rsi)\n"
+ "mov $8, %%edx\n" // len = 8
+ "15:mov %%r13, %%rdi\n" // fd = threadFd
+ "mov $1, %%eax\n" // NR_write
+ "16:syscall\n"
+ "cmp %%rdx, %%rax\n"
+ "jz 1b\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 16b\n"
+ "jmp 25f\n" // exit process
+
+ // NR_exit:
+ // Exit trusted thread after cleaning up resources
+ "17:mov %%fs:0x0, %%rsi\n"
+ "mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub
+ "mov $3, %%eax\n" // NR_close
+ "syscall\n"
+ "mov %%rsi, %%rdi\n" // start = secure_mem
+ "mov $8192, %%esi\n" // length = 4096
+ "xor %%rdx, %%rdx\n" // prot = PROT_NONE
+ "mov $10, %%eax\n" // NR_mprotect
+ "syscall\n"
+ "mov %%r13, %%rdi\n" // fd = threadFd
+ "mov $3, %%eax\n" // NR_close
+ "syscall\n"
+ "mov $56, %%eax\n" // NR_clone
+ "mov $17, %%rdi\n" // flags = SIGCHLD
+ "mov $1, %%rsi\n" // stack = 1
+ "syscall\n"
+ "mov %%rax, %%rdi\n"
+ "test %%rax, %%rax\n"
+ "jne 21f\n" // reap helper, exit thread
+ "jmp 22f\n" // unlock mutex
+
+ // NR_clone:
+ // Original trusted thread calls clone() to create new nascent
+ // thread. This thread is (typically) fully privileged and shares all
+ // resources with the caller (i.e. the previous trusted thread),
+ // and by extension it shares all resources with the sandbox'd
+ // threads.
+ // N.B. It is possible to make the thread creation code crash before
+ // it releases seccomp privileges. This is generally OK, as it just
+ // terminates the program. But if we ever support signal handling,
+ // we have to be careful that the user cannot install a SIGSEGV
+ // handler that gets executed with elevated privileges.
+ "18:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem
+ "syscall\n" // calls NR_clone
+ "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values
+ "jae 6b\n"
+ "add $2, %%rbx\n"
+ "test %%rax, %%rax\n"
+ "jne 14b\n" // return result
+
+ // In nascent thread, now.
+ "sub $2, %%rbx\n"
+ "xor %%r15, %%r15\n" // Request to return from clone() when done
+
+ // Get thread id of nascent thread
+ "19:mov $186, %%eax\n" // NR_gettid
+ "syscall\n"
+ "mov %%rax, %%r14\n"
+
+ // Nascent thread creates socketpair() for sending requests to
+ // trusted thread.
+ // We can create the filehandles on the stack. Filehandles are
+ // always treated as untrusted.
+ // socketpair(AF_UNIX, SOCK_STREAM, 0, fds)
+ "push %%r15\n"
+ "mov $53, %%eax\n" // NR_socketpair
+ "mov $1, %%edi\n" // domain = AF_UNIX
+ "mov $1, %%esi\n" // type = SOCK_STREAM
+ "xor %%rdx, %%rdx\n" // protocol = 0
+ "sub $8, %%rsp\n" // sv = %rsp
+ "mov %%rsp, %%r10\n"
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "jz 27f\n"
+
+ // If things went wrong, we don't have an (easy) way of signaling
+ // the parent. For our purposes, it is sufficient to fail with a
+ // fatal error.
+ "jmp 25f\n" // exit process
+ "20:mov $56, %%eax\n" // NR_clone
+ "mov $17, %%rdi\n" // flags = SIGCHLD
+ "mov $1, %%rsi\n" // stack = 1
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25f\n" // exit process
+ "jz 22f\n" // unlock and exit
+ "mov %%rax, %%rdi\n"
+ "21:xor %%rsi, %%rsi\n"
+ "xor %%rdx, %%rdx\n"
+ "xor %%r10, %%r10\n"
+ "mov $61, %%eax\n" // NR_wait4
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 21b\n"
+ "jmp 23f\n" // exit thread (no message)
+ "22:lea playground$syscall_mutex(%%rip), %%rdi\n"
+ "mov $4096, %%esi\n"
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $10, %%eax\n" // NR_mprotect
+ "syscall\n"
+ "lock; addl $0x80000000, (%%rdi)\n"
+ "jz 23f\n" // exit thread
+ "mov $1, %%edx\n"
+ "mov %%rdx, %%rsi\n" // FUTEX_WAKE
+ "mov $202, %%eax\n" // NR_futex
+ "syscall\n"
+ "23:mov $60, %%eax\n" // NR_exit
+ "mov $1, %%edi\n" // status = 1
+ "24:syscall\n"
+ "25:mov $1, %%eax\n" // NR_write
+ "mov $2, %%edi\n" // fd = stderr
+ "lea 100f(%%rip), %%rsi\n"
+ "mov $101f-100f, %%edx\n" // len = strlen(msg)
+ "syscall\n"
+ "mov $1, %%edi\n"
+ "26:mov $231, %%eax\n" // NR_exit_group
+ "jmp 24b\n"
+
+ // The first page is mapped read-only for use as securely shared memory
+ "27:mov 0xC0(%%rbp), %%r12\n" // %r12 = secure shared memory
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+ "mov $10, %%eax\n" // NR_mprotect
+ "mov %%r12, %%rdi\n" // addr = secure_mem
+ "mov $4096, %%esi\n" // len = 4096
+ "mov $1, %%edx\n" // prot = PROT_READ
+ "syscall\n"
+
+ // The second page is used as scratch space by the trusted thread.
+ // Make it writable.
+ "mov $10, %%eax\n" // NR_mprotect
+ "add $4096, %%rdi\n" // addr = secure_mem + 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "syscall\n"
+
+ // Call clone() to create new trusted thread().
+ // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ // CLONE_SYSVSEM|CLONE_UNTRACED|CLONE_SETTLS, stack, NULL, NULL,
+ // tls)
+ "mov 4(%%rsp), %%r13d\n" // %r13 = threadFd
+ "mov $56, %%eax\n" // NR_clone
+ "mov $0x8D0F00, %%edi\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR|TLS
+ "mov $1, %%rsi\n" // stack = 1
+ "mov %%r12, %%r8\n" // tls = new_secure_mem
+ "mov 0xC8(%%rbp), %%r15d\n" // %r15 = processFdPub
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25b\n" // exit process
+ "jz 0b\n" // invoke trustedThreadFnc()
+
+ // Done creating trusted thread. We can now get ready to return to caller
+ "mov 0(%%rsp), %%r9d\n" // %r9 = threadFdPub
+ "add $8, %%rsp\n"
+
+ // Set up thread local storage with information on how to talk to
+ // trusted thread and trusted process.
+ "lea 0xD8(%%r12), %%rsi\n" // args = &secure_mem.TLS;
+ "mov $158, %%eax\n" // NR_arch_prctl
+ "mov $0x1001, %%edi\n" // option = ARCH_SET_GS
+ "syscall\n"
+ "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values
+ "jae 20b\n" // exit thread, unlock global mutex
+
+ // Check whether this is the initial thread, or a newly created one.
+ // At startup we run the same code as when we create a new thread. At
+ // the very top of this function, you will find that we push 999(%rip)
+ // on the stack. That is the signal that we should return on the same
+ // stack rather than return to where clone was called.
+ "pop %%r15\n"
+ "test %%r15, %%r15\n"
+ "jne 28f\n"
+
+ // Returning from clone() into the newly created thread is special. We
+ // cannot unroll the stack, as we just set up a new stack for this
+ // thread. We have to explicitly restore CPU registers to the values
+ // that they had when the program originally called clone().
+ "sub $0x80, %%rsp\n" // redzone compensation
+ "mov 0x48(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x50(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x58(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x60(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x68(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x70(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x78(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x80(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x88(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x90(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0x98(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0xA0(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0xA8(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0xB0(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "mov 0xB8(%%rbp), %%rax\n"
+ "push %%rax\n"
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+
+ // Nascent thread launches a helper that doesn't share any of our
+ // resources, except for pages mapped as MAP_SHARED.
+ // clone(0, %rsp)
+ "28:mov $56, %%eax\n" // NR_clone
+ "mov $17, %%rdi\n" // flags = SIGCHLD
+ "mov %%rsp, %%rsi\n" // stack = %rsp
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "js 25b\n" // exit process
+ "jne 29f\n"
+
+ // Use sendmsg() to send to the trusted process the file handles for
+ // communicating with the new trusted thread. We also send the address
+ // of the secure memory area (for sanity checks) and the thread id.
+ "mov 0xCC(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub()
+ "cmp %%rbx, 8(%%rbp)\n"
+ "jne 25b\n" // exit process
+ "mov %%r9, %%rsi\n" // fd0 = threadFdPub
+ "mov %%r13, %%rdx\n" // fd1 = threadFd
+ "push %%r14\n" // threadId
+ "mov %%esi, 4(%%rsp)\n" // threadFdPub
+ "push %%r12\n" // secure_mem
+ "mov %%rsp, %%rcx\n" // buf = &data
+ "mov $16, %%r8\n" // len = sizeof(void*) + 2*sizeof(int)
+ "call playground$sendFd\n"
+
+ // Release syscall_mutex_. This signals the trusted process that
+ // it can write into the original thread's secure memory again.
+ "mov $10, %%eax\n" // NR_mprotect
+ "lea playground$syscall_mutex(%%rip), %%rdi\n"
+ "mov $4096, %%esi\n"
+ "mov $3, %%edx\n" // PROT_READ | PROT_WRITE
+ "syscall\n"
+ "lock; addl $0x80000000, (%%rdi)\n"
+ "jz 26b\n" // exit process (no error message)
+ "mov $1, %%edx\n"
+ "mov %%rdx, %%rsi\n" // FUTEX_WAKE
+ "mov $202, %%eax\n" // NR_futex
+ "syscall\n"
+ "jmp 26b\n" // exit process (no error message)
+
+ // Reap helper
+ "29:mov %%rax, %%rdi\n"
+ "30:xor %%rsi, %%rsi\n"
+ "xor %%rdx, %%rdx\n"
+ "xor %%r10, %%r10\n"
+ "mov $61, %%eax\n" // NR_wait4
+ "syscall\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 30\n"
+
+ // Release privileges by entering seccomp mode.
+ "mov $157, %%eax\n" // NR_prctl
+ "mov $22, %%edi\n" // PR_SET_SECCOMP
+ "mov $1, %%esi\n"
+ "syscall\n"
+ "test %%rax, %%rax\n"
+ "jnz 25b\n" // exit process
+
+ // Back in the newly created sandboxed thread, wait for trusted process
+ // to receive request. It is possible for an attacker to make us
+ // continue even before the trusted process is done. This is OK. It'll
+ // result in us putting stale values into the new thread's TLS. But that
+ // data is considered untrusted anyway.
+ "push %%rax\n"
+ "mov $1, %%edx\n" // len = 1
+ "mov %%rsp, %%rsi\n" // buf = %rsp
+ "mov %%r9, %%rdi\n" // fd = threadFdPub
+ "31:xor %%rax, %%rax\n" // NR_read
+ "syscall\n"
+ "cmp $-4, %%rax\n" // EINTR
+ "jz 31b\n"
+ "cmp %%rdx, %%rax\n"
+ "jne 25b\n" // exit process
+ "pop %%rax\n"
+
+ // Return to caller. We are in the new thread, now.
+ "xor %%rax, %%rax\n"
+ "test %%r15, %%r15\n"
+
+ // Returning to createTrustedThread()
+ "jz 32f\n"
+ "jmp *%%r15\n"
+
+ // Returning to the place where clone() had been called
+ "32:pop %%r15\n"
+ "pop %%r14\n"
+ "pop %%r13\n"
+ "pop %%r12\n"
+ "pop %%r11\n"
+ "pop %%r10\n"
+ "pop %%r9\n"
+ "pop %%r8\n"
+ "pop %%rdi\n"
+ "pop %%rsi\n"
+ "pop %%rdx\n"
+ "pop %%rcx\n"
+ "pop %%rbx\n"
+ "pop %%rbp\n"
+ "ret\n"
+
+ ".pushsection \".rodata\"\n"
+ "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
+ "101:\n"
+ ".popsection\n"
+
+ "999:pop %%rbp\n"
+ "pop %%rbx\n"
+ :
+ : "g"(&args)
+ : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+ "r13", "r14", "r15"
+#elif defined(__i386__)
+ struct user_desc u;
+ u.entry_number = (typeof u.entry_number)-1;
+ u.base_addr = NULL;
+ u.limit = 0xfffff;
+ u.seg_32bit = 1;
+ u.contents = 0;
+ u.read_exec_only = 0;
+ u.limit_in_pages = 1;
+ u.seg_not_present = 0;
+ u.useable = 1;
+ SysCalls sys;
+ if (sys.set_thread_area(&u) < 0) {
+ die("Cannot set up thread local storage");
+ }
+ asm volatile("movw %w0, %%fs"
+ :
+ : "q"(8*u.entry_number+3));
+ asm volatile(
+ "push %%ebx\n"
+ "push %%ebp\n"
+ "movd %0, %%mm6\n" // %mm6 = args
+ "lea 999f, %%ebx\n" // continue in same thread
+ "movd %%ebx, %%mm3\n"
+ "xor %%ebx, %%ebx\n" // initial sequence number
+ "movd %%ebx, %%mm2\n"
+ "jmp 19f\n" // create trusted thread
+
+ // TODO(markus): Coalesce the read() operations by reading into a bigger
+ // buffer.
+
+ // Parameters:
+ // %mm5: secure memory region
+ // the page following this one contains the scratch space
+ // %mm0: thread's side of threadFd
+ // %mm1: processFdPub
+ // %mm3: return address after creation of new trusted thread
+
+ // Local variables:
+ // %mm2: sequence number for trusted calls
+ // %mm4: thread id
+
+ // Temporary variables:
+ // %ebp: system call number
+ // %mm6: secure memory of previous thread
+ // %mm7: temporary variable for spilling data
+
+ // Layout of secure shared memory region (c.f. securemem.h):
+ // 0x00: pointer to the secure shared memory region (i.e. self)
+ // 0x04: sequence number; must match %mm2
+ // 0x08: system call number; passed to syscall in %eax
+ // 0x0C: first argument; passed to syscall in %ebx
+ // 0x10: second argument; passed to syscall in %ecx
+ // 0x14: third argument; passed to syscall in %edx
+ // 0x18: fourth argument; passed to syscall in %esi
+ // 0x1C: fifth argument; passed to syscall in %edi
+ // 0x20: sixth argument; passed to syscall in %ebp
+ // 0x24: stored return address for clone() system call
+ // 0x28: second stored return address for clone() system call
+ // 0x2C: stored %ebp value for clone() system call
+ // 0x30: stored %edi value for clone() system call
+ // 0x34: stored %esi value for clone() system call
+ // 0x38: stored %edx value for clone() system call
+ // 0x3C: stored %ecx value for clone() system call
+ // 0x40: stored %ebx value for clone() system call
+ // 0x44: new shared memory for clone()
+ // 0x48: processFdPub for talking to trusted process
+ // 0x4C: cloneFdPub for talking to trusted process
+ // 0x50: set to non-zero, if in debugging mode
+ // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE)
+ // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE)
+ // 0x60: thread id (TLS_TID)
+ // 0x68: threadFdPub (TLS_THREAD_FD)
+ // 0x200-0x1000: securely passed verified file name(s)
+
+ // Layout of (untrusted) scratch space:
+ // 0x00: syscall number; passed in %eax
+ // 0x04: first argument; passed in %ebx
+ // 0x08: second argument; passed in %ecx
+ // 0x0C: third argument; passed in %edx
+ // 0x10: fourth argument; passed in %esi
+ // 0x14: fifth argument; passed in %edi
+ // 0x18: sixth argument; passed in %ebp
+ // 0x1C: return value
+ // 0x20: RDTSCP result (%eax)
+ // 0x24: RDTSCP result (%edx)
+ // 0x28: RDTSCP result (%ecx)
+
+ "0:xor %%esp, %%esp\n"
+ "mov $2, %%eax\n" // %mm2 = initial sequence number
+ "movd %%eax, %%mm2\n"
+
+ // Read request from untrusted thread, or from trusted process. In either
+ // case, the data that we read has to be considered untrusted.
+ // read(threadFd, &scratch, 4)
+ "1:mov $3, %%eax\n" // NR_read
+ "movd %%mm0, %%ebx\n" // fd = threadFd
+ "movd %%mm5, %%ecx\n"
+ "add $0x1000, %%ecx\n" // buf = &scratch
+ "mov $4, %%edx\n" // len = 4
+ "2:int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 2b\n"
+ "cmp %%edx, %%eax\n"
+ "jnz 25f\n" // exit process
+
+ // Retrieve system call number. It is crucial that we only dereference
+ // 0x1000(%mm5) exactly once. Afterwards, memory becomes untrusted and
+ // we must use the value that we have read the first time.
+ "mov 0(%%ecx), %%eax\n"
+
+ // If syscall number is -1, execute an unlocked system call from the
+ // secure memory area
+ "cmp $-1, %%eax\n"
+ "jnz 5f\n"
+ "3:movd %%mm2, %%ebp\n"
+ "cmp %%ebp, 0x4-0x1000(%%ecx)\n"
+ "jne 25f\n" // exit process
+ "mov 0x08-0x1000(%%ecx), %%eax\n"
+ "mov 0x0C-0x1000(%%ecx), %%ebx\n"
+ "mov 0x14-0x1000(%%ecx), %%edx\n"
+ "mov 0x18-0x1000(%%ecx), %%esi\n"
+ "mov 0x1C-0x1000(%%ecx), %%edi\n"
+ "mov 0x20-0x1000(%%ecx), %%ebp\n"
+ "mov 0x10-0x1000(%%ecx), %%ecx\n"
+ "movd %%edi, %%mm4\n"
+ "movd %%ebp, %%mm7\n"
+ "movd %%mm2, %%ebp\n"
+ "movd %%mm5, %%edi\n"
+ "cmp %%ebp, 4(%%edi)\n"
+ "jne 25f\n" // exit process
+ "add $2, %%ebp\n"
+ "movd %%ebp, %%mm2\n"
+ "movd %%mm4, %%edi\n"
+ "movd %%mm7, %%ebp\n"
+
+ // shmget() gets some special treatment. Whenever we return from this
+ // system call, we remember the most recently returned SysV shm id.
+ "cmp $117, %%eax\n" // NR_ipc
+ "jnz 4f\n"
+ "cmp $23, %%ebx\n" // shmget()
+ "jnz 4f\n"
+ "int $0x80\n"
+ "mov %%eax, %%ebp\n"
+ "mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25f\n" // exit process
+ "mov %%eax, %%ebx\n"
+ "jnz 7f\n" // wait for child, then return result
+ "movd %%mm5, %%ebx\n" // start = secure_mem
+ "mov $4096, %%ecx\n" // len = 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $125, %%eax\n" // NR_mprotect
+ "int $0x80\n"
+ "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id
+ "xor %%ebx, %%ebx\n"
+ "jmp 26f\n" // exit program, no message
+ "4:int $0x80\n"
+ "jmp 14f\n" // return result
+
+ // If syscall number is -2, execute locked system call from the
+ // secure memory area
+ "5:jg 11f\n"
+ "cmp $-2, %%eax\n"
+ "jnz 8f\n"
+ "movd %%mm2, %%ebp\n"
+ "cmp %%ebp, 0x4-0x1000(%%ecx)\n"
+ "jne 25f\n" // exit process
+ "mov 0x08-0x1000(%%ecx), %%eax\n"
+ "mov 0x0C-0x1000(%%ecx), %%ebx\n"
+ "mov 0x14-0x1000(%%ecx), %%edx\n"
+ "mov 0x18-0x1000(%%ecx), %%esi\n"
+ "mov 0x1C-0x1000(%%ecx), %%edi\n"
+ "mov 0x20-0x1000(%%ecx), %%ebp\n"
+ "mov 0x10-0x1000(%%ecx), %%ecx\n"
+ "movd %%edi, %%mm4\n"
+ "movd %%ebp, %%mm7\n"
+ "movd %%mm2, %%ebp\n"
+ "movd %%mm5, %%edi\n"
+ "cmp %%ebp, 4(%%edi)\n"
+ "jne 25f\n" // exit process
+
+ // clone() has unusual calling conventions and must be handled specially
+ "cmp $120, %%eax\n" // NR_clone
+ "jz 18f\n"
+
+ // exit() terminates trusted thread
+ "cmp $1, %%eax\n" // NR_exit
+ "jz 17f\n"
+
+ // Perform requested system call
+ "movd %%mm4, %%edi\n"
+ "movd %%mm7, %%ebp\n"
+ "int $0x80\n"
+
+ // Unlock mutex
+ "6:movd %%mm2, %%ebp\n"
+ "movd %%mm5, %%edi\n"
+ "cmp %%ebp, 4(%%edi)\n"
+ "jne 25f\n" // exit process
+ "add $2, %%ebp\n"
+ "movd %%ebp, %%mm2\n"
+ "mov %%eax, %%ebp\n"
+ "mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25f\n" // exit process
+ "jz 22f\n" // unlock and exit
+ "mov %%eax, %%ebx\n"
+ "7:xor %%ecx, %%ecx\n"
+ "xor %%edx, %%edx\n"
+ "mov $7, %%eax\n" // NR_waitpid
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 6\n"
+ "mov %%ebp, %%eax\n"
+ "jmp 14f\n" // return result
+
+ // If syscall number is -3, read the time stamp counter
+ "8:cmp $-3, %%eax\n"
+ "jnz 9f\n"
+ "rdtsc\n" // sets %edx:%eax
+ "xor %%ecx, %%ecx\n"
+ "jmp 10f\n"
+ "9:cmp $-4, %%eax\n"
+ "jnz 11f\n"
+ "rdtscp\n" // sets %edx:%eax and %ecx
+ "10:movd %%mm5, %%ebx\n"
+ "add $0x1020, %%ebx\n"
+ "mov %%eax, 0(%%ebx)\n"
+ "mov %%edx, 4(%%ebx)\n"
+ "mov %%ecx, 8(%%ebx)\n"
+ "mov %%ebx, %%ecx\n"
+ "mov $12, %%edx\n"
+ "jmp 15f\n" // return result
+
+ // Check in syscallTable whether this system call is unrestricted
+ "11:mov %%eax, %%ebp\n"
+ #ifndef NDEBUG
+ "cmpw $0, 0x50-0x1000(%%ecx)\n"
+ "jnz 12f\n" // debug mode
+ #endif
+ "cmp playground$maxSyscall, %%eax\n"
+ "ja 25f\n" // exit process
+ "shl $3, %%eax\n"
+ "add $playground$syscallTable, %%eax\n"
+ "mov 0(%%eax), %%eax\n"
+ "cmp $1, %%eax\n"
+ "jne 25f\n" // exit process
+
+ // Default behavior for unrestricted system calls is to just execute
+ // them. Read the remaining arguments first.
+ "12:mov $3, %%eax\n" // NR_read
+ "movd %%mm0, %%ebx\n" // fd = threadFd
+ "add $4, %%ecx\n" // buf = &scratch + 4
+ "mov $24, %%edx\n" // len = 6*sizeof(void *)
+ "13:int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 13b\n"
+ "cmp %%edx, %%eax\n"
+ "jnz 25f\n" // exit process
+ "mov %%ebp, %%eax\n"
+ "mov 0x00(%%ecx), %%ebx\n"
+ "mov 0x08(%%ecx), %%edx\n"
+ "mov 0x0C(%%ecx), %%esi\n"
+ "mov 0x10(%%ecx), %%edi\n"
+ "mov 0x14(%%ecx), %%ebp\n"
+ "mov 0x04(%%ecx), %%ecx\n"
+ "cmp $252, %%eax\n" // NR_exit_group
+ "jz 26f\n" // exit program, no message
+ "int $0x80\n"
+
+ // Return result of system call to sandboxed thread
+ "14:movd %%mm5, %%ecx\n"
+ "add $0x101C, %%ecx\n" // buf = &scratch + 28
+ "mov %%eax, (%%ecx)\n"
+ "mov $4, %%edx\n" // len = 4
+ "15:movd %%mm0, %%ebx\n" // fd = threadFd
+ "mov $4, %%eax\n" // NR_write
+ "16:int $0x80\n"
+ "cmp %%edx, %%eax\n"
+ "jz 1b\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 16b\n"
+ "jmp 25f\n" // exit process
+
+ // NR_exit:
+ // Exit trusted thread after cleaning up resources
+ "17:mov %%edi, %%ecx\n"
+ "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub
+ "mov $6, %%eax\n" // NR_close
+ "int $0x80\n"
+ "mov %%ecx, %%ebx\n" // start = secure_mem
+ "mov $8192, %%ecx\n" // length = 4096
+ "xor %%edx, %%edx\n" // prot = PROT_NONE
+ "mov $125, %%eax\n" // NR_mprotect
+ "int $0x80\n"
+ "movd %%mm0, %%ebx\n" // fd = threadFd
+ "mov $6, %%eax\n" // NR_close
+ "int $0x80\n"
+ "mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "mov %%eax, %%ebx\n"
+ "test %%eax, %%eax\n"
+ "jne 21f\n" // reap helper, exit thread
+ "jmp 22f\n" // unlock mutex
+
+ // NR_clone:
+ // Original trusted thread calls clone() to create new nascent
+ // thread. This thread is (typically) fully privileged and shares all
+ // resources with the caller (i.e. the previous trusted thread),
+ // and by extension it shares all resources with the sandbox'd
+ // threads.
+ // N.B. It is possible to make the thread creation code crash before
+ // it releases seccomp privileges. This is generally OK, as it just
+ // terminates the program. But if we ever support signal handling,
+ // we have to be careful that the user cannot install a SIGSEGV
+ // handler that gets executed with elevated privileges.
+ "18:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem
+ "movd %%mm4, %%edi\n"
+ "movd %%mm7, %%ebp\n"
+ "int $0x80\n" // calls NR_clone
+ "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values
+ "jae 6b\n"
+ "movd %%mm2, %%edi\n"
+ "add $2, %%edi\n"
+ "movd %%edi, %%mm2\n"
+ "test %%eax, %%eax\n"
+ "jne 14b\n" // return result
+
+ // In nascent thread, now.
+ "sub $2, %%edi\n"
+ "movd %%edi, %%mm2\n"
+ "movd %%eax, %%mm3\n" // Request to return from clone() when done
+
+ // Get thread id of nascent thread
+ "19:mov $224, %%eax\n" // NR_gettid
+ "int $0x80\n"
+ "movd %%eax, %%mm4\n"
+
+ // Nascent thread creates socketpair() for sending requests to
+ // trusted thread.
+ // We can create the filehandles on the stack. Filehandles are
+ // always treated as untrusted.
+ // socketpair(AF_UNIX, SOCK_STREAM, 0, fds)
+ "mov $102, %%eax\n" // NR_socketcall
+ "mov $8, %%ebx\n" // socketpair
+ "sub $8, %%esp\n" // sv = %rsp
+ "push %%esp\n"
+ "xor %%ecx, %%ecx\n" // protocol = 0
+ "push %%ecx\n"
+ "mov $1, %%ecx\n" // type = SOCK_STREAM
+ "push %%ecx\n"
+ "push %%ecx\n" // domain = AF_UNIX
+ "mov %%esp, %%ecx\n"
+ "int $0x80\n"
+ "add $0x10, %%esp\n"
+ "test %%eax, %%eax\n"
+ "jz 27f\n"
+
+ // If things went wrong, we don't have an (easy) way of signaling
+ // the parent. For our purposes, it is sufficient to fail with a
+ // fatal error.
+ "jmp 25f\n" // exit process
+ "20:mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov $1, %%ecx\n" // stack = 1
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25f\n" // exit process
+ "jz 22f\n" // unlock and exit
+ "mov %%eax, %%ebx\n"
+ "21:xor %%ecx, %%ecx\n"
+ "xor %%edx, %%edx\n"
+ "mov $7, %%eax\n" // NR_waitpid
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 21b\n"
+ "jmp 23f\n" // exit thread (no message)
+ "22:lea playground$syscall_mutex, %%ebx\n"
+ "mov $4096, %%ecx\n"
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "mov $125, %%eax\n" // NR_mprotect
+ "int $0x80\n"
+ "lock; addl $0x80000000, (%%ebx)\n"
+ "jz 23f\n" // exit thread
+ "mov $1, %%edx\n"
+ "mov %%edx, %%ecx\n" // FUTEX_WAKE
+ "mov $240, %%eax\n" // NR_futex
+ "int $0x80\n"
+ "23:mov $1, %%eax\n" // NR_exit
+ "mov $1, %%ebx\n" // status = 1
+ "24:int $0x80\n"
+ "25:mov $4, %%eax\n" // NR_write
+ "mov $2, %%ebx\n" // fd = stderr
+ "lea 100f, %%ecx\n"
+ "mov $101f-100f, %%edx\n" // len = strlen(msg)
+ "int $0x80\n"
+ "mov $1, %%ebx\n"
+ "26:mov $252, %%eax\n" // NR_exit_group
+ "jmp 24b\n"
+
+ // The first page is mapped read-only for use as securely shared memory
+ "27:movd %%mm6, %%ebp\n"
+ "mov 0x44(%%ebp), %%esi\n"
+ "movd %%esi, %%mm5\n" // %mm5 = secure shared memory
+ "movd %%mm2, %%edi\n"
+ "cmp %%edi, 4(%%ebp)\n"
+ "jne 25b\n" // exit process
+ "mov $125, %%eax\n" // NR_mprotect
+ "mov %%esi, %%ebx\n"
+ "mov $4096, %%ecx\n" // len = 4096
+ "mov $1, %%edx\n" // prot = PROT_READ
+ "int $0x80\n"
+
+ // The second page is used as scratch space by the trusted thread.
+ // Make it writable.
+ "mov $125, %%eax\n" // NR_mprotect
+ "add $4096, %%ebx\n" // addr = secure_mem + 4096
+ "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
+ "int $0x80\n"
+
+ // Call clone() to create new trusted thread().
+ // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+ // CLONE_SYSVSEM|CLONE_UNTRACED, stack, NULL, NULL, NULL)
+ "mov 4(%%esp), %%eax\n"
+ "movd %%eax, %%mm0\n" // %mm0 = threadFd
+ "mov $120, %%eax\n" // NR_clone
+ "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR
+ "mov $1, %%ecx\n" // stack = 1
+ "movd 0x48(%%ebp), %%mm1\n" // %mm1 = processFdPub
+ "cmp %%edi, 4(%%ebp)\n"
+ "jne 25b\n" // exit process
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25b\n" // exit process
+ "jz 0b\n" // invoke trustedThreadFnc()
+
+ // Set up thread local storage
+ "mov $0x51, %%eax\n" // seg_32bit, limit_in_pages, useable
+ "push %%eax\n"
+ "mov $0xFFFFF, %%eax\n" // limit
+ "push %%eax\n"
+ "add $0x58, %%esi\n"
+ "push %%esi\n" // base_addr = &secure_mem.TLS
+ "mov %%fs, %%eax\n"
+ "shr $3, %%eax\n"
+ "push %%eax\n" // entry_number
+ "mov $243, %%eax\n" // NR_set_thread_area
+ "mov %%esp, %%ebx\n"
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "jnz 25b\n" // exit process
+ "add $16, %%esp\n"
+
+ // Done creating trusted thread. We can now get ready to return to caller
+ "mov 0(%%esp), %%esi\n" // %esi = threadFdPub
+ "add $8, %%esp\n"
+
+ // Check whether this is the initial thread, or a newly created one.
+ // At startup we run the same code as when we create a new thread. At
+ // the very top of this function, you will find that we store 999(%rip)
+ // in %%mm3. That is the signal that we should return on the same
+ // stack rather than return to where clone was called.
+ "movd %%mm3, %%eax\n"
+ "test %%eax, %%eax\n"
+ "jne 28f\n"
+
+ // Returning from clone() into the newly created thread is special. We
+ // cannot unroll the stack, as we just set up a new stack for this
+ // thread. We have to explicitly restore CPU registers to the values
+ // that they had when the program originally called clone().
+ "mov 0x24(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "mov 0x28(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "mov 0x2C(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "mov 0x30(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "mov 0x34(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "mov 0x38(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "mov 0x3C(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "mov 0x40(%%ebp), %%eax\n"
+ "push %%eax\n"
+ "cmp %%edi, 4(%%ebp)\n"
+ "jne 25b\n" // exit process
+
+ // Nascent thread launches a helper that doesn't share any of our
+ // resources, except for pages mapped as MAP_SHARED.
+ // clone(0, %esp)
+ "28:mov $120, %%eax\n" // NR_clone
+ "mov $17, %%ebx\n" // flags = SIGCHLD
+ "mov %%esp, %%ecx\n" // stack = %esp
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "js 25b\n" // exit process
+ "jne 29f\n"
+
+ // Use sendmsg() to send to the trusted process the file handles for
+ // communicating with the new trusted thread. We also send the address
+ // of the secure memory area (for sanity checks) and the thread id.
+ "push %%esi\n" // threadFdPub
+ "movd %%mm4, %%eax\n" // threadId
+ "push %%eax\n"
+ "movd %%mm5, %%eax\n" // secure_mem
+ "push %%eax\n"
+ "mov %%esp, %%ebx\n" // buf = &data
+ "mov $12, %%eax\n" // len = sizeof(void*) + 2*sizeof(int)
+ "push %%eax\n"
+ "push %%ebx\n"
+ "movd %%mm0, %%eax\n" // fd1 = threadFd
+ "push %%eax\n"
+ "push %%esi\n" // fd0 = threadFdPub
+ "mov 0x4C(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub()
+ "cmp %%edi, 4(%%ebp)\n"
+ "jne 25b\n" // exit process
+ "push %%eax\n"
+ "call playground$sendFd\n"
+
+ // Release syscall_mutex_. This signals the trusted process that
+ // it can write into the original thread's secure memory again.
+ "mov $125, %%eax\n" // NR_mprotect
+ "lea playground$syscall_mutex, %%ebx\n"
+ "mov $4096, %%ecx\n"
+ "mov $3, %%edx\n" // PROT_READ | PROT_WRITE
+ "int $0x80\n"
+ "lock; addl $0x80000000, (%%ebx)\n"
+ "jz 26b\n" // exit process (no error message)
+ "mov $1, %%edx\n"
+ "mov %%edx, %%ecx\n" // FUTEX_WAKE
+ "mov $240, %%eax\n" // NR_futex
+ "int $0x80\n"
+ "jmp 26b\n" // exit process (no error message)
+
+ // Reap helper
+ "29:mov %%eax, %%ebx\n"
+ "30:xor %%ecx, %%ecx\n"
+ "xor %%edx, %%edx\n"
+ "mov $7, %%eax\n" // NR_waitpid
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 30\n"
+
+ // Release privileges by entering seccomp mode.
+ "mov $172, %%eax\n" // NR_prctl
+ "mov $22, %%ebx\n" // PR_SET_SECCOMP
+ "mov $1, %%ecx\n"
+ "int $0x80\n"
+ "test %%eax, %%eax\n"
+ "jnz 25b\n" // exit process
+
+ // Back in the newly created sandboxed thread, wait for trusted process
+ // to receive request. It is possible for an attacker to make us
+ // continue even before the trusted process is done. This is OK. It'll
+ // result in us putting stale values into the new thread's TLS. But that
+ // data is considered untrusted anyway.
+ "push %%eax\n"
+ "mov $1, %%edx\n" // len = 1
+ "mov %%esp, %%ecx\n" // buf = %rsp
+ "mov %%esi, %%ebx\n" // fd = threadFdPub
+ "31:mov $3, %%eax\n" // NR_read
+ "int $0x80\n"
+ "cmp $-4, %%eax\n" // EINTR
+ "jz 31b\n"
+ "cmp %%edx, %%eax\n"
+ "jne 25b\n" // exit process
+ "pop %%eax\n"
+
+ // Return to caller. We are in the new thread, now.
+ "xor %%eax, %%eax\n"
+ "movd %%mm3, %%ebx\n"
+
+ // Release MMX registers, so that they can be used for floating point
+ // operations.
+ "emms\n"
+
+ // Returning to createTrustedThread()
+ "test %%ebx, %%ebx\n"
+ "jz 32f\n"
+ "jmp *%%ebx\n"
+
+ // Returning to the place where clone() had been called
+ "32:pop %%ebx\n"
+ "pop %%ecx\n"
+ "pop %%edx\n"
+ "pop %%esi\n"
+ "pop %%edi\n"
+ "pop %%ebp\n"
+ "ret\n"
+
+ ".pushsection \".rodata\"\n"
+ "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
+ "101:\n"
+ ".popsection\n"
+
+ "999:pop %%ebp\n"
+ "pop %%ebx\n"
+ :
+ : "g"(&args)
+ : "eax", "ecx", "edx", "edi", "esi"
+#else
+#error Unsupported target platform
+#endif
+);
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/x86_decode.cc b/sandbox/linux/seccomp/x86_decode.cc
new file mode 100644
index 0000000..c28b579
--- /dev/null
+++ b/sandbox/linux/seccomp/x86_decode.cc
@@ -0,0 +1,306 @@
+#include "x86_decode.h"
+
+namespace playground {
+
+#if defined(__x86_64__) || defined(__i386__)
+unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix,
+ char **rex_ptr, char **mod_rm_ptr, char **sib_ptr,
+ bool *is_group) {
+ enum {
+ BYTE_OP = (1<<1), // 0x02
+ IMM = (1<<2), // 0x04
+ IMM_BYTE = (2<<2), // 0x08
+ MEM_ABS = (3<<2), // 0x0C
+ MODE_MASK = (7<<2), // 0x1C
+ MOD_RM = (1<<5), // 0x20
+ STACK = (1<<6), // 0x40
+ GROUP = (1<<7), // 0x80
+ GROUP_MASK = 0x7F,
+ };
+
+ static unsigned char opcode_types[512] = {
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x00 - 0x07
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x00, // 0x08 - 0x0F
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x10 - 0x17
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x18 - 0x1F
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x20 - 0x27
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x28 - 0x2F
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x30 - 0x37
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x38 - 0x3F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x40 - 0x47
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x48 - 0x4F
+ 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x50 - 0x57
+ 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x58 - 0x5F
+ 0x01, 0x01, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0x60 - 0x67
+ 0x45, 0x25, 0x49, 0x29, 0x03, 0x01, 0x03, 0x01, // 0x68 - 0x6F
+ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x70 - 0x77
+ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x78 - 0x7F
+ 0x27, 0x25, 0x27, 0x29, 0x23, 0x21, 0x23, 0x21, // 0x80 - 0x87
+ 0x23, 0x21, 0x23, 0x21, 0x21, 0x21, 0x21, 0x80, // 0x88 - 0x8F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x90 - 0x97
+ 0x01, 0x01, 0x05, 0x01, 0x41, 0x41, 0x01, 0x01, // 0x98 - 0x9F
+ 0x0F, 0x0D, 0x0F, 0x0D, 0x03, 0x01, 0x03, 0x01, // 0xA0 - 0xA7
+ 0x09, 0x05, 0x03, 0x01, 0x03, 0x01, 0x03, 0x01, // 0xA8 - 0xAF
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // 0xB0 - 0xB7
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0xB8 - 0xBF
+ 0x27, 0x29, 0x01, 0x01, 0x21, 0x21, 0x27, 0x25, // 0xC0 - 0xC7
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x01, // 0xC8 - 0xCF
+ 0x23, 0x21, 0x23, 0x21, 0x09, 0x09, 0x01, 0x01, // 0xD0 - 0xD7
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xD8 - 0xDF
+ 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0xE0 - 0xE7
+ 0x05, 0x05, 0x05, 0x09, 0x03, 0x01, 0x03, 0x01, // 0xE8 - 0xEF
+ 0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x88, 0x90, // 0xF0 - 0xF7
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x98, 0xA0, // 0xF8 - 0xFF
+ 0x00, 0xA8, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, // 0xF00 - 0xF07
+ 0x01, 0x01, 0x00, 0x01, 0x00, 0x21, 0x01, 0x00, // 0xF08 - 0xF0F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF10 - 0xF17
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF18 - 0xF1F
+ 0x21, 0x21, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0xF20 - 0xF27
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF28 - 0xF2F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, // 0xF30 - 0xF37
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF38 - 0xF3F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF40 - 0xF47
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF48 - 0xF4F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF50 - 0xF57
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF58 - 0xF5F
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF60 - 0xF67
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF68 - 0xF6F
+ 0x21, 0x00, 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, // 0xF70 - 0xF77
+ 0x21, 0x21, 0x00, 0x00, 0x21, 0x21, 0x21, 0x21, // 0xF78 - 0xF7F
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF80 - 0xF87
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF88 - 0xF8F
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF90 - 0xF97
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF98 - 0xF9F
+ 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x00, 0x00, // 0xFA0 - 0xFA7
+ 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x21, 0x21, // 0xFA8 - 0xFAF
+ 0x23, 0x21, 0x00, 0x21, 0x00, 0x00, 0x23, 0x21, // 0xFB0 - 0xFB7
+ 0x21, 0x00, 0x29, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFB8 - 0xFBF
+ 0x21, 0x21, 0x00, 0x21, 0x00, 0x00, 0x00, 0x21, // 0xFC0 - 0xFC7
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xFC8 - 0xFCF
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD0 - 0xFD7
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD8 - 0xFDF
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE0 - 0xFE7
+ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE8 - 0xFEF
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF0 - 0xFF7
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF8 - 0xFFF
+ };
+
+ static unsigned char group_table[56] = {
+ 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 1A
+ 0x27, 0x27, 0x23, 0x23, 0x23, 0x23, 0x23, 0x23, // Group 3 (Byte)
+ 0x25, 0x25, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // Group 3
+ 0x23, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 4
+ 0x21, 0x21, 0x61, 0x21, 0x61, 0x21, 0x61, 0x00, // Group 5
+ 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, 0x21, 0x23, // Group 7
+ 0x21, 0x00, 0x00, 0x21, 0x21, 0x00, 0x21, 0x00, // Group 7 (Alternate)
+ };
+
+ const unsigned char *insn_ptr = reinterpret_cast<const unsigned char *>(*ip);
+ int operand_width = 4;
+ int address_width = 4;
+ if (is64bit) {
+ address_width = 8;
+ }
+ unsigned char byte, rex = 0;
+ bool found_prefix = false;
+ if (rex_ptr) {
+ *rex_ptr = 0;
+ }
+ if (mod_rm_ptr) {
+ *mod_rm_ptr = 0;
+ }
+ if (sib_ptr) {
+ *sib_ptr = 0;
+ }
+ for (;; ++insn_ptr) {
+ switch (byte = *insn_ptr) {
+ case 0x66: // Operand width prefix
+ operand_width ^= 6;
+ break;
+ case 0x67: // Address width prefix
+ address_width ^= is64bit ? 12 : 6;
+ break;
+ case 0x26: // Segment selector prefixes
+ case 0x2e:
+ case 0x36:
+ case 0x3e:
+ case 0x64:
+ case 0x65:
+ case 0xF0:
+ case 0xF2:
+ case 0xF3:
+ break;
+ case 0x40: case 0x41: case 0x42: case 0x43: // 64 bit REX prefixes
+ case 0x44: case 0x45: case 0x46: case 0x47:
+ case 0x48: case 0x49: case 0x4A: case 0x4B:
+ case 0x4C: case 0x4D: case 0x4E: case 0x4F:
+ if (is64bit) {
+ if (rex_ptr) {
+ *rex_ptr = (char *)insn_ptr;
+ }
+ rex = byte;
+ found_prefix = true;
+ continue;
+ }
+ // fall through
+ default:
+ ++insn_ptr;
+ goto no_more_prefixes;
+ }
+ rex = 0;
+ found_prefix = true;
+ }
+no_more_prefixes:
+ if (has_prefix) {
+ *has_prefix = found_prefix;
+ }
+ if (rex & REX_W) {
+ operand_width = 8;
+ }
+ unsigned char type;
+ unsigned short insn = byte;
+ unsigned int idx = 0;
+ if (byte == 0x0F) {
+ byte = *insn_ptr++;
+ insn = (insn << 8) | byte;
+ idx = 256;
+ }
+ type = opcode_types[idx + byte];
+ bool found_mod_rm = false;
+ bool found_group = false;
+ bool found_sib = false;
+ unsigned char mod_rm = 0;
+ unsigned char sib = 0;
+ if (type & GROUP) {
+ found_mod_rm = true;
+ found_group = true;
+ mod_rm = *insn_ptr;
+ if (mod_rm_ptr) {
+ *mod_rm_ptr = (char *)insn_ptr;
+ }
+ unsigned char group = (type & GROUP_MASK) + ((mod_rm >> 3) & 0x7);
+ if ((type & GROUP_MASK) == 40 && (mod_rm >> 6) == 3) {
+ group += 8;
+ }
+ type = group_table[group];
+ }
+ if (!type) {
+ // We know that we still don't decode some of the more obscure
+ // instructions, but for all practical purposes that doesn't matter.
+ // Compilers are unlikely to output them, and even if we encounter
+ // hand-coded assembly, we will soon synchronize to the instruction
+ // stream again.
+ //
+ // std::cerr << "Unsupported instruction at 0x" << std::hex <<
+ // std::uppercase << reinterpret_cast<long>(*ip) << " [ ";
+ // for (const unsigned char *ptr =
+ // reinterpret_cast<const unsigned char *>(*ip);
+ // ptr < insn_ptr; ) {
+ // std::cerr << std::hex << std::uppercase << std::setw(2) <<
+ // std::setfill('0') << (unsigned int)*ptr++ << ' ';
+ // }
+ // std::cerr << "]" << std::endl;
+ } else {
+ if (is64bit && (type & STACK)) {
+ operand_width = 8;
+ }
+ if (type & MOD_RM) {
+ found_mod_rm = true;
+ if (mod_rm_ptr) {
+ *mod_rm_ptr = (char *)insn_ptr;
+ }
+ mod_rm = *insn_ptr++;
+ int mod = (mod_rm >> 6) & 0x3;
+ int rm = 8*(rex & REX_B) + (mod_rm & 0x7);
+ if (mod != 3) {
+ if (address_width == 2) {
+ switch (mod) {
+ case 0:
+ if (rm != 6 /* SI */) {
+ break;
+ }
+ // fall through
+ case 2:
+ insn_ptr++;
+ // fall through
+ case 1:
+ insn_ptr++;
+ break;
+ }
+ } else {
+ if ((rm & 0x7) == 4) {
+ found_sib = true;
+ if (sib_ptr) {
+ *sib_ptr = (char *)insn_ptr;
+ }
+ sib = *insn_ptr++;
+ if (!mod && (sib & 0x7) == 5 /* BP */) {
+ insn_ptr += 4;
+ }
+ }
+ switch (mod) {
+ case 0:
+ if (rm != 5 /* BP */) {
+ break;
+ }
+ // fall through
+ case 2:
+ insn_ptr += 3;
+ // fall through
+ case 1:
+ insn_ptr++;
+ break;
+ }
+ }
+ }
+ }
+ switch (insn) {
+ case 0xC8: // ENTER
+ insn_ptr++;
+ // fall through
+ case 0x9A: // CALL (far)
+ case 0xC2: // RET (near)
+ case 0xCA: // LRET
+ case 0xEA: // JMP (far)
+ insn_ptr += 2;
+ break;
+ case 0xF80: case 0xF81: case 0xF82: case 0xF83: // Jcc (rel)
+ case 0xF84: case 0xF85: case 0xF86: case 0xF87:
+ case 0xF88: case 0xF89: case 0xF8A: case 0xF8B:
+ case 0xF8C: case 0xF8D: case 0xF8E: case 0xF8F:
+ insn_ptr += operand_width;
+ break;
+ }
+ switch (type & MODE_MASK) {
+ case IMM:
+ if (!(type & BYTE_OP)) {
+ switch (insn) {
+ case 0xB8: case 0xB9: case 0xBA: case 0xBB:
+ case 0xBC: case 0xBD: case 0xBE: case 0xBF:
+ // Allow MOV to/from 64bit addresses
+ insn_ptr += operand_width;
+ break;
+ default:
+ insn_ptr += (operand_width == 8) ? 4 : operand_width;
+ break;
+ }
+ break;
+ }
+ // fall through
+ case IMM_BYTE:
+ insn_ptr++;
+ break;
+ case MEM_ABS:
+ insn_ptr += address_width;
+ break;
+ }
+ }
+ if (is_group) {
+ *is_group = found_group;
+ }
+ *ip = reinterpret_cast<const char *>(insn_ptr);
+ return insn;
+}
+#endif
+
+} // namespace
diff --git a/sandbox/linux/seccomp/x86_decode.h b/sandbox/linux/seccomp/x86_decode.h
new file mode 100644
index 0000000..6db26ab
--- /dev/null
+++ b/sandbox/linux/seccomp/x86_decode.h
@@ -0,0 +1,15 @@
+#ifndef X86_DECODE_H__
+#define X86_DECODE_H__
+namespace playground {
+enum {
+ REX_B = 0x01,
+ REX_X = 0x02,
+ REX_R = 0x04,
+ REX_W = 0x08
+};
+
+unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix = 0,
+ char **rex_ptr = 0, char **mod_rm_ptr = 0,
+ char **sib_ptr = 0, bool *is_group = 0);
+} // namespace
+#endif // X86_DECODE_H__