diff options
author | markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-08-11 21:46:07 +0000 |
---|---|---|
committer | markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-08-11 21:46:07 +0000 |
commit | 0fb2bd939380e4d46bad10eb597bff4980ca7db2 (patch) | |
tree | 79d017b24dfb4d91059b856da7b8ad43764d76e6 /sandbox/linux | |
parent | 135b165d2bca7a9a7302eb4f771dc713c8100edb (diff) | |
download | chromium_src-0fb2bd939380e4d46bad10eb597bff4980ca7db2.zip chromium_src-0fb2bd939380e4d46bad10eb597bff4980ca7db2.tar.gz chromium_src-0fb2bd939380e4d46bad10eb597bff4980ca7db2.tar.bz2 |
Initial version of the Seccomp sandbox. Imported from http://code.google.com/p/seccompsandbox/
Make the seccomp sandbox dependant on the --enable-seccomp-sandbox flag
Review URL: http://codereview.chromium.org/165310
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@23087 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'sandbox/linux')
36 files changed, 11365 insertions, 0 deletions
diff --git a/sandbox/linux/seccomp/access.cc b/sandbox/linux/seccomp/access.cc new file mode 100644 index 0000000..0a0d0e5 --- /dev/null +++ b/sandbox/linux/seccomp/access.cc @@ -0,0 +1,77 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_access(const char *pathname, int mode) { + Debug::syscall(__NR_access, "Executing handler"); + size_t len = strlen(pathname); + struct Request { + int sysnum; + long long cookie; + Access access_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_access; + request->cookie = cookie(); + request->access_req.path_length = len; + request->access_req.mode = mode; + memcpy(request->pathname, pathname, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward access() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_access(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + Access access_req; + if (read(sys, sandboxFd, &access_req, sizeof(access_req)) != + sizeof(access_req)) { + read_parm_failed: + die("Failed to read parameters for access() [process]"); + } + int rc = -ENAMETOOLONG; + if (access_req.path_length >= sizeof(mem->pathname)) { + char buf[32]; + while (access_req.path_length > 0) { + size_t len = access_req.path_length > sizeof(buf) ? + sizeof(buf) : access_req.path_length; + ssize_t i = read(sys, sandboxFd, buf, len); + if (i <= 0) { + goto read_parm_failed; + } + access_req.path_length -= i; + } + if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to return data from access() [process]"); + } + return false; + } + SecureMem::lockSystemCall(parentProc, mem); + if (read(sys, sandboxFd, mem->pathname, access_req.path_length) != + (ssize_t)access_req.path_length) { + goto read_parm_failed; + } + mem->pathname[access_req.path_length] = '\000'; + + // TODO(markus): Implement sandboxing policy + Debug::message(("Allowing access to \"" + std::string(mem->pathname) + + "\"").c_str()); + + // Tell trusted thread to access the file. + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_access, + mem->pathname - (char*)mem + (char*)mem->self, + access_req.mode); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc new file mode 100644 index 0000000..109e5c6 --- /dev/null +++ b/sandbox/linux/seccomp/clone.cc @@ -0,0 +1,111 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_clone(int flags, void* stack, int* pid, int* ctid, + void* tls, void *wrapper_sp) { + Debug::syscall(__NR_clone, "Executing handler"); + struct { + int sysnum; + long long cookie; + Clone clone_req; + } __attribute__((packed)) request; + request.sysnum = __NR_clone; + request.cookie = cookie(); + request.clone_req.flags = flags; + request.clone_req.stack = stack; + request.clone_req.pid = pid; + request.clone_req.ctid = ctid; + request.clone_req.tls = tls; + + // Pass along the address on the stack where syscallWrapper() stored the + // original CPU registers. These registers will be restored in the newly + // created thread prior to returning from the wrapped system call. + #if defined(__x86_64__) + memcpy(&request.clone_req.regs64, wrapper_sp, + sizeof(request.clone_req.regs64) + sizeof(void *)); + #elif defined(__i386__) + memcpy(&request.clone_req.regs32, wrapper_sp, + sizeof(request.clone_req.regs32) + sizeof(void *)); + #else + #error Unsupported target platform + #endif + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward clone() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_clone(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + Clone clone_req; + SysCalls sys; + if (read(sys, sandboxFd, &clone_req, sizeof(clone_req)) !=sizeof(clone_req)){ + die("Failed to read parameters for clone() [process]"); + } + + // TODO(markus): add policy restricting parameters for clone + if ((clone_req.flags & ~CLONE_DETACHED) != (CLONE_VM|CLONE_FS|CLONE_FILES| + CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS| + CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID)) { + SecureMem::abandonSystemCall(threadFd, -EPERM); + return false; + } else { + SecureMem::Args* newMem = getSecureMem(); + if (!newMem) { + SecureMem::abandonSystemCall(threadFd, -ENOMEM); + return false; + } else { + // clone() has unusual semantics. We don't want to return back into the + // trusted thread, but instead we need to continue execution at the IP + // where we got called initially. + SecureMem::lockSystemCall(parentProc, mem); + mem->ret = clone_req.ret; + #if defined(__x86_64__) + mem->rbp = clone_req.regs64.rbp; + mem->rbx = clone_req.regs64.rbx; + mem->rcx = clone_req.regs64.rcx; + mem->rdx = clone_req.regs64.rdx; + mem->rsi = clone_req.regs64.rsi; + mem->rdi = clone_req.regs64.rdi; + mem->r8 = clone_req.regs64.r8; + mem->r9 = clone_req.regs64.r9; + mem->r10 = clone_req.regs64.r10; + mem->r11 = clone_req.regs64.r11; + mem->r12 = clone_req.regs64.r12; + mem->r13 = clone_req.regs64.r13; + mem->r14 = clone_req.regs64.r14; + mem->r15 = clone_req.regs64.r15; + #elif defined(__i386__) + mem->ret2 = clone_req.regs32.ret2; + mem->ebp = clone_req.regs32.ebp; + mem->edi = clone_req.regs32.edi; + mem->esi = clone_req.regs32.esi; + mem->edx = clone_req.regs32.edx; + mem->ecx = clone_req.regs32.ecx; + mem->ebx = clone_req.regs32.ebx; + #else + #error Unsupported target platform + #endif + newMem->sequence = 0; + newMem->shmId = -1; + mem->newSecureMem = newMem; + mem->processFdPub = processFdPub_; + mem->cloneFdPub = cloneFdPub_; + + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_clone, + clone_req.flags, clone_req.stack, + clone_req.pid, clone_req.ctid, clone_req.tls); + return true; + } + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/debug.cc b/sandbox/linux/seccomp/debug.cc new file mode 100644 index 0000000..b4f30a4 --- /dev/null +++ b/sandbox/linux/seccomp/debug.cc @@ -0,0 +1,225 @@ +#ifndef NDEBUG + +#include "debug.h" + +namespace playground { + +bool Debug::enabled_; +int Debug::numSyscallNames_; +const char **Debug::syscallNames_; +std::map<int, std::string> Debug::syscallNamesMap_; + +Debug Debug::debug_; + +Debug::Debug() { + // Logging is disabled by default, but can be turned on by setting an + // appropriate environment variable. Initialize this code from a global + // constructor, so that it runs before the sandbox is turned on. + enabled_ = !!getenv("SECCOMP_SANDBOX_DEBUGGING"); + + // Read names of system calls from header files, if available. Symbolic + // names make debugging so much nicer. + if (enabled_) { + static const char *filenames[] = { + #if __WORDSIZE == 64 + "/usr/include/asm/unistd_64.h", + #elif __WORDSIZE == 32 + "/usr/include/asm/unistd_32.h", + #endif + "/usr/include/asm/unistd.h", + NULL }; + numSyscallNames_ = 0; + for (const char **fn = filenames; *fn; ++fn) { + FILE *fp = fopen(*fn, "r"); + if (fp) { + std::string baseName; + int baseNum = -1; + char buf[80]; + while (fgets(buf, sizeof(buf), fp)) { + // Check if the line starts with "#define" + static const char* whitespace = " \t\r\n"; + char *token, *save; + token = strtok_r(buf, whitespace, &save); + if (token && !strcmp(token, "#define")) { + + // Only parse identifiers that start with "__NR_" + token = strtok_r(NULL, whitespace, &save); + if (token) { + if (strncmp(token, "__NR_", 5)) { + continue; + } + std::string syscallName(token + 5); + + // Parse the value of the symbol. Try to be forgiving in what + // we accept, as the file format might change over time. + token = strtok_r(NULL, "\r\n", &save); + if (token) { + // Some values are defined relative to previous values, we + // detect these examples by finding an earlier symbol name + // followed by a '+' plus character. + bool isRelative = false; + char *base = strstr(token, baseName.c_str()); + if (baseNum >= 0 && base) { + base += baseName.length(); + while (*base == ' ' || *base == '\t') { + ++base; + } + if (*base == '+') { + isRelative = true; + token = base; + } + } + + // Skip any characters that are not part of the syscall number. + while (*token < '0' || *token > '9') { + token++; + } + + // If we now have a valid datum, enter it into our map. + if (*token) { + int sysnum = atoi(token); + + // Deal with symbols that are defined relative to earlier + // ones. + if (isRelative) { + sysnum += baseNum; + } else { + baseNum = sysnum; + baseName = syscallName; + } + + // Keep track of the highest syscall number that we know + // about. + if (sysnum >= numSyscallNames_) { + numSyscallNames_ = sysnum + 1; + } + + syscallNamesMap_[sysnum] = syscallName; + } + } + } + } + } + fclose(fp); + break; + } + } + if (numSyscallNames_) { + // We cannot make system calls at the time, when we are looking up + // the names. So, copy them into a data structure that can be + // accessed without having to allocated memory (i.e. no more STL). + syscallNames_ = reinterpret_cast<const char **>( + calloc(sizeof(char *), numSyscallNames_)); + for (std::map<int, std::string>::const_iterator iter = + syscallNamesMap_.begin(); + iter != syscallNamesMap_.end(); + ++iter) { + syscallNames_[iter->first] = iter->second.c_str(); + } + } + } +} + +void Debug::message(const char* msg) { + if (enabled_) { + Sandbox::SysCalls sys; + size_t len = strlen(msg); + if (len && msg[len-1] != '\n') { + // Write operations should be atomic, so that we don't interleave + // messages from multiple threads. Append a newline, if it is not + // already there. + char copy[len + 1]; + memcpy(copy, msg, len); + copy[len] = '\n'; + Sandbox::write(sys, 2, copy, len + 1); + } else { + Sandbox::write(sys, 2, msg, len); + } + } +} + +void Debug::syscall(int sysnum, const char* msg, int call) { + // This function gets called from the system call wrapper. Avoid calling + // any library functions that themselves need system calls. + if (enabled_) { + const char *sysname = NULL; + if (sysnum >= 0 && sysnum < numSyscallNames_) { + sysname = syscallNames_[sysnum]; + } + char unnamed[40] = "Unnamed syscall #"; + if (!sysname) { + itoa(strrchr(sysname = unnamed, '\000'), sysnum); + } + #if defined(__NR_socketcall) || defined(__NR_ipc) + char extra[40]; + *extra = '\000'; + #if defined(__NR_socketcall) + if (sysnum == __NR_socketcall) { + static const char* socketcall_name[] = { + 0, "socket", "bind", "connect", "listen", "accept", "getsockname", + "getpeername", "socketpair", "send", "recv", "sendto","recvfrom", + "shutdown", "setsockopt", "getsockopt", "sendmsg", "recvmsg", + "accept4" + }; + if (call >= 1 && call < (int)(sizeof(socketcall_name)/sizeof(char *))) { + strcat(strcpy(extra, " "), socketcall_name[call]); + } else { + itoa(strcpy(extra, " #") + 2, call); + } + } + #endif + #if defined(__NR_ipc) + if (sysnum == __NR_ipc) { + static const char* ipc_name[] = { + 0, "semop", "semget", "semctl", "semtimedop", 0, 0, 0, 0, 0, 0, + "msgsnd", "msgrcv", "msgget", "msgctl", 0, 0, 0, 0, 0, 0, + "shmat", "shmdt", "shmget", "shmctl" }; + if (call >= 1 && call < (int)(sizeof(ipc_name)/sizeof(char *)) && + ipc_name[call]) { + strcat(strcpy(extra, " "), ipc_name[call]); + } else { + itoa(strcpy(extra, " #") + 2, call); + } + } + #endif + #else + static const char *extra = ""; + #endif + char buf[strlen(sysname) + strlen(extra) + (msg ? strlen(msg) : 0) + 4]; + strcat(strcat(strcat(strcat(strcpy(buf, sysname), extra), ": "), + msg ? msg : ""), "\n"); + message(buf); + } +} + +char* Debug::itoa(char* s, int n) { + // Remember return value + char *ret = s; + + // Insert sign for negative numbers + if (n < 0) { + *s++ = '-'; + n = -n; + } + + // Convert to decimal (in reverse order) + char *start = s; + do { + *s++ = '0' + (n % 10); + n /= 10; + } while (n); + *s-- = '\000'; + + // Reverse order of digits + while (start < s) { + char ch = *s; + *s-- = *start; + *start++ = ch; + } + + return ret; +} + +} // namespace + +#endif // NDEBUG diff --git a/sandbox/linux/seccomp/debug.h b/sandbox/linux/seccomp/debug.h new file mode 100644 index 0000000..728c55c --- /dev/null +++ b/sandbox/linux/seccomp/debug.h @@ -0,0 +1,58 @@ +#ifndef DEBUG_H__ +#define DEBUG_H__ + +#include <map> +#include <stdio.h> +#include <stdlib.h> +#include <string> +#include <string.h> + +#include "sandbox_impl.h" + +namespace playground { + +class Debug { + public: + // If debugging is enabled, write a message to stderr. + static void message(const char* msg) + #ifndef NDEBUG + asm("playground$debugMessage"); + #else + { } + #endif + + // If debugging is enabled, write the name of the syscall and an optional + // message to stderr. + static void syscall(int sysnum, const char* msg, int call = -1) + #ifndef NDEBUG + ; + #else + { } + #endif + + // Check whether debugging is enabled. + static bool isEnabled() { + #ifndef NDEBUG + return enabled_; + #else + return false; + #endif + } + + private: + #ifndef NDEBUG + Debug(); + static char* itoa(char* s, int n); + + static Debug debug_; + + static bool enabled_; + static int numSyscallNames_; + static const char **syscallNames_; + static std::map<int, std::string> syscallNamesMap_; + #endif +}; + +} // namespace + +#endif // DEBUG_H__ diff --git a/sandbox/linux/seccomp/exit.cc b/sandbox/linux/seccomp/exit.cc new file mode 100644 index 0000000..23ebc55 --- /dev/null +++ b/sandbox/linux/seccomp/exit.cc @@ -0,0 +1,32 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_exit(int status) { + Debug::syscall(__NR_exit, "Executing handler"); + struct { + int sysnum; + long long cookie; + } __attribute__((packed)) request; + request.sysnum = __NR_exit; + request.cookie = cookie(); + + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request)) { + die("Failed to forward exit() request [sandbox]"); + } + for (;;) { + sys._exit(status); + } +} + +bool Sandbox::process_exit(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + SecureMem::lockSystemCall(parentProc, mem); + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_exit, 0); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/getpid.cc b/sandbox/linux/seccomp/getpid.cc new file mode 100644 index 0000000..5eb32b8 --- /dev/null +++ b/sandbox/linux/seccomp/getpid.cc @@ -0,0 +1,11 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_getpid() { + Debug::syscall(__NR_getpid, "Executing handler"); + return pid_; +} + +} // namespace diff --git a/sandbox/linux/seccomp/gettid.cc b/sandbox/linux/seccomp/gettid.cc new file mode 100644 index 0000000..5414510 --- /dev/null +++ b/sandbox/linux/seccomp/gettid.cc @@ -0,0 +1,11 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_gettid() { + Debug::syscall(__NR_gettid, "Executing handler"); + return tid(); +} + +} // namespace diff --git a/sandbox/linux/seccomp/ioctl.cc b/sandbox/linux/seccomp/ioctl.cc new file mode 100644 index 0000000..ac630a7 --- /dev/null +++ b/sandbox/linux/seccomp/ioctl.cc @@ -0,0 +1,52 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_ioctl(int d, int req, void *arg) { + Debug::syscall(__NR_ioctl, "Executing handler"); + struct { + int sysnum; + long long cookie; + IOCtl ioctl_req; + } __attribute__((packed)) request; + request.sysnum = __NR_ioctl; + request.cookie = cookie(); + request.ioctl_req.d = d; + request.ioctl_req.req = req; + request.ioctl_req.arg = arg; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward ioctl() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_ioctl(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + IOCtl ioctl_req; + SysCalls sys; + if (read(sys, sandboxFd, &ioctl_req, sizeof(ioctl_req)) !=sizeof(ioctl_req)){ + die("Failed to read parameters for ioctl() [process]"); + } + int rc = -EINVAL; + switch (ioctl_req.req) { + case TCGETS: + case TIOCGWINSZ: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_ioctl, + ioctl_req.d, ioctl_req.req, ioctl_req.arg); + return true; + default: + std::cerr << "Unsupported ioctl: 0x" << std::hex << ioctl_req.req << + std::endl; + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/ipc.cc b/sandbox/linux/seccomp/ipc.cc new file mode 100644 index 0000000..f3ad9a2 --- /dev/null +++ b/sandbox/linux/seccomp/ipc.cc @@ -0,0 +1,337 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +#ifndef IPC_PRIVATE +#define IPC_PRIVATE 0 +#endif +#ifndef IPC_RMID +#define IPC_RMID 0 +#endif +#ifndef IPC_64 +#define IPC_64 256 +#endif + +#if defined(__NR_shmget) +void* Sandbox::sandbox_shmat(int shmid, const void* shmaddr, int shmflg) { + Debug::syscall(__NR_shmat, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmAt shmat_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmat; + request.cookie = cookie(); + request.shmat_req.shmid = shmid; + request.shmat_req.shmaddr = shmaddr; + request.shmat_req.shmflg = shmflg; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmat() request [sandbox]"); + } + return reinterpret_cast<void *>(rc); +} + +int Sandbox::sandbox_shmctl(int shmid, int cmd, void* buf) { + Debug::syscall(__NR_shmctl, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmCtl shmctl_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmctl; + request.cookie = cookie(); + request.shmctl_req.shmid = shmid; + request.shmctl_req.cmd = cmd; + request.shmctl_req.buf = buf; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmctl() request [sandbox]"); + } + return static_cast<int>(rc); +} + +int Sandbox::sandbox_shmdt(const void* shmaddr) { + Debug::syscall(__NR_shmdt, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmDt shmdt_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmdt; + request.cookie = cookie(); + request.shmdt_req.shmaddr = shmaddr; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmdt() request [sandbox]"); + } + return static_cast<int>(rc); +} + +int Sandbox::sandbox_shmget(int key, size_t size, int shmflg) { + Debug::syscall(__NR_shmget, "Executing handler"); + + struct { + int sysnum; + long long cookie; + ShmGet shmget_req; + } __attribute__((packed)) request; + request.sysnum = __NR_shmget; + request.cookie = cookie(); + request.shmget_req.key = key; + request.shmget_req.size = size; + request.shmget_req.shmflg = shmflg; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward shmget() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_shmat(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmAt shmat_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmat_req, sizeof(shmat_req)) != + sizeof(shmat_req)) { + die("Failed to read parameters for shmat() [process]"); + } + + // We only allow attaching to the shm identifier that was returned by + // the most recent call to shmget(IPC_PRIVATE) + if (shmat_req.shmaddr || shmat_req.shmflg || shmat_req.shmid != mem->shmId) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmat, shmat_req.shmid, shmat_req.shmaddr, + shmat_req.shmflg); + return true; +} + +bool Sandbox::process_shmctl(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmCtl shmctl_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmctl_req, sizeof(shmctl_req)) != + sizeof(shmctl_req)) { + die("Failed to read parameters for shmctl() [process]"); + } + + // The only shmctl() operation that we need to support is removal. This + // operation is generally safe. + if ((shmctl_req.cmd & ~(IPC_64 | IPC_RMID)) || shmctl_req.buf) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmctl, shmctl_req.shmid, shmctl_req.cmd, + shmctl_req.buf); + return true; +} + +bool Sandbox::process_shmdt(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmDt shmdt_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmdt_req, sizeof(shmdt_req)) != + sizeof(shmdt_req)) { + die("Failed to read parameters for shmdt() [process]"); + } + + // Detaching shared memory segments it generally safe, but just in case + // of a kernel bug, we make sure that the address does not fall into any + // of the reserved memory regions. + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)shmdt_req.shmaddr); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first <= shmdt_req.shmaddr; + ++iter){ + if (shmdt_req.shmaddr < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + shmdt_req.shmaddr >= iter->first) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmdt, shmdt_req.shmaddr); + return true; +} + +bool Sandbox::process_shmget(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + ShmGet shmget_req; + SysCalls sys; + if (read(sys, sandboxFd, &shmget_req, sizeof(shmget_req)) != + sizeof(shmget_req)) { + die("Failed to read parameters for shmget() [process]"); + } + + // We do not want to allow the sandboxed application to access arbitrary + // shared memory regions. We only allow it to access regions that it + // created itself. + if (shmget_req.key != IPC_PRIVATE || shmget_req.shmflg & ~0777) { + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_shmget, shmget_req.key, shmget_req.size, + shmget_req.shmflg); + return true; +} +#endif + +#if defined(__NR_ipc) +#ifndef SHMAT +#define SHMAT 21 +#endif +#ifndef SHMDT +#define SHMDT 22 +#endif +#ifndef SHMGET +#define SHMGET 23 +#endif +#ifndef SHMCTL +#define SHMCTL 24 +#endif + +int Sandbox::sandbox_ipc(unsigned call, int first, int second, int third, + void* ptr, long fifth) { + Debug::syscall(__NR_ipc, "Executing handler", call); + struct { + int sysnum; + long long cookie; + IPC ipc_req; + } __attribute__((packed)) request; + request.sysnum = __NR_ipc; + request.cookie = cookie(); + request.ipc_req.call = call; + request.ipc_req.first = first; + request.ipc_req.second = second; + request.ipc_req.third = third; + request.ipc_req.ptr = ptr; + request.ipc_req.fifth = fifth; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward ipc() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_ipc(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + IPC ipc_req; + SysCalls sys; + if (read(sys, sandboxFd, &ipc_req, sizeof(ipc_req)) != sizeof(ipc_req)) { + die("Failed to read parameters for ipc() [process]"); + } + + // We do not support all of the SysV IPC calls. In fact, we only support + // the minimum feature set necessary for Chrome's renderers to share memory + // with the X server. + switch (ipc_req.call) { + case SHMAT: { + // We only allow attaching to the shm identifier that was returned by + // the most recent call to shmget(IPC_PRIVATE) + if (ipc_req.ptr || ipc_req.second || ipc_req.first != mem->shmId) { + goto deny; + } + accept: + mem->shmId = -1; + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_ipc, ipc_req.call, ipc_req.first, + ipc_req.second, ipc_req.third, ipc_req.ptr, + ipc_req.fifth); + return true; + } + case SHMCTL: + // The only shmctl() operation that we need to support is removal. This + // operation is generally safe. + if ((ipc_req.second & ~(IPC_64 | IPC_RMID)) || ipc_req.ptr) { + goto deny; + } else { + goto accept; + } + case SHMDT: { + // Detaching shared memory segments it generally safe, but just in case + // of a kernel bug, we make sure that the address does not fall into any + // of the reserved memory regions. + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)ipc_req.ptr); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first <=ipc_req.ptr; ++iter){ + if (ipc_req.ptr < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + ipc_req.ptr >= iter->first) { + goto deny; + } + } + goto accept; + } + case SHMGET: + // We do not want to allow the sandboxed application to access arbitrary + // shared memory regions. We only allow it to access regions that it + // created itself. + if (ipc_req.first != IPC_PRIVATE || ipc_req.third & ~0777) { + goto deny; + } else { + goto accept; + } + default: + // Other than SysV shared memory, we do not actually need to support any + // other SysV IPC calls. + deny: + mem->shmId = -1; + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } +} +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc new file mode 100644 index 0000000..a6c406e --- /dev/null +++ b/sandbox/linux/seccomp/library.cc @@ -0,0 +1,1360 @@ +#define XOPEN_SOURCE 500 +#include <algorithm> +#include <elf.h> +#include <errno.h> +#include <errno.h> +#include <fcntl.h> +#include <iostream> +#include <linux/unistd.h> +#include <set> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include "debug.h" +#include "library.h" +#include "sandbox_impl.h" +#include "syscall.h" +#include "syscall_table.h" +#include "x86_decode.h" + +#if defined(__x86_64__) +typedef Elf64_Phdr Elf_Phdr; +typedef Elf64_Rela Elf_Rel; + +typedef Elf64_Half Elf_Half; +typedef Elf64_Word Elf_Word; +typedef Elf64_Sword Elf_Sword; +typedef Elf64_Xword Elf_Xword; +typedef Elf64_Sxword Elf_Sxword; +typedef Elf64_Off Elf_Off; +typedef Elf64_Section Elf_Section; +typedef Elf64_Versym Elf_Versym; + +#define ELF_ST_BIND ELF64_ST_BIND +#define ELF_ST_TYPE ELF64_ST_TYPE +#define ELF_ST_INFO ELF64_ST_INFO +#define ELF_R_SYM ELF64_R_SYM +#define ELF_R_TYPE ELF64_R_TYPE +#define ELF_R_INFO ELF64_R_INFO + +#define ELF_REL_PLT ".rela.plt" +#define ELF_JUMP_SLOT R_X86_64_JUMP_SLOT +#elif defined(__i386__) +typedef Elf32_Phdr Elf_Phdr; +typedef Elf32_Rel Elf_Rel; + +typedef Elf32_Half Elf_Half; +typedef Elf32_Word Elf_Word; +typedef Elf32_Sword Elf_Sword; +typedef Elf32_Xword Elf_Xword; +typedef Elf32_Sxword Elf_Sxword; +typedef Elf32_Off Elf_Off; +typedef Elf32_Section Elf_Section; +typedef Elf32_Versym Elf_Versym; + +#define ELF_ST_BIND ELF32_ST_BIND +#define ELF_ST_TYPE ELF32_ST_TYPE +#define ELF_ST_INFO ELF32_ST_INFO +#define ELF_R_SYM ELF32_R_SYM +#define ELF_R_TYPE ELF32_R_TYPE +#define ELF_R_INFO ELF32_R_INFO + +#define ELF_REL_PLT ".rel.plt" +#define ELF_JUMP_SLOT R_386_JMP_SLOT +#else +#error Unsupported target platform +#endif + +namespace playground { + +char* Library::__kernel_vsyscall; +char* Library::__kernel_sigreturn; +char* Library::__kernel_rt_sigreturn; + +char* Library::getBytes(char* dst, const char* src, ssize_t len) { + // Some kernels don't allow accessing the VDSO from write() + if (isVDSO_ && + src >= memory_ranges_.begin()->second.start && + src <= memory_ranges_.begin()->second.stop) { + ssize_t max = + reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - src; + if (len > max) { + len = max; + } + memcpy(dst, src, len); + return dst; + } + + // Read up to "len" bytes from "src" and copy them to "dst". Short + // copies are possible, if we are at the end of a mapping. Returns + // NULL, if the operation failed completely. + static int helper_socket[2]; + Sandbox::SysCalls sys; + if (!helper_socket[0] && !helper_socket[1]) { + // Copy data through a socketpair, as this allows us to access it + // without incurring a segmentation fault. + sys.socketpair(AF_UNIX, SOCK_STREAM, 0, helper_socket); + } + char* ptr = dst; + int inc = 4096; + while (len > 0) { + ssize_t l = inc == 1 ? inc : 4096 - (reinterpret_cast<long>(src) & 0xFFF); + if (l > len) { + l = len; + } + l = NOINTR_SYS(sys.write(helper_socket[0], src, l)); + if (l == -1) { + if (sys.my_errno == EFAULT) { + if (inc == 1) { + if (ptr == dst) { + return NULL; + } + break; + } + inc = 1; + continue; + } else { + return NULL; + } + } + l = sys.read(helper_socket[1], ptr, l); + if (l <= 0) { + return NULL; + } + ptr += l; + src += l; + len -= l; + } + return dst; +} + +char *Library::get(Elf_Addr offset, char *buf, size_t len) { + if (!valid_) { + memset(buf, 0, len); + return NULL; + } + RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); + if (iter == memory_ranges_.end()) { + memset(buf, 0, len); + return NULL; + } + offset -= iter->first; + long size = reinterpret_cast<char *>(iter->second.stop) - + reinterpret_cast<char *>(iter->second.start); + if (offset > size - len) { + if (!maps_ && memory_ranges_.size() == 1 && + !memory_ranges_.begin()->first && !isVDSO_) { + // We are in the child and have exactly one mapping covering the whole + // library. We are trying to read data past the end of what is currently + // mapped. Check if we can expand the memory mapping to recover the + // needed data + Sandbox::SysCalls sys; + long new_size = (offset + len + 4095) & ~4095; + void *new_start = sys.mremap(iter->second.start, size, new_size, + MREMAP_MAYMOVE); + if (new_start != MAP_FAILED) { + memory_ranges_.clear(); + memory_ranges_.insert(std::make_pair(0, + Range(new_start, reinterpret_cast<void *>( + reinterpret_cast<char *>(new_start) + new_size), + PROT_READ))); + iter = memory_ranges_.begin(); + goto ok; + } + } + memset(buf, 0, len); + return NULL; + } +ok: + char *src = reinterpret_cast<char *>(iter->second.start) + offset; + memset(buf, 0, len); + if (!getBytes(buf, src, len)) { + return NULL; + } + return buf; +} + +std::string Library::get(Elf_Addr offset) { + if (!valid_) { + return ""; + } + RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); + if (iter == memory_ranges_.end()) { + return ""; + } + offset -= iter->first; + size_t size = reinterpret_cast<char *>(iter->second.stop) - + reinterpret_cast<char *>(iter->second.start); + if (offset > size - 4096) { + if (!maps_ && memory_ranges_.size() == 1 && + !memory_ranges_.begin()->first && !isVDSO_) { + // We are in the child and have exactly one mapping covering the whole + // library. We are trying to read data past the end of what is currently + // mapped. Check if we can expand the memory mapping to recover the + // needed data. We assume that strings are never longer than 4kB. + Sandbox::SysCalls sys; + long new_size = (offset + 4096 + 4095) & ~4095; + void *new_start = sys.mremap(iter->second.start, size, new_size, + MREMAP_MAYMOVE); + if (new_start != MAP_FAILED) { + memory_ranges_.clear(); + memory_ranges_.insert(std::make_pair(0, + Range(new_start, reinterpret_cast<void *>( + reinterpret_cast<char *>(new_start) + new_size), + PROT_READ))); + iter = memory_ranges_.begin(); + goto ok; + } + } + } +ok: + const char *start = reinterpret_cast<char *>(iter->second.start) + offset; + const char *stop = reinterpret_cast<char *>(iter->second.stop) + offset; + char buf[4096] = { 0 }; + getBytes(buf, start, stop - start >= (int)sizeof(buf) ? + sizeof(buf) - 1 : stop - start); + start = buf; + stop = buf; + while (*stop) { + ++stop; + } + std::string s = stop > start ? std::string(start, stop - start) : ""; + return s; +} + +char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) { + if (!valid_) { + memset(buf, 0, len); + return NULL; + } + if (maps_) { + return maps_->forwardGetRequest(this, offset, buf, len); + } + return get(offset, buf, len); +} + +std::string Library::getOriginal(Elf_Addr offset) { + if (!valid_) { + return ""; + } + if (maps_) { + return maps_->forwardGetRequest(this, offset); + } + return get(offset); +} + +const Elf_Ehdr* Library::getEhdr() { + if (!valid_) { + return NULL; + } + return &ehdr_; +} + +const Elf_Shdr* Library::getSection(const std::string& section) { + if (!valid_) { + return NULL; + } + SectionTable::const_iterator iter = section_table_.find(section); + if (iter == section_table_.end()) { + return NULL; + } + return &iter->second.second; +} + +const int Library::getSectionIndex(const std::string& section) { + if (!valid_) { + return -1; + } + SectionTable::const_iterator iter = section_table_.find(section); + if (iter == section_table_.end()) { + return -1; + } + return iter->second.first; +} + +void **Library::getRelocation(const std::string& symbol) { + PltTable::const_iterator iter = plt_entries_.find(symbol); + if (iter == plt_entries_.end()) { + return NULL; + } + return reinterpret_cast<void **>(asr_offset_ + iter->second); +} + +void *Library::getSymbol(const std::string& symbol) { + SymbolTable::const_iterator iter = symbols_.find(symbol); + if (iter == symbols_.end() || !iter->second.st_value) { + return NULL; + } + return asr_offset_ + iter->second.st_value; +} + +void Library::makeWritable(bool state) const { + for (RangeMap::const_iterator iter = memory_ranges_.begin(); + iter != memory_ranges_.end(); ++iter) { + const Range& range = iter->second; + long length = reinterpret_cast<char *>(range.stop) - + reinterpret_cast<char *>(range.start); + Sandbox::SysCalls sys; + sys.mprotect(range.start, length, + range.prot | (state ? PROT_WRITE : 0)); + } +} + +bool Library::isSafeInsn(unsigned short insn) { + // Check if the instruction has no unexpected side-effects. If so, it can + // be safely relocated from the function that we are patching into the + // out-of-line scratch space that we are setting up. This is often necessary + // to make room for the JMP into the scratch space. + return ((insn & 0x7) < 0x6 && (insn & 0xF0) < 0x40 + /* ADD, OR, ADC, SBB, AND, SUB, XOR, CMP */) || + #if defined(__x86_64__) + insn == 0x63 /* MOVSXD */ || + #endif + (insn >= 0x80 && insn <= 0x8E /* ADD, OR, ADC, + SBB, AND, SUB, XOR, CMP, TEST, XCHG, MOV, LEA */) || + (insn == 0x90) || /* NOP */ + (insn >= 0xA0 && insn <= 0xA9) /* MOV, TEST */ || + (insn >= 0xB0 && insn <= 0xBF /* MOV */) || + (insn >= 0xC0 && insn <= 0xC1) || /* Bit Shift */ + (insn >= 0xD0 && insn <= 0xD3) || /* Bit Shift */ + (insn >= 0xC6 && insn <= 0xC7 /* MOV */) || + (insn == 0xF7) /* TEST, NOT, NEG, MUL, IMUL, DIV, IDIV */; +} + +char* Library::getScratchSpace(const Maps* maps, char* near, int needed, + char** extraSpace, int* extraLength) { + if (needed > *extraLength || + labs(*extraSpace - reinterpret_cast<char *>(near)) > (1536 << 20)) { + if (*extraSpace) { + // Start a new scratch page and mark any previous page as write-protected + Sandbox::SysCalls sys; + sys.mprotect(*extraSpace, 4096, PROT_READ|PROT_EXEC); + } + // Our new scratch space is initially executable and writable. + *extraLength = 4096; + *extraSpace = maps->allocNearAddr(near, *extraLength, + PROT_READ|PROT_WRITE|PROT_EXEC); + } + if (*extraSpace) { + *extraLength -= needed; + return *extraSpace + *extraLength; + } + Sandbox::die("Insufficient space to intercept system call"); +} + +void Library::patchSystemCallsInFunction(const Maps* maps, char *start, + char *end, char** extraSpace, + int* extraLength) { + std::set<char *> branch_targets; + for (char *ptr = start; ptr < end; ) { + unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64); + char *target; + if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || insn == 0xEB /* JMP */) { + target = ptr + (reinterpret_cast<signed char *>(ptr))[-1]; + } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ || + (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) { + target = ptr + (reinterpret_cast<int *>(ptr))[-1]; + } else { + continue; + } + branch_targets.insert(target); + } + struct Code { + char* addr; + int len; + unsigned short insn; + bool is_ip_relative; + } code[5] = { { 0 } }; + int codeIdx = 0; + char* ptr = start; + while (ptr < end) { + // Keep a ring-buffer of the last few instruction in order to find the + // correct place to patch the code. + char *mod_rm; + code[codeIdx].addr = ptr; + code[codeIdx].insn = next_inst((const char **)&ptr, __WORDSIZE == 64, + 0, 0, &mod_rm, 0, 0); + code[codeIdx].len = ptr - code[codeIdx].addr; + code[codeIdx].is_ip_relative = mod_rm && (*mod_rm & 0xC7) == 0x5; + + // Whenever we find a system call, we patch it with a jump to out-of-line + // code that redirects to our system call wrapper. + bool is_syscall = true; + #if defined(__x86_64__) + bool is_indirect_call = false; + if (code[codeIdx].insn == 0x0F05 /* SYSCALL */ || + // In addition, on x86-64, we need to redirect all CALLs between the + // VDSO and the VSyscalls page. We want these to jump to our own + // modified copy of the VSyscalls. As we know that the VSyscalls are + // always more than 2GB away from the VDSO, the compiler has to + // generate some form of indirect jumps. We can find all indirect + // CALLs and redirect them to a separate scratch area, where we can + // inspect the destination address. If it indeed points to the + // VSyscall area, we then adjust the destination address accordingly. + (is_indirect_call = + (isVDSO_ && vsys_offset_ && code[codeIdx].insn == 0xFF && + !code[codeIdx].is_ip_relative && + mod_rm && (*mod_rm & 0x38) == 0x10 /* CALL (indirect) */))) { + is_syscall = !is_indirect_call; + #elif defined(__i386__) + bool is_gs_call = false; + if (code[codeIdx].len == 7 && + code[codeIdx].insn == 0xFF && + code[codeIdx].addr[2] == '\x15' /* CALL (indirect) */ && + code[codeIdx].addr[0] == '\x65' /* %gs prefix */) { + char* target; + asm volatile("mov %%gs:(%1), %0\n" + : "=a"(target) + : "c"(*reinterpret_cast<int *>(code[codeIdx].addr+3))); + if (target == __kernel_vsyscall) { + is_gs_call = true; + // TODO(markus): also handle the other vsyscalls + } + } + if (is_gs_call || + (code[codeIdx].insn == 0xCD && + code[codeIdx].addr[1] == '\x80' /* INT $0x80 */)) { + #else + #error Unsupported target platform + #endif + // Found a system call. Search backwards to figure out how to redirect + // the code. We will need to overwrite a couple of instructions and, + // of course, move these instructions somewhere else. + int startIdx = codeIdx; + int endIdx = codeIdx; + int length = code[codeIdx].len; + for (int idx = codeIdx; + (idx = (idx + (sizeof(code) / sizeof(struct Code)) - 1) % + (sizeof(code) / sizeof(struct Code))) != codeIdx; ) { + std::set<char *>::const_iterator iter = + std::upper_bound(branch_targets.begin(), branch_targets.end(), + code[idx].addr); + if (iter != branch_targets.end() && *iter < ptr) { + // Found a branch pointing to somewhere past our instruction. This + // instruction cannot be moved safely. Leave it in place. + break; + } + if (code[idx].addr && !code[idx].is_ip_relative && + isSafeInsn(code[idx].insn)) { + // These are all benign instructions with no side-effects and no + // dependency on the program counter. We should be able to safely + // relocate them. + startIdx = idx; + length = ptr - code[startIdx].addr; + } else { + break; + } + } + // Search forward past the system call, too. Sometimes, we can only + // find relocatable instructions following the system call. + #if defined(__i386__) + findEndIdx: + #endif + char *next = ptr; + for (int i = codeIdx; + (i = (i + 1) % (sizeof(code) / sizeof(struct Code))) != startIdx; + ) { + std::set<char *>::const_iterator iter = + std::lower_bound(branch_targets.begin(), branch_targets.end(), + next); + if (iter != branch_targets.end() && *iter == next) { + // Found branch target pointing to our instruction + break; + } + char *tmp_rm; + code[i].addr = next; + code[i].insn = next_inst((const char **)&next, __WORDSIZE == 64, + 0, 0, &tmp_rm, 0, 0); + code[i].len = next - code[i].addr; + code[i].is_ip_relative = tmp_rm && (*tmp_rm & 0xC7) == 0x5; + if (!code[i].is_ip_relative && isSafeInsn(code[i].insn)) { + endIdx = i; + length = next - code[startIdx].addr; + } else { + break; + } + } + // We now know, how many instructions neighboring the system call we + // can safely overwrite. We need five bytes to insert a JMP/CALL and a + // 32bit address. We then jump to a code fragment that safely forwards + // to our system call wrapper. On x86-64, this is complicated by + // the fact that the API allows up to 128 bytes of red-zones below the + // current stack pointer. So, we cannot write to the stack until we + // have adjusted the stack pointer. + // + // .. .. .. .. ; any leading instructions copied from original code + // 48 81 EC 80 00 00 00 SUB $0x80, %rsp + // 50 PUSH %rax + // 48 8D 05 .. .. .. .. LEA ...(%rip), %rax + // 50 PUSH %rax + // 48 B8 .. .. .. .. MOV $syscallWrapper, %rax + // .. .. .. .. + // 50 PUSH %rax + // 48 8D 05 06 00 00 00 LEA 6(%rip), %rax + // 48 87 44 24 10 XCHG %rax, 16(%rsp) + // C3 RETQ + // 48 81 C4 80 00 00 00 ADD $0x80, %rsp + // .. .. .. .. ; any trailing instructions copied from original code + // E9 .. .. .. .. JMPQ ... + // + // Total: 52 bytes + any bytes that were copied + // + // On x86-32, the stack is available and we can do: + // + // TODO(markus): Try to maintain frame pointers on x86-32 + // + // .. .. .. .. ; any leading instructions copied from original code + // 68 .. .. .. .. PUSH return_addr + // 68 .. .. .. .. PUSH $syscallWrapper + // C3 RET + // .. .. .. .. ; any trailing instructions copied from original code + // C3 RET + // + // Total: 12 bytes + any bytes that were copied + // + // For indirect jumps from the VDSO to the VSyscall page, we instead + // replace the following code (this is only necessary on x86-64). This + // time, we don't have to worry about red zones: + // + // .. .. .. .. ; any leading instructions copied from original code + // E8 00 00 00 00 CALL . + // 48 83 04 24 .. ADDQ $.., (%rsp) + // FF .. .. .. .. .. PUSH .. ; from original CALL instruction + // 48 81 3C 24 00 00 00 FF CMPQ $0xFFFFFFFFFF000000, 0(%rsp) + // 72 10 JB . + 16 + // 81 2C 24 .. .. .. .. SUBL ..., 0(%rsp) + // C7 44 24 04 00 00 00 00 MOVL $0, 4(%rsp) + // C3 RETQ + // 48 87 04 24 XCHG %rax,(%rsp) + // 48 89 44 24 08 MOV %rax,0x8(%rsp) + // 58 POP %rax + // C3 RETQ + // .. .. .. .. ; any trailing instructions copied from original code + // E9 .. .. .. .. JMPQ ... + // + // Total: 52 bytes + any bytes that were copied + + if (length < 5) { + // There are a very small number of instruction sequences that we + // cannot easily intercept, and that have been observed in real world + // examples. Handle them here: + #if defined(__i386__) + int diff; + if (!memcmp(code[codeIdx].addr, "\xCD\x80\xEB", 3) && + (diff = *reinterpret_cast<signed char *>( + code[codeIdx].addr + 3)) < 0 && diff >= -6) { + // We have seen... + // for (;;) { + // _exit(0); + // } + // ..get compiled to: + // B8 01 00 00 00 MOV $__NR_exit, %eax + // 66 90 XCHG %ax, %ax + // 31 DB 0:XOR %ebx, %ebx + // CD 80 INT $0x80 + // EB FA JMP 0b + // The JMP is really superfluous as the system call never returns. + // And there are in fact no returning system calls that need to be + // unconditionally repeated in an infinite loop. + // If we replace the JMP with NOPs, the system call can successfully + // be intercepted. + *reinterpret_cast<unsigned short *>(code[codeIdx].addr + 2) = 0x9090; + goto findEndIdx; + } + #endif + // If we cannot figure out any other way to intercept this system call, + // we replace it with a call to INT0. This causes a SEGV which we then + // handle in the signal handler. That's a lot slower than rewriting the + // instruction with a jump, but it should only happen very rarely. + if (is_syscall) { + memcpy(code[codeIdx].addr, "\xCD", 2); + if (code[codeIdx].len > 2) { + memset(code[codeIdx].addr + 2, 0x90, code[codeIdx].len - 2); + } + goto replaced; + } else { + Sandbox::die("Cannot intercept system call"); + } + } + int needed = 5 - code[codeIdx].len; + int first = codeIdx; + while (needed > 0 && first != startIdx) { + first = (first + (sizeof(code) / sizeof(struct Code)) - 1) % + (sizeof(code) / sizeof(struct Code)); + needed -= code[first].len; + } + int second = codeIdx; + while (needed > 0) { + second = (second + 1) % (sizeof(code) / sizeof(struct Code)); + needed -= code[second].len; + } + int preamble = code[codeIdx].addr - code[first].addr; + int postamble = code[second].addr + code[second].len - + code[codeIdx].addr - code[codeIdx].len; + + // The following is all the code that construct the various bits of + // assembly code. + #if defined(__x86_64__) + if (is_indirect_call) { + needed = 52 + preamble + code[codeIdx].len + postamble; + } else { + needed = 52 + preamble + postamble; + } + #elif defined(__i386__) + needed = 12 + preamble + postamble; + #else + #error Unsupported target platform + #endif + + // Allocate scratch space and copy the preamble of code that was moved + // from the function that we are patching. + char* dest = getScratchSpace(maps, code[first].addr, needed, + extraSpace, extraLength); + memcpy(dest, code[first].addr, preamble); + + // For indirect calls, we need to copy the actual CALL instruction and + // turn it into a PUSH instruction. + #if defined(__x86_64__) + if (is_indirect_call) { + memcpy(dest + preamble, "\xE8\x00\x00\x00\x00\x48\x83\x04\x24", 9); + dest[preamble + 9] = code[codeIdx].len + 42; + memcpy(dest + preamble + 10, code[codeIdx].addr, code[codeIdx].len); + + // Convert CALL -> PUSH + dest[preamble + 10 + (mod_rm - code[codeIdx].addr)] |= 0x20; + preamble += 10 + code[codeIdx].len; + } + #endif + + // Copy the static body of the assembly code. + memcpy(dest + preamble, + #if defined(__x86_64__) + is_indirect_call ? + "\x48\x81\x3C\x24\x00\x00\x00\xFF\x72\x10\x81\x2C\x24\x00\x00\x00" + "\x00\xC7\x44\x24\x04\x00\x00\x00\x00\xC3\x48\x87\x04\x24\x48\x89" + "\x44\x24\x08\x58\xC3" : + "\x48\x81\xEC\x80\x00\x00\x00\x50\x48\x8D\x05\x00\x00\x00\x00\x50" + "\x48\xB8\x00\x00\x00\x00\x00\x00\x00\x00\x50\x48\x8D\x05\x06\x00" + "\x00\x00\x48\x87\x44\x24\x10\xC3\x48\x81\xC4\x80\x00\x00", + is_indirect_call ? 37 : 47 + #elif defined(__i386__) + "\x68\x00\x00\x00\x00\x68\x00\x00\x00\x00\xC3", 11 + #else + #error Unsupported target platform + #endif + ); + + // Copy the postamble that was moved from the function that we are + // patching. + memcpy(dest + preamble + + #if defined(__x86_64__) + (is_indirect_call ? 37 : 47), + #elif defined(__i386__) + 11, + #else + #error Unsupported target platform + #endif + code[codeIdx].addr + code[codeIdx].len, + postamble); + + // Patch up the various computed values + #if defined(__x86_64__) + int post = preamble + (is_indirect_call ? 37 : 47) + postamble; + dest[post] = '\xE9'; + *reinterpret_cast<int *>(dest + post + 1) = + (code[second].addr + code[second].len) - (dest + post + 5); + if (is_indirect_call) { + *reinterpret_cast<int *>(dest + preamble + 13) = vsys_offset_; + } else { + *reinterpret_cast<int *>(dest + preamble + 11) = + (code[second].addr + code[second].len) - (dest + preamble + 15); + *reinterpret_cast<void **>(dest + preamble + 18) = + reinterpret_cast<void *>(&syscallWrapper); + } + #elif defined(__i386__) + *(dest + preamble + 11 + postamble) = '\xC3'; + *reinterpret_cast<char **>(dest + preamble + 1) = + dest + preamble + 11; + *reinterpret_cast<void (**)()>(dest + preamble + 6) = syscallWrapper; + #else + #error Unsupported target platform + #endif + + // Pad unused space in the original function with NOPs + memset(code[first].addr, 0x90 /* NOP */, + code[second].addr + code[second].len - code[first].addr); + + // Replace the system call with an unconditional jump to our new code. + #if defined(__x86_64__) + *code[first].addr = '\xE9'; // JMPQ + #elif defined(__i386__) + *code[first].addr = '\xE8'; // CALL + #else + #error Unsupported target platform + #endif + *reinterpret_cast<int *>(code[first].addr + 1) = + dest - (code[first].addr + 5); + } + replaced: + codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code)); + } +} + +void Library::patchVDSO(char** extraSpace, int* extraLength){ + #if defined(__i386__) + Sandbox::SysCalls sys; + if (!__kernel_vsyscall || + sys.mprotect(reinterpret_cast<void *>( + reinterpret_cast<long>(__kernel_vsyscall) & ~0xFFF), + 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) { + return; + } + + // x86-32 has a small number of well-defined functions in the VDSO library. + // These functions do not easily lend themselves to be rewritten by the + // automatic code. Instead, we explicitly find new definitions for them. + // + // We don't bother with optimizing the syscall instruction instead always + // use INT $0x80, no matter whether the hardware supports more modern + // calling conventions. + // + // TODO(markus): Investigate whether it is worthwhile to optimize this + // code path and use the platform-specific entry code. + if (__kernel_vsyscall) { + // Replace the kernel entry point with: + // + // E9 .. .. .. .. JMP syscallWrapper + *__kernel_vsyscall = '\xE9'; + *reinterpret_cast<long *>(__kernel_vsyscall + 1) = + reinterpret_cast<char *>(&syscallWrapper) - + reinterpret_cast<char *>(__kernel_vsyscall + 5); + } + if (__kernel_sigreturn) { + // Replace the sigreturn() system call with a jump to code that does: + // + // 58 POP %eax + // B8 77 00 00 00 MOV $0x77, %eax + // E9 .. .. .. .. JMP syscallWrapper + char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace, + extraLength); + memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE9", 7); + *reinterpret_cast<char *>(dest + 7) = + reinterpret_cast<char *>(&syscallWrapper) - + reinterpret_cast<char *>(dest + 11); + *__kernel_sigreturn = '\xE9'; + *reinterpret_cast<char *>(__kernel_sigreturn + 1) = + dest - reinterpret_cast<char *>(__kernel_sigreturn + 5); + } + if (__kernel_rt_sigreturn) { + // Replace the rt_sigreturn() system call with a jump to code that does: + // + // B8 AD 00 00 00 MOV $0xAD, %eax + // E9 .. .. .. .. JMP syscallWrapper + char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace, + extraLength); + memcpy(dest, "\xB8\xAD\x00\x00\x00\xE9", 6); + *reinterpret_cast<char *>(dest + 6) = + reinterpret_cast<char *>(&syscallWrapper) - + reinterpret_cast<char *>(dest + 10); + *__kernel_rt_sigreturn = '\xE9'; + *reinterpret_cast<char *>(__kernel_rt_sigreturn + 1) = + dest - reinterpret_cast<char *>(__kernel_rt_sigreturn + 5); + } + #endif +} + +int Library::patchVSystemCalls() { + #if defined(__x86_64__) + // VSyscalls live in a shared 4kB page at the top of the address space. This + // page cannot be unmapped nor remapped. We have to create a copy within + // 2GB of the page, and rewrite all IP-relative accesses to shared variables. + // As the top of the address space is not accessible by mmap(), this means + // that we need to wrap around addresses to the bottom 2GB of the address + // space. + // Only x86-64 has VSyscalls. + if (maps_->vsyscall()) { + char* copy = maps_->allocNearAddr(maps_->vsyscall(), 0x1000, + PROT_READ|PROT_WRITE); + char* extraSpace = copy; + int extraLength = 0x1000; + memcpy(copy, maps_->vsyscall(), 0x1000); + long adjust = (long)maps_->vsyscall() - (long)copy; + for (int vsys = 0; vsys < 0x1000; vsys += 0x400) { + char* start = copy + vsys; + char* end = start + 0x400; + + // There can only be up to four VSyscalls starting at an offset of + // n*0x1000, each. VSyscalls are invoked by functions in the VDSO + // and provide fast implementations of a time source. We don't exactly + // know where the code and where the data is in the VSyscalls page. + // So, we disassemble the code for each function and find all branch + // targets within the function in order to find the last address of + // function. + for (char *last = start, *vars = end, *ptr = start; ptr < end; ) { + new_function: + char* mod_rm; + unsigned short insn = next_inst((const char **)&ptr, true, 0, 0, + &mod_rm, 0, 0); + if (mod_rm && (*mod_rm & 0xC7) == 0x5) { + // Instruction has IP relative addressing mode. Adjust to reference + // the variables in the original VSyscall segment. + long offset = *reinterpret_cast<int *>(mod_rm + 1); + char* var = ptr + offset; + if (var >= ptr && var < vars) { + // Variables are stored somewhere past all the functions. Remember + // the first variable in the VSyscall slot, so that we stop + // scanning for instructions once we reach that address. + vars = var; + } + offset += adjust; + if ((offset >> 32) && (offset >> 32) != -1) { + Sandbox::die("Cannot patch [vsystemcall]"); + } + *reinterpret_cast<int *>(mod_rm + 1) = offset; + } + + // Check for jump targets to higher addresses (but within our own + // VSyscall slot). They extend the possible end-address of this + // function. + char *target = 0; + if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || + insn == 0xEB /* JMP */) { + target = ptr + (reinterpret_cast<signed char *>(ptr))[-1]; + } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ || + (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) { + target = ptr + (reinterpret_cast<int *>(ptr))[-1]; + } + + // The function end is found, once the loop reaches the last valid + // address in the VSyscall slot, or once it finds a RET instruction + // that is not followed by any jump targets. Unconditional jumps that + // point backwards are treated the same as a RET instruction. + if (insn == 0xC3 /* RET */ || + (target < ptr && + (insn == 0xEB /* JMP */ || insn == 0xE9 /* JMP */))) { + if (last >= ptr) { + continue; + } else { + // The function can optionally be followed by more functions in + // the same VSyscall slot. Allow for alignment to a 16 byte + // boundary. If we then find more non-zero bytes, and if this is + // not the known start of the variables, assume a new function + // started. + for (; ptr < vars; ++ptr) { + if ((long)ptr & 0xF) { + if (*ptr && *ptr != '\x90' /* NOP */) { + goto new_function; + } + *ptr = '\x90'; // NOP + } else { + if (*ptr && *ptr != '\x90' /* NOP */) { + goto new_function; + } + break; + } + } + + // Translate all SYSCALLs to jumps into our system call handler. + patchSystemCallsInFunction(NULL, start, ptr, + &extraSpace, &extraLength); + break; + } + } + + // Adjust assumed end address for this function, if a valid jump + // target has been found that originates from the current instruction. + if (target > last && target < start + 0x100) { + last = target; + } + } + } + + // We are done. Write-protect our code and make it executable. + Sandbox::SysCalls sys; + sys.mprotect(copy, 0x1000, PROT_READ|PROT_EXEC); + return maps_->vsyscall() - copy; + } + #endif + return 0; +} + +void Library::patchSystemCalls() { + if (!valid_) { + return; + } + int extraLength = 0; + char* extraSpace = NULL; + if (isVDSO_) { + // patchVDSO() calls patchSystemCallsInFunction() which needs vsys_offset_ + // iff processing the VDSO library. So, make sure we call + // patchVSystemCalls() first. + vsys_offset_ = patchVSystemCalls(); + #if defined(__i386__) + patchVDSO(&extraSpace, &extraLength); + return; + #endif + } + SectionTable::const_iterator iter; + if ((iter = section_table_.find(".text")) == section_table_.end()) { + return; + } + const Elf_Shdr& shdr = iter->second.second; + char* start = reinterpret_cast<char *>(shdr.sh_addr + asr_offset_); + char* stop = start + shdr.sh_size; + char* func = start; + int nopcount = 0; + bool has_syscall = false; + for (char *ptr = start; ptr < stop; ptr++) { + #if defined(__x86_64__) + if ((*ptr == '\x0F' && ptr[1] == '\x05' /* SYSCALL */) || + (isVDSO_ && *ptr == '\xFF')) { + #elif defined(__i386__) + if ((*ptr == '\xCD' && ptr[1] == '\x80' /* INT $0x80 */) || + (*ptr == '\x65' && ptr[1] == '\xFF' && + ptr[2] == '\x15' /* CALL %gs:.. */)) { + #else + #error Unsupported target platform + #endif + ptr++; + has_syscall = true; + nopcount = 0; + } else if (*ptr == '\x90' /* NOP */) { + nopcount++; + } else if (!(reinterpret_cast<long>(ptr) & 0xF)) { + if (nopcount > 2) { + // This is very likely the beginning of a new function. Functions + // are aligned on 16 byte boundaries and the preceding function is + // padded out with NOPs. + // + // For performance reasons, we quickly scan the entire text segment + // for potential SYSCALLs, and then patch the code in increments of + // individual functions. + if (has_syscall) { + has_syscall = false; + // Our quick scan of the function found a potential system call. + // Do a more thorough scan, now. + patchSystemCallsInFunction(maps_, func, ptr, &extraSpace, + &extraLength); + } + func = ptr; + } + nopcount = 0; + } else { + nopcount = 0; + } + } + if (has_syscall) { + // Patch any remaining system calls that were in the last function before + // the loop terminated. + patchSystemCallsInFunction(maps_, func, stop, &extraSpace, &extraLength); + } + + // Mark our scratch space as write-protected and executable. + if (extraSpace) { + Sandbox::SysCalls sys; + sys.mprotect(extraSpace, 4096, PROT_READ|PROT_EXEC); + } +} + +bool Library::parseElf() { + valid_ = true; + + // Verify ELF header + Elf_Shdr str_shdr; + if (!getOriginal(0, &ehdr_) || + ehdr_.e_ehsize < sizeof(Elf_Ehdr) || + ehdr_.e_phentsize < sizeof(Elf_Phdr) || + ehdr_.e_shentsize < sizeof(Elf_Shdr) || + !getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, + &str_shdr)) { + // Not all memory mappings are necessarily ELF files. Skip memory + // mappings that we cannot identify. + valid_ = false; + return false; + } + + // Find PT_DYNAMIC segment. This is what our PLT entries and symbols will + // point to. This information is probably incorrect in the child, as it + // requires access to the original memory mappings. + for (int i = 0; i < ehdr_.e_phnum; i++) { + Elf_Phdr phdr; + if (getOriginal(ehdr_.e_phoff + i*ehdr_.e_phentsize, &phdr) && + phdr.p_type == PT_DYNAMIC) { + RangeMap::const_iterator iter = + memory_ranges_.lower_bound(phdr.p_offset); + if (iter != memory_ranges_.end()) { + asr_offset_ = reinterpret_cast<char *>(iter->second.start) - + (phdr.p_vaddr - (phdr.p_offset - iter->first)); + } + break; + } + } + + // Parse section table and find all sections in this ELF file + for (int i = 0; i < ehdr_.e_shnum; i++) { + Elf_Shdr shdr; + if (!getOriginal(ehdr_.e_shoff + i*ehdr_.e_shentsize, &shdr)) { + continue; + } + section_table_.insert( + std::make_pair(getOriginal(str_shdr.sh_offset + shdr.sh_name), + std::make_pair(i, shdr))); + } + + return !isVDSO_ || parseSymbols(); +} + +bool Library::parseSymbols() { + if (!valid_) { + return false; + } + + Elf_Shdr str_shdr; + getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, &str_shdr); + + // Find PLT and symbol tables + const Elf_Shdr* plt = getSection(ELF_REL_PLT); + const Elf_Shdr* symtab = getSection(".dynsym"); + Elf_Shdr strtab = { 0 }; + if (symtab) { + if (symtab->sh_link >= ehdr_.e_shnum || + !getOriginal(ehdr_.e_shoff + symtab->sh_link * ehdr_.e_shentsize, + &strtab)) { + Debug::message("Cannot find valid symbol table\n"); + valid_ = false; + return false; + } + } + + if (plt && symtab) { + // Parse PLT table and add its entries + for (int i = plt->sh_size/sizeof(Elf_Rel); --i >= 0; ) { + Elf_Rel rel; + if (!getOriginal(plt->sh_offset + i * sizeof(Elf_Rel), &rel) || + ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym) >= symtab->sh_size) { + Debug::message("Encountered invalid plt entry\n"); + valid_ = false; + return false; + } + + if (ELF_R_TYPE(rel.r_info) != ELF_JUMP_SLOT) { + continue; + } + Elf_Sym sym; + if (!getOriginal(symtab->sh_offset + + ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym), &sym) || + sym.st_shndx >= ehdr_.e_shnum) { + Debug::message("Encountered invalid symbol for plt entry\n"); + valid_ = false; + return false; + } + std::string name = getOriginal(strtab.sh_offset + sym.st_name); + if (name.empty()) { + continue; + } + plt_entries_.insert(std::make_pair(name, rel.r_offset)); + } + } + + if (symtab) { + // Parse symbol table and add its entries + for (Elf_Addr addr = 0; addr < symtab->sh_size; addr += sizeof(Elf_Sym)) { + Elf_Sym sym; + if (!getOriginal(symtab->sh_offset + addr, &sym) || + (sym.st_shndx >= ehdr_.e_shnum && + sym.st_shndx < SHN_LORESERVE)) { + Debug::message("Encountered invalid symbol\n"); + valid_ = false; + return false; + } + std::string name = getOriginal(strtab.sh_offset + sym.st_name); + if (name.empty()) { + continue; + } + symbols_.insert(std::make_pair(name, sym)); + } + } + + SymbolTable::const_iterator iter = symbols_.find("__kernel_vsyscall"); + if (iter != symbols_.end() && iter->second.st_value) { + __kernel_vsyscall = asr_offset_ + iter->second.st_value; + } + iter = symbols_.find("__kernel_sigreturn"); + if (iter != symbols_.end() && iter->second.st_value) { + __kernel_sigreturn = asr_offset_ + iter->second.st_value; + } + iter = symbols_.find("__kernel_rt_sigreturn"); + if (iter != symbols_.end() && iter->second.st_value) { + __kernel_rt_sigreturn = asr_offset_ + iter->second.st_value; + } + + return true; +} + +void Library::recoverOriginalDataParent(Maps* maps) { + maps_ = maps; +} + +void Library::recoverOriginalDataChild(const std::string& filename) { + if (isVDSO_) { + valid_ = true; + return; + } + if (memory_ranges_.empty() || memory_ranges_.rbegin()->first) { + failed: + memory_ranges_.clear(); + } else { + const Range& range = memory_ranges_.rbegin()->second; + struct Args { + void* old_addr; + long old_length; + void* new_addr; + long new_length; + long prot; + } args = { + range.start, + (reinterpret_cast<long>(range.stop) - + reinterpret_cast<long>(range.start) + 4095) & ~4095, + 0, + (memory_ranges_.begin()->first + + (reinterpret_cast<long>(memory_ranges_.begin()->second.stop) - + reinterpret_cast<long>(memory_ranges_.begin()->second.start)) + + 4095) & ~4095, + range.prot + }; + // We find the memory mapping that starts at file offset zero and + // extend it to cover the entire file. This is a little difficult to + // do, as the mapping needs to be moved to a different address. But + // we are potentially running code that is inside of this mapping at the + // time when it gets moved. + // + // We have to write the code in assembly. We allocate temporary + // storage and copy the critical code into this page. We then execute + // from this page, while we relocate the mapping. Finally, we allocate + // memory at the original location and copy the original data into it. + // The program can now resume execution. + #if defined(__x86_64__) + asm volatile( + // new_addr = 4096 + mmap(0, new_length + 4096, + // PROT_READ|PROT_WRITE|PROT_EXEC, + // MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + "mov $0, %%r9\n" + "mov $-1, %%r8\n" + "mov $0x22, %%r10\n" + "mov $7, %%rdx\n" + "mov 0x18(%0), %%rsi\n" + "add $4096, %%rsi\n" + "mov $0, %%rdi\n" + "mov $9, %%rax\n" + "syscall\n" + "cmp $-4096, %%rax\n" + "ja 6f\n" + "mov %%rax, %%r12\n" + "add $4096, %%r12\n" + + // memcpy(new_addr - 4096, &&asm, asm_length) + "lea 2f(%%rip), %%rsi\n" + "lea 6f(%%rip), %%rdi\n" + "sub %%rsi, %%rdi\n" + "0:sub $1, %%rdi\n" + "test %%rdi, %%rdi\n" + "js 1f\n" + "movzbl (%%rsi, %%rdi, 1), %%ebx\n" + "mov %%bl, (%%rax, %%rdi, 1)\n" + "jmp 0b\n" + "1:\n" + + // ((void (*)())new_addr - 4096)() + "lea 6f(%%rip), %%rbx\n" + "push %%rbx\n" + "jmp *%%rax\n" + + // mremap(old_addr, old_length, new_length, + // MREMAP_MAYMOVE|MREMAP_FIXED, new_addr) + "2:mov %%r12, %%r8\n" + "mov $3, %%r10\n" + "mov 0x18(%0), %%rdx\n" + "mov 0x8(%0), %%rsi\n" + "mov 0(%0), %%rdi\n" + "mov $25, %%rax\n" + "syscall\n" + "cmp $-4096, %%rax\n" + "ja 5f\n" + + // mmap(old_addr, old_length, PROT_WRITE, + // MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) + "mov $0, %%r9\n" + "mov $-1, %%r8\n" + "mov $0x32, %%r10\n" + "mov $2, %%rdx\n" + "mov 0x8(%0), %%rsi\n" + "mov 0(%0), %%rdi\n" + "mov $9, %%rax\n" + "syscall\n" + "cmp $-12, %%eax\n" + "jz 4f\n" + "cmp $-4096, %%rax\n" + "ja 5f\n" + + // memcpy(old_addr, new_addr, old_length) + "mov 0x8(%0), %%rdi\n" + "3:sub $1, %%rdi\n" + "test %%rdi, %%rdi\n" + "js 4f\n" + "movzbl (%%r12, %%rdi, 1), %%ebx\n" + "mov %%bl, (%%rax, %%rdi, 1)\n" + "jmp 3b\n" + "4:\n" + + // mprotect(old_addr, old_length, prot) + "mov 0x20(%0), %%rdx\n" + "mov 0x8(%0), %%rsi\n" + "mov %%rax, %%rdi\n" + "mov $10, %%rax\n" + "syscall\n" + + // args.new_addr = new_addr + "mov %%r12, 0x10(%0)\n" + "5:retq\n" + + // munmap(new_addr - 4096, 4096) + "6:mov $4096, %%rsi\n" + "mov %%r12, %%rdi\n" + "sub %%rsi, %%rdi\n" + "mov $11, %%rax\n" + "syscall\n" + : + : "q"(&args) + : "rax", "rbx", "rcx", "rdx", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "memory"); + #elif defined(__i386__) + asm volatile( + "push %%ebp\n" + "push %%ebx\n" + "push %%edi\n" + + // new_addr = 4096 + mmap(0, new_length + 4096, + // PROT_READ|PROT_WRITE|PROT_EXEC, + // MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + "mov $0, %%ebp\n" + "mov $0x22, %%esi\n" + "mov $7, %%edx\n" + "mov 12(%%edi), %%ecx\n" + "add $4096, %%ecx\n" + "mov $-1, %%edi\n" + "mov $0, %%ebx\n" + "mov $192, %%eax\n" + "int $0x80\n" + "cmp $-4096, %%eax\n" + "ja 6f\n" + "mov %%eax, %%ebp\n" + "add $4096, %%ebp\n" + + // memcpy(new_addr - 4096, &&asm, asm_length) + "lea 2f, %%ecx\n" + "lea 6f, %%ebx\n" + "sub %%ecx, %%ebx\n" + "0:dec %%ebx\n" + "test %%ebx, %%ebx\n" + "js 1f\n" + "movzbl (%%ecx, %%ebx, 1), %%edx\n" + "mov %%dl, (%%eax, %%ebx, 1)\n" + "jmp 0b\n" + "1:\n" + + // ((void (*)())new_addr - 4096)() + "lea 6f, %%ebx\n" + "push %%ebx\n" + "jmp *%%eax\n" + + // mremap(old_addr, old_length, new_length, + // MREMAP_MAYMOVE|MREMAP_FIXED, new_addr) + "2:push %%ebp\n" + "mov $3, %%esi\n" + "mov 8(%%esp), %%edi\n" + "mov 12(%%edi), %%edx\n" + "mov 4(%%edi), %%ecx\n" + "mov 0(%%edi), %%ebx\n" + "mov %%ebp, %%edi\n" + "mov $163, %%eax\n" + "int $0x80\n" + "cmp $-4096, %%eax\n" + "ja 5f\n" + + // mmap(old_addr, old_length, PROT_WRITE, + // MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) + "mov $0, %%ebp\n" + "mov $0x32, %%esi\n" + "mov $2, %%edx\n" + "mov 8(%%esp), %%edi\n" + "mov 4(%%edi), %%ecx\n" + "mov 0(%%edi), %%ebx\n" + "mov $-1, %%edi\n" + "mov $192, %%eax\n" + "int $0x80\n" + "cmp $-12, %%eax\n" + "jz 4f\n" + "cmp $-4096, %%eax\n" + "ja 5f\n" + + // memcpy(old_addr, new_addr, old_length) + "mov 0(%%esp), %%ecx\n" + "mov 8(%%esp), %%edi\n" + "mov 4(%%edi), %%ebx\n" + "3:dec %%ebx\n" + "test %%ebx, %%ebx\n" + "js 4f\n" + "movzbl (%%ecx, %%ebx, 1), %%edx\n" + "mov %%dl, (%%eax, %%ebx, 1)\n" + "jmp 3b\n" + "4:\n" + + // mprotect(old_addr, old_length, prot) + "mov 8(%%esp), %%edi\n" + "mov 16(%%edi), %%edx\n" + "mov 4(%%edi), %%ecx\n" + "mov %%eax, %%ebx\n" + "mov $125, %%eax\n" + "int $0x80\n" + + // args.new_addr = new_addr + "mov 8(%%esp), %%edi\n" + "mov 0(%%esp), %%ebp\n" + "mov %%ebp, 0x8(%%edi)\n" + + "5:pop %%ebx\n" + "ret\n" + + // munmap(new_addr - 4096, 4096) + "6:mov $4096, %%ecx\n" + "sub %%ecx, %%ebx\n" + "mov $91, %%eax\n" + "int $0x80\n" + "pop %%edi\n" + "pop %%ebx\n" + "pop %%ebp\n" + : + : "D"(&args) + : "eax", "ecx", "edx", "esi", "memory"); + #else + #error Unsupported target platform + #endif + if (!args.new_addr) { + goto failed; + } + + memory_ranges_.clear(); + memory_ranges_.insert(std::make_pair(0, Range(args.new_addr, + reinterpret_cast<char *>(args.new_addr) + args.new_length, + PROT_READ))); + valid_ = true; + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/library.h b/sandbox/linux/seccomp/library.h new file mode 100644 index 0000000..002992b --- /dev/null +++ b/sandbox/linux/seccomp/library.h @@ -0,0 +1,164 @@ +#ifndef LIBRARY_H__ +#define LIBRARY_H__ + +#include <elf.h> +#include <map> +#include <set> +#include <string> +#include <string.h> +#include <sys/mman.h> + +#include "maps.h" + +#if defined(__x86_64__) +typedef Elf64_Ehdr Elf_Ehdr; +typedef Elf64_Shdr Elf_Shdr; +typedef Elf64_Sym Elf_Sym; +typedef Elf64_Addr Elf_Addr; +#elif defined(__i386__) +typedef Elf32_Ehdr Elf_Ehdr; +typedef Elf32_Shdr Elf_Shdr; +typedef Elf32_Sym Elf_Sym; +typedef Elf32_Addr Elf_Addr; +#else +#error Unsupported target platform +#endif + +struct SyscallTable; +namespace playground { + +class Library { + friend class Maps; + public: + Library() : + valid_(false), + isVDSO_(false), + asr_offset_(0), + vsys_offset_(0), + maps_(0) { + } + + void addMemoryRange(void* start, void* stop, Elf_Addr offset, int prot, + int isVDSO) { + memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot))); + isVDSO_ = isVDSO; + } + + char *get(Elf_Addr offset, char *buf, size_t len); + std::string get(Elf_Addr offset); + char *getOriginal(Elf_Addr offset, char *buf, size_t len); + std::string getOriginal(Elf_Addr offset); + + template<class T>T* get(Elf_Addr offset, T* t) { + if (!valid_) { + memset(t, 0, sizeof(T)); + return NULL; + } + return reinterpret_cast<T *>(get(offset, reinterpret_cast<char *>(t), + sizeof(T))); + } + + template<class T>T* getOriginal(Elf_Addr offset, T* t) { + if (!valid_) { + memset(t, 0, sizeof(T)); + return false; + } + if (maps_) { + return reinterpret_cast<T *>(maps_->forwardGetRequest( + this, offset, reinterpret_cast<char *>(t), sizeof(T))); + } + return get(offset, t); + } + + template<class T>bool set(void *addr, T* value) { + if (!valid_) { + return false; + } + *reinterpret_cast<T *>(addr) = *value; + return true; + } + + template<class T>bool set(Elf_Addr offset, T* value) { + if (!valid_) { + return false; + } + RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); + if (iter == memory_ranges_.end()) { + return false; + } + offset -= iter->first; + if (offset > + reinterpret_cast<char *>(iter->second.stop) - + reinterpret_cast<char *>(iter->second.start) - + sizeof(T)) { + return false; + } + *reinterpret_cast<T *>( + reinterpret_cast<char *>(iter->second.start) + offset) = *value; + return true; + } + + const Elf_Ehdr* getEhdr(); + const Elf_Shdr* getSection(const std::string& section); + const int getSectionIndex(const std::string& section); + void **getRelocation(const std::string& symbol); + void *getSymbol(const std::string& symbol); + void makeWritable(bool state) const; + void patchSystemCalls(); + bool isVDSO() const { return isVDSO_; } + + protected: + bool parseElf(); + bool parseSymbols(); + void recoverOriginalDataParent(Maps* maps); + void recoverOriginalDataChild(const std::string& child); + + private: + class GreaterThan : public std::binary_function<Elf_Addr, Elf_Addr, bool> { + public: + bool operator() (Elf_Addr s1, Elf_Addr s2) const { + return s1 > s2; + } + }; + + struct Range { + Range(void* start, void* stop, int prot) : + start(start), stop(stop), prot(prot) { } + void* start; + void* stop; + int prot; + }; + + typedef std::map<Elf_Addr, Range, GreaterThan> RangeMap; + typedef std::map<std::string, std::pair<int, Elf_Shdr> > SectionTable; + typedef std::map<std::string, Elf_Sym> SymbolTable; + typedef std::map<std::string, Elf_Addr> PltTable; + + char* getBytes(char* dst, const char* src, ssize_t len); + static bool isSafeInsn(unsigned short insn); + static int isSimpleSystemCall(char *start, char *end); + static char* getScratchSpace(const Maps* maps, char* near, int needed, + char** extraSpace, int* extraLength); + void patchSystemCallsInFunction(const Maps* maps, char *start, char *end, + char** extraSpace, int* extraLength); + int patchVSystemCalls(); + void patchVDSO(char** extraSpace, int* extraLength); + + RangeMap memory_ranges_; + bool valid_; + bool isVDSO_; + char* asr_offset_; + int vsys_offset_; + Maps* maps_; + Elf_Ehdr ehdr_; + SectionTable section_table_; + SymbolTable symbols_; + PltTable plt_entries_; + static char* __kernel_vsyscall; + static char* __kernel_sigreturn; + static char* __kernel_rt_sigreturn; +}; + +} // namespace + +#endif // LIBRARY_H__ diff --git a/sandbox/linux/seccomp/linux_syscall_support.h b/sandbox/linux/seccomp/linux_syscall_support.h new file mode 100644 index 0000000..876c279 --- /dev/null +++ b/sandbox/linux/seccomp/linux_syscall_support.h @@ -0,0 +1,3173 @@ +/* Copyright (c) 2005-2008, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Markus Gutschke + */ + +/* This file includes Linux-specific support functions common to the + * coredumper and the thread lister; primarily, this is a collection + * of direct system calls, and a couple of symbols missing from + * standard header files. + * There are a few options that the including file can set to control + * the behavior of this file: + * + * SYS_CPLUSPLUS: + * The entire header file will normally be wrapped in 'extern "C" { }", + * making it suitable for compilation as both C and C++ source. If you + * do not want to do this, you can set the SYS_CPLUSPLUS macro to inhibit + * the wrapping. N.B. doing so will suppress inclusion of all prerequisite + * system header files, too. It is the caller's responsibility to provide + * the necessary definitions. + * + * SYS_ERRNO: + * All system calls will update "errno" unless overriden by setting the + * SYS_ERRNO macro prior to including this file. SYS_ERRNO should be + * an l-value. + * + * SYS_INLINE: + * New symbols will be defined "static inline", unless overridden by + * the SYS_INLINE macro. + * + * SYS_LINUX_SYSCALL_SUPPORT_H + * This macro is used to avoid multiple inclusions of this header file. + * If you need to include this file more than once, make sure to + * unset SYS_LINUX_SYSCALL_SUPPORT_H before each inclusion. + * + * SYS_PREFIX: + * New system calls will have a prefix of "sys_" unless overridden by + * the SYS_PREFIX macro. Valid values for this macro are [0..9] which + * results in prefixes "sys[0..9]_". It is also possible to set this + * macro to -1, which avoids all prefixes. + * + * This file defines a few internal symbols that all start with "LSS_". + * Do not access these symbols from outside this file. They are not part + * of the supported API. + */ +#ifndef SYS_LINUX_SYSCALL_SUPPORT_H +#define SYS_LINUX_SYSCALL_SUPPORT_H + +/* We currently only support x86-32, x86-64, ARM, MIPS, and PPC on Linux. + * Porting to other related platforms should not be difficult. + */ +#if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__) || \ + defined(__mips__) || defined(__PPC__)) && defined(__linux) + +#ifndef SYS_CPLUSPLUS +#ifdef __cplusplus +/* Some system header files in older versions of gcc neglect to properly + * handle being included from C++. As it appears to be harmless to have + * multiple nested 'extern "C"' blocks, just add another one here. + */ +extern "C" { +#endif + +#include <errno.h> +#include <signal.h> +#include <stdarg.h> +#include <string.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/time.h> +#include <sys/types.h> +#include <syscall.h> +#include <unistd.h> +#include <linux/unistd.h> +#include <endian.h> + +#ifdef __mips__ +/* Include definitions of the ABI currently in use. */ +#include <sgidefs.h> +#endif + +#endif + +/* As glibc often provides subtly incompatible data structures (and implicit + * wrapper functions that convert them), we provide our own kernel data + * structures for use by the system calls. + * These structures have been developed by using Linux 2.6.23 headers for + * reference. Note though, we do not care about exact API compatibility + * with the kernel, and in fact the kernel often does not have a single + * API that works across architectures. Instead, we try to mimic the glibc + * API where reasonable, and only guarantee ABI compatibility with the + * kernel headers. + * Most notably, here are a few changes that were made to the structures + * defined by kernel headers: + * + * - we only define structures, but not symbolic names for kernel data + * types. For the latter, we directly use the native C datatype + * (i.e. "unsigned" instead of "mode_t"). + * - in a few cases, it is possible to define identical structures for + * both 32bit (e.g. i386) and 64bit (e.g. x86-64) platforms by + * standardizing on the 64bit version of the data types. In particular, + * this means that we use "unsigned" where the 32bit headers say + * "unsigned long". + * - overall, we try to minimize the number of cases where we need to + * conditionally define different structures. + * - the "struct kernel_sigaction" class of structures have been + * modified to more closely mimic glibc's API by introducing an + * anonymous union for the function pointer. + * - a small number of field names had to have an underscore appended to + * them, because glibc defines a global macro by the same name. + */ + +/* include/linux/dirent.h */ +struct kernel_dirent64 { + unsigned long long d_ino; + long long d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[256]; +}; + +/* include/linux/dirent.h */ +struct kernel_dirent { + long d_ino; + long d_off; + unsigned short d_reclen; + char d_name[256]; +}; + +/* include/linux/uio.h */ +struct kernel_iovec { + void *iov_base; + unsigned long iov_len; +}; + +/* include/linux/socket.h */ +struct kernel_msghdr { + void *msg_name; + int msg_namelen; + struct kernel_iovec*msg_iov; + unsigned long msg_iovlen; + void *msg_control; + unsigned long msg_controllen; + unsigned msg_flags; +}; + +/* include/asm-generic/poll.h */ +struct kernel_pollfd { + int fd; + short events; + short revents; +}; + +/* include/linux/resource.h */ +struct kernel_rlimit { + unsigned long rlim_cur; + unsigned long rlim_max; +}; + +/* include/linux/time.h */ +struct kernel_timespec { + long tv_sec; + long tv_nsec; +}; + +/* include/linux/time.h */ +struct kernel_timeval { + long tv_sec; + long tv_usec; +}; + +/* include/linux/resource.h */ +struct kernel_rusage { + struct kernel_timeval ru_utime; + struct kernel_timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +struct siginfo; +#if defined(__i386__) || defined(__ARM_ARCH_3__) || defined(__PPC__) + +/* include/asm-{arm,i386,mips,ppc}/signal.h */ +struct kernel_old_sigaction { + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + unsigned long sa_mask; + unsigned long sa_flags; + void (*sa_restorer)(void); +} __attribute__((packed,aligned(4))); +#elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + #define kernel_old_sigaction kernel_sigaction +#endif + +/* Some kernel functions (e.g. sigaction() in 2.6.23) require that the + * exactly match the size of the signal set, even though the API was + * intended to be extensible. We define our own KERNEL_NSIG to deal with + * this. + * Please note that glibc provides signals [1.._NSIG-1], whereas the + * kernel (and this header) provides the range [1..KERNEL_NSIG]. The + * actual number of signals is obviously the same, but the constants + * differ by one. + */ +#ifdef __mips__ +#define KERNEL_NSIG 128 +#else +#define KERNEL_NSIG 64 +#endif + +/* include/asm-{arm,i386,mips,x86_64}/signal.h */ +struct kernel_sigset_t { + unsigned long sig[(KERNEL_NSIG + 8*sizeof(unsigned long) - 1)/ + (8*sizeof(unsigned long))]; +}; + +/* include/asm-{arm,i386,mips,x86_64,ppc}/signal.h */ +struct kernel_sigaction { +#ifdef __mips__ + unsigned long sa_flags; + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + struct kernel_sigset_t sa_mask; +#else + union { + void (*sa_handler_)(int); + void (*sa_sigaction_)(int, struct siginfo *, void *); + }; + unsigned long sa_flags; + void (*sa_restorer)(void); + struct kernel_sigset_t sa_mask; +#endif +}; + +/* include/linux/socket.h */ +struct kernel_sockaddr { + unsigned short sa_family; + char sa_data[14]; +}; + +/* include/asm-{arm,i386,mips,ppc}/stat.h */ +#ifdef __mips__ +#if _MIPS_SIM == _MIPS_SIM_ABI64 +struct kernel_stat { +#else +struct kernel_stat64 { +#endif + unsigned st_dev; + unsigned __pad0[3]; + unsigned long long st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned st_rdev; + unsigned __pad1[3]; + long long st_size; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned st_blksize; + unsigned __pad2; + unsigned long long st_blocks; +}; +#elif defined __PPC__ +struct kernel_stat64 { + unsigned long long st_dev; + unsigned long long st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned long long st_rdev; + unsigned short int __pad2; + long long st_size; + long st_blksize; + long long st_blocks; + long st_atime_; + unsigned long st_atime_nsec_; + long st_mtime_; + unsigned long st_mtime_nsec_; + long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long __unused4; + unsigned long __unused5; +}; +#else +struct kernel_stat64 { + unsigned long long st_dev; + unsigned char __pad0[4]; + unsigned __st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned long long st_rdev; + unsigned char __pad3[4]; + long long st_size; + unsigned st_blksize; + unsigned long long st_blocks; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned long long st_ino; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc}/stat.h */ +#if defined(__i386__) || defined(__ARM_ARCH_3__) +struct kernel_stat { + /* The kernel headers suggest that st_dev and st_rdev should be 32bit + * quantities encoding 12bit major and 20bit minor numbers in an interleaved + * format. In reality, we do not see useful data in the top bits. So, + * we'll leave the padding in here, until we find a better solution. + */ + unsigned short st_dev; + short pad1; + unsigned st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; + short pad2; + unsigned st_size; + unsigned st_blksize; + unsigned st_blocks; + unsigned st_atime_; + unsigned st_atime_nsec_; + unsigned st_mtime_; + unsigned st_mtime_nsec_; + unsigned st_ctime_; + unsigned st_ctime_nsec_; + unsigned __unused4; + unsigned __unused5; +}; +#elif defined(__x86_64__) +struct kernel_stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned st_mode; + unsigned st_uid; + unsigned st_gid; + unsigned __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + long st_blocks; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + long __unused[3]; +}; +#elif defined(__PPC__) +struct kernel_stat { + unsigned st_dev; + unsigned long st_ino; // ino_t + unsigned long st_mode; // mode_t + unsigned short st_nlink; // nlink_t + unsigned st_uid; // uid_t + unsigned st_gid; // gid_t + unsigned st_rdev; + long st_size; // off_t + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long __unused4; + unsigned long __unused5; +}; +#elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) +struct kernel_stat { + unsigned st_dev; + int st_pad1[3]; + unsigned st_ino; + unsigned st_mode; + unsigned st_nlink; + unsigned st_uid; + unsigned st_gid; + unsigned st_rdev; + int st_pad2[2]; + long st_size; + int st_pad3; + long st_atime_; + long st_atime_nsec_; + long st_mtime_; + long st_mtime_nsec_; + long st_ctime_; + long st_ctime_nsec_; + int st_blksize; + int st_blocks; + int st_pad4[14]; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc}/statfs.h */ +#ifdef __mips__ +#if _MIPS_SIM != _MIPS_SIM_ABI64 +struct kernel_statfs64 { + unsigned long f_type; + unsigned long f_bsize; + unsigned long f_frsize; + unsigned long __pad; + unsigned long long f_blocks; + unsigned long long f_bfree; + unsigned long long f_files; + unsigned long long f_ffree; + unsigned long long f_bavail; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_spare[6]; +}; +#endif +#elif !defined(__x86_64__) +struct kernel_statfs64 { + unsigned long f_type; + unsigned long f_bsize; + unsigned long long f_blocks; + unsigned long long f_bfree; + unsigned long long f_bavail; + unsigned long long f_files; + unsigned long long f_ffree; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_frsize; + unsigned long f_spare[5]; +}; +#endif + +/* include/asm-{arm,i386,mips,x86_64,ppc,generic}/statfs.h */ +#ifdef __mips__ +struct kernel_statfs { + long f_type; + long f_bsize; + long f_frsize; + long f_blocks; + long f_bfree; + long f_files; + long f_ffree; + long f_bavail; + struct { int val[2]; } f_fsid; + long f_namelen; + long f_spare[6]; +}; +#else +struct kernel_statfs { + /* x86_64 actually defines all these fields as signed, whereas all other */ + /* platforms define them as unsigned. Leaving them at unsigned should not */ + /* cause any problems. */ + unsigned long f_type; + unsigned long f_bsize; + unsigned long f_blocks; + unsigned long f_bfree; + unsigned long f_bavail; + unsigned long f_files; + unsigned long f_ffree; + struct { int val[2]; } f_fsid; + unsigned long f_namelen; + unsigned long f_frsize; + unsigned long f_spare[5]; +}; +#endif + + +/* Definitions missing from the standard header files */ +#ifndef O_DIRECTORY +#if defined(__ARM_ARCH_3__) +#define O_DIRECTORY 0040000 +#else +#define O_DIRECTORY 0200000 +#endif +#endif +#ifndef NT_PRXFPREG +#define NT_PRXFPREG 0x46e62b7f +#endif +#ifndef PTRACE_GETFPXREGS +#define PTRACE_GETFPXREGS ((enum __ptrace_request)18) +#endif +#ifndef PR_GET_DUMPABLE +#define PR_GET_DUMPABLE 3 +#endif +#ifndef PR_SET_DUMPABLE +#define PR_SET_DUMPABLE 4 +#endif +#ifndef PR_GET_SECCOMP +#define PR_GET_SECCOMP 21 +#endif +#ifndef PR_SET_SECCOMP +#define PR_SET_SECCOMP 22 +#endif +#ifndef AT_FDCWD +#define AT_FDCWD (-100) +#endif +#ifndef AT_SYMLINK_NOFOLLOW +#define AT_SYMLINK_NOFOLLOW 0x100 +#endif +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif +#ifndef MREMAP_FIXED +#define MREMAP_FIXED 2 +#endif +#ifndef SA_RESTORER +#define SA_RESTORER 0x04000000 +#endif +#ifndef CPUCLOCK_PROF +#define CPUCLOCK_PROF 0 +#endif +#ifndef CPUCLOCK_VIRT +#define CPUCLOCK_VIRT 1 +#endif +#ifndef CPUCLOCK_SCHED +#define CPUCLOCK_SCHED 2 +#endif +#ifndef CPUCLOCK_PERTHREAD_MASK +#define CPUCLOCK_PERTHREAD_MASK 4 +#endif +#ifndef MAKE_PROCESS_CPUCLOCK +#define MAKE_PROCESS_CPUCLOCK(pid, clock) \ + ((~(int)(pid) << 3) | (int)(clock)) +#endif +#ifndef MAKE_THREAD_CPUCLOCK +#define MAKE_THREAD_CPUCLOCK(tid, clock) \ + ((~(int)(tid) << 3) | (int)((clock) | CPUCLOCK_PERTHREAD_MASK)) +#endif + +#if defined(__x86_64__) +#ifndef ARCH_SET_GS +#define ARCH_SET_GS 0x1001 +#endif +#ifndef ARCH_GET_GS +#define ARCH_GET_GS 0x1004 +#endif +#endif + +#if defined(__i386__) +#ifndef __NR_quotactl +#define __NR_quotactl 131 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_setresgid 170 +#define __NR_getresgid 171 +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn 173 +#define __NR_rt_sigaction 174 +#define __NR_rt_sigprocmask 175 +#define __NR_rt_sigpending 176 +#define __NR_rt_sigsuspend 179 +#endif +#ifndef __NR_pread64 +#define __NR_pread64 180 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 181 +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit 191 +#endif +#ifndef __NR_stat64 +#define __NR_stat64 195 +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 197 +#endif +#ifndef __NR_setresuid32 +#define __NR_setresuid32 208 +#define __NR_getresuid32 209 +#define __NR_setresgid32 210 +#define __NR_getresgid32 211 +#endif +#ifndef __NR_setfsuid32 +#define __NR_setfsuid32 215 +#define __NR_setfsgid32 216 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 220 +#endif +#ifndef __NR_gettid +#define __NR_gettid 224 +#endif +#ifndef __NR_readahead +#define __NR_readahead 225 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 226 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 227 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 229 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 230 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 232 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 233 +#endif +#ifndef __NR_tkill +#define __NR_tkill 238 +#endif +#ifndef __NR_futex +#define __NR_futex 240 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 241 +#define __NR_sched_getaffinity 242 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 258 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 265 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 266 +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 268 +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 269 +#endif +#ifndef __NR_fadvise64_64 +#define __NR_fadvise64_64 272 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 289 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 290 +#endif +#ifndef __NR_openat +#define __NR_openat 295 +#endif +#ifndef __NR_fstatat64 +#define __NR_fstatat64 300 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 301 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 317 +#endif +#ifndef __NR_getcpu +#define __NR_getcpu 318 +#endif +#ifndef __NR_fallocate +#define __NR_fallocate 324 +#endif +/* End of i386 definitions */ +#elif defined(__ARM_ARCH_3__) +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_SYSCALL_BASE + 164) +#define __NR_getresuid (__NR_SYSCALL_BASE + 165) +#define __NR_setresgid (__NR_SYSCALL_BASE + 170) +#define __NR_getresgid (__NR_SYSCALL_BASE + 171) +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn (__NR_SYSCALL_BASE + 173) +#define __NR_rt_sigaction (__NR_SYSCALL_BASE + 174) +#define __NR_rt_sigprocmask (__NR_SYSCALL_BASE + 175) +#define __NR_rt_sigpending (__NR_SYSCALL_BASE + 176) +#define __NR_rt_sigsuspend (__NR_SYSCALL_BASE + 179) +#endif +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_SYSCALL_BASE + 180) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_SYSCALL_BASE + 181) +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit (__NR_SYSCALL_BASE + 191) +#endif +#ifndef __NR_stat64 +#define __NR_stat64 (__NR_SYSCALL_BASE + 195) +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 (__NR_SYSCALL_BASE + 197) +#endif +#ifndef __NR_setresuid32 +#define __NR_setresuid32 (__NR_SYSCALL_BASE + 208) +#define __NR_getresuid32 (__NR_SYSCALL_BASE + 209) +#define __NR_setresgid32 (__NR_SYSCALL_BASE + 210) +#define __NR_getresgid32 (__NR_SYSCALL_BASE + 211) +#endif +#ifndef __NR_setfsuid32 +#define __NR_setfsuid32 (__NR_SYSCALL_BASE + 215) +#define __NR_setfsgid32 (__NR_SYSCALL_BASE + 216) +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 (__NR_SYSCALL_BASE + 217) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_SYSCALL_BASE + 224) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_SYSCALL_BASE + 225) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_SYSCALL_BASE + 226) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_SYSCALL_BASE + 227) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_SYSCALL_BASE + 229) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_SYSCALL_BASE + 230) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_SYSCALL_BASE + 232) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_SYSCALL_BASE + 233) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_SYSCALL_BASE + 238) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_SYSCALL_BASE + 240) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_SYSCALL_BASE + 241) +#define __NR_sched_getaffinity (__NR_SYSCALL_BASE + 242) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_SYSCALL_BASE + 256) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_SYSCALL_BASE + 263) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_SYSCALL_BASE + 264) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_SYSCALL_BASE + 266) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_SYSCALL_BASE + 267) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_SYSCALL_BASE + 314) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_SYSCALL_BASE + 315) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_SYSCALL_BASE + 344) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_SYSCALL_BASE + 345) +#endif +/* End of ARM 3 definitions */ +#elif defined(__x86_64__) +#ifndef __NR_pread64 +#define __NR_pread64 17 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 18 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 117 +#define __NR_getresuid 118 +#define __NR_setresgid 119 +#define __NR_getresgid 120 +#endif +#ifndef __NR_quotactl +#define __NR_quotactl 179 +#endif +#ifndef __NR_gettid +#define __NR_gettid 186 +#endif +#ifndef __NR_readahead +#define __NR_readahead 187 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 188 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 189 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 191 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 192 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 194 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 195 +#endif +#ifndef __NR_tkill +#define __NR_tkill 200 +#endif +#ifndef __NR_futex +#define __NR_futex 202 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 203 +#define __NR_sched_getaffinity 204 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 217 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 218 +#endif +#ifndef __NR_fadvise64 +#define __NR_fadvise64 221 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 228 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 229 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 251 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 252 +#endif +#ifndef __NR_openat +#define __NR_openat 257 +#endif +#ifndef __NR_newfstatat +#define __NR_newfstatat 262 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 263 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 279 +#endif +#ifndef __NR_fallocate +#define __NR_fallocate 285 +#endif +/* End of x86-64 definitions */ +#elif defined(__mips__) +#if _MIPS_SIM == _MIPS_SIM_ABI32 +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 185) +#define __NR_getresuid (__NR_Linux + 186) +#define __NR_setresgid (__NR_Linux + 190) +#define __NR_getresgid (__NR_Linux + 191) +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn (__NR_Linux + 193) +#define __NR_rt_sigaction (__NR_Linux + 194) +#define __NR_rt_sigprocmask (__NR_Linux + 195) +#define __NR_rt_sigpending (__NR_Linux + 196) +#define __NR_rt_sigsuspend (__NR_Linux + 199) +#endif +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_Linux + 200) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_Linux + 201) +#endif +#ifndef __NR_stat64 +#define __NR_stat64 (__NR_Linux + 213) +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 (__NR_Linux + 215) +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 (__NR_Linux + 219) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 222) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 223) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 224) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 225) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 227) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 228) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 230) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 231) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 236) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 238) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 239) +#define __NR_sched_getaffinity (__NR_Linux + 240) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 252) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_Linux + 255) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_Linux + 256) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 263) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 264) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 288) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 293) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 294) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 308) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 312) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 314) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 315) +#endif +/* End of MIPS (old 32bit API) definitions */ +#elif _MIPS_SIM == _MIPS_SIM_ABI64 +#ifndef __NR_pread64 +#define __NR_pread64 (__NR_Linux + 16) +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 (__NR_Linux + 17) +#endif +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 115) +#define __NR_getresuid (__NR_Linux + 116) +#define __NR_setresgid (__NR_Linux + 117) +#define __NR_getresgid (__NR_Linux + 118) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 178) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 179) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 180) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 181) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 183) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 184) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 186) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 187) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 192) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 194) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 195) +#define __NR_sched_getaffinity (__NR_Linux + 196) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 212) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 222) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 223) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 247) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 252) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 253) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 267) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 271) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 273) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 274) +#endif +/* End of MIPS (64bit API) definitions */ +#else +#ifndef __NR_setresuid +#define __NR_setresuid (__NR_Linux + 115) +#define __NR_getresuid (__NR_Linux + 116) +#define __NR_setresgid (__NR_Linux + 117) +#define __NR_getresgid (__NR_Linux + 118) +#endif +#ifndef __NR_gettid +#define __NR_gettid (__NR_Linux + 178) +#endif +#ifndef __NR_readahead +#define __NR_readahead (__NR_Linux + 179) +#endif +#ifndef __NR_setxattr +#define __NR_setxattr (__NR_Linux + 180) +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr (__NR_Linux + 181) +#endif +#ifndef __NR_getxattr +#define __NR_getxattr (__NR_Linux + 183) +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr (__NR_Linux + 184) +#endif +#ifndef __NR_listxattr +#define __NR_listxattr (__NR_Linux + 186) +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr (__NR_Linux + 187) +#endif +#ifndef __NR_tkill +#define __NR_tkill (__NR_Linux + 192) +#endif +#ifndef __NR_futex +#define __NR_futex (__NR_Linux + 194) +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity (__NR_Linux + 195) +#define __NR_sched_getaffinity (__NR_Linux + 196) +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address (__NR_Linux + 213) +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 (__NR_Linux + 217) +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 (__NR_Linux + 218) +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime (__NR_Linux + 226) +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres (__NR_Linux + 227) +#endif +#ifndef __NR_openat +#define __NR_openat (__NR_Linux + 251) +#endif +#ifndef __NR_fstatat +#define __NR_fstatat (__NR_Linux + 256) +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat (__NR_Linux + 257) +#endif +#ifndef __NR_move_pages +#define __NR_move_pages (__NR_Linux + 271) +#endif +#ifndef __NR_getcpu +#define __NR_getcpu (__NR_Linux + 275) +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set (__NR_Linux + 277) +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get (__NR_Linux + 278) +#endif +/* End of MIPS (new 32bit API) definitions */ +#endif +/* End of MIPS definitions */ +#elif defined(__PPC__) +#ifndef __NR_setfsuid +#define __NR_setfsuid 138 +#define __NR_setfsgid 139 +#endif +#ifndef __NR_setresuid +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_setresgid 169 +#define __NR_getresgid 170 +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigreturn 172 +#define __NR_rt_sigaction 173 +#define __NR_rt_sigprocmask 174 +#define __NR_rt_sigpending 175 +#define __NR_rt_sigsuspend 178 +#endif +#ifndef __NR_pread64 +#define __NR_pread64 179 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 180 +#endif +#ifndef __NR_ugetrlimit +#define __NR_ugetrlimit 190 +#endif +#ifndef __NR_readahead +#define __NR_readahead 191 +#endif +#ifndef __NR_stat64 +#define __NR_stat64 195 +#endif +#ifndef __NR_fstat64 +#define __NR_fstat64 197 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 202 +#endif +#ifndef __NR_gettid +#define __NR_gettid 207 +#endif +#ifndef __NR_tkill +#define __NR_tkill 208 +#endif +#ifndef __NR_setxattr +#define __NR_setxattr 209 +#endif +#ifndef __NR_lsetxattr +#define __NR_lsetxattr 210 +#endif +#ifndef __NR_getxattr +#define __NR_getxattr 212 +#endif +#ifndef __NR_lgetxattr +#define __NR_lgetxattr 213 +#endif +#ifndef __NR_listxattr +#define __NR_listxattr 215 +#endif +#ifndef __NR_llistxattr +#define __NR_llistxattr 216 +#endif +#ifndef __NR_futex +#define __NR_futex 221 +#endif +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 222 +#define __NR_sched_getaffinity 223 +#endif +#ifndef __NR_set_tid_address +#define __NR_set_tid_address 232 +#endif +#ifndef __NR_clock_gettime +#define __NR_clock_gettime 246 +#endif +#ifndef __NR_clock_getres +#define __NR_clock_getres 247 +#endif +#ifndef __NR_statfs64 +#define __NR_statfs64 252 +#endif +#ifndef __NR_fstatfs64 +#define __NR_fstatfs64 253 +#endif +#ifndef __NR_fadvise64_64 +#define __NR_fadvise64_64 254 +#endif +#ifndef __NR_ioprio_set +#define __NR_ioprio_set 273 +#endif +#ifndef __NR_ioprio_get +#define __NR_ioprio_get 274 +#endif +#ifndef __NR_openat +#define __NR_openat 286 +#endif +#ifndef __NR_fstatat64 +#define __NR_fstatat64 291 +#endif +#ifndef __NR_unlinkat +#define __NR_unlinkat 292 +#endif +#ifndef __NR_move_pages +#define __NR_move_pages 301 +#endif +#ifndef __NR_getcpu +#define __NR_getcpu 302 +#endif +/* End of powerpc defininitions */ +#endif + + +/* After forking, we must make sure to only call system calls. */ +#if __BOUNDED_POINTERS__ + #error "Need to port invocations of syscalls for bounded ptrs" +#else + /* The core dumper and the thread lister get executed after threads + * have been suspended. As a consequence, we cannot call any functions + * that acquire locks. Unfortunately, libc wraps most system calls + * (e.g. in order to implement pthread_atfork, and to make calls + * cancellable), which means we cannot call these functions. Instead, + * we have to call syscall() directly. + */ + #undef LSS_ERRNO + #ifdef SYS_ERRNO + /* Allow the including file to override the location of errno. This can + * be useful when using clone() with the CLONE_VM option. + */ + #define LSS_ERRNO SYS_ERRNO + #else + #define LSS_ERRNO errno + #endif + + #undef LSS_INLINE + #ifdef SYS_INLINE + #define LSS_INLINE SYS_INLINE + #else + #define LSS_INLINE static inline + #endif + + /* Allow the including file to override the prefix used for all new + * system calls. By default, it will be set to "sys_". + */ + #undef LSS_NAME + #ifndef SYS_PREFIX + #define LSS_NAME(name) sys_##name + #elif SYS_PREFIX < 0 + #define LSS_NAME(name) name + #elif SYS_PREFIX == 0 + #define LSS_NAME(name) sys0_##name + #elif SYS_PREFIX == 1 + #define LSS_NAME(name) sys1_##name + #elif SYS_PREFIX == 2 + #define LSS_NAME(name) sys2_##name + #elif SYS_PREFIX == 3 + #define LSS_NAME(name) sys3_##name + #elif SYS_PREFIX == 4 + #define LSS_NAME(name) sys4_##name + #elif SYS_PREFIX == 5 + #define LSS_NAME(name) sys5_##name + #elif SYS_PREFIX == 6 + #define LSS_NAME(name) sys6_##name + #elif SYS_PREFIX == 7 + #define LSS_NAME(name) sys7_##name + #elif SYS_PREFIX == 8 + #define LSS_NAME(name) sys8_##name + #elif SYS_PREFIX == 9 + #define LSS_NAME(name) sys9_##name + #endif + + #undef LSS_RETURN + #if (defined(__i386__) || defined(__x86_64__) || defined(__ARM_ARCH_3__)) + /* Failing system calls return a negative result in the range of + * -1..-4095. These are "errno" values with the sign inverted. + */ + #define LSS_RETURN(type, res) \ + do { \ + if ((unsigned long)(res) >= (unsigned long)(-4095)) { \ + LSS_ERRNO = -(res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #elif defined(__mips__) + /* On MIPS, failing system calls return -1, and set errno in a + * separate CPU register. + */ + #define LSS_RETURN(type, res, err) \ + do { \ + if (err) { \ + LSS_ERRNO = (res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #elif defined(__PPC__) + /* On PPC, failing system calls return -1, and set errno in a + * separate CPU register. See linux/unistd.h. + */ + #define LSS_RETURN(type, res, err) \ + do { \ + if (err & 0x10000000 ) { \ + LSS_ERRNO = (res); \ + res = -1; \ + } \ + return (type) (res); \ + } while (0) + #endif + #if defined(__i386__) + /* In PIC mode (e.g. when building shared libraries), gcc for i386 + * reserves ebx. Unfortunately, most distribution ship with implementations + * of _syscallX() which clobber ebx. + * Also, most definitions of _syscallX() neglect to mark "memory" as being + * clobbered. This causes problems with compilers, that do a better job + * at optimizing across __asm__ calls. + * So, we just have to redefine all of the _syscallX() macros. + */ + #undef LSS_BODY + #define LSS_BODY(type,args...) \ + long __res; \ + __asm__ __volatile__("push %%ebx\n" \ + "movl %2,%%ebx\n" \ + "int $0x80\n" \ + "pop %%ebx" \ + args \ + : "memory"); \ + LSS_RETURN(type,__res) + #undef _syscall0 + #define _syscall0(type,name) \ + type LSS_NAME(name)(void) { \ + long __res; \ + __asm__ volatile("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name) \ + : "memory"); \ + LSS_RETURN(type,__res); \ + } + #undef _syscall1 + #define _syscall1(type,name,type1,arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1))); \ + } + #undef _syscall2 + #define _syscall2(type,name,type1,arg1,type2,arg2) \ + type LSS_NAME(name)(type1 arg1,type2 arg2) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name),"ri" ((long)(arg1)), "c" ((long)(arg2))); \ + } + #undef _syscall3 + #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ + type LSS_NAME(name)(type1 arg1,type2 arg2,type3 arg3) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \ + "d" ((long)(arg3))); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_BODY(type, \ + : "=a" (__res) \ + : "0" (__NR_##name), "ri" ((long)(arg1)), "c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4))); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + long __res; \ + __asm__ __volatile__("push %%ebx\n" \ + "movl %2,%%ebx\n" \ + "movl %1,%%eax\n" \ + "int $0x80\n" \ + "pop %%ebx" \ + : "=a" (__res) \ + : "i" (__NR_##name), "ri" ((long)(arg1)), \ + "c" ((long)(arg2)), "d" ((long)(arg3)), \ + "S" ((long)(arg4)), "D" ((long)(arg5)) \ + : "memory"); \ + LSS_RETURN(type,__res); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + long __res; \ + struct { long __a1; long __a6; } __s = { (long)arg1, (long) arg6 }; \ + __asm__ __volatile__("push %%ebp\n" \ + "push %%ebx\n" \ + "movl 4(%2),%%ebp\n" \ + "movl 0(%2), %%ebx\n" \ + "movl %1,%%eax\n" \ + "int $0x80\n" \ + "pop %%ebx\n" \ + "pop %%ebp" \ + : "=a" (__res) \ + : "i" (__NR_##name), "0" ((long)(&__s)), \ + "c" ((long)(arg2)), "d" ((long)(arg3)), \ + "S" ((long)(arg4)), "D" ((long)(arg5)) \ + : "memory"); \ + LSS_RETURN(type,__res); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + __asm__ __volatile__(/* if (fn == NULL) + * return -EINVAL; + */ + "movl %3,%%ecx\n" + "jecxz 1f\n" + + /* if (child_stack == NULL) + * return -EINVAL; + */ + "movl %4,%%ecx\n" + "jecxz 1f\n" + + /* Set up alignment of the child stack: + * child_stack = (child_stack & ~0xF) - 20; + */ + "andl $-16,%%ecx\n" + "subl $20,%%ecx\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "movl %6,%%eax\n" + "movl %%eax,4(%%ecx)\n" + "movl %3,%%eax\n" + "movl %%eax,(%%ecx)\n" + + /* %eax = syscall(%eax = __NR_clone, + * %ebx = flags, + * %ecx = child_stack, + * %edx = parent_tidptr, + * %esi = newtls, + * %edi = child_tidptr) + * Also, make sure that %ebx gets preserved as it is + * used in PIC mode. + */ + "movl %8,%%esi\n" + "movl %7,%%edx\n" + "movl %5,%%eax\n" + "movl %9,%%edi\n" + "pushl %%ebx\n" + "movl %%eax,%%ebx\n" + "movl %2,%%eax\n" + "int $0x80\n" + + /* In the parent: restore %ebx + * In the child: move "fn" into %ebx + */ + "popl %%ebx\n" + + /* if (%eax != 0) + * return %eax; + */ + "test %%eax,%%eax\n" + "jnz 1f\n" + + /* In the child, now. Terminate frame pointer chain. + */ + "movl $0,%%ebp\n" + + /* Call "fn". "arg" is already on the stack. + */ + "call *%%ebx\n" + + /* Call _exit(%ebx). Unfortunately older versions + * of gcc restrict the number of arguments that can + * be passed to asm(). So, we need to hard-code the + * system call number. + */ + "movl %%eax,%%ebx\n" + "movl $1,%%eax\n" + "int $0x80\n" + + /* Return to parent. + */ + "1:\n" + : "=a" (__res) + : "0"(-EINVAL), "i"(__NR_clone), + "m"(fn), "m"(child_stack), "m"(flags), "m"(arg), + "m"(parent_tidptr), "m"(newtls), "m"(child_tidptr) + : "memory", "ecx", "edx", "esi", "edi"); + LSS_RETURN(int, __res); + } + + #define __NR__fadvise64_64 __NR_fadvise64_64 + LSS_INLINE _syscall6(int, _fadvise64_64, int, fd, + unsigned, offset_lo, unsigned, offset_hi, + unsigned, len_lo, unsigned, len_hi, + int, advice) + + LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset, + loff_t len, int advice) { + return LSS_NAME(_fadvise64_64)(fd, + (unsigned)offset, (unsigned)(offset >>32), + (unsigned)len, (unsigned)(len >> 32), + advice); + } + + #define __NR__fallocate __NR_fallocate + LSS_INLINE _syscall6(int, _fallocate, int, fd, + int, mode, + unsigned, offset_lo, unsigned, offset_hi, + unsigned, len_lo, unsigned, len_hi) + + LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode, + loff_t offset, loff_t len) { + union { loff_t off; unsigned w[2]; } o = { offset }, l = { len }; + return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]); + } + + LSS_INLINE _syscall1(int, set_thread_area, void *, u) + LSS_INLINE _syscall1(int, get_thread_area, void *, u) + + LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { + /* On i386, the kernel does not know how to return from a signal + * handler. Instead, it relies on user space to provide a + * restorer function that calls the {rt_,}sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:movl %1,%%eax\n" + "int $0x80\n" + "2:popl %0\n" + "addl $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_rt_sigreturn)); + return res; + } + LSS_INLINE void (*LSS_NAME(restore)(void))(void) { + /* On i386, the kernel does not know how to return from a signal + * handler. Instead, it relies on user space to provide a + * restorer function that calls the {rt_,}sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:pop %%eax\n" + "movl %1,%%eax\n" + "int $0x80\n" + "2:popl %0\n" + "addl $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_sigreturn)); + return res; + } + #elif defined(__x86_64__) + /* There are no known problems with any of the _syscallX() macros + * currently shipping for x86_64, but we still need to be able to define + * our own version so that we can override the location of the errno + * location (e.g. when using the clone() system call with the CLONE_VM + * option). + */ + #undef LSS_BODY + #define LSS_BODY(type,name, ...) \ + long __res; \ + __asm__ __volatile__("syscall" : "=a" (__res) : "0" (__NR_##name), \ + ##__VA_ARGS__ : "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type,name) \ + type LSS_NAME(name)() { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type,name,type1,arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(type, name, "D" ((long)(arg1))); \ + } + #undef _syscall2 + #define _syscall2(type,name,type1,arg1,type2,arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2))); \ + } + #undef _syscall3 + #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)), \ + "d" ((long)(arg3))); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "g" ((long)(arg4)) : "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "g" ((long)(arg4)), "g" ((long)(arg5)) : \ + "r8", "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + long __res; \ + __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; movq %7,%%r9;" \ + "syscall" : \ + "=a" (__res) : "0" (__NR_##name), \ + "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ + "g" ((long)(arg4)), "g" ((long)(arg5)), "g" ((long)(arg6)) : \ + "r8", "r9", "r10", "r11", "rcx", "memory"); \ + LSS_RETURN(type, __res); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + { + register void *__tls __asm__("r8") = newtls; + register int *__ctid __asm__("r10") = child_tidptr; + __asm__ __volatile__(/* if (fn == NULL) + * return -EINVAL; + */ + "testq %4,%4\n" + "jz 1f\n" + + /* if (child_stack == NULL) + * return -EINVAL; + */ + "testq %5,%5\n" + "jz 1f\n" + + /* childstack -= 2*sizeof(void *); + */ + "subq $16,%5\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "movq %7,8(%5)\n" + "movq %4,0(%5)\n" + + /* %rax = syscall(%rax = __NR_clone, + * %rdi = flags, + * %rsi = child_stack, + * %rdx = parent_tidptr, + * %r8 = new_tls, + * %r10 = child_tidptr) + */ + "movq %2,%%rax\n" + "syscall\n" + + /* if (%rax != 0) + * return; + */ + "testq %%rax,%%rax\n" + "jnz 1f\n" + + /* In the child. Terminate frame pointer chain. + */ + "xorq %%rbp,%%rbp\n" + + /* Call "fn(arg)". + */ + "popq %%rax\n" + "popq %%rdi\n" + "call *%%rax\n" + + /* Call _exit(%ebx). + */ + "movq %%rax,%%rdi\n" + "movq %3,%%rax\n" + "syscall\n" + + /* Return to parent. + */ + "1:\n" + : "=a" (__res) + : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit), + "r"(fn), "S"(child_stack), "D"(flags), "r"(arg), + "d"(parent_tidptr), "r"(__tls), "r"(__ctid) + : "memory", "r11", "rcx"); + } + LSS_RETURN(int, __res); + } + LSS_INLINE _syscall2(int, arch_prctl, int, c, void *, a) + LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len, + int, advice) + + LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { + /* On x86-64, the kernel does not know how to return from + * a signal handler. Instead, it relies on user space to provide a + * restorer function that calls the rt_sigreturn() system call. + * Unfortunately, we cannot just reference the glibc version of this + * function, as glibc goes out of its way to make it inaccessible. + */ + void (*res)(void); + __asm__ __volatile__("call 2f\n" + "0:.align 16\n" + "1:movq %1,%%rax\n" + "syscall\n" + "2:popq %0\n" + "addq $(1b-0b),%0\n" + : "=a" (res) + : "i" (__NR_rt_sigreturn)); + return res; + } + #elif defined(__ARM_ARCH_3__) + /* Most definitions of _syscallX() neglect to mark "memory" as being + * clobbered. This causes problems with compilers, that do a better job + * at optimizing across __asm__ calls. + * So, we just have to redefine all fo the _syscallX() macros. + */ + #undef LSS_REG + #define LSS_REG(r,a) register long __r##r __asm__("r"#r) = (long)a + #undef LSS_BODY + #define LSS_BODY(type,name,args...) \ + register long __res_r0 __asm__("r0"); \ + long __res; \ + __asm__ __volatile__ (__syscall(name) \ + : "=r"(__res_r0) : args : "lr", "memory"); \ + __res = __res_r0; \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)() { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__r0)); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1)); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2)); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3)); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); LSS_REG(4, arg5); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \ + "r"(__r4)); \ + } + #undef _syscall6 + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6); \ + LSS_BODY(type, name, "r"(__r0), "r"(__r1), "r"(__r2), "r"(__r3), \ + "r"(__r4), "r"(__r5)); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + { + register int __flags __asm__("r0") = flags; + register void *__stack __asm__("r1") = child_stack; + register void *__ptid __asm__("r2") = parent_tidptr; + register void *__tls __asm__("r3") = newtls; + register int *__ctid __asm__("r4") = child_tidptr; + __asm__ __volatile__(/* if (fn == NULL || child_stack == NULL) + * return -EINVAL; + */ + "cmp %2,#0\n" + "cmpne %3,#0\n" + "moveq %0,%1\n" + "beq 1f\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "str %5,[%3,#-4]!\n" + "str %2,[%3,#-4]!\n" + + /* %r0 = syscall(%r0 = flags, + * %r1 = child_stack, + * %r2 = parent_tidptr, + * %r3 = newtls, + * %r4 = child_tidptr) + */ + __syscall(clone)"\n" + + /* if (%r0 != 0) + * return %r0; + */ + "movs %0,r0\n" + "bne 1f\n" + + /* In the child, now. Call "fn(arg)". + */ + "ldr r0,[sp, #4]\n" + "mov lr,pc\n" + "ldr pc,[sp]\n" + + /* Call _exit(%r0). + */ + __syscall(exit)"\n" + "1:\n" + : "=r" (__res) + : "i"(-EINVAL), + "r"(fn), "r"(__stack), "r"(__flags), "r"(arg), + "r"(__ptid), "r"(__tls), "r"(__ctid) + : "lr", "memory"); + } + LSS_RETURN(int, __res); + } + #elif defined(__mips__) + #undef LSS_REG + #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) = \ + (unsigned long)(a) + #undef LSS_BODY + #define LSS_BODY(type,name,r7,...) \ + register unsigned long __v0 __asm__("$2") = __NR_##name; \ + __asm__ __volatile__ ("syscall\n" \ + : "=&r"(__v0), r7 (__r7) \ + : "0"(__v0), ##__VA_ARGS__ \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)() { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_BODY(type, name, "=r"); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_BODY(type, name, "=r", "r"(__r4)); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_REG(5, arg2); \ + LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5)); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + register unsigned long __r7 __asm__("$7"); \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_BODY(type, name, "=r", "r"(__r4), "r"(__r5), "r"(__r6)); \ + } + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6)); \ + } + #undef _syscall5 + #if _MIPS_SIM == _MIPS_SIM_ABI32 + /* The old 32bit MIPS system call API passes the fifth and sixth argument + * on the stack, whereas the new APIs use registers "r8" and "r9". + */ + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + register unsigned long __v0 __asm__("$2"); \ + __asm__ __volatile__ (".set noreorder\n" \ + "lw $2, %6\n" \ + "subu $29, 32\n" \ + "sw $2, 16($29)\n" \ + "li $2, %2\n" \ + "syscall\n" \ + "addiu $29, 32\n" \ + ".set reorder\n" \ + : "=&r"(__v0), "+r" (__r7) \ + : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ + "r"(__r6), "m" ((unsigned long)arg5) \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7); \ + } + #else + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); LSS_REG(8, arg5); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \ + "r"(__r8)); \ + } + #endif + #undef _syscall6 + #if _MIPS_SIM == _MIPS_SIM_ABI32 + /* The old 32bit MIPS system call API passes the fifth and sixth argument + * on the stack, whereas the new APIs use registers "r8" and "r9". + */ + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); \ + register unsigned long __v0 __asm__("$2"); \ + __asm__ __volatile__ (".set noreorder\n" \ + "lw $2, %6\n" \ + "lw $8, %7\n" \ + "subu $29, 32\n" \ + "sw $2, 16($29)\n" \ + "sw $8, 20($29)\n" \ + "li $2, %2\n" \ + "syscall\n" \ + "addiu $29, 32\n" \ + ".set reorder\n" \ + : "=&r"(__v0), "+r" (__r7) \ + : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ + "r"(__r6), "r" ((unsigned long)arg5), \ + "r" ((unsigned long)arg6) \ + : "$8", "$9", "$10", "$11", "$12", \ + "$13", "$14", "$15", "$24", "memory"); \ + LSS_RETURN(type, __v0, __r7); \ + } + #else + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5,type6 arg6) { \ + LSS_REG(4, arg1); LSS_REG(5, arg2); LSS_REG(6, arg3); \ + LSS_REG(7, arg4); LSS_REG(8, arg5); LSS_REG(9, arg6); \ + LSS_BODY(type, name, "+r", "r"(__r4), "r"(__r5), "r"(__r6), \ + "r"(__r8), "r"(__r9)); \ + } + #endif + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + register unsigned long __v0 __asm__("$2"); + register unsigned long __r7 __asm__("$7") = (unsigned long)newtls; + { + register int __flags __asm__("$4") = flags; + register void *__stack __asm__("$5") = child_stack; + register void *__ptid __asm__("$6") = parent_tidptr; + register int *__ctid __asm__("$8") = child_tidptr; + __asm__ __volatile__( + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "subu $29,24\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "sub $29,16\n" + #else + "dsubu $29,16\n" + #endif + + /* if (fn == NULL || child_stack == NULL) + * return -EINVAL; + */ + "li %0,%2\n" + "beqz %5,1f\n" + "beqz %6,1f\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "subu %6,32\n" + "sw %5,0(%6)\n" + "sw %8,4(%6)\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "sub %6,32\n" + "sw %5,0(%6)\n" + "sw %8,8(%6)\n" + #else + "dsubu %6,32\n" + "sd %5,0(%6)\n" + "sd %8,8(%6)\n" + #endif + + /* $7 = syscall($4 = flags, + * $5 = child_stack, + * $6 = parent_tidptr, + * $7 = newtls, + * $8 = child_tidptr) + */ + "li $2,%3\n" + "syscall\n" + + /* if ($7 != 0) + * return $2; + */ + "bnez $7,1f\n" + "bnez $2,1f\n" + + /* In the child, now. Call "fn(arg)". + */ + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "lw $25,0($29)\n" + "lw $4,4($29)\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "lw $25,0($29)\n" + "lw $4,8($29)\n" + #else + "ld $25,0($29)\n" + "ld $4,8($29)\n" + #endif + "jalr $25\n" + + /* Call _exit($2) + */ + "move $4,$2\n" + "li $2,%4\n" + "syscall\n" + + "1:\n" + #if _MIPS_SIM == _MIPS_SIM_ABI32 && _MIPS_SZPTR == 32 + "addu $29, 24\n" + #elif _MIPS_SIM == _MIPS_SIM_NABI32 + "add $29, 16\n" + #else + "daddu $29,16\n" + #endif + : "=&r" (__v0), "=r" (__r7) + : "i"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit), + "r"(fn), "r"(__stack), "r"(__flags), "r"(arg), + "r"(__ptid), "r"(__r7), "r"(__ctid) + : "$9", "$10", "$11", "$12", "$13", "$14", "$15", + "$24", "memory"); + } + LSS_RETURN(int, __v0, __r7); + } + #elif defined (__PPC__) + #undef LSS_LOADARGS_0 + #define LSS_LOADARGS_0(name, dummy...) \ + __sc_0 = __NR_##name + #undef LSS_LOADARGS_1 + #define LSS_LOADARGS_1(name, arg1) \ + LSS_LOADARGS_0(name); \ + __sc_3 = (unsigned long) (arg1) + #undef LSS_LOADARGS_2 + #define LSS_LOADARGS_2(name, arg1, arg2) \ + LSS_LOADARGS_1(name, arg1); \ + __sc_4 = (unsigned long) (arg2) + #undef LSS_LOADARGS_3 + #define LSS_LOADARGS_3(name, arg1, arg2, arg3) \ + LSS_LOADARGS_2(name, arg1, arg2); \ + __sc_5 = (unsigned long) (arg3) + #undef LSS_LOADARGS_4 + #define LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4) \ + LSS_LOADARGS_3(name, arg1, arg2, arg3); \ + __sc_6 = (unsigned long) (arg4) + #undef LSS_LOADARGS_5 + #define LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5) \ + LSS_LOADARGS_4(name, arg1, arg2, arg3, arg4); \ + __sc_7 = (unsigned long) (arg5) + #undef LSS_LOADARGS_6 + #define LSS_LOADARGS_6(name, arg1, arg2, arg3, arg4, arg5, arg6) \ + LSS_LOADARGS_5(name, arg1, arg2, arg3, arg4, arg5); \ + __sc_8 = (unsigned long) (arg6) + #undef LSS_ASMINPUT_0 + #define LSS_ASMINPUT_0 "0" (__sc_0) + #undef LSS_ASMINPUT_1 + #define LSS_ASMINPUT_1 LSS_ASMINPUT_0, "1" (__sc_3) + #undef LSS_ASMINPUT_2 + #define LSS_ASMINPUT_2 LSS_ASMINPUT_1, "2" (__sc_4) + #undef LSS_ASMINPUT_3 + #define LSS_ASMINPUT_3 LSS_ASMINPUT_2, "3" (__sc_5) + #undef LSS_ASMINPUT_4 + #define LSS_ASMINPUT_4 LSS_ASMINPUT_3, "4" (__sc_6) + #undef LSS_ASMINPUT_5 + #define LSS_ASMINPUT_5 LSS_ASMINPUT_4, "5" (__sc_7) + #undef LSS_ASMINPUT_6 + #define LSS_ASMINPUT_6 LSS_ASMINPUT_5, "6" (__sc_8) + #undef LSS_BODY + #define LSS_BODY(nr, type, name, args...) \ + long __sc_ret, __sc_err; \ + { \ + register unsigned long __sc_0 __asm__ ("r0"); \ + register unsigned long __sc_3 __asm__ ("r3"); \ + register unsigned long __sc_4 __asm__ ("r4"); \ + register unsigned long __sc_5 __asm__ ("r5"); \ + register unsigned long __sc_6 __asm__ ("r6"); \ + register unsigned long __sc_7 __asm__ ("r7"); \ + register unsigned long __sc_8 __asm__ ("r8"); \ + \ + LSS_LOADARGS_##nr(name, args); \ + __asm__ __volatile__ \ + ("sc\n\t" \ + "mfcr %0" \ + : "=&r" (__sc_0), \ + "=&r" (__sc_3), "=&r" (__sc_4), \ + "=&r" (__sc_5), "=&r" (__sc_6), \ + "=&r" (__sc_7), "=&r" (__sc_8) \ + : LSS_ASMINPUT_##nr \ + : "cr0", "ctr", "memory", \ + "r9", "r10", "r11", "r12"); \ + __sc_ret = __sc_3; \ + __sc_err = __sc_0; \ + } \ + LSS_RETURN(type, __sc_ret, __sc_err) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)(void) { \ + LSS_BODY(0, type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_BODY(1, type, name, arg1); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_BODY(2, type, name, arg1, arg2); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_BODY(3, type, name, arg1, arg2, arg3); \ + } + #undef _syscall4 + #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_BODY(4, type, name, arg1, arg2, arg3, arg4); \ + } + #undef _syscall5 + #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_BODY(5, type, name, arg1, arg2, arg3, arg4, arg5); \ + } + #undef _syscall6 + #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \ + } + /* clone function adapted from glibc 2.3.6 clone.S */ + /* TODO(csilvers): consider wrapping some args up in a struct, like we + * do for i386's _syscall6, so we can compile successfully on gcc 2.95 + */ + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __ret, __err; + { + register int (*__fn)(void *) __asm__ ("r8") = fn; + register void *__cstack __asm__ ("r4") = child_stack; + register int __flags __asm__ ("r3") = flags; + register void * __arg __asm__ ("r9") = arg; + register int * __ptidptr __asm__ ("r5") = parent_tidptr; + register void * __newtls __asm__ ("r6") = newtls; + register int * __ctidptr __asm__ ("r7") = child_tidptr; + __asm__ __volatile__( + /* check for fn == NULL + * and child_stack == NULL + */ + "cmpwi cr0, %6, 0\n\t" + "cmpwi cr1, %7, 0\n\t" + "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t" + "beq- cr0, 1f\n\t" + + /* set up stack frame for child */ + "clrrwi %7, %7, 4\n\t" + "li 0, 0\n\t" + "stwu 0, -16(%7)\n\t" + + /* fn, arg, child_stack are saved across the syscall: r28-30 */ + "mr 28, %6\n\t" + "mr 29, %7\n\t" + "mr 27, %9\n\t" + + /* syscall */ + "li 0, %4\n\t" + /* flags already in r3 + * child_stack already in r4 + * ptidptr already in r5 + * newtls already in r6 + * ctidptr already in r7 + */ + "sc\n\t" + + /* Test if syscall was successful */ + "cmpwi cr1, 3, 0\n\t" + "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t" + "bne- cr1, 1f\n\t" + + /* Do the function call */ + "mtctr 28\n\t" + "mr 3, 27\n\t" + "bctrl\n\t" + + /* Call _exit(r3) */ + "li 0, %5\n\t" + "sc\n\t" + + /* Return to parent */ + "1:\n" + "mfcr %1\n\t" + "mr %0, 3\n\t" + : "=r" (__ret), "=r" (__err) + : "0" (-1), "1" (EINVAL), + "i" (__NR_clone), "i" (__NR_exit), + "r" (__fn), "r" (__cstack), "r" (__flags), + "r" (__arg), "r" (__ptidptr), "r" (__newtls), + "r" (__ctidptr) + : "cr0", "cr1", "memory", "ctr", + "r0", "r29", "r27", "r28"); + } + LSS_RETURN(int, __ret, __err); + } + #endif + #define __NR__exit __NR_exit + #define __NR__gettid __NR_gettid + #define __NR__mremap __NR_mremap + LSS_INLINE _syscall1(int, brk, void *, e) + LSS_INLINE _syscall1(int, chdir, const char *,p) + LSS_INLINE _syscall1(int, close, int, f) + LSS_INLINE _syscall2(int, clock_getres, int, c, + struct kernel_timespec*, t) + LSS_INLINE _syscall2(int, clock_gettime, int, c, + struct kernel_timespec*, t) + LSS_INLINE _syscall1(int, dup, int, f) + LSS_INLINE _syscall2(int, dup2, int, s, + int, d) + LSS_INLINE _syscall3(int, execve, const char*, f, + const char*const*,a,const char*const*, e) + LSS_INLINE _syscall1(int, _exit, int, e) + LSS_INLINE _syscall1(int, exit_group, int, e) + LSS_INLINE _syscall3(int, fcntl, int, f, + int, c, long, a) + LSS_INLINE _syscall0(pid_t, fork) + LSS_INLINE _syscall2(int, fstat, int, f, + struct kernel_stat*, b) + LSS_INLINE _syscall2(int, fstatfs, int, f, + struct kernel_statfs*, b) + LSS_INLINE _syscall2(int, ftruncate, int, f, + off_t, l) + LSS_INLINE _syscall4(int, futex, int*, a, + int, o, int, v, + struct kernel_timespec*, t) + LSS_INLINE _syscall3(int, getdents, int, f, + struct kernel_dirent*, d, int, c) + LSS_INLINE _syscall3(int, getdents64, int, f, + struct kernel_dirent64*, d, int, c) + LSS_INLINE _syscall0(gid_t, getegid) + LSS_INLINE _syscall0(uid_t, geteuid) + LSS_INLINE _syscall0(pid_t, getpgrp) + LSS_INLINE _syscall0(pid_t, getpid) + LSS_INLINE _syscall0(pid_t, getppid) + LSS_INLINE _syscall2(int, getpriority, int, a, + int, b) + LSS_INLINE _syscall3(int, getresgid, gid_t *, r, + gid_t *, e, gid_t *, s) + LSS_INLINE _syscall3(int, getresuid, uid_t *, r, + uid_t *, e, uid_t *, s) + LSS_INLINE _syscall2(int, getrlimit, int, r, + struct kernel_rlimit*, l) + LSS_INLINE _syscall1(pid_t, getsid, pid_t, p) + LSS_INLINE _syscall0(pid_t, _gettid) + LSS_INLINE _syscall2(int, gettimeofday, struct timeval *, v, + struct timezone *, z) + LSS_INLINE _syscall5(int, setxattr, const char *,p, + const char *, n, const void *,v, + size_t, s, int, f) + LSS_INLINE _syscall5(int, lsetxattr, const char *,p, + const char *, n, const void *,v, + size_t, s, int, f) + LSS_INLINE _syscall4(ssize_t, getxattr, const char *,p, + const char *, n, void *, v, size_t, s) + LSS_INLINE _syscall4(ssize_t, lgetxattr, const char *,p, + const char *, n, void *, v, size_t, s) + LSS_INLINE _syscall3(ssize_t, listxattr, const char *,p, + char *, l, size_t, s) + LSS_INLINE _syscall3(ssize_t, llistxattr, const char *,p, + char *, l, size_t, s) + LSS_INLINE _syscall3(int, ioctl, int, d, + int, r, void *, a) + LSS_INLINE _syscall2(int, ioprio_get, int, which, + int, who) + LSS_INLINE _syscall3(int, ioprio_set, int, which, + int, who, int, ioprio) + LSS_INLINE _syscall2(int, kill, pid_t, p, + int, s) + LSS_INLINE _syscall3(off_t, lseek, int, f, + off_t, o, int, w) + LSS_INLINE _syscall2(int, munmap, void*, s, + size_t, l) + LSS_INLINE _syscall6(long, move_pages, pid_t, p, + unsigned long, n, void **,g, int *, d, + int *, s, int, f) + LSS_INLINE _syscall3(int, mprotect, const void *,a, + size_t, l, int, p) + LSS_INLINE _syscall5(void*, _mremap, void*, o, + size_t, os, size_t, ns, + unsigned long, f, void *, a) + LSS_INLINE _syscall3(int, open, const char*, p, + int, f, int, m) + LSS_INLINE _syscall3(int, poll, struct kernel_pollfd*, u, + unsigned int, n, int, t) + LSS_INLINE _syscall2(int, prctl, int, o, + long, a) + LSS_INLINE _syscall4(long, ptrace, int, r, + pid_t, p, void *, a, void *, d) + #if defined(__NR_quotactl) + // Defined on x86_64 / i386 only + LSS_INLINE _syscall4(int, quotactl, int, cmd, const char *, special, + int, id, caddr_t, addr) + #endif + LSS_INLINE _syscall3(ssize_t, read, int, f, + void *, b, size_t, c) + LSS_INLINE _syscall3(int, readlink, const char*, p, + char*, b, size_t, s) + LSS_INLINE _syscall4(int, rt_sigaction, int, s, + const struct kernel_sigaction*, a, + struct kernel_sigaction*, o, size_t, c) + LSS_INLINE _syscall2(int, rt_sigpending, struct kernel_sigset_t *, s, + size_t, c) + LSS_INLINE _syscall4(int, rt_sigprocmask, int, h, + const struct kernel_sigset_t*, s, + struct kernel_sigset_t*, o, size_t, c); + LSS_INLINE _syscall1(int, rt_sigreturn, unsigned long, u); + LSS_INLINE _syscall2(int, rt_sigsuspend, + const struct kernel_sigset_t*, s, size_t, c); + LSS_INLINE _syscall3(int, sched_getaffinity,pid_t, p, + unsigned int, l, unsigned long *, m) + LSS_INLINE _syscall3(int, sched_setaffinity,pid_t, p, + unsigned int, l, unsigned long *, m) + LSS_INLINE _syscall0(int, sched_yield) + LSS_INLINE _syscall1(long, set_tid_address, int *, t) + LSS_INLINE _syscall1(int, setfsgid, gid_t, g) + LSS_INLINE _syscall1(int, setfsuid, uid_t, u) + LSS_INLINE _syscall1(int, setuid, uid_t, u) + LSS_INLINE _syscall1(int, setgid, gid_t, g) + LSS_INLINE _syscall2(int, setpgid, pid_t, p, + pid_t, g) + LSS_INLINE _syscall3(int, setpriority, int, a, + int, b, int, p) + LSS_INLINE _syscall3(int, setresgid, gid_t, r, + gid_t, e, gid_t, s) + LSS_INLINE _syscall3(int, setresuid, uid_t, r, + uid_t, e, uid_t, s) + LSS_INLINE _syscall2(int, setrlimit, int, r, + const struct kernel_rlimit*, l) + LSS_INLINE _syscall0(pid_t, setsid) + LSS_INLINE _syscall2(int, sigaltstack, const stack_t*, s, + const stack_t*, o) + #if defined(__NR_sigreturn) + LSS_INLINE _syscall1(int, sigreturn, unsigned long, u); + #endif + LSS_INLINE _syscall2(int, stat, const char*, f, + struct kernel_stat*, b) + LSS_INLINE _syscall2(int, statfs, const char*, f, + struct kernel_statfs*, b) + LSS_INLINE _syscall3(int, tgkill, pid_t, p, + pid_t, t, int, s) + LSS_INLINE _syscall2(int, tkill, pid_t, p, + int, s) + LSS_INLINE _syscall3(ssize_t, write, int, f, + const void *, b, size_t, c) + LSS_INLINE _syscall3(ssize_t, writev, int, f, + const struct kernel_iovec*, v, size_t, c) + LSS_INLINE _syscall1(int, unlink, const char*, f) + #if defined(__NR_getcpu) + LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu, + unsigned *, node, void *, unused); + #endif + #if defined(__x86_64__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) + LSS_INLINE _syscall3(int, recvmsg, int, s, + struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall3(int, sendmsg, int, s, + const struct kernel_msghdr*, m, int, f) + LSS_INLINE _syscall6(int, sendto, int, s, + const void*, m, size_t, l, + int, f, + const struct kernel_sockaddr*, a, int, t) + LSS_INLINE _syscall2(int, shutdown, int, s, + int, h) + LSS_INLINE _syscall3(int, socket, int, d, + int, t, int, p) + LSS_INLINE _syscall4(int, socketpair, int, d, + int, t, int, p, int*, s) + #endif + #if defined(__x86_64__) + LSS_INLINE _syscall4(int, fallocate, int, fd, int, mode, + loff_t, offset, loff_t, len) + + LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, + gid_t *egid, + gid_t *sgid) { + return LSS_NAME(getresgid)(rgid, egid, sgid); + } + + LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, + uid_t *euid, + uid_t *suid) { + return LSS_NAME(getresuid)(ruid, euid, suid); + } + + LSS_INLINE _syscall6(void*, mmap, void*, s, + size_t, l, int, p, + int, f, int, d, + __off64_t, o) + + LSS_INLINE _syscall4(int, newfstatat, int, d, + const char *, p, + struct kernel_stat*, b, int, f) + + LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { + return LSS_NAME(setfsgid)(gid); + } + + LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { + return LSS_NAME(setfsuid)(uid); + } + + LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { + return LSS_NAME(setresgid)(rgid, egid, sgid); + } + + LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { + return LSS_NAME(setresuid)(ruid, euid, suid); + } + + LSS_INLINE int LSS_NAME(sigaction)(int signum, + const struct kernel_sigaction *act, + struct kernel_sigaction *oldact) { + /* On x86_64, the kernel requires us to always set our own + * SA_RESTORER in order to be able to return from a signal handler. + * This function must have a "magic" signature that the "gdb" + * (and maybe the kernel?) can recognize. + */ + if (act != NULL && !(act->sa_flags & SA_RESTORER)) { + struct kernel_sigaction a = *act; + a.sa_flags |= SA_RESTORER; + a.sa_restorer = LSS_NAME(restore_rt)(); + return LSS_NAME(rt_sigaction)(signum, &a, oldact, + (KERNEL_NSIG+7)/8); + } else { + return LSS_NAME(rt_sigaction)(signum, act, oldact, + (KERNEL_NSIG+7)/8); + } + } + + LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { + return LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); + } + + LSS_INLINE int LSS_NAME(sigprocmask)(int how, + const struct kernel_sigset_t *set, + struct kernel_sigset_t *oldset) { + return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); + } + + LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { + return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); + } + #endif + #if defined(__x86_64__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) + LSS_INLINE _syscall4(pid_t, wait4, pid_t, p, + int*, s, int, o, + struct kernel_rusage*, r) + + LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){ + return LSS_NAME(wait4)(pid, status, options, 0); + } + #endif + #if defined(__i386__) || defined(__x86_64__) + LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m) + LSS_INLINE _syscall3(int, unlinkat, int, d, const char *, p, int, f) + #endif + #if defined(__i386__) || defined(__ARM_ARCH_3__) + #define __NR__getresgid32 __NR_getresgid32 + #define __NR__getresuid32 __NR_getresuid32 + #define __NR__setfsgid32 __NR_setfsgid32 + #define __NR__setfsuid32 __NR_setfsuid32 + #define __NR__setresgid32 __NR_setresgid32 + #define __NR__setresuid32 __NR_setresuid32 + LSS_INLINE _syscall2(int, ugetrlimit, int, r, + struct kernel_rlimit*, l) + LSS_INLINE _syscall3(int, _getresgid32, gid_t *, r, + gid_t *, e, gid_t *, s) + LSS_INLINE _syscall3(int, _getresuid32, uid_t *, r, + uid_t *, e, uid_t *, s) + LSS_INLINE _syscall1(int, _setfsgid32, gid_t, f) + LSS_INLINE _syscall1(int, _setfsuid32, uid_t, f) + LSS_INLINE _syscall3(int, _setresgid32, gid_t, r, + gid_t, e, gid_t, s) + LSS_INLINE _syscall3(int, _setresuid32, uid_t, r, + uid_t, e, uid_t, s) + + LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, + gid_t *egid, + gid_t *sgid) { + int rc; + if ((rc = LSS_NAME(_getresgid32)(rgid, egid, sgid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((rgid == NULL) || (egid == NULL) || (sgid == NULL)) { + return EFAULT; + } + // Clear the high bits first, since getresgid only sets 16 bits + *rgid = *egid = *sgid = 0; + rc = LSS_NAME(getresgid)(rgid, egid, sgid); + } + return rc; + } + + LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, + uid_t *euid, + uid_t *suid) { + int rc; + if ((rc = LSS_NAME(_getresuid32)(ruid, euid, suid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((ruid == NULL) || (euid == NULL) || (suid == NULL)) { + return EFAULT; + } + // Clear the high bits first, since getresuid only sets 16 bits + *ruid = *euid = *suid = 0; + rc = LSS_NAME(getresuid)(ruid, euid, suid); + } + return rc; + } + + LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { + int rc; + if ((rc = LSS_NAME(_setfsgid32)(gid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)gid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setfsgid)(gid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { + int rc; + if ((rc = LSS_NAME(_setfsuid32)(uid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)uid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setfsuid)(uid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { + int rc; + if ((rc = LSS_NAME(_setresgid32)(rgid, egid, sgid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)rgid & ~0xFFFFu || + (unsigned int)egid & ~0xFFFFu || + (unsigned int)sgid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setresgid)(rgid, egid, sgid); + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { + int rc; + if ((rc = LSS_NAME(_setresuid32)(ruid, euid, suid)) < 0 && + LSS_ERRNO == ENOSYS) { + if ((unsigned int)ruid & ~0xFFFFu || + (unsigned int)euid & ~0xFFFFu || + (unsigned int)suid & ~0xFFFFu) { + rc = EINVAL; + } else { + rc = LSS_NAME(setresuid)(ruid, euid, suid); + } + } + return rc; + } + #endif + LSS_INLINE int LSS_NAME(sigemptyset)(struct kernel_sigset_t *set) { + memset(&set->sig, 0, sizeof(set->sig)); + return 0; + } + + LSS_INLINE int LSS_NAME(sigfillset)(struct kernel_sigset_t *set) { + memset(&set->sig, -1, sizeof(set->sig)); + return 0; + } + + LSS_INLINE int LSS_NAME(sigaddset)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] + |= 1UL << ((signum - 1) % (8*sizeof(set->sig[0]))); + return 0; + } + } + + LSS_INLINE int LSS_NAME(sigdelset)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] + &= ~(1UL << ((signum - 1) % (8*sizeof(set->sig[0])))); + return 0; + } + } + + LSS_INLINE int LSS_NAME(sigismember)(struct kernel_sigset_t *set, + int signum) { + if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { + LSS_ERRNO = EINVAL; + return -1; + } else { + return !!(set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] & + (1UL << ((signum - 1) % (8*sizeof(set->sig[0]))))); + } + } + #if defined(__i386__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || defined(__PPC__) + #define __NR__sigaction __NR_sigaction + #define __NR__sigpending __NR_sigpending + #define __NR__sigprocmask __NR_sigprocmask + #define __NR__sigsuspend __NR_sigsuspend + #define __NR__socketcall __NR_socketcall + LSS_INLINE _syscall2(int, fstat64, int, f, + struct kernel_stat64 *, b) + LSS_INLINE _syscall5(int, _llseek, uint, fd, ulong, hi, ulong, lo, + loff_t *, res, uint, wh) + LSS_INLINE _syscall1(void*, mmap, void*, a) + LSS_INLINE _syscall6(void*, mmap2, void*, s, + size_t, l, int, p, + int, f, int, d, + __off64_t, o) + LSS_INLINE _syscall3(int, _sigaction, int, s, + const struct kernel_old_sigaction*, a, + struct kernel_old_sigaction*, o) + LSS_INLINE _syscall1(int, _sigpending, unsigned long*, s) + LSS_INLINE _syscall3(int, _sigprocmask, int, h, + const unsigned long*, s, + unsigned long*, o) + #ifdef __PPC__ + LSS_INLINE _syscall1(int, _sigsuspend, unsigned long, s) + #else + LSS_INLINE _syscall3(int, _sigsuspend, const void*, a, + int, b, + unsigned long, s) + #endif + LSS_INLINE _syscall2(int, stat64, const char *, p, + struct kernel_stat64 *, b) + + LSS_INLINE int LSS_NAME(sigaction)(int signum, + const struct kernel_sigaction *act, + struct kernel_sigaction *oldact) { + int old_errno = LSS_ERRNO; + int rc; + struct kernel_sigaction a; + if (act != NULL) { + a = *act; + #ifdef __i386__ + /* On i386, the kernel requires us to always set our own + * SA_RESTORER when using realtime signals. Otherwise, it does not + * know how to return from a signal handler. This function must have + * a "magic" signature that the "gdb" (and maybe the kernel?) can + * recognize. + * Apparently, a SA_RESTORER is implicitly set by the kernel, when + * using non-realtime signals. + * + * TODO: Test whether ARM needs a restorer + */ + if (!(a.sa_flags & SA_RESTORER)) { + a.sa_flags |= SA_RESTORER; + a.sa_restorer = (a.sa_flags & SA_SIGINFO) + ? LSS_NAME(restore_rt)() : LSS_NAME(restore)(); + } + #endif + } + rc = LSS_NAME(rt_sigaction)(signum, act ? &a : act, oldact, + (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + struct kernel_old_sigaction oa, ooa, *ptr_a = &oa, *ptr_oa = &ooa; + if (!act) { + ptr_a = NULL; + } else { + oa.sa_handler_ = act->sa_handler_; + memcpy(&oa.sa_mask, &act->sa_mask, sizeof(oa.sa_mask)); + #ifndef __mips__ + oa.sa_restorer = act->sa_restorer; + #endif + oa.sa_flags = act->sa_flags; + } + if (!oldact) { + ptr_oa = NULL; + } + LSS_ERRNO = old_errno; + rc = LSS_NAME(_sigaction)(signum, ptr_a, ptr_oa); + if (rc == 0 && oldact) { + if (act) { + memcpy(oldact, act, sizeof(*act)); + } else { + memset(oldact, 0, sizeof(*oldact)); + } + oldact->sa_handler_ = ptr_oa->sa_handler_; + oldact->sa_flags = ptr_oa->sa_flags; + memcpy(&oldact->sa_mask, &ptr_oa->sa_mask, sizeof(ptr_oa->sa_mask)); + #ifndef __mips__ + oldact->sa_restorer = ptr_oa->sa_restorer; + #endif + } + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { + int old_errno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = old_errno; + LSS_NAME(sigemptyset)(set); + rc = LSS_NAME(_sigpending)(&set->sig[0]); + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigprocmask)(int how, + const struct kernel_sigset_t *set, + struct kernel_sigset_t *oldset) { + int olderrno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = olderrno; + if (oldset) { + LSS_NAME(sigemptyset)(oldset); + } + rc = LSS_NAME(_sigprocmask)(how, + set ? &set->sig[0] : NULL, + oldset ? &oldset->sig[0] : NULL); + } + return rc; + } + + LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { + int olderrno = LSS_ERRNO; + int rc = LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); + if (rc < 0 && LSS_ERRNO == ENOSYS) { + LSS_ERRNO = olderrno; + rc = LSS_NAME(_sigsuspend)( + #ifndef __PPC__ + set, 0, + #endif + set->sig[0]); + } + return rc; + } + #endif + #if defined(__PPC__) + #undef LSS_SC_LOADARGS_0 + #define LSS_SC_LOADARGS_0(dummy...) + #undef LSS_SC_LOADARGS_1 + #define LSS_SC_LOADARGS_1(arg1) \ + __sc_4 = (unsigned long) (arg1) + #undef LSS_SC_LOADARGS_2 + #define LSS_SC_LOADARGS_2(arg1, arg2) \ + LSS_SC_LOADARGS_1(arg1); \ + __sc_5 = (unsigned long) (arg2) + #undef LSS_SC_LOADARGS_3 + #define LSS_SC_LOADARGS_3(arg1, arg2, arg3) \ + LSS_SC_LOADARGS_2(arg1, arg2); \ + __sc_6 = (unsigned long) (arg3) + #undef LSS_SC_LOADARGS_4 + #define LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4) \ + LSS_SC_LOADARGS_3(arg1, arg2, arg3); \ + __sc_7 = (unsigned long) (arg4) + #undef LSS_SC_LOADARGS_5 + #define LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5) \ + LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4); \ + __sc_8 = (unsigned long) (arg5) + #undef LSS_SC_BODY + #define LSS_SC_BODY(nr, type, opt, args...) \ + long __sc_ret, __sc_err; \ + { \ + register unsigned long __sc_0 __asm__ ("r0") = __NR_socketcall; \ + register unsigned long __sc_3 __asm__ ("r3") = opt; \ + register unsigned long __sc_4 __asm__ ("r4"); \ + register unsigned long __sc_5 __asm__ ("r5"); \ + register unsigned long __sc_6 __asm__ ("r6"); \ + register unsigned long __sc_7 __asm__ ("r7"); \ + register unsigned long __sc_8 __asm__ ("r8"); \ + LSS_SC_LOADARGS_##nr(args); \ + __asm__ __volatile__ \ + ("stwu 1, -48(1)\n\t" \ + "stw 4, 20(1)\n\t" \ + "stw 5, 24(1)\n\t" \ + "stw 6, 28(1)\n\t" \ + "stw 7, 32(1)\n\t" \ + "stw 8, 36(1)\n\t" \ + "addi 4, 1, 20\n\t" \ + "sc\n\t" \ + "mfcr %0" \ + : "=&r" (__sc_0), \ + "=&r" (__sc_3), "=&r" (__sc_4), \ + "=&r" (__sc_5), "=&r" (__sc_6), \ + "=&r" (__sc_7), "=&r" (__sc_8) \ + : LSS_ASMINPUT_##nr \ + : "cr0", "ctr", "memory"); \ + __sc_ret = __sc_3; \ + __sc_err = __sc_0; \ + } \ + LSS_RETURN(type, __sc_ret, __sc_err) + + LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, + int flags){ + LSS_SC_BODY(3, ssize_t, 17, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, + const struct kernel_msghdr *msg, + int flags) { + LSS_SC_BODY(3, ssize_t, 16, s, msg, flags); + } + + // TODO(csilvers): why is this ifdef'ed out? +#if 0 + LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, + int flags, + const struct kernel_sockaddr *to, + unsigned int tolen) { + LSS_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen); + } +#endif + + LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { + LSS_SC_BODY(2, int, 13, s, how); + } + + LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { + LSS_SC_BODY(3, int, 1, domain, type, protocol); + } + + LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, + int sv[2]) { + LSS_SC_BODY(4, int, 8, d, type, protocol, sv); + } + #endif + #if defined(__i386__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + #define __NR__socketcall __NR_socketcall + LSS_INLINE _syscall2(int, _socketcall, int, c, + va_list, a) + + LSS_INLINE int LSS_NAME(socketcall)(int op, ...) { + int rc; + va_list ap; + va_start(ap, op); + rc = LSS_NAME(_socketcall)(op, ap); + va_end(ap); + return rc; + } + + LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, + int flags){ + return (ssize_t)LSS_NAME(socketcall)(17, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, + const struct kernel_msghdr *msg, + int flags) { + return (ssize_t)LSS_NAME(socketcall)(16, s, msg, flags); + } + + LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, + int flags, + const struct kernel_sockaddr *to, + unsigned int tolen) { + return (ssize_t)LSS_NAME(socketcall)(11, s, buf, len, flags, to, tolen); + } + + LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { + return LSS_NAME(socketcall)(13, s, how); + } + + LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { + return LSS_NAME(socketcall)(1, domain, type, protocol); + } + + LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, + int sv[2]) { + return LSS_NAME(socketcall)(8, d, type, protocol, sv); + } + #endif + #if defined(__i386__) || defined(__PPC__) + LSS_INLINE _syscall4(int, fstatat64, int, d, + const char *, p, + struct kernel_stat64 *, b, int, f) + #endif + #if defined(__i386__) || defined(__PPC__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + LSS_INLINE _syscall3(pid_t, waitpid, pid_t, p, + int*, s, int, o) + #endif + #if defined(__mips__) + /* sys_pipe() on MIPS has non-standard calling conventions, as it returns + * both file handles through CPU registers. + */ + LSS_INLINE int LSS_NAME(pipe)(int *p) { + register unsigned long __v0 __asm__("$2") = __NR_pipe; + register unsigned long __v1 __asm__("$3"); + register unsigned long __r7 __asm__("$7"); + __asm__ __volatile__ ("syscall\n" + : "=&r"(__v0), "=&r"(__v1), "+r" (__r7) + : "0"(__v0) + : "$8", "$9", "$10", "$11", "$12", + "$13", "$14", "$15", "$24", "memory"); + if (__r7) { + LSS_ERRNO = __v0; + return -1; + } else { + p[0] = __v0; + p[1] = __v1; + return 0; + } + } + #else + LSS_INLINE _syscall1(int, pipe, int *, p) + #endif + /* TODO(csilvers): see if ppc can/should support this as well */ + #if defined(__i386__) || defined(__ARM_ARCH_3__) || \ + (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) + #define __NR__statfs64 __NR_statfs64 + #define __NR__fstatfs64 __NR_fstatfs64 + LSS_INLINE _syscall3(int, _statfs64, const char*, p, + size_t, s,struct kernel_statfs64*, b) + LSS_INLINE _syscall3(int, _fstatfs64, int, f, + size_t, s,struct kernel_statfs64*, b) + LSS_INLINE int LSS_NAME(statfs64)(const char *p, + struct kernel_statfs64 *b) { + return LSS_NAME(_statfs64)(p, sizeof(*b), b); + } + LSS_INLINE int LSS_NAME(fstatfs64)(int f,struct kernel_statfs64 *b) { + return LSS_NAME(_fstatfs64)(f, sizeof(*b), b); + } + #endif + + LSS_INLINE int LSS_NAME(execv)(const char *path, const char *const argv[]) { + extern char **environ; + return LSS_NAME(execve)(path, argv, (const char *const *)environ); + } + + LSS_INLINE pid_t LSS_NAME(gettid)() { + pid_t tid = LSS_NAME(_gettid)(); + if (tid != -1) { + return tid; + } + return LSS_NAME(getpid)(); + } + + LSS_INLINE void *LSS_NAME(mremap)(void *old_address, size_t old_size, + size_t new_size, int flags, ...) { + va_list ap; + void *new_address, *rc; + va_start(ap, flags); + new_address = va_arg(ap, void *); + rc = LSS_NAME(_mremap)(old_address, old_size, new_size, + flags, new_address); + va_end(ap); + return rc; + } + + LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) { + /* PTRACE_DETACH can sometimes forget to wake up the tracee and it + * then sends job control signals to the real parent, rather than to + * the tracer. We reduce the risk of this happening by starting a + * whole new time slice, and then quickly sending a SIGCONT signal + * right after detaching from the tracee. + * + * We use tkill to ensure that we only issue a wakeup for the thread being + * detached. Large multi threaded apps can take a long time in the kernel + * processing SIGCONT. + */ + int rc, err; + LSS_NAME(sched_yield)(); + rc = LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0); + err = LSS_ERRNO; + LSS_NAME(tkill)(pid, SIGCONT); + /* Old systems don't have tkill */ + if (LSS_ERRNO == ENOSYS) + LSS_NAME(kill)(pid, SIGCONT); + LSS_ERRNO = err; + return rc; + } + + LSS_INLINE int LSS_NAME(raise)(int sig) { + return LSS_NAME(kill)(LSS_NAME(getpid)(), sig); + } + + LSS_INLINE int LSS_NAME(setpgrp)() { + return LSS_NAME(setpgid)(0, 0); + } + + LSS_INLINE int LSS_NAME(sysconf)(int name) { + extern int __getpagesize(void); + switch (name) { + case _SC_OPEN_MAX: { + struct kernel_rlimit limit; + return LSS_NAME(getrlimit)(RLIMIT_NOFILE, &limit) < 0 + ? 8192 : limit.rlim_cur; + } + case _SC_PAGESIZE: + return __getpagesize(); + default: + LSS_ERRNO = ENOSYS; + return -1; + } + } + #if defined(__x86_64__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64) + LSS_INLINE _syscall4(ssize_t, pread64, int, f, + void *, b, size_t, c, + loff_t, o) + LSS_INLINE _syscall4(ssize_t, pwrite64, int, f, + const void *, b, size_t, c, + loff_t, o) + LSS_INLINE _syscall3(int, readahead, int, f, + loff_t, o, unsigned, c) + #else + #define __NR__pread64 __NR_pread64 + #define __NR__pwrite64 __NR_pwrite64 + #define __NR__readahead __NR_readahead + LSS_INLINE _syscall5(ssize_t, _pread64, int, f, + void *, b, size_t, c, unsigned, o1, + unsigned, o2) + LSS_INLINE _syscall5(ssize_t, _pwrite64, int, f, + const void *, b, size_t, c, unsigned, o1, + long, o2) + LSS_INLINE _syscall4(int, _readahead, int, f, + unsigned, o1, unsigned, o2, size_t, c); + /* We force 64bit-wide parameters onto the stack, then access each + * 32-bit component individually. This guarantees that we build the + * correct parameters independent of the native byte-order of the + * underlying architecture. + */ + LSS_INLINE ssize_t LSS_NAME(pread64)(int fd, void *buf, size_t count, + loff_t off) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_pread64)(fd, buf, count, o.arg[0], o.arg[1]); + } + LSS_INLINE ssize_t LSS_NAME(pwrite64)(int fd, const void *buf, + size_t count, loff_t off) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_pwrite64)(fd, buf, count, o.arg[0], o.arg[1]); + } + LSS_INLINE int LSS_NAME(readahead)(int fd, loff_t off, int len) { + union { loff_t off; unsigned arg[2]; } o = { off }; + return LSS_NAME(_readahead)(fd, o.arg[0], o.arg[1], len); + } + #endif +#endif + +#if defined(__cplusplus) && !defined(SYS_CPLUSPLUS) +} +#endif + +#endif +#endif diff --git a/sandbox/linux/seccomp/madvise.cc b/sandbox/linux/seccomp/madvise.cc new file mode 100644 index 0000000..738da7f --- /dev/null +++ b/sandbox/linux/seccomp/madvise.cc @@ -0,0 +1,75 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_madvise(void* start, size_t length, int advice) { + Debug::syscall(__NR_madvise, "Executing handler"); + struct { + int sysnum; + long long cookie; + MAdvise madvise_req; + } __attribute__((packed)) request; + request.sysnum = __NR_madvise; + request.cookie = cookie(); + request.madvise_req.start = start; + request.madvise_req.len = length; + request.madvise_req.advice = advice; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward madvise() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_madvise(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + MAdvise madvise_req; + SysCalls sys; + if (read(sys, sandboxFd, &madvise_req, sizeof(madvise_req)) != + sizeof(madvise_req)) { + die("Failed to read parameters for madvise() [process]"); + } + int rc = -EINVAL; + switch (madvise_req.advice) { + case MADV_NORMAL: + case MADV_RANDOM: + case MADV_SEQUENTIAL: + case MADV_WILLNEED: + ok: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_madvise, + madvise_req.start, madvise_req.len, + madvise_req.advice); + return true; + default: + // All other flags to madvise() are potential dangerous (as opposed to + // merely affecting overall performance). Do not allow them on memory + // ranges that were part of the original mappings. + void *stop = reinterpret_cast<void *>( + (char *)madvise_req.start + madvise_req.len); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)madvise_req.start); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (madvise_req.start < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + + // Changing attributes on memory regions that were newly mapped inside of + // the sandbox is OK. + goto ok; + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/maps.cc b/sandbox/linux/seccomp/maps.cc new file mode 100644 index 0000000..606b65d --- /dev/null +++ b/sandbox/linux/seccomp/maps.cc @@ -0,0 +1,330 @@ +#include <errno.h> +#include <fcntl.h> +#include <iostream> +#include <linux/unistd.h> +#include <signal.h> +#include <stdarg.h> +#include <stdlib.h> +#include <sys/ptrace.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include "library.h" +#include "maps.h" +#include "sandbox_impl.h" + +namespace playground { + +Maps::Maps(const std::string& maps_file) : + maps_file_(maps_file), + begin_iter_(this, true, false), + end_iter_(this, false, true), + pid_(-1), + vsyscall_(0) { + memset(fds_, -1, sizeof(fds_)); + int fd = open(maps_file.c_str(), O_RDONLY); + Sandbox::SysCalls sys; + if (fd >= 0) { + char buf[256] = { 0 }; + int len = 0, rc = 1; + bool long_line = false; + do { + if (rc > 0) { + rc = Sandbox::read(sys, fd, buf + len, sizeof(buf) - len - 1); + if (rc > 0) { + len += rc; + } + } + char *ptr = buf; + if (!long_line) { + long_line = true; + unsigned long start = strtoul(ptr, &ptr, 16); + unsigned long stop = strtoul(ptr + 1, &ptr, 16); + while (*ptr == ' ' || *ptr == '\t') ++ptr; + char *perm_ptr = ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; + std::string perm(perm_ptr, ptr - perm_ptr); + unsigned long offset = strtoul(ptr, &ptr, 16); + while (*ptr == ' ' || *ptr == '\t') ++ptr; + char *id_ptr = ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; + while (*ptr == ' ' || *ptr == '\t') ++ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; + std::string id(id_ptr, ptr - id_ptr); + while (*ptr == ' ' || *ptr == '\t') ++ptr; + char *library_ptr = ptr; + while (*ptr && *ptr != ' ' && *ptr != '\t' && *ptr != '\n') ++ptr; + std::string library(library_ptr, ptr - library_ptr); + bool isVDSO = false; + if (library == "[vdso]") { + // /proc/self/maps has a misleading file offset in the [vdso] entry. + // Override it with a sane value. + offset = 0; + isVDSO = true; + } else if (library == "[vsyscall]") { + vsyscall_ = reinterpret_cast<char *>(start); + } else if (library.empty() || library[0] == '[') { + goto skip_entry; + } + int prot = 0; + if (perm.find('r') != std::string::npos) { + prot |= PROT_READ; + } + if (perm.find('w') != std::string::npos) { + prot |= PROT_WRITE; + } + if (perm.find('x') != std::string::npos) { + prot |= PROT_EXEC; + } + if ((prot & (PROT_EXEC | PROT_READ)) == 0) { + goto skip_entry; + } + libs_[id + ' ' + library].addMemoryRange( + reinterpret_cast<void *>(start), + reinterpret_cast<void *>(stop), + Elf_Addr(offset), + prot, isVDSO); + } + skip_entry: + for (;;) { + if (!*ptr || *ptr++ == '\n') { + long_line = false; + memmove(buf, ptr, len - (ptr - buf)); + memset(buf + len - (ptr - buf), 0, ptr - buf); + len -= (ptr - buf); + break; + } + } + } while (len || long_line); + NOINTR_SYS(close(fd)); + + // The runtime loader clobbers some of the data that we want to read, + // when it relocates objects. As we cannot trust the filename that we + // obtained from /proc/self/maps, we instead fork() a child process and + // use mremap() to uncover the obscured data. + int tmp_fds[4]; + pipe(tmp_fds); + pipe(tmp_fds + 2); + pid_ = fork(); + if (pid_ >= 0) { + // Set up read and write file descriptors for exchanging data + // between parent and child. + fds_[ !pid_] = tmp_fds[ !pid_]; + fds_[!!pid_] = tmp_fds[2 + !!pid_]; + NOINTR_SYS(close( tmp_fds[ !!pid_])); + NOINTR_SYS(close( tmp_fds[2 + !pid_])); + + for (LibraryMap::iterator iter = libs_.begin(); iter != libs_.end(); ){ + Library* lib = &iter->second; + if (pid_) { + lib->recoverOriginalDataParent(this); + } else { + lib->recoverOriginalDataChild(strrchr(iter->first.c_str(), ' ') + 1); + } + if (pid_ && !lib->parseElf()) { + libs_.erase(iter++); + } else { + ++iter; + } + } + + // Handle requests sent from the parent to the child + if (!pid_) { + Request req; + for (;;) { + if (Sandbox::read(sys, fds_[0], &req, sizeof(Request)) != + sizeof(Request)) { + _exit(0); + } + switch (req.type) { + case Request::REQ_GET: + { + char *buf = new char[req.length]; + if (!req.library->get(req.offset, buf, req.length)) { + req.length = -1; + Sandbox::write(sys, fds_[1], &req.length,sizeof(req.length)); + } else { + Sandbox::write(sys, fds_[1], &req.length,sizeof(req.length)); + Sandbox::write(sys, fds_[1], buf, req.length); + } + delete[] buf; + } + break; + case Request::REQ_GET_STR: + { + std::string s = req.library->get(req.offset); + req.length = s.length(); + Sandbox::write(sys, fds_[1], &req.length, sizeof(req.length)); + Sandbox::write(sys, fds_[1], s.c_str(), req.length); + } + break; + } + } + } + } else { + for (int i = 0; i < 4; i++) { + NOINTR_SYS(close(tmp_fds[i])); + } + } + } +} + +Maps::~Maps() { + Sandbox::SysCalls sys; + sys.kill(pid_, SIGKILL); + sys.waitpid(pid_, NULL, 0); +} + +char *Maps::forwardGetRequest(Library *library, Elf_Addr offset, + char *buf, size_t length) const { + Request req(Request::REQ_GET, library, offset, length); + Sandbox::SysCalls sys; + if (Sandbox::write(sys, fds_[1], &req, sizeof(Request)) != sizeof(Request) || + Sandbox::read(sys, fds_[0], &req.length, sizeof(req.length)) != + sizeof(req.length) || + req.length == -1 || + Sandbox::read(sys, fds_[0], buf, length) != (ssize_t)length) { + memset(buf, 0, length); + return NULL; + } + return buf; +} + +std::string Maps::forwardGetRequest(Library *library, + Elf_Addr offset) const { + Request req(Request::REQ_GET_STR, library, offset, -1); + Sandbox::SysCalls sys; + if (Sandbox::write(sys, fds_[1], &req, sizeof(Request)) != sizeof(Request) || + Sandbox::read(sys, fds_[0], &req.length, sizeof(req.length)) != + sizeof(req.length)) { + return ""; + } + char *buf = new char[req.length]; + if (Sandbox::read(sys, fds_[0], buf, req.length) != (ssize_t)req.length) { + delete[] buf; + return ""; + } + std::string s(buf, req.length); + delete[] buf; + return s; +} + +Maps::Iterator::Iterator(Maps* maps, bool at_beginning, bool at_end) + : maps_(maps), + at_beginning_(at_beginning), + at_end_(at_end) { +} + +Maps::LibraryMap::iterator& Maps::Iterator::getIterator() const { + if (at_beginning_) { + iter_ = maps_->libs_.begin(); + } else if (at_end_) { + iter_ = maps_->libs_.end(); + } + return iter_; +} + +Maps::Iterator Maps::Iterator::begin() { + return maps_->begin_iter_; +} + +Maps::Iterator Maps::Iterator::end() { + return maps_->end_iter_; +} + +Maps::Iterator& Maps::Iterator::operator++() { + getIterator().operator++(); + at_beginning_ = false; + return *this; +} + +Maps::Iterator Maps::Iterator::operator++(int i) { + getIterator().operator++(i); + at_beginning_ = false; + return *this; +} + +Library* Maps::Iterator::operator*() const { + return &getIterator().operator*().second; +} + +bool Maps::Iterator::operator==(const Maps::Iterator& iter) const { + return getIterator().operator==(iter.getIterator()); +} + +bool Maps::Iterator::operator!=(const Maps::Iterator& iter) const { + return !operator==(iter); +} + +std::string Maps::Iterator::name() const { + return getIterator()->first; +} + +char* Maps::allocNearAddr(char* addr, size_t size, int prot) const { + // We try to allocate memory within 1.5GB of a target address. This means, + // we will be able to perform relative 32bit jumps from the target address. + size = (size + 4095) & ~4095; + Sandbox::SysCalls sys; + int fd = sys.open(maps_file_.c_str(), O_RDONLY, 0); + if (fd < 0) { + return NULL; + } + + char buf[256] = { 0 }; + int len = 0, rc = 1; + bool long_line = false; + unsigned long gap_start = 0x10000; + char *new_addr; + do { + if (rc > 0) { + do { + rc = Sandbox::read(sys, fd, buf + len, sizeof(buf) - len - 1); + if (rc > 0) { + len += rc; + } + } while (rc > 0 && len < (int)sizeof(buf) - 1); + } + char *ptr = buf; + if (!long_line) { + long_line = true; + unsigned long start = strtoul(ptr, &ptr, 16); + unsigned long stop = strtoul(ptr + 1, &ptr, 16); + if (start - gap_start >= size) { + if (reinterpret_cast<long>(addr) - static_cast<long>(start) >= 0) { + if (reinterpret_cast<long>(addr) - (start - size) < (1536 << 20)) { + new_addr = reinterpret_cast<char *>(sys.MMAP + (reinterpret_cast<void *>(start - size), size, prot, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0)); + if (new_addr != MAP_FAILED) { + goto done; + } + } + } else if (gap_start + size - reinterpret_cast<long>(addr) < + (1536 << 20)) { + new_addr = reinterpret_cast<char *>(sys.MMAP + (reinterpret_cast<void *>(gap_start), size, prot, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1 ,0)); + if (new_addr != MAP_FAILED) { + goto done; + } + } + } + gap_start = stop; + } + for (;;) { + if (!*ptr || *ptr++ == '\n') { + long_line = false; + memmove(buf, ptr, len - (ptr - buf)); + memset(buf + len - (ptr - buf), 0, ptr - buf); + len -= (ptr - buf); + break; + } + } + } while (len || long_line); + new_addr = NULL; +done: + sys.close(fd); + return new_addr; +} + +} // namespace diff --git a/sandbox/linux/seccomp/maps.h b/sandbox/linux/seccomp/maps.h new file mode 100644 index 0000000..6b86555 --- /dev/null +++ b/sandbox/linux/seccomp/maps.h @@ -0,0 +1,105 @@ +#ifndef MAPS_H__ +#define MAPS_H__ + +#include <elf.h> +#include <string> +#include <vector> + +#if defined(__x86_64__) +typedef Elf64_Addr Elf_Addr; +#elif defined(__i386__) +typedef Elf32_Addr Elf_Addr; +#else +#error Undefined target platform +#endif + +namespace playground { + +class Library; +class Maps { + friend class Library; + public: + Maps(const std::string& maps_file); + ~Maps(); + + protected: + char *forwardGetRequest(Library *library, Elf_Addr offset, char *buf, + size_t length) const; + std::string forwardGetRequest(Library *library, Elf_Addr offset) const; + + // A map with all the libraries currently loaded into the application. + // The key is a unique combination of device number, inode number, and + // file name. It should be treated as opaque. + typedef std::map<std::string, Library> LibraryMap; + friend class Iterator; + class Iterator { + friend class Maps; + + protected: + explicit Iterator(Maps* maps); + Iterator(Maps* maps, bool at_beginning, bool at_end); + Maps::LibraryMap::iterator& getIterator() const; + + public: + Iterator begin(); + Iterator end(); + Iterator& operator++(); + Iterator operator++(int i); + Library* operator*() const; + bool operator==(const Iterator& iter) const; + bool operator!=(const Iterator& iter) const; + std::string name() const; + + protected: + mutable LibraryMap::iterator iter_; + Maps *maps_; + bool at_beginning_; + bool at_end_; + }; + + public: + typedef class Iterator const_iterator; + + const_iterator begin() { + return begin_iter_; + } + + const_iterator end() { + return end_iter_; + } + + char* allocNearAddr(char *addr, size_t size, int prot) const; + + char* vsyscall() const { return vsyscall_; } + + private: + struct Request { + enum Type { REQ_GET, REQ_GET_STR }; + + Request() { } + + Request(enum Type t, Library* i, Elf_Addr o, ssize_t l) : + library(i), offset(o), length(l), type(t), padding(0) { + } + + Library* library; + Elf_Addr offset; + ssize_t length; + enum Type type; + int padding; // for valgrind + }; + + protected: + const std::string maps_file_; + const Iterator begin_iter_; + const Iterator end_iter_; + + LibraryMap libs_; + pid_t pid_; + int fds_[2]; + char* vsyscall_; +}; + +} // namespace + +#endif // MAPS_H__ diff --git a/sandbox/linux/seccomp/mmap.cc b/sandbox/linux/seccomp/mmap.cc new file mode 100644 index 0000000..9ffd110 --- /dev/null +++ b/sandbox/linux/seccomp/mmap.cc @@ -0,0 +1,69 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +void* Sandbox::sandbox_mmap(void *start, size_t length, int prot, int flags, + int fd, off_t offset) { + Debug::syscall(__NR_mmap, "Executing handler"); + struct { + int sysnum; + long long cookie; + MMap mmap_req; + } __attribute__((packed)) request; + request.sysnum = __NR_MMAP; + request.cookie = cookie(); + request.mmap_req.start = start; + request.mmap_req.length = length; + request.mmap_req.prot = prot; + request.mmap_req.flags = flags; + request.mmap_req.fd = fd; + request.mmap_req.offset = offset; + + void* rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward mmap() request [sandbox]"); + } + return rc; +} + +bool Sandbox::process_mmap(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + MMap mmap_req; + if (read(sys, sandboxFd, &mmap_req, sizeof(mmap_req)) != sizeof(mmap_req)) { + die("Failed to read parameters for mmap() [process]"); + } + + if (mmap_req.flags & MAP_FIXED) { + // Cannot map a memory area that was part of the original memory mappings. + void *stop = reinterpret_cast<void *>( + (char *)mmap_req.start + mmap_req.length); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)mmap_req.start); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (mmap_req.start < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + int rc = -EINVAL; + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + } + + // All other mmap() requests are OK + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_MMAP, + mmap_req.start, mmap_req.length, mmap_req.prot, + mmap_req.flags, mmap_req.fd, mmap_req.offset); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/mprotect.cc b/sandbox/linux/seccomp/mprotect.cc new file mode 100644 index 0000000..1852b7d --- /dev/null +++ b/sandbox/linux/seccomp/mprotect.cc @@ -0,0 +1,66 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_mprotect(const void *addr, size_t len, int prot) { + Debug::syscall(__NR_mprotect, "Executing handler"); + struct { + int sysnum; + long long cookie; + MProtect mprotect_req; + } __attribute__((packed)) request; + request.sysnum = __NR_mprotect; + request.cookie = cookie(); + request.mprotect_req.addr = addr; + request.mprotect_req.len = len; + request.mprotect_req.prot = prot; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward mprotect() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_mprotect(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + MProtect mprotect_req; + if (read(sys, sandboxFd, &mprotect_req, sizeof(mprotect_req)) != + sizeof(mprotect_req)) { + die("Failed to read parameters for mprotect() [process]"); + } + + // Cannot change permissions on any memory region that was part of the + // original memory mappings. + int rc = -EINVAL; + void *stop = reinterpret_cast<void *>( + (char *)mprotect_req.addr + mprotect_req.len); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + (void *)mprotect_req.addr); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (mprotect_req.addr < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + + // Changing permissions on memory regions that were newly mapped inside of + // the sandbox is OK. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_mprotect, + mprotect_req.addr, mprotect_req.len, + mprotect_req.prot); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/munmap.cc b/sandbox/linux/seccomp/munmap.cc new file mode 100644 index 0000000..ddab897 --- /dev/null +++ b/sandbox/linux/seccomp/munmap.cc @@ -0,0 +1,64 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_munmap(void* start, size_t length) { + Debug::syscall(__NR_munmap, "Executing handler"); + struct { + int sysnum; + long long cookie; + MUnmap munmap_req; + } __attribute__((packed)) request; + request.sysnum = __NR_munmap; + request.cookie = cookie(); + request.munmap_req.start = start; + request.munmap_req.length = length; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward munmap() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_munmap(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + MUnmap munmap_req; + if (read(sys, sandboxFd, &munmap_req, sizeof(munmap_req)) != + sizeof(munmap_req)) { + die("Failed to read parameters for munmap() [process]"); + } + + // Cannot unmap any memory region that was part of the original memory + // mappings. + int rc = -EINVAL; + void *stop = reinterpret_cast<void *>( + reinterpret_cast<char *>(munmap_req.start) + munmap_req.length); + ProtectedMap::const_iterator iter = protectedMap_.lower_bound( + munmap_req.start); + if (iter != protectedMap_.begin()) { + --iter; + } + for (; iter != protectedMap_.end() && iter->first < stop; ++iter) { + if (munmap_req.start < reinterpret_cast<void *>( + reinterpret_cast<char *>(iter->first) + iter->second) && + stop > iter->first) { + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } + } + + // Unmapping memory regions that were newly mapped inside of the sandbox + // is OK. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_munmap, + munmap_req.start, munmap_req.length); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/mutex.h b/sandbox/linux/seccomp/mutex.h new file mode 100644 index 0000000..7729be6 --- /dev/null +++ b/sandbox/linux/seccomp/mutex.h @@ -0,0 +1,149 @@ +#ifndef MUTEX_H__ +#define MUTEX_H__ + +#include "sandbox_impl.h" + +namespace playground { + +class Mutex { + public: + typedef int mutex_t; + + enum { kInitValue = 0 }; + + static void initMutex(mutex_t* mutex) { + // Mutex is unlocked, and nobody is waiting for it + *mutex = kInitValue; + } + + static void unlockMutex(mutex_t* mutex) { + char status; + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; addl %2, %0\n" + "setz %1" + : "=m"(*mutex), "=qm"(status) + : "ir"(0x80000000), "m"(*mutex)); + #else + #error Unsupported target platform + #endif + if (status) { + // Mutex is zero now. No other waiters. So, we can return. + return; + } + // We unlocked the mutex, but still need to wake up other waiters. + Sandbox::SysCalls sys; + sys.futex(mutex, FUTEX_WAKE, 1, NULL); + } + + static bool lockMutex(mutex_t* mutex, int timeout = 0) { + bool rc = true; + // Increment mutex to add ourselves to the list of waiters + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; incl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + for (;;) { + // Atomically check whether the mutex is available and if so, acquire it + char status; + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; btsl %3, %1\n" + "setc %0" + : "=q"(status), "=m"(*mutex) + : "m"(*mutex), "ir"(31)); + #else + #error Unsupported target platform + #endif + if (!status) { + done: + // If the mutex was available, remove ourselves from list of waiters + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; decl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + return rc; + } + int value = *mutex; + if (value >= 0) { + // Mutex has just become available, no need to call kernel + continue; + } + Sandbox::SysCalls sys; + Sandbox::SysCalls::kernel_timespec tm; + if (timeout) { + tm.tv_sec = timeout / 1000; + tm.tv_nsec = (timeout % 1000) * 1000 * 1000; + } else { + tm.tv_sec = 0; + tm.tv_nsec = 0; + } + if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) && + sys.my_errno == ETIMEDOUT) { + rc = false; + goto done; + } + } + } + + static bool waitForUnlock(mutex_t* mutex, int timeout = 0) { + bool rc = true; + // Increment mutex to add ourselves to the list of waiters + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; incl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + Sandbox::SysCalls sys; + for (;;) { + mutex_t value = *mutex; + if (value >= 0) { + done: + // Mutex was not locked. Remove ourselves from list of waiters, notify + // any other waiters (if any), and return. + #if defined(__x86_64__) || defined(__i386__) + asm volatile( + "lock; decl %0\n" + : "=m"(*mutex) + : "m"(*mutex)); + #else + #error Unsupported target platform + #endif + NOINTR_SYS(sys.futex(mutex, FUTEX_WAKE, 1, 0)); + return rc; + } + + // Wait for mutex to become unlocked + Sandbox::SysCalls::kernel_timespec tm; + if (timeout) { + tm.tv_sec = timeout / 1000; + tm.tv_nsec = (timeout % 1000) * 1000 * 1000; + } else { + tm.tv_sec = 0; + tm.tv_nsec = 0; + } + + if (NOINTR_SYS(sys.futex(mutex, FUTEX_WAIT, value, &tm)) && + sys.my_errno == ETIMEDOUT) { + rc = false; + goto done; + } + } + } + +}; + +} // namespace + +#endif // MUTEX_H__ diff --git a/sandbox/linux/seccomp/open.cc b/sandbox/linux/seccomp/open.cc new file mode 100644 index 0000000..9b4786b --- /dev/null +++ b/sandbox/linux/seccomp/open.cc @@ -0,0 +1,92 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_open(const char *pathname, int flags, mode_t mode) { + Debug::syscall(__NR_open, "Executing handler"); + size_t len = strlen(pathname); + struct Request { + int sysnum; + long long cookie; + Open open_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_open; + request->cookie = cookie(); + request->open_req.path_length = len; + request->open_req.flags = flags; + request->open_req.mode = mode; + memcpy(request->pathname, pathname, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward open() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_open(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + Open open_req; + if (read(sys, sandboxFd, &open_req, sizeof(open_req)) != sizeof(open_req)) { + read_parm_failed: + die("Failed to read parameters for open() [process]"); + } + int rc = -ENAMETOOLONG; + if (open_req.path_length >= sizeof(mem->pathname)) { + char buf[32]; + while (open_req.path_length > 0) { + size_t len = open_req.path_length > sizeof(buf) ? + sizeof(buf) : open_req.path_length; + ssize_t i = read(sys, sandboxFd, buf, len); + if (i <= 0) { + goto read_parm_failed; + } + open_req.path_length -= i; + } + if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to return data from open() [process]"); + } + return false; + } + + if ((open_req.flags & O_ACCMODE) != O_RDONLY) { + // After locking the mutex, we can no longer abandon the system call. So, + // perform checks before clobbering the securely shared memory. + char tmp[open_req.path_length]; + if (read(sys, sandboxFd, tmp, open_req.path_length) != + (ssize_t)open_req.path_length) { + goto read_parm_failed; + } + Debug::message(("Denying access to \"" + std::string(tmp) + "\"").c_str()); + SecureMem::abandonSystemCall(threadFd, -EACCES); + return false; + } + + SecureMem::lockSystemCall(parentProc, mem); + if (read(sys, sandboxFd, mem->pathname, open_req.path_length) != + (ssize_t)open_req.path_length) { + goto read_parm_failed; + } + mem->pathname[open_req.path_length] = '\000'; + + // TODO(markus): Implement sandboxing policy. For now, we allow read + // access to everything. That's probably not correct. + Debug::message(("Allowing access to \"" + std::string(mem->pathname) + + "\"").c_str()); + + // Tell trusted thread to open the file. + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, __NR_open, + mem->pathname - (char*)mem + (char*)mem->self, + open_req.flags, open_req.mode); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc new file mode 100644 index 0000000..0c3e499 --- /dev/null +++ b/sandbox/linux/seccomp/sandbox.cc @@ -0,0 +1,421 @@ +#include "library.h" +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +// Global variables +int Sandbox::pid_; +int Sandbox::processFdPub_; +int Sandbox::cloneFdPub_; +Sandbox::ProtectedMap Sandbox::protectedMap_; +std::vector<SecureMem::Args*> Sandbox::secureMemPool_; + + +bool Sandbox::sendFd(int transport, int fd0, int fd1, const void* buf, + size_t len) { + int fds[2], count = 0; + if (fd0 >= 0) { fds[count++] = fd0; } + if (fd1 >= 0) { fds[count++] = fd1; } + if (!count) { + return false; + } + char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct SysCalls::kernel_iovec iov[2] = { { 0 } }; + struct SysCalls::kernel_msghdr msg = { 0 }; + int dummy = 0; + iov[0].iov_base = &dummy; + iov[0].iov_len = sizeof(dummy); + if (buf && len > 0) { + iov[1].iov_base = const_cast<void *>(buf); + iov[1].iov_len = len; + } + msg.msg_iov = iov; + msg.msg_iovlen = (buf && len > 0) ? 2 : 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = CMSG_LEN(count*sizeof(int)); + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(count*sizeof(int)); + memcpy(CMSG_DATA(cmsg), fds, count*sizeof(int)); + SysCalls sys; + return NOINTR_SYS(sys.sendmsg(transport, &msg, 0)) == + (ssize_t)(sizeof(dummy) + ((buf && len > 0) ? len : 0)); +} + +bool Sandbox::getFd(int transport, int* fd0, int* fd1, void* buf, size_t*len) { + int count = 0; + int *err = NULL; + if (fd0) { + count++; + err = fd0; + *fd0 = -1; + } + if (fd1) { + if (!count++) { + err = fd1; + } + *fd1 = -1; + } + if (!count) { + return false; + } + char cmsg_buf[CMSG_SPACE(count*sizeof(int))]; + memset(cmsg_buf, 0, sizeof(cmsg_buf)); + struct SysCalls::kernel_iovec iov[2] = { { 0 } }; + struct SysCalls::kernel_msghdr msg = { 0 }; + iov[0].iov_base = err; + iov[0].iov_len = sizeof(int); + if (buf && len && *len > 0) { + iov[1].iov_base = buf; + iov[1].iov_len = *len; + } + msg.msg_iov = iov; + msg.msg_iovlen = (buf && len && *len > 0) ? 2 : 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = CMSG_LEN(count*sizeof(int)); + SysCalls sys; + ssize_t bytes = NOINTR_SYS(sys.recvmsg(transport, &msg, 0)); + if (len) { + *len = bytes > (int)sizeof(int) ? + bytes - sizeof(int) : 0; + } + if (bytes != (ssize_t)(sizeof(int) + ((buf && len && *len > 0) ? *len : 0))){ + *err = bytes >= 0 ? 0 : -EBADF; + return false; + } + if (*err) { + // "err" is the first four bytes of the payload. If these are non-zero, + // the sender on the other side of the socketpair sent us an errno value. + // We don't expect to get any file handles in this case. + return false; + } + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if ((msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) || + !cmsg || + cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS || + cmsg->cmsg_len != CMSG_LEN(count*sizeof(int))) { + *err = -EBADF; + return false; + } + if (fd1) { *fd1 = ((int *)CMSG_DATA(cmsg))[--count]; } + if (fd0) { *fd0 = ((int *)CMSG_DATA(cmsg))[--count]; } + return true; +} + +void Sandbox::setupSignalHandlers() { + SysCalls sys; + struct SysCalls::kernel_sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler_ = SIG_DFL; + sys.sigaction(SIGCHLD, &sa, NULL); + + // Set up SEGV handler for dealing with RDTSC instructions + sa.sa_handler_ = segv(); + sys.sigaction(SIGSEGV, &sa, NULL); + + // Block all asynchronous signals, except for SIGCHLD which needs to be + // set to SIG_DFL for waitpid() to work. + SysCalls::kernel_sigset_t mask; + memset(&mask, 0xFF, sizeof(mask)); + mask.sig[0] &= ~((1 << (SIGSEGV - 1)) | (1 << (SIGINT - 1)) | + (1 << (SIGTERM - 1)) | (1 << (SIGQUIT - 1)) | + (1 << (SIGHUP - 1)) | (1 << (SIGABRT - 1)) | + (1 << (SIGCHLD - 1))); + sys.sigprocmask(SIG_SETMASK, &mask, 0); +} + +void (*Sandbox::segv())(int signo) { + void (*fnc)(int signo); + asm volatile( + "call 999f\n" +#if defined(__x86_64__) + // Inspect instruction at the point where the segmentation fault + // happened. If it is RDTSC, forward the request to the trusted + // thread. + "mov $-3, %%r14\n" // request for RDTSC + "mov 0xB0(%%rsp), %%r15\n" // %rip at time of segmentation fault + "cmpw $0x310F, (%%r15)\n" // RDTSC + "jz 0f\n" + "cmpw $0x010F, (%%r15)\n" // RDTSCP + "jnz 8f\n" + "cmpb $0xF9, 2(%%r15)\n" + "jnz 8f\n" + "mov $-4, %%r14\n" // request for RDTSCP + "0:" +#ifndef NDEBUG + "lea 100f(%%rip), %%rdi\n" + "call playground$debugMessage\n" +#endif + "sub $4, %%rsp\n" + "push %%r14\n" + "mov %%gs:16, %%edi\n" // fd = threadFdPub + "mov %%rsp, %%rsi\n" // buf = %esp + "mov $4, %%edx\n" // len = sizeof(int) + "1:mov $1, %%eax\n" // NR_write + "syscall\n" + "cmp %%rax, %%rdx\n" + "jz 5f\n" + "cmp $-4, %%eax\n" // EINTR + "jz 1b\n" + "2:add $12, %%rsp\n" + "movq $0, 0x98(%%rsp)\n" // %rax at time of segmentation fault + "movq $0, 0x90(%%rsp)\n" // %rdx at time of segmentation fault + "cmpw $0x310F, (%%r15)\n" // RDTSC + "jz 3f\n" + "movq $0, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault + "3:addq $2, 0xB0(%%rsp)\n" // %rip at time of segmentation fault + "cmpw $0x010F, (%%r15)\n" // RDTSC + "jnz 4f\n" + "addq $1, 0xB0(%%rsp)\n" // %rip at time of segmentation fault + "4:ret\n" + "5:mov $12, %%edx\n" // len = 3*sizeof(int) + "6:mov $0, %%eax\n" // NR_read + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 6b\n" + "cmp %%rax, %%rdx\n" + "jnz 2b\n" + "mov 0(%%rsp), %%eax\n" + "mov 4(%%rsp), %%edx\n" + "mov 8(%%rsp), %%ecx\n" + "add $12, %%rsp\n" + "mov %%rdx, 0x90(%%rsp)\n" // %rdx at time of segmentation fault + "cmpw $0x310F, (%%r15)\n" // RDTSC + "jz 7f\n" + "mov %%rcx, 0xA0(%%rsp)\n" // %rcx at time of segmentation fault + "7:mov %%rax, 0x98(%%rsp)\n" // %rax at time of segmentation fault + "jmp 3b\n" + + // If the instruction is INT 0, then this was probably the result + // of playground::Library being unable to find a way to safely + // rewrite the system call instruction. Retrieve the CPU register + // at the time of the segmentation fault and invoke syscallWrapper(). + "8:cmpw $0xCD, (%%r15)\n" // INT $0x0 + "jnz 9f\n" +#ifndef NDEBUG + "lea 200f(%%rip), %%rdi\n" + "call playground$debugMessage\n" +#endif + "mov 0x98(%%rsp), %%rax\n" // %rax at time of segmentation fault + "mov 0x70(%%rsp), %%rdi\n" // %rdi at time of segmentation fault + "mov 0x78(%%rsp), %%rsi\n" // %rsi at time of segmentation fault + "mov 0x90(%%rsp), %%rdx\n" // %rdx at time of segmentation fault + "mov 0x40(%%rsp), %%r10\n" // %r10 at time of segmentation fault + "mov 0x30(%%rsp), %%r8\n" // %r8 at time of segmentation fault + "mov 0x38(%%rsp), %%r9\n" // %r9 at time of segmentation fault + "lea 7b(%%rip), %%rcx\n" + "push %%rcx\n" + "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault + "lea playground$syscallWrapper(%%rip), %%rcx\n" + "jmp *%%rcx\n" + + // This was a genuine segmentation fault. Trigger the kernel's default + // signal disposition. The only way we can do this from seccomp mode + // is by blocking the signal and retriggering it. + "9:mov $2, %%edi\n" // stderr + "lea 300f(%%rip), %%rsi\n" // "Segmentation fault\n" + "mov $301f-300f, %%edx\n" + "mov $1, %%eax\n" // NR_write + "syscall\n" + "orb $4, 0x131(%%rsp)\n" // signal mask at time of segmentation fault + "ret\n" +#elif defined(__i386__) + // Inspect instruction at the point where the segmentation fault + // happened. If it is RDTSC, forward the request to the trusted + // thread. + "mov $-3, %%ebx\n" // request for RDTSC + "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault + "cmpw $0x310F, (%%ebp)\n" // RDTSC + "jz 0f\n" + "cmpw $0x010F, (%%ebp)\n" + "jnz 8f\n" + "cmpb $0xF9, 2(%%ebp)\n" + "jnz 8f\n" + "mov $-4, %%ebx\n" // request for RDTSCP + "0:" +#ifndef NDEBUG + "lea 100f, %%eax\n" + "push %%eax\n" + "call playground$debugMessage\n" + "sub $4, %%esp\n" +#else + "sub $8, %%esp\n" +#endif + "push %%ebx\n" + "mov %%fs:16, %%ebx\n" // fd = threadFdPub + "mov %%esp, %%ecx\n" // buf = %esp + "mov $4, %%edx\n" // len = sizeof(int) + "1:mov %%edx, %%eax\n" // NR_write + "int $0x80\n" + "cmp %%eax, %%edx\n" + "jz 5f\n" + "cmp $-4, %%eax\n" // EINTR + "jz 1b\n" + "2:add $12, %%esp\n" + "movl $0, 0x34(%%esp)\n" // %eax at time of segmentation fault + "movl $0, 0x2C(%%esp)\n" // %edx at time of segmentation fault + "cmpw $0x310F, (%%ebp)\n" // RDTSC + "jz 3f\n" + "movl $0, 0x30(%%esp)\n" // %ecx at time of segmentation fault + "3:addl $2, 0x40(%%esp)\n" // %eip at time of segmentation fault + "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault + "cmpw $0x010F, (%%ebp)\n" // RDTSC + "jnz 4f\n" + "addl $1, 0x40(%%esp)\n" // %eip at time of segmentation fault + "4:ret\n" + "5:mov $12, %%edx\n" // len = 3*sizeof(int) + "6:mov $3, %%eax\n" // NR_read + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 6b\n" + "cmp %%eax, %%edx\n" + "jnz 2b\n" + "pop %%eax\n" + "pop %%edx\n" + "pop %%ecx\n" + "mov %%edx, 0x2C(%%esp)\n" // %edx at time of segmentation fault + "cmpw $0x310F, (%%ebp)\n" // RDTSC + "jz 7f\n" + "mov %%ecx, 0x30(%%esp)\n" // %ecx at time of segmentation fault + "7:mov %%eax, 0x34(%%esp)\n" // %eax at time of segmentation fault + "jmp 3b\n" + + // If the instruction is INT 0, then this was probably the result + // of playground::Library being unable to find a way to safely + // rewrite the system call instruction. Retrieve the CPU register + // at the time of the segmentation fault and invoke syscallWrapper(). + "8:cmpw $0xCD, (%%ebp)\n" // INT $0x0 + "jnz 9f\n" +#ifndef NDEBUG + "lea 200f, %%eax\n" + "push %%eax\n" + "call playground$debugMessage\n" + "add $0x4, %%esp\n" +#endif + "mov 0x34(%%esp), %%eax\n" // %eax at time of segmentation fault + "mov 0x28(%%esp), %%ebx\n" // %ebx at time of segmentation fault + "mov 0x30(%%esp), %%ecx\n" // %ecx at time of segmentation fault + "mov 0x2C(%%esp), %%edx\n" // %edx at time of segmentation fault + "mov 0x1C(%%esp), %%esi\n" // %esi at time of segmentation fault + "mov 0x18(%%esp), %%edi\n" // %edi at time of segmentation fault + "mov 0x20(%%esp), %%ebp\n" // %ebp at time of segmentation fault + "call playground$syscallWrapper\n" + "jmp 7b\n" + + // This was a genuine segmentation fault. Trigger the kernel's default + // signal disposition. The only way we can do this from seccomp mode + // is by blocking the signal and retriggering it. + "9:mov $2, %%ebx\n" // stderr + "lea 300f, %%ecx\n" // "Segmentation fault\n" + "mov $301f-300f, %%edx\n" + "mov $4, %%eax\n" // NR_write + "int $0x80\n" + "orb $4, 0x59(%%esp)\n" // signal mask at time of segmentation fault + "ret\n" +#else +#error Unsupported target platform +#endif + ".pushsection \".rodata\"\n" +#ifndef NDEBUG + "100:.asciz \"RDTSC(P): Executing handler\\n\"\n" + "200:.asciz \"INT $0x0: Executing handler\\n\"\n" +#endif + "300:.ascii \"Segmentation fault\\n\"\n" + "301:\n" + ".popsection\n" + "999:pop %0\n" + : "=g"(fnc) + ); + return fnc; +} + +void Sandbox::snapshotMemoryMappings(int processFd) { + SysCalls sys; + int mapsFd = sys.open("/proc/self/maps", O_RDONLY, 0); + if (mapsFd < 0 || !sendFd(processFd, mapsFd, -1, NULL, NULL)) { + failure: + die("Cannot access /proc/self/maps"); + } + NOINTR_SYS(sys.close(mapsFd)); + int dummy; + if (read(sys, processFd, &dummy, sizeof(dummy)) != sizeof(dummy)) { + goto failure; + } +} + +void Sandbox::startSandbox() { + SysCalls sys; + + // The pid is unchanged for the entire program, so we can retrieve it once + // and store it in a global variable. + pid_ = sys.getpid(); + + // Block all signals, except for the RDTSC handler + setupSignalHandlers(); + + // Get socketpairs for talking to the trusted process + int pair[4]; + if (socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || + socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { + die("Failed to create trusted thread"); + } + processFdPub_ = pair[0]; + cloneFdPub_ = pair[2]; + SecureMemArgs::Args* secureMem = createTrustedProcess(pair[0], pair[1], + pair[2], pair[3]); + + // We find all libraries that have system calls and redirect the system + // calls to the sandbox. If we miss any system calls, the application will be + // terminated by the kernel's seccomp code. So, from a security point of + // view, if this code fails to identify system calls, we are still behaving + // correctly. + { + Maps maps("/proc/self/maps"); + const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; + + // Intercept system calls in the VDSO segment (if any). This has to happen + // before intercepting system calls in any of the other libraries, as + // the main kernel entry point might be inside of the VDSO and we need to + // determine its address before we can compare it to jumps from inside + // other libraries. + for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ + Library* library = *iter; + if (library->isVDSO()) { + library->makeWritable(true); + library->patchSystemCalls(); + library->makeWritable(false); + break; + } + } + + // Intercept system calls in libraries that are known to have them. + for (Maps::const_iterator iter = maps.begin(); iter != maps.end(); ++iter){ + Library* library = *iter; + for (const char **ptr = libs; *ptr; ptr++) { + char *name = strstr(iter.name().c_str(), *ptr); + if (name) { + char ch = name[strlen(*ptr)]; + if (ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z') { + library->makeWritable(true); + library->patchSystemCalls(); + library->makeWritable(false); + break; + } + } + } + } + } + + // Take a snapshot of the current memory mappings. These mappings will be + // off-limits to all future mmap(), munmap(), mremap(), and mprotect() calls. + snapshotMemoryMappings(processFdPub_); + + // Creating the trusted thread enables sandboxing + createTrustedThread(processFdPub_, cloneFdPub_, secureMem); +} + +} // namespace diff --git a/sandbox/linux/seccomp/sandbox.h b/sandbox/linux/seccomp/sandbox.h new file mode 100644 index 0000000..959156b --- /dev/null +++ b/sandbox/linux/seccomp/sandbox.h @@ -0,0 +1,6 @@ +#ifndef SANDBOX_H__ +#define SANDBOX_H__ + +extern "C" void StartSeccompSandbox(); + +#endif // SANDBOX_H__ diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h new file mode 100644 index 0000000..3edb8c9 --- /dev/null +++ b/sandbox/linux/seccomp/sandbox_impl.h @@ -0,0 +1,621 @@ +#ifndef SANDBOX_IMPL_H__ +#define SANDBOX_IMPL_H__ + +#include <asm/ldt.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/futex.h> +#include <linux/prctl.h> +#include <linux/unistd.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sched.h> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/ptrace.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> + +#define NOINTR_SYS(x) \ + ({ typeof(x) i__; while ((i__ = (x)) < 0 && sys.my_errno == EINTR); i__;}) + +#ifdef __cplusplus +#include <iostream> +#include <map> +#include <vector> +#include "sandbox.h" +#include "securemem.h" +#include "tls.h" + +namespace playground { + +class Sandbox { + // TODO(markus): restrict access to our private file handles + public: + enum { kMaxThreads = 100 }; + + // This is the main public entry point. It finds all system calls that + // need rewriting, sets up the resources needed by the sandbox, and + // enters Seccomp mode. + static void startSandbox() asm("StartSeccompSandbox"); + + private: +// syscall_table.c has to be implemented in C, as C++ does not support +// designated initializers for arrays. The only other alternative would be +// to have a source code generator for this table. +// +// We would still like the C source file to include our header file. This +// requires some define statements to transform C++ specific constructs to +// something that is palatable to a C compiler. +#define STATIC static +#define SecureMemArgs SecureMem::Args + // Clone() is special as it has a wrapper in syscall_table.c. The wrapper + // adds one extra argument (the pointer to the saved registers) and then + // calls playground$sandbox__clone(). + static int sandbox_clone(int flags, void* stack, int* pid, int* ctid, + void* tls, void* wrapper_sp) + asm("playground$sandbox__clone"); +#else +#define STATIC +#define bool int +#define SecureMemArgs void + // This is the wrapper entry point that is found in the syscall_table. + int sandbox_clone(int flags, void* stack, int* pid, int* ctid, void* tls) + asm("playground$sandbox_clone"); +#endif + + // Entry points for sandboxed code that is attempting to make system calls + STATIC int sandbox_access(const char*, int) + asm("playground$sandbox_access"); + STATIC int sandbox_exit(int status) asm("playground$sandbox_exit"); + STATIC int sandbox_getpid() asm("playground$sandbox_getpid"); + #if defined(__NR_getsockopt) + STATIC int sandbox_getsockopt(int, int, int, void*, socklen_t*) + asm("playground$sandbox_getsockopt"); + #endif + STATIC int sandbox_gettid() asm("playground$sandbox_gettid"); + STATIC int sandbox_ioctl(int d, int req, void* arg) + asm("playground$sandbox_ioctl"); + #if defined(__NR_ipc) + STATIC int sandbox_ipc(unsigned, int, int, int, void*, long) + asm("playground$sandbox_ipc"); + #endif + STATIC int sandbox_madvise(void*, size_t, int) + asm("playground$sandbox_madvise"); + STATIC void *sandbox_mmap(void* start, size_t length, int prot, int flags, + int fd, off_t offset) + asm("playground$sandbox_mmap"); + STATIC int sandbox_mprotect(const void*, size_t, int) + asm("playground$sandbox_mprotect"); + STATIC int sandbox_munmap(void* start, size_t length) + asm("playground$sandbox_munmap"); + STATIC int sandbox_open(const char*, int, mode_t) + asm("playground$sandbox_open"); + #if defined(__NR_recvfrom) + STATIC ssize_t sandbox_recvfrom(int, void*, size_t, int, void*, socklen_t*) + asm("playground$sandbox_recvfrom"); + STATIC ssize_t sandbox_recvmsg(int, struct msghdr*, int) + asm("playground$sandbox_recvmsg"); + STATIC size_t sandbox_sendmsg(int, const struct msghdr*, int) + asm("playground$sandbox_sendmsg"); + STATIC ssize_t sandbox_sendto(int, const void*, size_t, int, const void*, + socklen_t)asm("playground$sandbox_sendto"); + #if defined(__NR_shmat) + STATIC void* sandbox_shmat(int, const void*, int) + asm("playground$sandbox_shmat"); + STATIC int sandbox_shmctl(int, int, void*) + asm("playground$sandbox_shmctl"); + STATIC int sandbox_shmdt(const void*) asm("playground$sandbox_shmdt"); + STATIC int sandbox_shmget(int, size_t, int) + asm("playground$sandbox_shmget"); + #endif + STATIC int sandbox_setsockopt(int, int, int, const void*, socklen_t) + asm("playground$sandbox_setsockopt"); + #endif + #if defined(__NR_socketcall) + STATIC int sandbox_socketcall(int call, void* args) + asm("playground$sandbox_socketcall"); + #endif + STATIC int sandbox_stat(const char* path, void* buf) + asm("playground$sandbox_stat"); + #if defined(__NR_stat64) + STATIC int sandbox_stat64(const char *path, void* b) + asm("playground$sandbox_stat64"); + #endif + + // Functions for system calls that need to be handled in the trusted process + STATIC bool process_access(int, int, int, int, SecureMemArgs*) + asm("playground$process_access"); + STATIC bool process_clone(int, int, int, int, SecureMemArgs*) + asm("playground$process_clone"); + STATIC bool process_exit(int, int, int, int, SecureMemArgs*) + asm("playground$process_exit"); + #if defined(__NR_getsockopt) + STATIC bool process_getsockopt(int, int, int, int, SecureMemArgs*) + asm("playground$process_getsockopt"); + #endif + STATIC bool process_ioctl(int, int, int, int, SecureMemArgs*) + asm("playground$process_ioctl"); + #if defined(__NR_ipc) + STATIC bool process_ipc(int, int, int, int, SecureMemArgs*) + asm("playground$process_ipc"); + #endif + STATIC bool process_madvise(int, int, int, int, SecureMemArgs*) + asm("playground$process_madvise"); + STATIC bool process_mmap(int, int, int, int, SecureMemArgs*) + asm("playground$process_mmap"); + STATIC bool process_mprotect(int, int, int, int, SecureMemArgs*) + asm("playground$process_mprotect"); + STATIC bool process_munmap(int, int, int, int, SecureMemArgs*) + asm("playground$process_munmap"); + STATIC bool process_open(int, int, int, int, SecureMemArgs*) + asm("playground$process_open"); + #if defined(__NR_recvfrom) + STATIC bool process_recvfrom(int, int, int, int, SecureMemArgs*) + asm("playground$process_recvfrom"); + STATIC bool process_recvmsg(int, int, int, int, SecureMemArgs*) + asm("playground$process_recvmsg"); + STATIC bool process_sendmsg(int, int, int, int, SecureMemArgs*) + asm("playground$process_sendmsg"); + STATIC bool process_sendto(int, int, int, int, SecureMemArgs*) + asm("playground$process_sendto"); + STATIC bool process_setsockopt(int, int, int, int, SecureMemArgs*) + asm("playground$process_setsockopt"); + #endif + #if defined(__NR_shmat) + STATIC bool process_shmat(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmat"); + STATIC bool process_shmctl(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmctl"); + STATIC bool process_shmdt(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmdt"); + STATIC bool process_shmget(int, int, int, int, SecureMemArgs*) + asm("playground$process_shmget"); + #endif + #if defined(__NR_socketcall) + STATIC bool process_socketcall(int, int, int, int, SecureMemArgs*) + asm("playground$process_socketcall"); + #endif + STATIC bool process_stat(int, int, int, int, SecureMemArgs*) + asm("playground$process_stat"); + +#ifdef __cplusplus + friend class Debug; + friend class Library; + friend class Maps; + friend class Mutex; + friend class SecureMem; + friend class TLS; + + // Define our own inline system calls. These calls will not be rewritten + // to point to the sandboxed wrapper functions. They thus allow us to + // make actual system calls (e.g. in the sandbox initialization code, and + // in the trusted process) + class SysCalls { + public: + #define SYS_CPLUSPLUS + #define SYS_ERRNO my_errno + #define SYS_INLINE inline + #define SYS_PREFIX -1 + #undef SYS_LINUX_SYSCALL_SUPPORT_H + #include "linux_syscall_support.h" + SysCalls() : my_errno(0) { } + int my_errno; + }; + #ifdef __NR_mmap2 + #define MMAP mmap2 + #define __NR_MMAP __NR_mmap2 + #else + #define MMAP mmap + #define __NR_MMAP __NR_mmap + #endif + + // Print an error message and terminate the program. Used for fatal errors. + static void die(const char *msg = 0) __attribute__((noreturn)) { + SysCalls sys; + if (msg) { + sys.write(2, msg, strlen(msg)); + sys.write(2, "\n", 1); + } + for (;;) { + sys.exit_group(1); + sys._exit(1); + } + } + + // Wrapper around "read()" that can deal with partial and interrupted reads + // and that does not modify the global errno variable. + static ssize_t read(SysCalls& sys, int fd, void* buf, size_t len) { + if (len < 0) { + sys.my_errno = EINVAL; + return -1; + } + size_t offset = 0; + while (offset < len) { + ssize_t partial = + NOINTR_SYS(sys.read(fd, reinterpret_cast<char*>(buf) + offset, + len - offset)); + if (partial < 0) { + return partial; + } else if (!partial) { + break; + } + offset += partial; + } + return offset; + } + + // Wrapper around "write()" that can deal with interrupted writes and that + // does not modify the global errno variable. + static ssize_t write(SysCalls& sys, int fd, const void* buf, size_t len){ + return NOINTR_SYS(sys.write(fd, buf, len)); + } + + // Sends a file handle to another process. + static bool sendFd(int transport, int fd0, int fd1, const void* buf, + size_t len) asm("playground$sendFd"); + + // If getFd() fails, it will set the first valid fd slot (e.g. fd0) to + // -errno. + static bool getFd(int transport, int* fd0, int* fd1, void* buf, + size_t* len); + + // Data structures used to forward system calls to the trusted process. + struct Accept { + int sockfd; + void* addr; + socklen_t* addrlen; + } __attribute__((packed)); + + struct Accept4 { + int sockfd; + void* addr; + socklen_t* addrlen; + int flags; + } __attribute__((packed)); + + struct Access { + size_t path_length; + int mode; + } __attribute__((packed)); + + struct Bind { + int sockfd; + void* addr; + socklen_t addrlen; + } __attribute__((packed)); + + struct Clone { + int flags; + void* stack; + int* pid; + int* ctid; + void* tls; + #if defined(__x86_64__) + struct { + void* r15; + void* r14; + void* r13; + void* r12; + void* r11; + void* r10; + void* r9; + void* r8; + void* rdi; + void* rsi; + void* rdx; + void* rcx; + void* rbx; + void* rbp; + void* fake_ret; + } regs64 __attribute__((packed)); + #elif defined(__i386__) + struct { + void* ebp; + void* edi; + void* esi; + void* edx; + void* ecx; + void* ebx; + void* ret2; + } regs32 __attribute__((packed)); + #else + #error Unsupported target platform + #endif + void* ret; + } __attribute__((packed)); + + struct Connect { + int sockfd; + void* addr; + socklen_t addrlen; + } __attribute__((packed)); + + struct GetSockName { + int sockfd; + void* name; + socklen_t* namelen; + } __attribute__((packed)); + + struct GetPeerName { + int sockfd; + void* name; + socklen_t* namelen; + } __attribute__((packed)); + + struct GetSockOpt { + int sockfd; + int level; + int optname; + void* optval; + socklen_t* optlen; + } __attribute__((packed)); + + struct IOCtl { + int d; + int req; + void *arg; + } __attribute__((packed)); + + #if defined(__NR_ipc) + struct IPC { + unsigned call; + int first; + int second; + int third; + void* ptr; + long fifth; + } __attribute__((packed)); + #endif + + struct Listen { + int sockfd; + int backlog; + } __attribute__((packed)); + + struct MAdvise { + const void* start; + size_t len; + int advice; + } __attribute__((packed)); + + struct MMap { + void* start; + size_t length; + int prot; + int flags; + int fd; + off_t offset; + } __attribute__((packed)); + + struct MProtect { + const void* addr; + size_t len; + int prot; + }; + + struct MUnmap { + void* start; + size_t length; + } __attribute__((packed)); + + struct Open { + size_t path_length; + int flags; + mode_t mode; + } __attribute__((packed)); + + struct Recv { + int sockfd; + void* buf; + size_t len; + int flags; + } __attribute__((packed)); + + struct RecvFrom { + int sockfd; + void* buf; + size_t len; + int flags; + void* from; + socklen_t *fromlen; + } __attribute__((packed)); + + struct RecvMsg { + int sockfd; + struct msghdr* msg; + int flags; + } __attribute__((packed)); + + struct Send { + int sockfd; + const void* buf; + size_t len; + int flags; + } __attribute__((packed)); + + struct SendMsg { + int sockfd; + const struct msghdr* msg; + int flags; + } __attribute__((packed)); + + struct SendTo { + int sockfd; + const void* buf; + size_t len; + int flags; + const void* to; + socklen_t tolen; + } __attribute__((packed)); + + struct SetSockOpt { + int sockfd; + int level; + int optname; + const void* optval; + socklen_t optlen; + } __attribute__((packed)); + + #if defined(__NR_shmat) + struct ShmAt { + int shmid; + const void* shmaddr; + int shmflg; + } __attribute__((packed)); + + struct ShmCtl { + int shmid; + int cmd; + void *buf; + } __attribute__((packed)); + + struct ShmDt { + const void *shmaddr; + } __attribute__((packed)); + + struct ShmGet { + int key; + size_t size; + int shmflg; + } __attribute__((packed)); + #endif + + struct ShutDown { + int sockfd; + int how; + } __attribute__((packed)); + + struct Socket { + int domain; + int type; + int protocol; + } __attribute__((packed)); + + struct SocketPair { + int domain; + int type; + int protocol; + int* pair; + } __attribute__((packed)); + + #if defined(__NR_socketcall) + struct SocketCall { + int call; + void* arg_ptr; + union { + Socket socket; + Bind bind; + Connect connect; + Listen listen; + Accept accept; + GetSockName getsockname; + GetPeerName getpeername; + SocketPair socketpair; + Send send; + Recv recv; + SendTo sendto; + RecvFrom recvfrom; + ShutDown shutdown; + SetSockOpt setsockopt; + GetSockOpt getsockopt; + SendMsg sendmsg; + RecvMsg recvmsg; + Accept4 accept4; + } args; + } __attribute__((packed)); + #endif + + struct Stat { + int sysnum; + size_t path_length; + void* buf; + } __attribute__((packed)); + + // Thread local data available from each sandboxed thread. + enum { TLS_COOKIE, TLS_TID, TLS_THREAD_FD }; + static long long cookie() { return TLS::getTLSValue<long long>(TLS_COOKIE); } + static int tid() { return TLS::getTLSValue<int>(TLS_TID); } + static int threadFdPub() { return TLS::getTLSValue<int>(TLS_THREAD_FD); } + static int processFdPub() { return processFdPub_; } + + // The SEGV handler knows how to handle RDTSC instructions + static void setupSignalHandlers(); + static void (*segv())(int signo); + + // If no specific handler has been registered for a system call, call this + // function which asks the trusted thread to perform the call. This is used + // for system calls that are not restricted. + static void* defaultSystemCallHandler(int syscallNum, void* arg0, + void* arg1, void* arg2, void* arg3, + void* arg4, void* arg5) + asm("playground$defaultSystemCallHandler"); + + // Return a secure memory structure that can be used by a newly created + // thread. + static SecureMem::Args* getSecureMem(); + + // This functions runs in the trusted process at startup and finds all the + // memory mappings that existed when the sandbox was first enabled. Going + // forward, all these mappings are off-limits for operations such as + // mmap(), munmap(), and mprotect(). + static void initializeProtectedMap(int fd); + + // Helper functions that allows the trusted process to get access to + // "/proc/self/maps" in the sandbox. + static void snapshotMemoryMappings(int processFd); + + // Main loop for the trusted process. + static void trustedProcess(int parentProc, int processFdPub, int sandboxFd, + int cloneFd, SecureMem::Args* secureArena) + __attribute__((noreturn)); + + // Fork()s of the trusted process. + static SecureMem::Args* createTrustedProcess(int processFdPub, int sandboxFd, + int cloneFdPub, int cloneFd); + + // Creates the trusted thread for the initial thread, then enables + // Seccomp mode. + static void createTrustedThread(int processFdPub, int cloneFdPub, + SecureMem::Args* secureMem); + + static int pid_; + static int processFdPub_; + static int cloneFdPub_; + + #ifdef __i386__ + struct SocketCallArgInfo; + static const struct SocketCallArgInfo socketCallArgInfo[]; + #endif + + // The syscall_mutex_ can only be directly accessed by the trusted process. + // It can be accessed by the trusted thread after fork()ing and calling + // mprotect(PROT_READ|PROT_WRITE). The mutex is used for system calls that + // require passing additional data, and that require the trusted process to + // wait until the trusted thread is done processing (e.g. exit(), clone(), + // open(), stat()) + static int syscall_mutex_ asm("playground$syscall_mutex"); + + // Available in trusted process, only + typedef std::map<void *, long> ProtectedMap; + static ProtectedMap protectedMap_; + static std::vector<SecureMem::Args*> secureMemPool_; +}; + +} // namespace + +using playground::Sandbox; +#endif // __cplusplus + +#endif // SANDBOX_IMPL_H__ diff --git a/sandbox/linux/seccomp/securemem.cc b/sandbox/linux/seccomp/securemem.cc new file mode 100644 index 0000000..c8e59f9 --- /dev/null +++ b/sandbox/linux/seccomp/securemem.cc @@ -0,0 +1,97 @@ +#include "debug.h" +#include "mutex.h" +#include "sandbox_impl.h" +#include "securemem.h" + +namespace playground { + +void SecureMem::abandonSystemCall(int fd, int err) { + void* rc = reinterpret_cast<void *>(err); + if (err) { + Debug::message("System call failed\n"); + } + Sandbox::SysCalls sys; + if (Sandbox::write(sys, fd, &rc, sizeof(rc)) != sizeof(rc)) { + Sandbox::die("Failed to send system call"); + } +} + +void SecureMem::dieIfParentDied(int parentProc) { + // The syscall_mutex_ should not be contended. If it is, we are either + // experiencing a very unusual load of system calls that the sandbox is not + // optimized for; or, more likely, the sandboxed process terminated while the + // trusted process was in the middle of waiting for the mutex. We detect + // this situation and terminate the trusted process. + char proc[80]; + sprintf(proc, "/proc/self/fd/%d/status", parentProc); + struct stat sb; + if (stat(proc, &sb)) { + Sandbox::die(); + } +} + +void SecureMem::lockSystemCall(int parentProc, Args* mem) { + while (!Mutex::lockMutex(&Sandbox::syscall_mutex_, 500)) { + dieIfParentDied(parentProc); + } + asm volatile( + #if defined(__x86_64__) + "lock; incq (%0)\n" + #elif defined(__i386__) + "lock; incl (%0)\n" + #else + #error Unsupported target platform + #endif + : + : "q"(&mem->sequence) + : "memory"); +} + +void SecureMem::sendSystemCallInternal(int fd, bool locked, int parentProc, + Args* mem, int syscallNum, void* arg1, + void* arg2, void* arg3, void* arg4, + void* arg5, void* arg6) { + if (!locked) { + asm volatile( + #if defined(__x86_64__) + "lock; incq (%0)\n" + #elif defined(__i386__) + "lock; incl (%0)\n" + #else + #error Unsupported target platform + #endif + : + : "q"(&mem->sequence) + : "memory"); + } + mem->syscallNum = syscallNum; + mem->arg1 = arg1; + mem->arg2 = arg2; + mem->arg3 = arg3; + mem->arg4 = arg4; + mem->arg5 = arg5; + mem->arg6 = arg6; + asm volatile( + #if defined(__x86_64__) + "lock; incq (%0)\n" + #elif defined(__i386__) + "lock; incl (%0)\n" + #else + #error Unsupported target platform + #endif + : + : "q"(&mem->sequence) + : "memory"); + int data = locked ? -2 : -1; + Sandbox::SysCalls sys; + if (Sandbox::write(sys, fd, &data, sizeof(data)) != sizeof(data)) { + Sandbox::die("Failed to send system call"); + } + if (parentProc >= 0) { + while (!Mutex::waitForUnlock(&Sandbox::syscall_mutex_, 500)) { + dieIfParentDied(parentProc); + } + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h new file mode 100644 index 0000000..4c208ce --- /dev/null +++ b/sandbox/linux/seccomp/securemem.h @@ -0,0 +1,179 @@ +#ifndef SECURE_MEM_H__ +#define SECURE_MEM_H__ + +#include <stdlib.h> + +namespace playground { + +class SecureMem { + public: + // Each thread is associated with two memory pages (i.e. 8192 bytes). This + // memory is fully accessible by the trusted process, but in the trusted + // thread and the sandboxed thread, the first page is only mapped PROT_READ, + // and the second one is PROT_READ|PROT_WRITE. + // + // The first page can be modified by the trusted process and this is the + // main mechanism how it communicates with the trusted thread. After each + // update, it updates the "sequence" number. The trusted process must + // check the "sequence" number has the expected value, and only then can + // it trust the data in this page. + typedef struct Args { + union { + struct { + union { + struct { + struct Args* self; + long sequence; + long syscallNum; + void* arg1; + void* arg2; + void* arg3; + void* arg4; + void* arg5; + void* arg6; + + // Used by clone() to allow return from the syscall wrapper. + void* ret; + #if defined(__x86_64__) + void* rbp; + void* rbx; + void* rcx; + void* rdx; + void* rsi; + void* rdi; + void* r8; + void* r9; + void* r10; + void* r11; + void* r12; + void* r13; + void* r14; + void* r15; + #elif defined(__i386__) + void* ret2; + void* ebp; + void* edi; + void* esi; + void* edx; + void* ecx; + void* ebx; + #else + #error Unsupported target platform + #endif + + // Used by clone() to set up data for the new thread. + struct Args* newSecureMem; + int processFdPub; + int cloneFdPub; + + // Set to non-zero, if in debugging mode + int allowAllSystemCalls; + + // The most recent SysV SHM identifier returned by + // shmget(IPC_PRIVATE) + int shmId; + + // The following entries make up the sandboxed thread's TLS + long long cookie; + long long threadId; + long long threadFdPub; + } __attribute__((packed)); + char header[512]; + }; + // Used for calls such as open() and stat(). + char pathname[4096 - 512]; + } __attribute__((packed)); + char securePage[4096]; + }; + union { + // This scratch space is used by the trusted thread to read parameters + // for unrestricted system calls. + struct { + long tmpSyscallNum; + void* tmpArg1; + void* tmpArg2; + void* tmpArg3; + void* tmpArg4; + void* tmpArg5; + void* tmpArg6; + void* tmpReturnValue; + } __attribute__((packed)); + char scratchPage[4096]; + }; + } __attribute__((packed)) Args; + + // Allows the trusted process to check whether the parent process still + // exists. If it doesn't, kill the trusted process. + static void dieIfParentDied(int parentProc); + + // The trusted process received a system call that it intends to deny. + static void abandonSystemCall(int fd, int err); + + // Acquires the syscall_mutex_ prior to making changes to the parameters in + // the secure memory page. Used by calls such as exit(), clone(), open(), + // socketcall(), and stat(). + // After locking the mutex, it is no longer valid to abandon the system + // call! + static void lockSystemCall(int parentProc, Args* mem); + + // Sends a system call to the trusted thread. If "locked" is true, the + // caller must first call lockSystemCall() and must also provide + // "parentProc". In locked mode, sendSystemCall() won't return until the + // trusted thread has completed processing. + // Use sparingly as it serializes the operation of the trusted process. + static void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum); + } + template<class T1> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1); + } + template<class T1, class T2> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2); + } + template<class T1, class T2, class T3> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3); + } + template<class T1, class T2, class T3, class T4> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4); + } + template<class T1, class T2, class T3, class T4, class T5> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4, + T5 arg5) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4, + (void*)arg5); + } + template<class T1, class T2, class T3, class T4, class T5, class T6> static + void sendSystemCall(int fd, bool locked, int parentProc, Args* mem, + int syscallNum, T1 arg1, T2 arg2, T3 arg3, T4 arg4, + T5 arg5, T6 arg6) { + sendSystemCallInternal(fd, locked, parentProc, mem, syscallNum, + (void*)arg1, (void*)arg2, (void*)arg3, (void*)arg4, + (void*)arg5, (void*)arg6); + } + + private: + static void sendSystemCallInternal(int fd, bool locked, int parentProc, + Args* mem, int syscallNum, void* arg1 = 0, + void* arg2 = 0, void* arg3 = 0, + void* arg4 = 0, void* arg5 = 0, + void* arg6 = 0); +}; + +} // namespace + +#endif // SECURE_MEM_H__ diff --git a/sandbox/linux/seccomp/socketcall.cc b/sandbox/linux/seccomp/socketcall.cc new file mode 100644 index 0000000..d51431d --- /dev/null +++ b/sandbox/linux/seccomp/socketcall.cc @@ -0,0 +1,1013 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +#if defined(__NR_socket) + +ssize_t Sandbox::sandbox_recvfrom(int sockfd, void* buf, size_t len, int flags, + void* from, socklen_t* fromlen) { + Debug::syscall(__NR_recvfrom, "Executing handler"); + + SysCalls sys; + if (!from && !flags) { + // recv() with a NULL sender and no flags is the same as read(), which + // is unrestricted in seccomp mode. + Debug::message("Replaced recv() with call to read()"); + ssize_t rc = sys.read(sockfd, buf, len); + if (rc < 0) { + return -sys.my_errno; + } else { + return rc; + } + } + + struct { + int sysnum; + long long cookie; + RecvFrom recvfrom_req; + } __attribute__((packed)) request; + request.sysnum = __NR_recvfrom; + request.cookie = cookie(); + request.recvfrom_req.sockfd = sockfd; + request.recvfrom_req.buf = buf; + request.recvfrom_req.len = len; + request.recvfrom_req.flags = flags; + request.recvfrom_req.from = from; + request.recvfrom_req.fromlen = fromlen; + + long rc; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward recvfrom() request [sandbox]"); + } + return static_cast<int>(rc); +} + +ssize_t Sandbox::sandbox_recvmsg(int sockfd, struct msghdr* msg, int flags) { + Debug::syscall(__NR_recvmsg, "Executing handler"); + + // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do + // not know whether the caller needs us to set msg->msg_flags. + struct { + int sysnum; + long long cookie; + RecvMsg recvmsg_req; + } __attribute__((packed)) request; + request.sysnum = __NR_recvmsg; + request.cookie = cookie(); + request.recvmsg_req.sockfd = sockfd; + request.recvmsg_req.msg = msg; + request.recvmsg_req.flags = flags; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward recvmsg() request [sandbox]"); + } + return static_cast<int>(rc); +} + +size_t Sandbox::sandbox_sendmsg(int sockfd, const struct msghdr* msg, + int flags) { + Debug::syscall(__NR_sendmsg, "Executing handler"); + + if (msg->msg_iovlen == 1 && msg->msg_controllen == 0) { + // sendmsg() can sometimes be simplified as sendto() + return sandbox_sendto(sockfd, msg->msg_iov, msg->msg_iovlen, + flags, msg->msg_name, msg->msg_namelen); + } + + struct Request { + int sysnum; + long long cookie; + SendMsg sendmsg_req; + struct msghdr msg; + } __attribute__((packed)); + char data[sizeof(struct Request) + msg->msg_namelen + msg->msg_controllen]; + struct Request *request = reinterpret_cast<struct Request *>(data); + request->sysnum = __NR_sendmsg; + request->cookie = cookie(); + request->sendmsg_req.sockfd = sockfd; + request->sendmsg_req.msg = msg; + request->sendmsg_req.flags = flags; + request->msg = *msg; + memcpy(reinterpret_cast<char *>( + memcpy(request + 1, msg->msg_name, msg->msg_namelen)) + + msg->msg_namelen, + msg->msg_control, msg->msg_controllen); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &data, sizeof(data)) != + (ssize_t)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward sendmsg() request [sandbox]"); + } + return static_cast<int>(rc); +} + +ssize_t Sandbox::sandbox_sendto(int sockfd, const void* buf, size_t len, + int flags, const void* to, socklen_t tolen) { + Debug::syscall(__NR_sendto, "Executing handler"); + + SysCalls sys; + if (!to && !flags) { + // sendto() with a NULL recipient and no flags is the same as write(), + // which is unrestricted in seccomp mode. + Debug::message("Replaced sendto() with call to write()"); + ssize_t rc = sys.write(sockfd, buf, len); + if (rc < 0) { + return -sys.my_errno; + } else { + return rc; + } + } + + struct { + int sysnum; + long long cookie; + SendTo sendto_req; + } __attribute__((packed)) request; + request.sysnum = __NR_sendto; + request.cookie = cookie(); + request.sendto_req.sockfd = sockfd; + request.sendto_req.buf = buf; + request.sendto_req.len = len; + request.sendto_req.flags = flags; + request.sendto_req.to = to; + request.sendto_req.tolen = tolen; + + long rc; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward sendto() request [sandbox]"); + } + return static_cast<int>(rc); +} + +int Sandbox::sandbox_setsockopt(int sockfd, int level, int optname, + const void* optval, socklen_t optlen) { + Debug::syscall(__NR_setsockopt, "Executing handler"); + + struct { + int sysnum; + long long cookie; + SetSockOpt setsockopt_req; + } __attribute__((packed)) request; + request.sysnum = __NR_setsockopt; + request.cookie = cookie(); + request.setsockopt_req.sockfd = sockfd; + request.setsockopt_req.level = level; + request.setsockopt_req.optname = optname; + request.setsockopt_req.optval = optval; + request.setsockopt_req.optlen = optlen; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward setsockopt() request [sandbox]"); + } + return static_cast<int>(rc); +} + +int Sandbox::sandbox_getsockopt(int sockfd, int level, int optname, + void* optval, socklen_t* optlen) { + Debug::syscall(__NR_getsockopt, "Executing handler"); + + struct { + int sysnum; + long long cookie; + GetSockOpt getsockopt_req; + } __attribute__((packed)) request; + request.sysnum = __NR_getsockopt; + request.cookie = cookie(); + request.getsockopt_req.sockfd = sockfd; + request.getsockopt_req.level = level; + request.getsockopt_req.optname = optname; + request.getsockopt_req.optval = optval; + request.getsockopt_req.optlen = optlen; + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), &request, sizeof(request)) != + sizeof(request) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward getsockopt() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_recvfrom(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + RecvFrom recvfrom_req; + SysCalls sys; + if (read(sys, sandboxFd, &recvfrom_req, sizeof(recvfrom_req)) != + sizeof(recvfrom_req)) { + die("Failed to read parameters for recvfrom() [process]"); + } + + // Unsupported flag encountered. Deny the call. + if (recvfrom_req.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // While we do not anticipate any particular need to receive data on + // unconnected sockets, there is no particular risk in doing so. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_recvfrom, recvfrom_req.sockfd, + recvfrom_req.buf, recvfrom_req.len, + recvfrom_req.flags, recvfrom_req.from, + recvfrom_req.fromlen); + return true; +} + +bool Sandbox::process_recvmsg(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + RecvMsg recvmsg_req; + SysCalls sys; + if (read(sys, sandboxFd, &recvmsg_req, sizeof(recvmsg_req)) != + sizeof(recvmsg_req)) { + die("Failed to read parameters for recvmsg() [process]"); + } + + // Unsupported flag encountered. Deny the call. + if (recvmsg_req.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // Receiving messages is general not security critical. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_recvmsg, recvmsg_req.sockfd, + recvmsg_req.msg, recvmsg_req.flags); + return true; +} + +bool Sandbox::process_sendmsg(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + struct { + SendMsg sendmsg_req; + struct msghdr msg; + } __attribute__((packed)) data; + SysCalls sys; + if (read(sys, sandboxFd, &data, sizeof(data)) != sizeof(data)) { + die("Failed to read parameters for sendmsg() [process]"); + } + + if (data.msg.msg_namelen < 0 || data.msg.msg_namelen > 4096 || + data.msg.msg_controllen < 0 || data.msg.msg_controllen > 4096) { + die("Unexpected size for socketcall() payload [process]"); + } + char extra[data.msg.msg_namelen + data.msg.msg_controllen]; + if (read(sys, sandboxFd, &extra, sizeof(extra)) != (ssize_t)sizeof(extra)) { + die("Failed to read parameters for sendmsg() [process]"); + } + if (sizeof(struct msghdr) + sizeof(extra) > sizeof(mem->pathname)) { + goto deny; + } + + if (data.msg.msg_namelen || + (data.sendmsg_req.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))) { + deny: + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // The trusted process receives file handles when a new untrusted thread + // gets created. We have security checks in place that prevent any + // critical information from being tampered with during thread creation. + // But if we disallowed passing of file handles, this would add an extra + // hurdle for an attacker. + // Unfortunately, for now, this is not possible as Chrome's + // base::SendRecvMsg() needs the ability to pass file handles. + if (data.msg.msg_controllen) { + data.msg.msg_control = extra + data.msg.msg_namelen; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&data.msg); + do { + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) { + goto deny; + } + } while ((cmsg = CMSG_NXTHDR(&data.msg, cmsg)) != NULL); + } + + // This must be a locked system call, because we have to ensure that the + // untrusted code does not tamper with the msghdr after we have examined it. + SecureMem::lockSystemCall(parentProc, mem); + if (sizeof(extra) > 0) { + if (data.msg.msg_namelen > 0) { + data.msg.msg_name = mem->pathname + sizeof(struct msghdr); + } + if (data.msg.msg_controllen > 0) { + data.msg.msg_control = mem->pathname + sizeof(struct msghdr) + + data.msg.msg_namelen; + } + memcpy(mem->pathname + sizeof(struct msghdr), extra, sizeof(extra)); + } + memcpy(mem->pathname, &data.msg, sizeof(struct msghdr)); + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, + __NR_sendmsg, data.sendmsg_req.sockfd, + mem->pathname - (char*)mem + (char*)mem->self, + data.sendmsg_req.flags); + return true; +} + +bool Sandbox::process_sendto(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SendTo sendto_req; + SysCalls sys; + if (read(sys, sandboxFd, &sendto_req, sizeof(sendto_req)) != + sizeof(sendto_req)) { + die("Failed to read parameters for sendto() [process]"); + } + + // The sandbox does not allow sending to arbitrary addresses. + if (sendto_req.to) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // Unsupported flag encountered. Deny the call. + if (sendto_req.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) { + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; + } + + // Sending data on a connected socket is similar to calling write(). + // Allow it. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_sendto, sendto_req.sockfd, + sendto_req.buf, sendto_req.len, + sendto_req.flags, sendto_req.to, + sendto_req.tolen); + return true; +} + +bool Sandbox::process_setsockopt(int parentProc, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + SetSockOpt setsockopt_req; + SysCalls sys; + if (read(sys, sandboxFd, &setsockopt_req, sizeof(setsockopt_req)) != + sizeof(setsockopt_req)) { + die("Failed to read parameters for setsockopt() [process]"); + } + + switch (setsockopt_req.level) { + case SOL_SOCKET: + switch (setsockopt_req.optname) { + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_setsockopt, setsockopt_req.sockfd, + setsockopt_req.level, setsockopt_req.optname, + setsockopt_req.optval, setsockopt_req.optlen); + return true; + default: + break; + } + break; + case IPPROTO_TCP: + switch (setsockopt_req.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_setsockopt, setsockopt_req.sockfd, + setsockopt_req.level, setsockopt_req.optname, + setsockopt_req.optval, setsockopt_req.optlen); + return true; + default: + break; + } + break; + default: + break; + } + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; +} + +bool Sandbox::process_getsockopt(int parentProc, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + GetSockOpt getsockopt_req; + SysCalls sys; + if (read(sys, sandboxFd, &getsockopt_req, sizeof(getsockopt_req)) != + sizeof(getsockopt_req)) { + die("Failed to read parameters for getsockopt() [process]"); + } + + switch (getsockopt_req.level) { + case SOL_SOCKET: + switch (getsockopt_req.optname) { + case SO_ACCEPTCONN: + case SO_ERROR: + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + case SO_TYPE: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_getsockopt, getsockopt_req.sockfd, + getsockopt_req.level, getsockopt_req.optname, + getsockopt_req.optval, getsockopt_req.optlen); + return true; + default: + break; + } + break; + case IPPROTO_TCP: + switch (getsockopt_req.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, + __NR_getsockopt, getsockopt_req.sockfd, + getsockopt_req.level, getsockopt_req.optname, + getsockopt_req.optval, getsockopt_req.optlen); + return true; + default: + break; + } + break; + default: + break; + } + SecureMem::abandonSystemCall(threadFd, -EINVAL); + return false; +} + +#endif +#if defined(__NR_socketcall) + +enum { + SYS_SOCKET = 1, + SYS_BIND = 2, + SYS_CONNECT = 3, + SYS_LISTEN = 4, + SYS_ACCEPT = 5, + SYS_GETSOCKNAME = 6, + SYS_GETPEERNAME = 7, + SYS_SOCKETPAIR = 8, + SYS_SEND = 9, + SYS_RECV = 10, + SYS_SENDTO = 11, + SYS_RECVFROM = 12, + SYS_SHUTDOWN = 13, + SYS_SETSOCKOPT = 14, + SYS_GETSOCKOPT = 15, + SYS_SENDMSG = 16, + SYS_RECVMSG = 17, + SYS_ACCEPT4 = 18 +}; + +struct Sandbox::SocketCallArgInfo { + size_t len; + off_t addrOff; + off_t lengthOff; +}; +const struct Sandbox::SocketCallArgInfo Sandbox::socketCallArgInfo[] = { + #define STRUCT(s) reinterpret_cast<SocketCall *>(0)->args.s + #define SIZE(s) sizeof(STRUCT(s)) + #define OFF(s, f) offsetof(typeof STRUCT(s), f) + { 0 }, + { SIZE(socket) }, + { SIZE(bind), OFF(bind, addr), OFF(bind, addrlen) }, + { SIZE(connect), OFF(connect, addr), OFF(connect, addrlen) }, + { SIZE(listen) }, + { SIZE(accept) }, + { SIZE(getsockname) }, + { SIZE(getpeername) }, + { SIZE(socketpair) }, + { SIZE(send) }, + { SIZE(recv) }, + { SIZE(sendto), OFF(sendto, to), OFF(sendto, tolen) }, + { SIZE(recvfrom) }, + { SIZE(shutdown) }, + { SIZE(setsockopt), OFF(setsockopt, optval), OFF(setsockopt, optlen) }, + { SIZE(getsockopt) }, + { SIZE(sendmsg) }, + { SIZE(recvmsg) }, + { SIZE(accept4) } + #undef STRUCT + #undef SIZE + #undef OFF +}; + +int Sandbox::sandbox_socketcall(int call, void* args) { + Debug::syscall(__NR_socketcall, "Executing handler", call); + + // When demultiplexing socketcall(), only accept calls that have a valid + // "call" opcode. + if (call < SYS_SOCKET || call > SYS_ACCEPT4) { + return -ENOSYS; + } + + // Some type of calls include a pointer to an address or name, which cannot + // be accessed by the trusted process, as it lives in a separate address + // space. For these calls, append the extra data to the serialized request. + // This requires some copying of data, as we have to make sure there is + // only a single atomic call to write(). + socklen_t numExtraData = 0; + const void* extraDataAddr = NULL; + if (socketCallArgInfo[call].lengthOff) { + memcpy(&numExtraData, + reinterpret_cast<char *>(args) + socketCallArgInfo[call].lengthOff, + sizeof(socklen_t)); + extraDataAddr = reinterpret_cast<char *>(args) + + socketCallArgInfo[call].addrOff; + } + + // sendmsg() and recvmsg() have more complicated requirements for computing + // the amount of extra data that needs to be sent to the trusted process. + if (call == SYS_SENDMSG) { + SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args); + if (sendmsg_args->msg->msg_iovlen == 1 && + !sendmsg_args->msg->msg_control) { + // Further down in the code, this sendmsg() call will be simplified to + // a sendto() call. Make sure we already compute the correct value for + // numExtraData, as it is needed when we allocate "data[]" on the stack. + numExtraData = sendmsg_args->msg->msg_namelen; + extraDataAddr = sendmsg_args->msg->msg_name; + } else { + // sendmsg() needs to include some of the extra data so that we can + // inspect it in process_socketcall() + numExtraData = sizeof(*sendmsg_args->msg) + + sendmsg_args->msg->msg_namelen + + sendmsg_args->msg->msg_controllen; + extraDataAddr = NULL; + } + } + if (call == SYS_RECVMSG) { + RecvMsg *recvmsg_args = reinterpret_cast<RecvMsg *>(args); + numExtraData = sizeof(*recvmsg_args->msg); + extraDataAddr = recvmsg_args->msg; + } + + // Set up storage for the request header and copy the data from "args" + // into it. + struct Request { + int sysnum; + long long cookie; + SocketCall socketcall_req; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + numExtraData]; + request = reinterpret_cast<struct Request *>(data); + memcpy(&request->socketcall_req.args, args, socketCallArgInfo[call].len); + + // Simplify send(), sendto() and sendmsg(), if there are simpler equivalent + // calls. This allows us to occasionally replace them with calls to write(), + // which don't have to be forwarded to the trusted process. + SysCalls sys; + if (call == SYS_SENDMSG && + request->socketcall_req.args.sendmsg.msg->msg_iovlen == 1 && + !request->socketcall_req.args.sendmsg.msg->msg_control) { + // Ordering of these assignments is important, as we are reshuffling + // fields inside of a union. + call = SYS_SENDTO; + request->socketcall_req.args.sendto.flags = + request->socketcall_req.args.sendmsg.flags; + request->socketcall_req.args.sendto.to = + request->socketcall_req.args.sendmsg.msg->msg_name; + request->socketcall_req.args.sendto.tolen = + request->socketcall_req.args.sendmsg.msg->msg_namelen; + request->socketcall_req.args.sendto.len = + request->socketcall_req.args.sendmsg.msg->msg_iov->iov_len; + request->socketcall_req.args.sendto.buf = + request->socketcall_req.args.sendmsg.msg->msg_iov->iov_base; + } + if (call == SYS_SENDTO && !request->socketcall_req.args.sendto.to) { + // sendto() with a NULL address is the same as send() + call = SYS_SEND; + numExtraData = 0; + } + if (call == SYS_SEND && !request->socketcall_req.args.send.flags) { + // send() with no flags is the same as write(), which is unrestricted + // in seccomp mode. + Debug::message("Replaced socketcall() with call to write()"); + ssize_t rc = sys.write(request->socketcall_req.args.send.sockfd, + request->socketcall_req.args.send.buf, + request->socketcall_req.args.send.len); + if (rc < 0) { + return -sys.my_errno; + } else { + return rc; + } + } + + // Simplify recv(), and recvfrom(), if there are simpler equivalent calls. + // This allows us to occasionally replace them with calls to read(), which + // don't have to be forwarded to the trusted process. + // We cannot simplify recvmsg() to recvfrom(), recv() or read(), as we do + // not know whether the caller needs us to set msg->msg_flags. + if (call == SYS_RECVFROM && !request->socketcall_req.args.recvfrom.from) { + // recvfrom() with a NULL address buffer is the same as recv() + call = SYS_RECV; + } + if (call == SYS_RECV && !request->socketcall_req.args.recv.flags) { + // recv() with no flags is the same as read(), which is unrestricted + // in seccomp mode. + Debug::message("Replaced socketcall() with call to read()"); + ssize_t rc = sys.read(request->socketcall_req.args.recv.sockfd, + request->socketcall_req.args.recv.buf, + request->socketcall_req.args.recv.len); + if (rc < 0) { + return -sys.my_errno; + } else { + return rc; + } + } + + // Fill in the rest of the request header. + request->sysnum = __NR_socketcall; + request->cookie = cookie(); + request->socketcall_req.call = call; + request->socketcall_req.arg_ptr = args; + int padding = sizeof(request->socketcall_req.args) - + socketCallArgInfo[call].len; + if (padding > 0) { + memset((char *)(&request->socketcall_req.args + 1) - padding, 0, padding); + } + if (call == SYS_SENDMSG) { + // for sendmsg() we include the (optional) destination address, and the + // (optional) control data in the payload. + SendMsg *sendmsg_args = reinterpret_cast<SendMsg *>(args); + memcpy(reinterpret_cast<char *>( + memcpy(reinterpret_cast<char *>( + memcpy(request + 1, sendmsg_args->msg, sizeof(*sendmsg_args->msg))) + + sizeof(*sendmsg_args->msg), + sendmsg_args->msg->msg_name, sendmsg_args->msg->msg_namelen)) + + sendmsg_args->msg->msg_namelen, + sendmsg_args->msg->msg_control, sendmsg_args->msg->msg_controllen); + } else if (extraDataAddr) { + memcpy(request + 1, extraDataAddr, numExtraData); + } + + // Send request to trusted process and collect response from trusted thread. + long rc; + ssize_t len = sizeof(struct Request) + numExtraData; + if (write(sys, processFdPub(), data, len) != len || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward socketcall() request [sandbox]"); + } + return static_cast<int>(rc); +} + +bool Sandbox::process_socketcall(int parentProc, int sandboxFd, + int threadFdPub, int threadFd, + SecureMem::Args* mem) { + // Read request + SocketCall socketcall_req; + SysCalls sys; + if (read(sys, sandboxFd, &socketcall_req, sizeof(socketcall_req)) != + sizeof(socketcall_req)) { + die("Failed to read parameters for socketcall() [process]"); + } + + // sandbox_socketcall() should never send us an unexpected "call" opcode. + // If it did, something went very wrong and we better terminate the process. + if (socketcall_req.call < SYS_SOCKET || socketcall_req.call > SYS_ACCEPT4) { + die("Unexpected socketcall() [process]"); + } + + // Check if this particular operation carries an extra payload. + socklen_t numExtraData = 0; + if (socketCallArgInfo[socketcall_req.call].lengthOff) { + memcpy(&numExtraData, + reinterpret_cast<char *>(&socketcall_req) + + socketCallArgInfo[socketcall_req.call].lengthOff, + sizeof(socklen_t)); + } else if (socketcall_req.call == SYS_SENDMSG) { + numExtraData = sizeof(*socketcall_req.args.sendmsg.msg); + } else if (socketcall_req.call == SYS_RECVMSG) { + numExtraData = sizeof(*socketcall_req.args.recvmsg.msg); + } + + // Verify that the length for the payload is reasonable. We don't want to + // blow up our stack, and excessive (or negative) buffer sizes are almost + // certainly a bug. + if (numExtraData < 0 || numExtraData > 4096) { + die("Unexpected size for socketcall() payload [process]"); + } + + // Read the extra payload, if any. + char extra[numExtraData]; + if (numExtraData) { + if (read(sys, sandboxFd, extra, numExtraData) != (ssize_t)numExtraData) { + die("Failed to read socketcall() payload [process]"); + } + } + + // sendmsg() has another level of indirection and can carry even more payload + ssize_t numSendmsgExtra = 0; + if (socketcall_req.call == SYS_SENDMSG) { + struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra); + if (msg->msg_namelen < 0 || msg->msg_namelen > 4096 || + msg->msg_controllen < 0 || msg->msg_controllen > 4096) { + die("Unexpected size for socketcall() payload [process]"); + } + numSendmsgExtra = msg->msg_namelen + msg->msg_controllen; + } + char sendmsgExtra[numSendmsgExtra]; + if (numSendmsgExtra) { + if (read(sys, sandboxFd, sendmsgExtra, numSendmsgExtra) != + numSendmsgExtra) { + die("Failed to read socketcall() payload [process]"); + } + } + + int rc = -EINVAL; + switch (socketcall_req.call) { + case SYS_SOCKET: + // The sandbox does not allow creation of any new sockets. + goto deny; + case SYS_BIND: + // The sandbox does not allow binding an address to a socket. + goto deny; + case SYS_CONNECT: + // The sandbox does not allow connecting a socket. + goto deny; + case SYS_LISTEN: + // The sandbox does not allow a socket to enter listening state. + goto deny; + case SYS_ACCEPT4: + case SYS_ACCEPT: + // If the sandbox obtained a socket that is already in the listening + // state (e.g. because somebody sent it a suitable file descriptor), it + // is permissible to call accept(). + + accept_simple: + // None of the parameters need to be checked, so it is OK to refer + // to the parameter block created by the untrusted code. + SecureMem::sendSystemCall(threadFdPub, false, -1, mem, __NR_socketcall, + socketcall_req.call, socketcall_req.arg_ptr); + return true; + case SYS_GETSOCKNAME: + case SYS_GETPEERNAME: + // Querying the local and the remote name is not considered security + // sensitive for the purposes of the sandbox. + goto accept_simple; + case SYS_SOCKETPAIR: + // Socket pairs are connected to each other and not considered + // security sensitive. + goto accept_simple; + case SYS_SENDTO: + if (socketcall_req.args.sendto.to) { + // The sandbox does not allow sending to arbitrary addresses. + goto deny; + } + // Fall through + case SYS_SEND: + if (socketcall_req.args.send.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB)) { + // Unsupported flag encountered. Deny the call. + goto deny; + } + // Sending data on a connected socket is similar to calling write(). + // Allow it. + + accept_complex: + // The parameter block contains potentially security critical information + // that should not be tampered with after it has been inspected. Copy it + // into the write-protected securely shared memory before telling the + // trusted thread to execute the socket call. + SecureMem::lockSystemCall(parentProc, mem); + memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args)); + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, + __NR_socketcall, socketcall_req.call, + mem->pathname - (char*)mem + (char*)mem->self); + return true; + case SYS_RECVFROM: + // While we do not anticipate any particular need to receive data on + // unconnected sockets, there is no particular risk in doing so. + // Fall through + case SYS_RECV: + if (socketcall_req.args.recv.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + // Unsupported flag encountered. Deny the call. + goto deny; + } + // Receiving data on a connected socket is similar to calling read(). + // Allow it. + goto accept_complex; + case SYS_SHUTDOWN: + // Shutting down a socket is always OK. + goto accept_simple; + case SYS_SETSOCKOPT: + switch (socketcall_req.args.setsockopt.level) { + case SOL_SOCKET: + switch (socketcall_req.args.setsockopt.optname) { + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + goto accept_complex; + default: + break; + } + break; + case IPPROTO_TCP: + switch (socketcall_req.args.setsockopt.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + goto accept_complex; + default: + break; + } + break; + default: + break; + } + goto deny; + case SYS_GETSOCKOPT: + switch (socketcall_req.args.getsockopt.level) { + case SOL_SOCKET: + switch (socketcall_req.args.getsockopt.optname) { + case SO_ACCEPTCONN: + case SO_ERROR: + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDLOWAT: + case SO_RCVTIMEO: + case SO_SNDTIMEO: + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_TIMESTAMP: + case SO_TYPE: + goto accept_complex; + default: + break; + } + break; + case IPPROTO_TCP: + switch (socketcall_req.args.getsockopt.optname) { + case TCP_CORK: + case TCP_DEFER_ACCEPT: + case TCP_INFO: + case TCP_KEEPCNT: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_LINGER2: + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_QUICKACK: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + goto accept_complex; + default: + break; + } + break; + default: + break; + } + goto deny; + case SYS_SENDMSG: { + struct msghdr* msg = reinterpret_cast<struct msghdr*>(extra); + + if (sizeof(socketcall_req.args) + sizeof(*msg) + numSendmsgExtra > + sizeof(mem->pathname)) { + goto deny; + } + + if (msg->msg_namelen || + (socketcall_req.args.sendmsg.flags & + ~(MSG_CONFIRM|MSG_DONTWAIT|MSG_EOR|MSG_MORE|MSG_NOSIGNAL|MSG_OOB))){ + goto deny; + } + + // The trusted process receives file handles when a new untrusted thread + // gets created. We have security checks in place that prevent any + // critical information from being tampered with during thread creation. + // But if we disallowed passing of file handles, this would add an extra + // hurdle for an attacker. + // Unfortunately, for now, this is not possible as Chrome's + // base::SendRecvMsg() needs the ability to pass file handles. + if (msg->msg_controllen) { + msg->msg_control = sendmsgExtra + msg->msg_namelen; + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + do { + if (cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) { + goto deny; + } + } while ((cmsg = CMSG_NXTHDR(msg, cmsg)) != NULL); + } + + // This must be a locked system call, because we have to ensure that + // the untrusted code does not tamper with the msghdr after we have + // examined it. + SecureMem::lockSystemCall(parentProc, mem); + socketcall_req.args.sendmsg.msg = + reinterpret_cast<struct msghdr*>(mem->pathname + + sizeof(socketcall_req.args) - + (char*)mem + (char*)mem->self); + memcpy(mem->pathname, &socketcall_req.args, sizeof(socketcall_req.args)); + if (numSendmsgExtra) { + if (msg->msg_namelen > 0) { + msg->msg_name = const_cast<struct msghdr*>( + socketcall_req.args.sendmsg.msg) + 1; + } + if (msg->msg_controllen > 0) { + msg->msg_control = (char *)( + socketcall_req.args.sendmsg.msg + 1) + msg->msg_namelen; + } + memcpy(mem->pathname + sizeof(socketcall_req.args) + sizeof(*msg), + sendmsgExtra, numSendmsgExtra); + } + memcpy(mem->pathname + sizeof(socketcall_req.args), msg, sizeof(*msg)); + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, + __NR_socketcall, socketcall_req.call, + mem->pathname - (char*)mem + (char*)mem->self); + return true; + } + case SYS_RECVMSG: + // Receiving messages is general not security critical. + if (socketcall_req.args.recvmsg.flags & + ~(MSG_DONTWAIT|MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_WAITALL)) { + goto deny; + } + goto accept_complex; + default: + deny: + SecureMem::abandonSystemCall(threadFd, rc); + return false; + } +} + +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/stat.cc b/sandbox/linux/seccomp/stat.cc new file mode 100644 index 0000000..8634fdf --- /dev/null +++ b/sandbox/linux/seccomp/stat.cc @@ -0,0 +1,110 @@ +#include "debug.h" +#include "sandbox_impl.h" + +namespace playground { + +int Sandbox::sandbox_stat(const char *path, void *buf) { + Debug::syscall(__NR_stat, "Executing handler"); + size_t len = strlen(path); + struct Request { + int sysnum; + long long cookie; + Stat stat_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_stat; + request->cookie = cookie(); + request->stat_req.sysnum = __NR_stat; + request->stat_req.path_length = len; + request->stat_req.buf = buf; + memcpy(request->pathname, path, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward stat() request [sandbox]"); + } + return static_cast<int>(rc); +} + +#if defined(__NR_stat64) +int Sandbox::sandbox_stat64(const char *path, void *buf) { + Debug::syscall(__NR_stat64, "Executing handler"); + size_t len = strlen(path); + struct Request { + int sysnum; + long long cookie; + Stat stat_req; + char pathname[0]; + } __attribute__((packed)) *request; + char data[sizeof(struct Request) + len]; + request = reinterpret_cast<struct Request*>(data); + request->sysnum = __NR_stat64; + request->cookie = cookie(); + request->stat_req.sysnum = __NR_stat64; + request->stat_req.path_length = len; + request->stat_req.buf = buf; + memcpy(request->pathname, path, len); + + long rc; + SysCalls sys; + if (write(sys, processFdPub(), request, sizeof(data)) != (int)sizeof(data) || + read(sys, threadFdPub(), &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward stat64() request [sandbox]"); + } + return static_cast<int>(rc); +} +#endif + +bool Sandbox::process_stat(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMem::Args* mem) { + // Read request + SysCalls sys; + Stat stat_req; + if (read(sys, sandboxFd, &stat_req, sizeof(stat_req)) != sizeof(stat_req)) { + read_parm_failed: + die("Failed to read parameters for stat() [process]"); + } + int rc = -ENAMETOOLONG; + if (stat_req.path_length >= (int)sizeof(mem->pathname)) { + char buf[32]; + while (stat_req.path_length > 0) { + size_t len = stat_req.path_length > sizeof(buf) ? + sizeof(buf) : stat_req.path_length; + ssize_t i = read(sys, sandboxFd, buf, len); + if (i <= 0) { + goto read_parm_failed; + } + stat_req.path_length -= i; + } + if (write(sys, threadFd, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to return data from stat() [process]"); + } + return false; + } + SecureMem::lockSystemCall(parentProc, mem); + if (read(sys, sandboxFd, mem->pathname, stat_req.path_length) != + (ssize_t)stat_req.path_length) { + goto read_parm_failed; + } + mem->pathname[stat_req.path_length] = '\000'; + + // TODO(markus): Implement sandboxing policy + Debug::message(("Allowing access to \"" + std::string(mem->pathname) + + "\"").c_str()); + + // Tell trusted thread to stat the file. + SecureMem::sendSystemCall(threadFdPub, true, parentProc, mem, + #if defined(__i386__) + stat_req.sysnum == __NR_stat64 ? __NR_stat64 : + #endif + __NR_stat, + mem->pathname - (char*)mem + (char*)mem->self, + stat_req.buf); + return true; +} + +} // namespace diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc new file mode 100644 index 0000000..b25146b --- /dev/null +++ b/sandbox/linux/seccomp/syscall.cc @@ -0,0 +1,258 @@ +#include "debug.h" +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +// TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file +asm( + ".pushsection .text, \"ax\", @progbits\n" + + // This is the special wrapper for the clone() system call. The code + // relies on the stack layout of the system call wrapper (c.f. below). It + // passes the stack pointer as an additional argument to sandbox__clone(), + // so that upon starting the child, register values can be restored and + // the child can start executing at the correct IP, instead of trying to + // run in the trusted thread. + "playground$sandbox_clone:" + ".globl playground$sandbox_clone\n" + ".type playground$sandbox_clone, @function\n" + #if defined(__x86_64__) + // Skip the 8 byte return address into the system call wrapper. The + // following bytes are the saved register values that we need to restore + // upon return from clone() in the new thread. + "lea 8(%rsp), %r9\n" + "jmp playground$sandbox__clone\n" + #elif defined(__i386__) + // As i386 passes function arguments on the stack, we need to skip a few + // more values before we can get to the saved registers. + "lea 28(%esp), %eax\n" + "mov %eax, 24(%esp)\n" + "jmp playground$sandbox__clone\n" + #else + #error Unsupported target platform + #endif + ".size playground$sandbox_clone, .-playground$sandbox_clone\n" + + + // This is the wrapper which is called by the untrusted code, trying to + // make a system call. + "playground$syscallWrapper:" + ".globl playground$syscallWrapper\n" + ".type playground$syscallWrapper, @function\n" + #if defined(__x86_64__) + // Save all registers + "push %rbp\n" + "mov %rsp, %rbp\n" + "push %rbx\n" + "push %rcx\n" + "push %rdx\n" + "push %rsi\n" + "push %rdi\n" + "push %r8\n" + "push %r9\n" + "push %r10\n" + "push %r11\n" + "push %r12\n" + "push %r13\n" + "push %r14\n" + "push %r15\n" + + // Convert from syscall calling conventions to C calling conventions. + // System calls have a subtly different register ordering than the user- + // space x86-64 ABI. + "mov %r10, %rcx\n" + + // Check range of system call + "cmp playground$maxSyscall(%rip), %eax\n" + "ja 1f\n" + + // Retrieve function call from system call table (c.f. syscall_table.c). + // We have three different types of entries; zero for denied system calls, + // that should be handled by the defaultSystemCallHandler(); minus one + // for unrestricted system calls that need to be forwarded to the trusted + // thread; and function pointers to specific handler functions. + "mov %rax, %r10\n" + "shl $4, %r10\n" + "lea playground$syscallTable(%rip), %r11\n" + "add %r11, %r10\n" + "mov 0(%r10), %r10\n" + + // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise + // jump to fallback handler. + "cmp $1, %r10\n" + "jbe 1f\n" + "call *%r10\n" + "0:" + + // Restore CPU registers, except for %rax which was set by the system call. + "pop %r15\n" + "pop %r14\n" + "pop %r13\n" + "pop %r12\n" + "pop %r11\n" + "pop %r10\n" + "pop %r9\n" + "pop %r8\n" + "pop %rdi\n" + "pop %rsi\n" + "pop %rdx\n" + "pop %rcx\n" + "pop %rbx\n" + "pop %rbp\n" + + // Remove fake return address. This is added in the patching code in + // library.cc and it makes stack traces a little cleaner. + "add $8, %rsp\n" + + // Return to caller + "ret\n" + + "1:" + // If we end up calling a specific handler, we don't need to know the + // system call number. However, in the generic case, we do. Shift + // registers so that the system call number becomes visible as the + // first function argument. + "push %r9\n" + "mov %r8, %r9\n" + "mov %rcx, %r8\n" + "mov %rdx, %rcx\n" + "mov %rsi, %rdx\n" + "mov %rdi, %rsi\n" + "mov %rax, %rdi\n" + + // Call default handler. + "call playground$defaultSystemCallHandler\n" + "pop %r9\n" + "jmp 0b\n" + #elif defined(__i386__) + // Preserve all registers + "push %ebx\n" + "push %ecx\n" + "push %edx\n" + "push %esi\n" + "push %edi\n" + "push %ebp\n" + + // Convert from syscall calling conventions to C calling conventions + "push %ebp\n" + "push %edi\n" + "push %esi\n" + "push %edx\n" + "push %ecx\n" + "push %ebx\n" + "push %eax\n" + + // Check range of system call + "cmp playground$maxSyscall, %eax\n" + "ja 1f\n" + + // Retrieve function call from system call table (c.f. syscall_table.c). + // We have three different types of entries; zero for denied system calls, + // that should be handled by the defaultSystemCallHandler(); minus one + // for unrestricted system calls that need to be forwarded to the trusted + // thread; and function pointers to specific handler functions. + "shl $3, %eax\n" + "lea playground$syscallTable, %ebx\n" + "add %ebx, %eax\n" + "mov 0(%eax), %eax\n" + + // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise + // jump to fallback handler. + "cmp $1, %eax\n" + "jbe 1f\n" + "add $4, %esp\n" + "call *%eax\n" + "add $24, %esp\n" + "0:" + + // Restore CPU registers, except for %eax which was set by the system call. + "pop %ebp\n" + "pop %edi\n" + "pop %esi\n" + "pop %edx\n" + "pop %ecx\n" + "pop %ebx\n" + + // Return to caller + "ret\n" + + "1:" + // Call default handler. + "push $2f\n" + "push $playground$defaultSystemCallHandler\n" + "ret\n" + "2:add $28, %esp\n" + "jmp 0b\n" + + #else + #error Unsupported target platform + #endif + ".size playground$syscallWrapper, .-playground$syscallWrapper\n" + ".popsection\n" +); + + +void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1, + void* arg2, void* arg3, void* arg4, + void* arg5) { + // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that. + + // We try to avoid intercepting read(), write(), and sigreturn(), as + // these system calls are not restricted in Seccomp mode. But depending on + // the exact instruction sequence in libc, we might not be able to reliably + // filter out these system calls at the time when we instrument the code. + SysCalls sys; + unsigned long rc; + switch (syscallNum) { + case __NR_read: + Debug::syscall(syscallNum, "Allowing unrestricted system call"); + rc = sys.read((long)arg0, arg1, (size_t)arg2); + break; + case __NR_write: + Debug::syscall(syscallNum, "Allowing unrestricted system call"); + rc = sys.write((long)arg0, arg1, (size_t)arg2); + break; + case __NR_rt_sigreturn: + Debug::syscall(syscallNum, "Allowing unrestricted system call"); + rc = sys.rt_sigreturn((unsigned long)arg0); + break; + default: + if (Debug::isEnabled()) { + // In debug mode, prevent stderr from being closed + if (syscallNum == __NR_close && arg0 == (void *)2) + return 0; + } + + if ((unsigned)syscallNum <= maxSyscall && + syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) { + Debug::syscall(syscallNum, "Allowing unrestricted system call"); + perform_unrestricted: + struct { + int sysnum; + void* unrestricted_req[6]; + } __attribute__((packed)) request = { + syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } }; + + int thread = threadFdPub(); + void* rc; + if (write(sys, thread, &request, sizeof(request)) != sizeof(request) || + read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) { + die("Failed to forward unrestricted system call"); + } + return rc; + } else if (Debug::isEnabled()) { + Debug::syscall(syscallNum, + "In production mode, this call would be disallowed"); + goto perform_unrestricted; + } else { + return (void *)-ENOSYS; + } + } + if (rc < 0) { + rc = -sys.my_errno; + } + return (void *)rc; +} + +} // namespace diff --git a/sandbox/linux/seccomp/syscall.h b/sandbox/linux/seccomp/syscall.h new file mode 100644 index 0000000..e4390c2 --- /dev/null +++ b/sandbox/linux/seccomp/syscall.h @@ -0,0 +1,14 @@ +#ifndef SYSCALL_H__ +#define SYSCALL_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +void syscallWrapper() asm("playground$syscallWrapper"); + +#ifdef __cplusplus +} +#endif + +#endif // SYSCALL_H__ diff --git a/sandbox/linux/seccomp/syscall_table.c b/sandbox/linux/seccomp/syscall_table.c new file mode 100644 index 0000000..79b281e --- /dev/null +++ b/sandbox/linux/seccomp/syscall_table.c @@ -0,0 +1,118 @@ +#include <asm/unistd.h> +#include "sandbox_impl.h" +#include "syscall_table.h" + +#if defined(__x86_64__) +#ifndef __NR_set_robust_list +#define __NR_set_robust_list 273 +#endif +#ifndef __NR_accept4 +#define __NR_accept4 288 +#endif +#elif defined(__i386__) +#ifndef __NR_set_robust_list +#define __NR_set_robust_list 311 +#endif +#else +#error Unsupported target platform +#endif + +// TODO(markus): This is an incredibly dirty hack to make the syscallTable +// live in r/o memory. +// Unfortunately, gcc doesn't give us a clean option to do +// this. Ultimately, we should probably write some code that +// parses /usr/include/asm/unistd*.h and generates a *.S file. +// But we then need to figure out how to integrate this code +// with our build system. + +const struct SyscallTable syscallTable[] __attribute__(( + section(".rodata, \"a\", @progbits\n#"))) ={ + + #if defined(__NR_accept) + [ __NR_accept ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_accept4 ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_access ] = { (void*)&sandbox_access, process_access }, + [ __NR_brk ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_clock_gettime ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_clone ] = { (void*)&sandbox_clone, process_clone }, + [ __NR_close ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_epoll_create ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_epoll_ctl ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_epoll_wait ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_exit ] = { (void*)&sandbox_exit, process_exit }, + [ __NR_exit_group ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_fcntl ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_fcntl64) + [ __NR_fcntl64 ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_fstat ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_fstat64) + [ __NR_fstat64 ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_futex ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_getdents ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_getdents64 ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_getpeername) + [ __NR_getpeername ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_getpid ] = { (void*)&sandbox_getpid, 0 }, + #if defined(__NR_getsockname) + [ __NR_getsockname ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_getsockopt ] = { (void*)&sandbox_getsockopt,process_getsockopt }, + #endif + [ __NR_gettid ] = { (void*)&sandbox_gettid, 0 }, + [ __NR_gettimeofday ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_ioctl ] = { (void*)&sandbox_ioctl, process_ioctl }, + #if defined(__NR_ipc) + [ __NR_ipc ] = { (void*)&sandbox_ipc, process_ipc }, + #endif + #if defined(__NR__llseek) + [ __NR__llseek ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + [ __NR_lseek ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_madvise ] = { (void*)&sandbox_madvise, process_madvise }, + #if defined(__NR_mmap2) + [ __NR_mmap2 ] = + #else + [ __NR_mmap ] = + #endif + { (void*)&sandbox_mmap, process_mmap }, + [ __NR_mprotect ] = { (void*)&sandbox_mprotect, process_mprotect }, + [ __NR_munmap ] = { (void*)&sandbox_munmap, process_munmap }, + [ __NR_open ] = { (void*)&sandbox_open, process_open }, + [ __NR_pipe ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_poll ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_recvfrom) + [ __NR_recvfrom ] = { (void*)&sandbox_recvfrom, process_recvfrom }, + [ __NR_recvmsg ] = { (void*)&sandbox_recvmsg, process_recvmsg }, + [ __NR_sendmsg ] = { (void*)&sandbox_sendmsg, process_sendmsg }, + [ __NR_sendto ] = { (void*)&sandbox_sendto, process_sendto }, + #endif + [ __NR_set_robust_list ] = { UNRESTRICTED_SYSCALL, 0 }, + #if defined(__NR_setsockopt) + [ __NR_setsockopt ] = { (void*)&sandbox_setsockopt,process_setsockopt }, + #if defined(__NR_shmat) + [ __NR_shmat ] = { (void*)&sandbox_shmat, process_shmat }, + [ __NR_shmctl ] = { (void*)&sandbox_shmctl, process_shmctl }, + [ __NR_shmdt ] = { (void*)&sandbox_shmdt, process_shmdt }, + [ __NR_shmget ] = { (void*)&sandbox_shmget, process_shmget }, + #endif + [ __NR_shutdown ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_socketpair ] = { UNRESTRICTED_SYSCALL, 0 }, + #endif + #if defined(__NR_socketcall) + [ __NR_socketcall ] = { (void*)&sandbox_socketcall,process_socketcall }, + #endif + [ __NR_stat ] = { (void*)&sandbox_stat, process_stat }, + #if defined(__NR_stat64) + [ __NR_stat64 ] = { (void*)&sandbox_stat64, process_stat }, + #endif + [ __NR_time ] = { UNRESTRICTED_SYSCALL, 0 }, + [ __NR_uname ] = { UNRESTRICTED_SYSCALL, 0 }, +}; +const unsigned maxSyscall __attribute__((section(".rodata"))) = + sizeof(syscallTable)/sizeof(struct SyscallTable); + +const int syscall_mutex_[4096/sizeof(int)] asm("playground$syscall_mutex") + __attribute__((section(".rodata"),aligned(4096))) = { 0x80000000 }; diff --git a/sandbox/linux/seccomp/syscall_table.h b/sandbox/linux/seccomp/syscall_table.h new file mode 100644 index 0000000..d678c0b --- /dev/null +++ b/sandbox/linux/seccomp/syscall_table.h @@ -0,0 +1,30 @@ +#ifndef SYSCALL_TABLE_H__ +#define SYSCALL_TABLE_H__ + +#include <sys/types.h> + +#ifdef __cplusplus +#include "securemem.h" +extern "C" { +namespace playground { +#define SecureMemArgs SecureMem::Args +#else +#define SecureMemArgs void +#define bool int +#endif + #define UNRESTRICTED_SYSCALL ((void *)1) + + struct SyscallTable { + void *handler; + bool (*trustedProcess)(int parentProc, int sandboxFd, int threadFdPub, + int threadFd, SecureMemArgs* mem); + }; + extern const struct SyscallTable syscallTable[] + asm("playground$syscallTable"); + extern const unsigned maxSyscall asm("playground$maxSyscall"); +#ifdef __cplusplus +} // namespace +} +#endif + +#endif // SYSCALL_TABLE_H__ diff --git a/sandbox/linux/seccomp/tls.h b/sandbox/linux/seccomp/tls.h new file mode 100644 index 0000000..8eae697 --- /dev/null +++ b/sandbox/linux/seccomp/tls.h @@ -0,0 +1,151 @@ +#ifndef TLS_H__ +#define TLS_H__ + +#include <asm/ldt.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/prctl.h> + +namespace playground { + +class TLS { + private: + class SysCalls { + public: + #define SYS_CPLUSPLUS + #define SYS_ERRNO my_errno + #define SYS_INLINE inline + #define SYS_PREFIX -1 + #undef SYS_LINUX_SYSCALL_SUPPORT_H + #include "linux_syscall_support.h" + SysCalls() : my_errno(0) { } + int my_errno; + }; + + public: + static void *allocateTLS() { + SysCalls sys; + #if defined(__x86_64__) + void *addr = sys.mmap(0, 4096, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (sys.arch_prctl(ARCH_SET_GS, addr) < 0) { + return NULL; + } + #elif defined(__i386__) + void *addr = sys.mmap2(0, 4096, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + struct user_desc u; + u.entry_number = (typeof u.entry_number)-1; + u.base_addr = (int)addr; + u.limit = 0xfffff; + u.seg_32bit = 1; + u.contents = 0; + u.read_exec_only = 0; + u.limit_in_pages = 1; + u.seg_not_present = 0; + u.useable = 1; + if (sys.set_thread_area(&u) < 0) { + return NULL; + } + asm volatile( + "movw %w0, %%fs" + : + : "q"(8*u.entry_number+3)); + #else + #error Unsupported target platform + #endif + return addr; + } + + static void freeTLS() { + SysCalls sys; + void *addr; + #if defined(__x86_64__) + sys.arch_prctl(ARCH_GET_GS, &addr); + #elif defined(__i386__) + struct user_desc u; + sys.get_thread_area(&u); + addr = (void *)u.base_addr; + #else + #error Unsupported target platform + #endif + sys.munmap(addr, 4096); + } + + template<class T> static inline bool setTLSValue(int idx, T val) { + #if defined(__x86_64__) + if (idx < 0 || idx >= 4096/8) { + return false; + } + asm volatile( + "movq %0, %%gs:(%1)\n" + : + : "q"((void *)val), "q"(8ll * idx)); + #elif defined(__i386__) + if (idx < 0 || idx >= 4096/8) { + return false; + } + if (sizeof(T) == 8) { + asm volatile( + "movl %0, %%fs:(%1)\n" + : + : "r"((unsigned)val), "r"(8 * idx)); + asm volatile( + "movl %0, %%fs:(%1)\n" + : + : "r"((unsigned)((unsigned long long)val >> 32)), "r"(8 * idx + 4)); + } else { + asm volatile( + "movl %0, %%fs:(%1)\n" + : + : "r"(val), "r"(8 * idx)); + } + #else + #error Unsupported target platform + #endif + return true; + } + + template<class T> static inline T getTLSValue(int idx) { + #if defined(__x86_64__) + long long rc; + if (idx < 0 || idx >= 4096/8) { + return 0; + } + asm volatile( + "movq %%gs:(%1), %0\n" + : "=q"(rc) + : "q"(8ll * idx)); + return (T)rc; + #elif defined(__i386__) + if (idx < 0 || idx >= 4096/8) { + return 0; + } + if (sizeof(T) == 8) { + unsigned lo, hi; + asm volatile( + "movl %%fs:(%1), %0\n" + : "=r"(lo) + : "r"(8 * idx)); + asm volatile( + "movl %%fs:(%1), %0\n" + : "=r"(hi) + : "r"(8 * idx + 4)); + return (T)((unsigned long long)lo + ((unsigned long long)hi << 32)); + } else { + long rc; + asm volatile( + "movl %%fs:(%1), %0\n" + : "=r"(rc) + : "r"(8 * idx)); + return (T)rc; + } + #else + #error Unsupported target platform + #endif + } + +}; + +} // namespace +#endif diff --git a/sandbox/linux/seccomp/trusted_process.cc b/sandbox/linux/seccomp/trusted_process.cc new file mode 100644 index 0000000..b4bee94 --- /dev/null +++ b/sandbox/linux/seccomp/trusted_process.cc @@ -0,0 +1,258 @@ +#include <dirent.h> +#include <map> + +#include "debug.h" +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +struct Thread { + int fdPub, fd; + SecureMem::Args* mem; +}; + +SecureMem::Args* Sandbox::getSecureMem() { + if (!secureMemPool_.empty()) { + SecureMem::Args* rc = secureMemPool_.back(); + secureMemPool_.pop_back(); + return rc; + } + return NULL; +} + +void Sandbox::trustedProcess(int parentProc, int processFdPub, int sandboxFd, + int cloneFd, SecureMem::Args* secureArena) { + std::map<long long, struct Thread> threads; + SysCalls sys; + long long cookie = 0; + + // The very first entry in the secure memory arena has been assigned to the + // initial thread. The remaining entries are available for allocation. + SecureMem::Args* startAddress = secureArena; + SecureMem::Args* nextThread = startAddress; + for (int i = 0; i < kMaxThreads-1; i++) { + secureMemPool_.push_back(++startAddress); + } + +newThreadCreated: + // Receive information from newly created thread + Thread *newThread = &threads[++cookie]; + memset(newThread, 0, sizeof(Thread)); + struct { + SecureMem::Args* self; + int tid; + int fdPub; + } __attribute__((packed)) data; + + size_t dataLen = sizeof(data); + if (!getFd(cloneFd, &newThread->fdPub, &newThread->fd, &data, &dataLen) || + dataLen != sizeof(data)) { + // We get here either because the sandbox got corrupted, or because our + // parent process has terminated. + if (newThread->fdPub || dataLen) { + die("Failed to receive new thread information"); + } + die(); + } + if (data.self != nextThread) { + // The only potentially security critical information received from the + // newly created thread is "self". The "tid" is for informational purposes + // (and for use in the new thread's TLS), and "fdPub" is uncritical as all + // file descriptors are considered untrusted. + // Thus, we only use "self" for a sanity check, but don't actually trust + // it beyond that. + die("Received corrupted thread information"); + } + newThread->mem = nextThread; + + // Set up TLS area and let thread know that the data is now ready + nextThread->cookie = cookie; + nextThread->threadId = data.tid; + nextThread->threadFdPub = data.fdPub; + write(sys, newThread->fd, "", 1); + + // Dispatch system calls that have been forwarded from the trusted thread(s). + for (;;) { + struct { + unsigned int sysnum; + long long cookie; + } __attribute__((packed)) header; + + int rc; + if ((rc = read(sys, sandboxFd, &header, sizeof(header))) !=sizeof(header)){ + if (rc) { + die("Failed to read system call number and thread id"); + } + die(); + } + std::map<long long, struct Thread>::iterator iter = + threads.find(header.cookie); + if (iter == threads.end()) { + die("Received request from unknown thread"); + } + struct Thread* currentThread = &iter->second; + if (header.sysnum > maxSyscall || + !syscallTable[header.sysnum].trustedProcess) { + die("Trusted process encountered unexpected system call"); + } + + // Dispatch system call to handler function. Treat both exit() and clone() + // specially. + if (syscallTable[header.sysnum].trustedProcess(parentProc, + sandboxFd, + currentThread->fdPub, + currentThread->fd, + currentThread->mem) && + header.sysnum == __NR_clone) { + nextThread = currentThread->mem->newSecureMem; + goto newThreadCreated; + } else if (header.sysnum == __NR_exit) { + NOINTR_SYS(sys.close(iter->second.fdPub)); + NOINTR_SYS(sys.close(iter->second.fd)); + SecureMem::Args* secureMem = currentThread->mem; + threads.erase(iter); + secureMemPool_.push_back(secureMem); + } + } +} + +void Sandbox::initializeProtectedMap(int fd) { + int mapsFd; + if (!getFd(fd, &mapsFd, NULL, NULL, NULL)) { + maps_failure: + die("Cannot access /proc/self/maps"); + } + + // Read the memory mappings as they were before the sandbox takes effect. + // These mappings cannot be changed by the sandboxed process. + char line[80]; + FILE *fp = fdopen(mapsFd, "r"); + for (bool truncated = false;;) { + if (fgets(line, sizeof(line), fp) == NULL) { + if (feof(fp) || errno != EINTR) { + break; + } + continue; + } + if (!truncated) { + unsigned long start, stop; + char *ptr = line; + errno = 0; + start = strtoul(ptr, &ptr, 16); + if (errno || *ptr++ != '-') { + parse_failure: + die("Failed to parse /proc/self/maps"); + } + stop = strtoul(ptr, &ptr, 16); + if (errno || *ptr++ != ' ') { + goto parse_failure; + } + protectedMap_[reinterpret_cast<void *>(start)] = stop - start; + } + truncated = strchr(line, '\n') == NULL; + } + SysCalls sys; + NOINTR_SYS(sys.close(mapsFd)); + + // Prevent low address memory allocations. Some buggy kernels allow those + if (protectedMap_[0] < (64 << 10)) { + protectedMap_[0] = 64 << 10; + } + + // Let the sandbox know that we are done parsing the memory map. + if (write(sys, fd, &mapsFd, sizeof(mapsFd)) != sizeof(mapsFd)) { + goto maps_failure; + } +} + +SecureMem::Args* Sandbox::createTrustedProcess(int processFdPub, int sandboxFd, + int cloneFdPub, int cloneFd) { + // Allocate memory that will be used by an arena for storing the secure + // memory. While we allow this memory area to be empty at times (e.g. when + // not all threads are in use), we make sure that it never gets overwritten + // by user-allocated memory. This happens in initializeProtectedMap() and + // snapshotMemoryMappings(). + SecureMem::Args* secureArena = reinterpret_cast<SecureMem::Args*>( + mmap(NULL, 8192*kMaxThreads, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0)); + if (secureArena == MAP_FAILED) { + die("Failed to allocate secure memory arena"); + } + + // Set up the mutex to be accessible from the trusted process and from + // children of the trusted thread(s) + if (mmap(&syscall_mutex_, 4096, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, -1, 0) != &syscall_mutex_) { + die("Failed to initialize secure mutex"); + } + syscall_mutex_ = 0x80000000; + + + // Hold on to a file handle in the parent's process directory. We can use + // this later to reliably tell if the parent died. + int parentProc = open("/proc/self/", O_RDONLY|O_DIRECTORY); + if (parentProc < 0) { + die("Failed to access /proc/self"); + } + + // Create a trusted process that can evaluate system call parameters and + // decide whether a system call should execute. This process runs outside of + // the seccomp sandbox. It communicates with the sandbox'd process through + // a socketpair() and through securely shared memory. + pid_t pid = fork(); + if (pid < 0) { + die("Failed to create trusted process"); + } + if (!pid) { + // Close all file handles except for sandboxFd, cloneFd, and stdio + DIR *dir = opendir("/proc/self/fd"); + if (dir == 0) { + // If we don't know the list of our open file handles, just try closing + // all valid ones. + for (int fd = sysconf(_SC_OPEN_MAX); --fd > 2; ) { + if (fd != parentProc && fd != sandboxFd && fd != cloneFd) { + close(fd); + } + } + } else { + // If available, if is much more efficient to just close the file + // handles that show up in /proc/self/fd/ + struct dirent de, *res; + while (!readdir_r(dir, &de, &res) && res) { + if (res->d_name[0] < '0') + continue; + int fd = atoi(res->d_name); + if (fd > 2 && + fd != parentProc && fd != sandboxFd && fd != cloneFd && + fd != dirfd(dir)) { + close(fd); + } + } + closedir(dir); + } + + // Initialize secure memory used for threads + for (int i = 0; i < kMaxThreads; i++) { + SecureMem::Args* args = secureArena + i; + args->self = args; + #ifndef NDEBUG + args->allowAllSystemCalls= Debug::isEnabled(); + #endif + } + + initializeProtectedMap(sandboxFd); + trustedProcess(parentProc, processFdPub, sandboxFd, cloneFd, secureArena); + die(); + } + + // We are still in the untrusted code. Deny access to restricted resources. + mprotect(secureArena, 8192*kMaxThreads, PROT_NONE); + mprotect(&syscall_mutex_, 4096, PROT_NONE); + close(parentProc); + close(sandboxFd); + + return secureArena; +} + +} // namespace diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc new file mode 100644 index 0000000..985d053 --- /dev/null +++ b/sandbox/linux/seccomp/trusted_thread.cc @@ -0,0 +1,1207 @@ +#include "sandbox_impl.h" +#include "syscall_table.h" + +namespace playground { + +void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, + SecureMem::Args* secureMem) { + SecureMem::Args args = { { { { { 0 } } } } }; + args.self = &args; + args.newSecureMem = secureMem; + args.processFdPub = processFdPub; + args.cloneFdPub = cloneFdPub; +#if defined(__x86_64__) + asm volatile( + "push %%rbx\n" + "push %%rbp\n" + "mov %0, %%rbp\n" // %rbp = args + "xor %%rbx, %%rbx\n" // initial sequence number + "lea 999f(%%rip), %%r15\n" // continue in same thread + "jmp 19f\n" // create trusted thread + + // TODO(markus): Coalesce the read() operations by reading into a bigger + // buffer. + + // Parameters: + // *%fs: secure memory region + // the page following this one contains the scratch space + // %r13: thread's side of threadFd + // %r15: processFdPub + + // Local variables: + // %rbx: sequence number for trusted calls + + // Temporary variables: + // %r9: system call number + // %rbp: secure memory of previous thread + + // Layout of secure shared memory region (c.f. securemem.h): + // 0x00: pointer to the secure shared memory region (i.e. self) + // 0x08: sequence number; must match %rbx + // 0x10: system call number; passed to syscall in %rax + // 0x18: first argument; passed to syscall in %rdi + // 0x20: second argument; passed to syscall in %rsi + // 0x28: third argument; passed to syscall in %rdx + // 0x30: fourth argument; passed to syscall in %r10 + // 0x38: fifth argument; passed to syscall in %r8 + // 0x40: sixth argument; passed to syscall in %r9 + // 0x48: stored return address for clone() system call + // 0x50: stored %rbp value for clone() system call + // 0x58: stored %rbx value for clone() system call + // 0x60: stored %rcx value for clone() system call + // 0x68: stored %rdx value for clone() system call + // 0x70: stored %rsi value for clone() system call + // 0x78: stored %rdi value for clone() system call + // 0x80: stored %r8 value for clone() system call + // 0x88: stored %r9 value for clone() system call + // 0x90: stored %r10 value for clone() system call + // 0x98: stored %r11 value for clone() system call + // 0xA0: stored %r12 value for clone() system call + // 0xA8: stored %r13 value for clone() system call + // 0xB0: stored %r14 value for clone() system call + // 0xB8: stored %r15 value for clone() system call + // 0xC0: new shared memory for clone() + // 0xC8: processFdPub for talking to trusted process + // 0xCC: cloneFdPub for talking to trusted process + // 0xD0: set to non-zero, if in debugging mode + // 0xD4: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0xD8: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0xE0: thread id (TLS_TID) + // 0xE8: threadFdPub (TLS_THREAD_FD) + // 0x200-0x1000: securely passed verified file name(s) + + // Layout of (untrusted) scratch space: + // 0x00: syscall number; passed in %rax + // 0x04: first argument; passed in %rdi + // 0x0C: second argument; passed in %rsi + // 0x14: third argument; passed in %rdx + // 0x1C: fourth argument; passed in %r10 + // 0x24: fifth argument; passed in %r8 + // 0x2C: sixth argument; passed in %r9 + // 0x34: return value + // 0x3C: RDTSCP result (%eax) + // 0x40: RDTSCP result (%edx) + // 0x44: RDTSCP result (%ecx) + + // We use the %fs register for accessing the secure read-only page, and + // the untrusted scratch space immediately following it. The segment + // register and the local descriptor table is set up by passing + // appropriate arguments to clone(). + + "0:xor %%rsp, %%rsp\n" + "mov $2, %%ebx\n" // %rbx = initial sequence number + + // Read request from untrusted thread, or from trusted process. In either + // case, the data that we read has to be considered untrusted. + // read(threadFd, &scratch, 4) + "1:xor %%rax, %%rax\n" // NR_read + "mov %%r13, %%rdi\n" // fd = threadFd + "mov %%fs:0x0, %%rsi\n" + "add $0x1000, %%rsi\n" // buf = &scratch + "mov $4, %%edx\n" // len = 4 + "2:syscall\n" + "cmp $-4, %%rax\n" // EINTR + "jz 2b\n" + "cmp %%rdx, %%rax\n" + "jnz 25f\n" // exit process + + // Retrieve system call number. It is crucial that we only dereference + // %fs:0x1000 exactly once. Afterwards, memory becomes untrusted and + // we must use the value that we have read the first time. + "mov 0(%%rsi), %%eax\n" + + // If syscall number is -1, execute an unlocked system call from the + // secure memory area + "cmp $-1, %%eax\n" + "jnz 5f\n" + "3:cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "mov %%fs:0x10, %%rax\n" + "mov %%fs:0x18, %%rdi\n" + "mov %%fs:0x20, %%rsi\n" + "mov %%fs:0x28, %%rdx\n" + "mov %%fs:0x30, %%r10\n" + "mov %%fs:0x38, %%r8\n" + "mov %%fs:0x40, %%r9\n" + "cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "add $2, %%rbx\n" + + // shmget() gets some special treatment. Whenever we return from this + // system call, we remember the most recently returned SysV shm id. + "cmp $29, %%eax\n" // NR_shmget + "jnz 4f\n" + "syscall\n" + "mov %%rax, %%r8\n" + "mov $56, %%eax\n" // NR_clone + "mov $17, %%edi\n" // flags = SIGCHLD + "mov $1, %%esi\n" // stack = 1 + "syscall\n" + "test %%rax, %%rax\n" + "js 25f\n" // exit process + "mov %%rax, %%rdi\n" + "jnz 7f\n" // wait for child, then return result + "mov %%fs:0x0, %%rdi\n" // start = secure_mem + "mov $4096, %%esi\n" // len = 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $10, %%eax\n" // NR_mprotect + "syscall\n" + "mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id + "xor %%rdi, %%rdi\n" + "jmp 26f\n" // exit program, no message + "4:syscall\n" + "jmp 14f\n" // return result + + // If syscall number is -2, execute locked system call from the + // secure memory area + "5:jg 11f\n" + "cmp $-2, %%eax\n" + "jnz 8f\n" + "cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "mov %%fs:0x10, %%rax\n" + "mov %%fs:0x18, %%rdi\n" + "mov %%fs:0x20, %%rsi\n" + "mov %%fs:0x28, %%rdx\n" + "mov %%fs:0x30, %%r10\n" + "mov %%fs:0x38, %%r8\n" + "mov %%fs:0x40, %%r9\n" + "cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + + // clone() has unusual calling conventions and must be handled specially + "cmp $56, %%rax\n" // NR_clone + "jz 18f\n" + + // exit() terminates trusted thread + "cmp $60, %%eax\n" // NR_exit + "jz 17f\n" + + // Perform requested system call + "syscall\n" + + // Unlock mutex + "6:cmp %%rbx, %%fs:0x8\n" + "jne 25f\n" // exit process + "add $2, %%rbx\n" + "mov %%rax, %%r8\n" + "mov $56, %%eax\n" // NR_clone + "mov $17, %%rdi\n" // flags = SIGCHLD + "mov $1, %%rsi\n" // stack = 1 + "syscall\n" + "test %%rax, %%rax\n" + "js 25f\n" // exit process + "jz 22f\n" // unlock and exit + "mov %%rax, %%rdi\n" + "7:xor %%rsi, %%rsi\n" + "xor %%rdx, %%rdx\n" + "xor %%r10, %%r10\n" + "mov $61, %%eax\n" // NR_wait4 + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 7b\n" + "mov %%r8, %%rax\n" + "jmp 14f\n" // return result + + // If syscall number is -3, read the time stamp counter + "8:cmp $-3, %%eax\n" + "jnz 9f\n" + "rdtsc\n" // sets %edx:%eax + "xor %%rcx, %%rcx\n" + "jmp 10f\n" + "9:cmp $-4, %%eax\n" + "jnz 11f\n" + "rdtscp\n" // sets %edx:%eax and %ecx + "10:add $0x3C, %%rsi\n" + "mov %%eax, 0(%%rsi)\n" + "mov %%edx, 4(%%rsi)\n" + "mov %%ecx, 8(%%rsi)\n" + "mov $12, %%edx\n" + "jmp 15f\n" // return result + + // Check in syscallTable whether this system call is unrestricted + "11:mov %%rax, %%r9\n" + #ifndef NDEBUG + "cmpw $0, %%fs:0xD0\n" // debug mode + "jnz 12f\n" + #endif + "cmp playground$maxSyscall(%%rip), %%eax\n" + "ja 25f\n" // exit process + "shl $4, %%rax\n" + "lea playground$syscallTable(%%rip), %%rdi\n" + "add %%rdi, %%rax\n" + "mov 0(%%rax), %%rax\n" + "cmp $1, %%rax\n" + "jne 25f\n" // exit process + + // Default behavior for unrestricted system calls is to just execute + // them. Read the remaining arguments first. + "12:mov %%rsi, %%r8\n" + "xor %%rax, %%rax\n" // NR_read + "mov %%r13, %%rdi\n" // fd = threadFd + "add $4, %%rsi\n" // buf = &scratch + 4 + "mov $48, %%edx\n" // len = 6*sizeof(void *) + "13:syscall\n" + "cmp $-4, %%rax\n" // EINTR + "jz 13b\n" + "cmp %%rdx, %%rax\n" + "jnz 25f\n" // exit process + "mov %%r9, %%rax\n" + "mov 0x04(%%r8), %%rdi\n" + "mov 0x0C(%%r8), %%rsi\n" + "mov 0x14(%%r8), %%rdx\n" + "mov 0x1C(%%r8), %%r10\n" + "mov 0x2C(%%r8), %%r9\n" + "mov 0x24(%%r8), %%r8\n" + "cmp $231, %%rax\n" // NR_exit_group + "jz 26f\n" // exit program, no message + "syscall\n" + + // Return result of system call to sandboxed thread + "14:mov %%fs:0x0, %%rsi\n" + "add $0x1034, %%rsi\n" // buf = &scratch + 52 + "mov %%rax, (%%rsi)\n" + "mov $8, %%edx\n" // len = 8 + "15:mov %%r13, %%rdi\n" // fd = threadFd + "mov $1, %%eax\n" // NR_write + "16:syscall\n" + "cmp %%rdx, %%rax\n" + "jz 1b\n" + "cmp $-4, %%rax\n" // EINTR + "jz 16b\n" + "jmp 25f\n" // exit process + + // NR_exit: + // Exit trusted thread after cleaning up resources + "17:mov %%fs:0x0, %%rsi\n" + "mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub + "mov $3, %%eax\n" // NR_close + "syscall\n" + "mov %%rsi, %%rdi\n" // start = secure_mem + "mov $8192, %%esi\n" // length = 4096 + "xor %%rdx, %%rdx\n" // prot = PROT_NONE + "mov $10, %%eax\n" // NR_mprotect + "syscall\n" + "mov %%r13, %%rdi\n" // fd = threadFd + "mov $3, %%eax\n" // NR_close + "syscall\n" + "mov $56, %%eax\n" // NR_clone + "mov $17, %%rdi\n" // flags = SIGCHLD + "mov $1, %%rsi\n" // stack = 1 + "syscall\n" + "mov %%rax, %%rdi\n" + "test %%rax, %%rax\n" + "jne 21f\n" // reap helper, exit thread + "jmp 22f\n" // unlock mutex + + // NR_clone: + // Original trusted thread calls clone() to create new nascent + // thread. This thread is (typically) fully privileged and shares all + // resources with the caller (i.e. the previous trusted thread), + // and by extension it shares all resources with the sandbox'd + // threads. + // N.B. It is possible to make the thread creation code crash before + // it releases seccomp privileges. This is generally OK, as it just + // terminates the program. But if we ever support signal handling, + // we have to be careful that the user cannot install a SIGSEGV + // handler that gets executed with elevated privileges. + "18:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "syscall\n" // calls NR_clone + "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values + "jae 6b\n" + "add $2, %%rbx\n" + "test %%rax, %%rax\n" + "jne 14b\n" // return result + + // In nascent thread, now. + "sub $2, %%rbx\n" + "xor %%r15, %%r15\n" // Request to return from clone() when done + + // Get thread id of nascent thread + "19:mov $186, %%eax\n" // NR_gettid + "syscall\n" + "mov %%rax, %%r14\n" + + // Nascent thread creates socketpair() for sending requests to + // trusted thread. + // We can create the filehandles on the stack. Filehandles are + // always treated as untrusted. + // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) + "push %%r15\n" + "mov $53, %%eax\n" // NR_socketpair + "mov $1, %%edi\n" // domain = AF_UNIX + "mov $1, %%esi\n" // type = SOCK_STREAM + "xor %%rdx, %%rdx\n" // protocol = 0 + "sub $8, %%rsp\n" // sv = %rsp + "mov %%rsp, %%r10\n" + "syscall\n" + "test %%rax, %%rax\n" + "jz 27f\n" + + // If things went wrong, we don't have an (easy) way of signaling + // the parent. For our purposes, it is sufficient to fail with a + // fatal error. + "jmp 25f\n" // exit process + "20:mov $56, %%eax\n" // NR_clone + "mov $17, %%rdi\n" // flags = SIGCHLD + "mov $1, %%rsi\n" // stack = 1 + "syscall\n" + "test %%rax, %%rax\n" + "js 25f\n" // exit process + "jz 22f\n" // unlock and exit + "mov %%rax, %%rdi\n" + "21:xor %%rsi, %%rsi\n" + "xor %%rdx, %%rdx\n" + "xor %%r10, %%r10\n" + "mov $61, %%eax\n" // NR_wait4 + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 21b\n" + "jmp 23f\n" // exit thread (no message) + "22:lea playground$syscall_mutex(%%rip), %%rdi\n" + "mov $4096, %%esi\n" + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $10, %%eax\n" // NR_mprotect + "syscall\n" + "lock; addl $0x80000000, (%%rdi)\n" + "jz 23f\n" // exit thread + "mov $1, %%edx\n" + "mov %%rdx, %%rsi\n" // FUTEX_WAKE + "mov $202, %%eax\n" // NR_futex + "syscall\n" + "23:mov $60, %%eax\n" // NR_exit + "mov $1, %%edi\n" // status = 1 + "24:syscall\n" + "25:mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 100f(%%rip), %%rsi\n" + "mov $101f-100f, %%edx\n" // len = strlen(msg) + "syscall\n" + "mov $1, %%edi\n" + "26:mov $231, %%eax\n" // NR_exit_group + "jmp 24b\n" + + // The first page is mapped read-only for use as securely shared memory + "27:mov 0xC0(%%rbp), %%r12\n" // %r12 = secure shared memory + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + "mov $10, %%eax\n" // NR_mprotect + "mov %%r12, %%rdi\n" // addr = secure_mem + "mov $4096, %%esi\n" // len = 4096 + "mov $1, %%edx\n" // prot = PROT_READ + "syscall\n" + + // The second page is used as scratch space by the trusted thread. + // Make it writable. + "mov $10, %%eax\n" // NR_mprotect + "add $4096, %%rdi\n" // addr = secure_mem + 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "syscall\n" + + // Call clone() to create new trusted thread(). + // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + // CLONE_SYSVSEM|CLONE_UNTRACED|CLONE_SETTLS, stack, NULL, NULL, + // tls) + "mov 4(%%rsp), %%r13d\n" // %r13 = threadFd + "mov $56, %%eax\n" // NR_clone + "mov $0x8D0F00, %%edi\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR|TLS + "mov $1, %%rsi\n" // stack = 1 + "mov %%r12, %%r8\n" // tls = new_secure_mem + "mov 0xC8(%%rbp), %%r15d\n" // %r15 = processFdPub + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + "syscall\n" + "test %%rax, %%rax\n" + "js 25b\n" // exit process + "jz 0b\n" // invoke trustedThreadFnc() + + // Done creating trusted thread. We can now get ready to return to caller + "mov 0(%%rsp), %%r9d\n" // %r9 = threadFdPub + "add $8, %%rsp\n" + + // Set up thread local storage with information on how to talk to + // trusted thread and trusted process. + "lea 0xD8(%%r12), %%rsi\n" // args = &secure_mem.TLS; + "mov $158, %%eax\n" // NR_arch_prctl + "mov $0x1001, %%edi\n" // option = ARCH_SET_GS + "syscall\n" + "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values + "jae 20b\n" // exit thread, unlock global mutex + + // Check whether this is the initial thread, or a newly created one. + // At startup we run the same code as when we create a new thread. At + // the very top of this function, you will find that we push 999(%rip) + // on the stack. That is the signal that we should return on the same + // stack rather than return to where clone was called. + "pop %%r15\n" + "test %%r15, %%r15\n" + "jne 28f\n" + + // Returning from clone() into the newly created thread is special. We + // cannot unroll the stack, as we just set up a new stack for this + // thread. We have to explicitly restore CPU registers to the values + // that they had when the program originally called clone(). + "sub $0x80, %%rsp\n" // redzone compensation + "mov 0x48(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x50(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x58(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x60(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x68(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x70(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x78(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x80(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x88(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x90(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0x98(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0xA0(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0xA8(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0xB0(%%rbp), %%rax\n" + "push %%rax\n" + "mov 0xB8(%%rbp), %%rax\n" + "push %%rax\n" + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + + // Nascent thread launches a helper that doesn't share any of our + // resources, except for pages mapped as MAP_SHARED. + // clone(0, %rsp) + "28:mov $56, %%eax\n" // NR_clone + "mov $17, %%rdi\n" // flags = SIGCHLD + "mov %%rsp, %%rsi\n" // stack = %rsp + "syscall\n" + "test %%rax, %%rax\n" + "js 25b\n" // exit process + "jne 29f\n" + + // Use sendmsg() to send to the trusted process the file handles for + // communicating with the new trusted thread. We also send the address + // of the secure memory area (for sanity checks) and the thread id. + "mov 0xCC(%%rbp), %%edi\n" // transport = Sandbox::cloneFdPub() + "cmp %%rbx, 8(%%rbp)\n" + "jne 25b\n" // exit process + "mov %%r9, %%rsi\n" // fd0 = threadFdPub + "mov %%r13, %%rdx\n" // fd1 = threadFd + "push %%r14\n" // threadId + "mov %%esi, 4(%%rsp)\n" // threadFdPub + "push %%r12\n" // secure_mem + "mov %%rsp, %%rcx\n" // buf = &data + "mov $16, %%r8\n" // len = sizeof(void*) + 2*sizeof(int) + "call playground$sendFd\n" + + // Release syscall_mutex_. This signals the trusted process that + // it can write into the original thread's secure memory again. + "mov $10, %%eax\n" // NR_mprotect + "lea playground$syscall_mutex(%%rip), %%rdi\n" + "mov $4096, %%esi\n" + "mov $3, %%edx\n" // PROT_READ | PROT_WRITE + "syscall\n" + "lock; addl $0x80000000, (%%rdi)\n" + "jz 26b\n" // exit process (no error message) + "mov $1, %%edx\n" + "mov %%rdx, %%rsi\n" // FUTEX_WAKE + "mov $202, %%eax\n" // NR_futex + "syscall\n" + "jmp 26b\n" // exit process (no error message) + + // Reap helper + "29:mov %%rax, %%rdi\n" + "30:xor %%rsi, %%rsi\n" + "xor %%rdx, %%rdx\n" + "xor %%r10, %%r10\n" + "mov $61, %%eax\n" // NR_wait4 + "syscall\n" + "cmp $-4, %%eax\n" // EINTR + "jz 30\n" + + // Release privileges by entering seccomp mode. + "mov $157, %%eax\n" // NR_prctl + "mov $22, %%edi\n" // PR_SET_SECCOMP + "mov $1, %%esi\n" + "syscall\n" + "test %%rax, %%rax\n" + "jnz 25b\n" // exit process + + // Back in the newly created sandboxed thread, wait for trusted process + // to receive request. It is possible for an attacker to make us + // continue even before the trusted process is done. This is OK. It'll + // result in us putting stale values into the new thread's TLS. But that + // data is considered untrusted anyway. + "push %%rax\n" + "mov $1, %%edx\n" // len = 1 + "mov %%rsp, %%rsi\n" // buf = %rsp + "mov %%r9, %%rdi\n" // fd = threadFdPub + "31:xor %%rax, %%rax\n" // NR_read + "syscall\n" + "cmp $-4, %%rax\n" // EINTR + "jz 31b\n" + "cmp %%rdx, %%rax\n" + "jne 25b\n" // exit process + "pop %%rax\n" + + // Return to caller. We are in the new thread, now. + "xor %%rax, %%rax\n" + "test %%r15, %%r15\n" + + // Returning to createTrustedThread() + "jz 32f\n" + "jmp *%%r15\n" + + // Returning to the place where clone() had been called + "32:pop %%r15\n" + "pop %%r14\n" + "pop %%r13\n" + "pop %%r12\n" + "pop %%r11\n" + "pop %%r10\n" + "pop %%r9\n" + "pop %%r8\n" + "pop %%rdi\n" + "pop %%rsi\n" + "pop %%rdx\n" + "pop %%rcx\n" + "pop %%rbx\n" + "pop %%rbp\n" + "ret\n" + + ".pushsection \".rodata\"\n" + "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" + "101:\n" + ".popsection\n" + + "999:pop %%rbp\n" + "pop %%rbx\n" + : + : "g"(&args) + : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15" +#elif defined(__i386__) + struct user_desc u; + u.entry_number = (typeof u.entry_number)-1; + u.base_addr = NULL; + u.limit = 0xfffff; + u.seg_32bit = 1; + u.contents = 0; + u.read_exec_only = 0; + u.limit_in_pages = 1; + u.seg_not_present = 0; + u.useable = 1; + SysCalls sys; + if (sys.set_thread_area(&u) < 0) { + die("Cannot set up thread local storage"); + } + asm volatile("movw %w0, %%fs" + : + : "q"(8*u.entry_number+3)); + asm volatile( + "push %%ebx\n" + "push %%ebp\n" + "movd %0, %%mm6\n" // %mm6 = args + "lea 999f, %%ebx\n" // continue in same thread + "movd %%ebx, %%mm3\n" + "xor %%ebx, %%ebx\n" // initial sequence number + "movd %%ebx, %%mm2\n" + "jmp 19f\n" // create trusted thread + + // TODO(markus): Coalesce the read() operations by reading into a bigger + // buffer. + + // Parameters: + // %mm5: secure memory region + // the page following this one contains the scratch space + // %mm0: thread's side of threadFd + // %mm1: processFdPub + // %mm3: return address after creation of new trusted thread + + // Local variables: + // %mm2: sequence number for trusted calls + // %mm4: thread id + + // Temporary variables: + // %ebp: system call number + // %mm6: secure memory of previous thread + // %mm7: temporary variable for spilling data + + // Layout of secure shared memory region (c.f. securemem.h): + // 0x00: pointer to the secure shared memory region (i.e. self) + // 0x04: sequence number; must match %mm2 + // 0x08: system call number; passed to syscall in %eax + // 0x0C: first argument; passed to syscall in %ebx + // 0x10: second argument; passed to syscall in %ecx + // 0x14: third argument; passed to syscall in %edx + // 0x18: fourth argument; passed to syscall in %esi + // 0x1C: fifth argument; passed to syscall in %edi + // 0x20: sixth argument; passed to syscall in %ebp + // 0x24: stored return address for clone() system call + // 0x28: second stored return address for clone() system call + // 0x2C: stored %ebp value for clone() system call + // 0x30: stored %edi value for clone() system call + // 0x34: stored %esi value for clone() system call + // 0x38: stored %edx value for clone() system call + // 0x3C: stored %ecx value for clone() system call + // 0x40: stored %ebx value for clone() system call + // 0x44: new shared memory for clone() + // 0x48: processFdPub for talking to trusted process + // 0x4C: cloneFdPub for talking to trusted process + // 0x50: set to non-zero, if in debugging mode + // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0x60: thread id (TLS_TID) + // 0x68: threadFdPub (TLS_THREAD_FD) + // 0x200-0x1000: securely passed verified file name(s) + + // Layout of (untrusted) scratch space: + // 0x00: syscall number; passed in %eax + // 0x04: first argument; passed in %ebx + // 0x08: second argument; passed in %ecx + // 0x0C: third argument; passed in %edx + // 0x10: fourth argument; passed in %esi + // 0x14: fifth argument; passed in %edi + // 0x18: sixth argument; passed in %ebp + // 0x1C: return value + // 0x20: RDTSCP result (%eax) + // 0x24: RDTSCP result (%edx) + // 0x28: RDTSCP result (%ecx) + + "0:xor %%esp, %%esp\n" + "mov $2, %%eax\n" // %mm2 = initial sequence number + "movd %%eax, %%mm2\n" + + // Read request from untrusted thread, or from trusted process. In either + // case, the data that we read has to be considered untrusted. + // read(threadFd, &scratch, 4) + "1:mov $3, %%eax\n" // NR_read + "movd %%mm0, %%ebx\n" // fd = threadFd + "movd %%mm5, %%ecx\n" + "add $0x1000, %%ecx\n" // buf = &scratch + "mov $4, %%edx\n" // len = 4 + "2:int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 2b\n" + "cmp %%edx, %%eax\n" + "jnz 25f\n" // exit process + + // Retrieve system call number. It is crucial that we only dereference + // 0x1000(%mm5) exactly once. Afterwards, memory becomes untrusted and + // we must use the value that we have read the first time. + "mov 0(%%ecx), %%eax\n" + + // If syscall number is -1, execute an unlocked system call from the + // secure memory area + "cmp $-1, %%eax\n" + "jnz 5f\n" + "3:movd %%mm2, %%ebp\n" + "cmp %%ebp, 0x4-0x1000(%%ecx)\n" + "jne 25f\n" // exit process + "mov 0x08-0x1000(%%ecx), %%eax\n" + "mov 0x0C-0x1000(%%ecx), %%ebx\n" + "mov 0x14-0x1000(%%ecx), %%edx\n" + "mov 0x18-0x1000(%%ecx), %%esi\n" + "mov 0x1C-0x1000(%%ecx), %%edi\n" + "mov 0x20-0x1000(%%ecx), %%ebp\n" + "mov 0x10-0x1000(%%ecx), %%ecx\n" + "movd %%edi, %%mm4\n" + "movd %%ebp, %%mm7\n" + "movd %%mm2, %%ebp\n" + "movd %%mm5, %%edi\n" + "cmp %%ebp, 4(%%edi)\n" + "jne 25f\n" // exit process + "add $2, %%ebp\n" + "movd %%ebp, %%mm2\n" + "movd %%mm4, %%edi\n" + "movd %%mm7, %%ebp\n" + + // shmget() gets some special treatment. Whenever we return from this + // system call, we remember the most recently returned SysV shm id. + "cmp $117, %%eax\n" // NR_ipc + "jnz 4f\n" + "cmp $23, %%ebx\n" // shmget() + "jnz 4f\n" + "int $0x80\n" + "mov %%eax, %%ebp\n" + "mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25f\n" // exit process + "mov %%eax, %%ebx\n" + "jnz 7f\n" // wait for child, then return result + "movd %%mm5, %%ebx\n" // start = secure_mem + "mov $4096, %%ecx\n" // len = 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $125, %%eax\n" // NR_mprotect + "int $0x80\n" + "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id + "xor %%ebx, %%ebx\n" + "jmp 26f\n" // exit program, no message + "4:int $0x80\n" + "jmp 14f\n" // return result + + // If syscall number is -2, execute locked system call from the + // secure memory area + "5:jg 11f\n" + "cmp $-2, %%eax\n" + "jnz 8f\n" + "movd %%mm2, %%ebp\n" + "cmp %%ebp, 0x4-0x1000(%%ecx)\n" + "jne 25f\n" // exit process + "mov 0x08-0x1000(%%ecx), %%eax\n" + "mov 0x0C-0x1000(%%ecx), %%ebx\n" + "mov 0x14-0x1000(%%ecx), %%edx\n" + "mov 0x18-0x1000(%%ecx), %%esi\n" + "mov 0x1C-0x1000(%%ecx), %%edi\n" + "mov 0x20-0x1000(%%ecx), %%ebp\n" + "mov 0x10-0x1000(%%ecx), %%ecx\n" + "movd %%edi, %%mm4\n" + "movd %%ebp, %%mm7\n" + "movd %%mm2, %%ebp\n" + "movd %%mm5, %%edi\n" + "cmp %%ebp, 4(%%edi)\n" + "jne 25f\n" // exit process + + // clone() has unusual calling conventions and must be handled specially + "cmp $120, %%eax\n" // NR_clone + "jz 18f\n" + + // exit() terminates trusted thread + "cmp $1, %%eax\n" // NR_exit + "jz 17f\n" + + // Perform requested system call + "movd %%mm4, %%edi\n" + "movd %%mm7, %%ebp\n" + "int $0x80\n" + + // Unlock mutex + "6:movd %%mm2, %%ebp\n" + "movd %%mm5, %%edi\n" + "cmp %%ebp, 4(%%edi)\n" + "jne 25f\n" // exit process + "add $2, %%ebp\n" + "movd %%ebp, %%mm2\n" + "mov %%eax, %%ebp\n" + "mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25f\n" // exit process + "jz 22f\n" // unlock and exit + "mov %%eax, %%ebx\n" + "7:xor %%ecx, %%ecx\n" + "xor %%edx, %%edx\n" + "mov $7, %%eax\n" // NR_waitpid + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 6\n" + "mov %%ebp, %%eax\n" + "jmp 14f\n" // return result + + // If syscall number is -3, read the time stamp counter + "8:cmp $-3, %%eax\n" + "jnz 9f\n" + "rdtsc\n" // sets %edx:%eax + "xor %%ecx, %%ecx\n" + "jmp 10f\n" + "9:cmp $-4, %%eax\n" + "jnz 11f\n" + "rdtscp\n" // sets %edx:%eax and %ecx + "10:movd %%mm5, %%ebx\n" + "add $0x1020, %%ebx\n" + "mov %%eax, 0(%%ebx)\n" + "mov %%edx, 4(%%ebx)\n" + "mov %%ecx, 8(%%ebx)\n" + "mov %%ebx, %%ecx\n" + "mov $12, %%edx\n" + "jmp 15f\n" // return result + + // Check in syscallTable whether this system call is unrestricted + "11:mov %%eax, %%ebp\n" + #ifndef NDEBUG + "cmpw $0, 0x50-0x1000(%%ecx)\n" + "jnz 12f\n" // debug mode + #endif + "cmp playground$maxSyscall, %%eax\n" + "ja 25f\n" // exit process + "shl $3, %%eax\n" + "add $playground$syscallTable, %%eax\n" + "mov 0(%%eax), %%eax\n" + "cmp $1, %%eax\n" + "jne 25f\n" // exit process + + // Default behavior for unrestricted system calls is to just execute + // them. Read the remaining arguments first. + "12:mov $3, %%eax\n" // NR_read + "movd %%mm0, %%ebx\n" // fd = threadFd + "add $4, %%ecx\n" // buf = &scratch + 4 + "mov $24, %%edx\n" // len = 6*sizeof(void *) + "13:int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 13b\n" + "cmp %%edx, %%eax\n" + "jnz 25f\n" // exit process + "mov %%ebp, %%eax\n" + "mov 0x00(%%ecx), %%ebx\n" + "mov 0x08(%%ecx), %%edx\n" + "mov 0x0C(%%ecx), %%esi\n" + "mov 0x10(%%ecx), %%edi\n" + "mov 0x14(%%ecx), %%ebp\n" + "mov 0x04(%%ecx), %%ecx\n" + "cmp $252, %%eax\n" // NR_exit_group + "jz 26f\n" // exit program, no message + "int $0x80\n" + + // Return result of system call to sandboxed thread + "14:movd %%mm5, %%ecx\n" + "add $0x101C, %%ecx\n" // buf = &scratch + 28 + "mov %%eax, (%%ecx)\n" + "mov $4, %%edx\n" // len = 4 + "15:movd %%mm0, %%ebx\n" // fd = threadFd + "mov $4, %%eax\n" // NR_write + "16:int $0x80\n" + "cmp %%edx, %%eax\n" + "jz 1b\n" + "cmp $-4, %%eax\n" // EINTR + "jz 16b\n" + "jmp 25f\n" // exit process + + // NR_exit: + // Exit trusted thread after cleaning up resources + "17:mov %%edi, %%ecx\n" + "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub + "mov $6, %%eax\n" // NR_close + "int $0x80\n" + "mov %%ecx, %%ebx\n" // start = secure_mem + "mov $8192, %%ecx\n" // length = 4096 + "xor %%edx, %%edx\n" // prot = PROT_NONE + "mov $125, %%eax\n" // NR_mprotect + "int $0x80\n" + "movd %%mm0, %%ebx\n" // fd = threadFd + "mov $6, %%eax\n" // NR_close + "int $0x80\n" + "mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "mov %%eax, %%ebx\n" + "test %%eax, %%eax\n" + "jne 21f\n" // reap helper, exit thread + "jmp 22f\n" // unlock mutex + + // NR_clone: + // Original trusted thread calls clone() to create new nascent + // thread. This thread is (typically) fully privileged and shares all + // resources with the caller (i.e. the previous trusted thread), + // and by extension it shares all resources with the sandbox'd + // threads. + // N.B. It is possible to make the thread creation code crash before + // it releases seccomp privileges. This is generally OK, as it just + // terminates the program. But if we ever support signal handling, + // we have to be careful that the user cannot install a SIGSEGV + // handler that gets executed with elevated privileges. + "18:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem + "movd %%mm4, %%edi\n" + "movd %%mm7, %%ebp\n" + "int $0x80\n" // calls NR_clone + "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values + "jae 6b\n" + "movd %%mm2, %%edi\n" + "add $2, %%edi\n" + "movd %%edi, %%mm2\n" + "test %%eax, %%eax\n" + "jne 14b\n" // return result + + // In nascent thread, now. + "sub $2, %%edi\n" + "movd %%edi, %%mm2\n" + "movd %%eax, %%mm3\n" // Request to return from clone() when done + + // Get thread id of nascent thread + "19:mov $224, %%eax\n" // NR_gettid + "int $0x80\n" + "movd %%eax, %%mm4\n" + + // Nascent thread creates socketpair() for sending requests to + // trusted thread. + // We can create the filehandles on the stack. Filehandles are + // always treated as untrusted. + // socketpair(AF_UNIX, SOCK_STREAM, 0, fds) + "mov $102, %%eax\n" // NR_socketcall + "mov $8, %%ebx\n" // socketpair + "sub $8, %%esp\n" // sv = %rsp + "push %%esp\n" + "xor %%ecx, %%ecx\n" // protocol = 0 + "push %%ecx\n" + "mov $1, %%ecx\n" // type = SOCK_STREAM + "push %%ecx\n" + "push %%ecx\n" // domain = AF_UNIX + "mov %%esp, %%ecx\n" + "int $0x80\n" + "add $0x10, %%esp\n" + "test %%eax, %%eax\n" + "jz 27f\n" + + // If things went wrong, we don't have an (easy) way of signaling + // the parent. For our purposes, it is sufficient to fail with a + // fatal error. + "jmp 25f\n" // exit process + "20:mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov $1, %%ecx\n" // stack = 1 + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25f\n" // exit process + "jz 22f\n" // unlock and exit + "mov %%eax, %%ebx\n" + "21:xor %%ecx, %%ecx\n" + "xor %%edx, %%edx\n" + "mov $7, %%eax\n" // NR_waitpid + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 21b\n" + "jmp 23f\n" // exit thread (no message) + "22:lea playground$syscall_mutex, %%ebx\n" + "mov $4096, %%ecx\n" + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "mov $125, %%eax\n" // NR_mprotect + "int $0x80\n" + "lock; addl $0x80000000, (%%ebx)\n" + "jz 23f\n" // exit thread + "mov $1, %%edx\n" + "mov %%edx, %%ecx\n" // FUTEX_WAKE + "mov $240, %%eax\n" // NR_futex + "int $0x80\n" + "23:mov $1, %%eax\n" // NR_exit + "mov $1, %%ebx\n" // status = 1 + "24:int $0x80\n" + "25:mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 100f, %%ecx\n" + "mov $101f-100f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "mov $1, %%ebx\n" + "26:mov $252, %%eax\n" // NR_exit_group + "jmp 24b\n" + + // The first page is mapped read-only for use as securely shared memory + "27:movd %%mm6, %%ebp\n" + "mov 0x44(%%ebp), %%esi\n" + "movd %%esi, %%mm5\n" // %mm5 = secure shared memory + "movd %%mm2, %%edi\n" + "cmp %%edi, 4(%%ebp)\n" + "jne 25b\n" // exit process + "mov $125, %%eax\n" // NR_mprotect + "mov %%esi, %%ebx\n" + "mov $4096, %%ecx\n" // len = 4096 + "mov $1, %%edx\n" // prot = PROT_READ + "int $0x80\n" + + // The second page is used as scratch space by the trusted thread. + // Make it writable. + "mov $125, %%eax\n" // NR_mprotect + "add $4096, %%ebx\n" // addr = secure_mem + 4096 + "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE + "int $0x80\n" + + // Call clone() to create new trusted thread(). + // clone(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + // CLONE_SYSVSEM|CLONE_UNTRACED, stack, NULL, NULL, NULL) + "mov 4(%%esp), %%eax\n" + "movd %%eax, %%mm0\n" // %mm0 = threadFd + "mov $120, %%eax\n" // NR_clone + "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR + "mov $1, %%ecx\n" // stack = 1 + "movd 0x48(%%ebp), %%mm1\n" // %mm1 = processFdPub + "cmp %%edi, 4(%%ebp)\n" + "jne 25b\n" // exit process + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25b\n" // exit process + "jz 0b\n" // invoke trustedThreadFnc() + + // Set up thread local storage + "mov $0x51, %%eax\n" // seg_32bit, limit_in_pages, useable + "push %%eax\n" + "mov $0xFFFFF, %%eax\n" // limit + "push %%eax\n" + "add $0x58, %%esi\n" + "push %%esi\n" // base_addr = &secure_mem.TLS + "mov %%fs, %%eax\n" + "shr $3, %%eax\n" + "push %%eax\n" // entry_number + "mov $243, %%eax\n" // NR_set_thread_area + "mov %%esp, %%ebx\n" + "int $0x80\n" + "test %%eax, %%eax\n" + "jnz 25b\n" // exit process + "add $16, %%esp\n" + + // Done creating trusted thread. We can now get ready to return to caller + "mov 0(%%esp), %%esi\n" // %esi = threadFdPub + "add $8, %%esp\n" + + // Check whether this is the initial thread, or a newly created one. + // At startup we run the same code as when we create a new thread. At + // the very top of this function, you will find that we store 999(%rip) + // in %%mm3. That is the signal that we should return on the same + // stack rather than return to where clone was called. + "movd %%mm3, %%eax\n" + "test %%eax, %%eax\n" + "jne 28f\n" + + // Returning from clone() into the newly created thread is special. We + // cannot unroll the stack, as we just set up a new stack for this + // thread. We have to explicitly restore CPU registers to the values + // that they had when the program originally called clone(). + "mov 0x24(%%ebp), %%eax\n" + "push %%eax\n" + "mov 0x28(%%ebp), %%eax\n" + "push %%eax\n" + "mov 0x2C(%%ebp), %%eax\n" + "push %%eax\n" + "mov 0x30(%%ebp), %%eax\n" + "push %%eax\n" + "mov 0x34(%%ebp), %%eax\n" + "push %%eax\n" + "mov 0x38(%%ebp), %%eax\n" + "push %%eax\n" + "mov 0x3C(%%ebp), %%eax\n" + "push %%eax\n" + "mov 0x40(%%ebp), %%eax\n" + "push %%eax\n" + "cmp %%edi, 4(%%ebp)\n" + "jne 25b\n" // exit process + + // Nascent thread launches a helper that doesn't share any of our + // resources, except for pages mapped as MAP_SHARED. + // clone(0, %esp) + "28:mov $120, %%eax\n" // NR_clone + "mov $17, %%ebx\n" // flags = SIGCHLD + "mov %%esp, %%ecx\n" // stack = %esp + "int $0x80\n" + "test %%eax, %%eax\n" + "js 25b\n" // exit process + "jne 29f\n" + + // Use sendmsg() to send to the trusted process the file handles for + // communicating with the new trusted thread. We also send the address + // of the secure memory area (for sanity checks) and the thread id. + "push %%esi\n" // threadFdPub + "movd %%mm4, %%eax\n" // threadId + "push %%eax\n" + "movd %%mm5, %%eax\n" // secure_mem + "push %%eax\n" + "mov %%esp, %%ebx\n" // buf = &data + "mov $12, %%eax\n" // len = sizeof(void*) + 2*sizeof(int) + "push %%eax\n" + "push %%ebx\n" + "movd %%mm0, %%eax\n" // fd1 = threadFd + "push %%eax\n" + "push %%esi\n" // fd0 = threadFdPub + "mov 0x4C(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub() + "cmp %%edi, 4(%%ebp)\n" + "jne 25b\n" // exit process + "push %%eax\n" + "call playground$sendFd\n" + + // Release syscall_mutex_. This signals the trusted process that + // it can write into the original thread's secure memory again. + "mov $125, %%eax\n" // NR_mprotect + "lea playground$syscall_mutex, %%ebx\n" + "mov $4096, %%ecx\n" + "mov $3, %%edx\n" // PROT_READ | PROT_WRITE + "int $0x80\n" + "lock; addl $0x80000000, (%%ebx)\n" + "jz 26b\n" // exit process (no error message) + "mov $1, %%edx\n" + "mov %%edx, %%ecx\n" // FUTEX_WAKE + "mov $240, %%eax\n" // NR_futex + "int $0x80\n" + "jmp 26b\n" // exit process (no error message) + + // Reap helper + "29:mov %%eax, %%ebx\n" + "30:xor %%ecx, %%ecx\n" + "xor %%edx, %%edx\n" + "mov $7, %%eax\n" // NR_waitpid + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 30\n" + + // Release privileges by entering seccomp mode. + "mov $172, %%eax\n" // NR_prctl + "mov $22, %%ebx\n" // PR_SET_SECCOMP + "mov $1, %%ecx\n" + "int $0x80\n" + "test %%eax, %%eax\n" + "jnz 25b\n" // exit process + + // Back in the newly created sandboxed thread, wait for trusted process + // to receive request. It is possible for an attacker to make us + // continue even before the trusted process is done. This is OK. It'll + // result in us putting stale values into the new thread's TLS. But that + // data is considered untrusted anyway. + "push %%eax\n" + "mov $1, %%edx\n" // len = 1 + "mov %%esp, %%ecx\n" // buf = %rsp + "mov %%esi, %%ebx\n" // fd = threadFdPub + "31:mov $3, %%eax\n" // NR_read + "int $0x80\n" + "cmp $-4, %%eax\n" // EINTR + "jz 31b\n" + "cmp %%edx, %%eax\n" + "jne 25b\n" // exit process + "pop %%eax\n" + + // Return to caller. We are in the new thread, now. + "xor %%eax, %%eax\n" + "movd %%mm3, %%ebx\n" + + // Release MMX registers, so that they can be used for floating point + // operations. + "emms\n" + + // Returning to createTrustedThread() + "test %%ebx, %%ebx\n" + "jz 32f\n" + "jmp *%%ebx\n" + + // Returning to the place where clone() had been called + "32:pop %%ebx\n" + "pop %%ecx\n" + "pop %%edx\n" + "pop %%esi\n" + "pop %%edi\n" + "pop %%ebp\n" + "ret\n" + + ".pushsection \".rodata\"\n" + "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" + "101:\n" + ".popsection\n" + + "999:pop %%ebp\n" + "pop %%ebx\n" + : + : "g"(&args) + : "eax", "ecx", "edx", "edi", "esi" +#else +#error Unsupported target platform +#endif +); +} + +} // namespace diff --git a/sandbox/linux/seccomp/x86_decode.cc b/sandbox/linux/seccomp/x86_decode.cc new file mode 100644 index 0000000..c28b579 --- /dev/null +++ b/sandbox/linux/seccomp/x86_decode.cc @@ -0,0 +1,306 @@ +#include "x86_decode.h" + +namespace playground { + +#if defined(__x86_64__) || defined(__i386__) +unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix, + char **rex_ptr, char **mod_rm_ptr, char **sib_ptr, + bool *is_group) { + enum { + BYTE_OP = (1<<1), // 0x02 + IMM = (1<<2), // 0x04 + IMM_BYTE = (2<<2), // 0x08 + MEM_ABS = (3<<2), // 0x0C + MODE_MASK = (7<<2), // 0x1C + MOD_RM = (1<<5), // 0x20 + STACK = (1<<6), // 0x40 + GROUP = (1<<7), // 0x80 + GROUP_MASK = 0x7F, + }; + + static unsigned char opcode_types[512] = { + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x00 - 0x07 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x00, // 0x08 - 0x0F + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x10 - 0x17 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x01, 0x01, // 0x18 - 0x1F + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x20 - 0x27 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x28 - 0x2F + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x30 - 0x37 + 0x23, 0x21, 0x23, 0x21, 0x09, 0x05, 0x00, 0x01, // 0x38 - 0x3F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x40 - 0x47 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x48 - 0x4F + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x50 - 0x57 + 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // 0x58 - 0x5F + 0x01, 0x01, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0x60 - 0x67 + 0x45, 0x25, 0x49, 0x29, 0x03, 0x01, 0x03, 0x01, // 0x68 - 0x6F + 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x70 - 0x77 + 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0x78 - 0x7F + 0x27, 0x25, 0x27, 0x29, 0x23, 0x21, 0x23, 0x21, // 0x80 - 0x87 + 0x23, 0x21, 0x23, 0x21, 0x21, 0x21, 0x21, 0x80, // 0x88 - 0x8F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0x90 - 0x97 + 0x01, 0x01, 0x05, 0x01, 0x41, 0x41, 0x01, 0x01, // 0x98 - 0x9F + 0x0F, 0x0D, 0x0F, 0x0D, 0x03, 0x01, 0x03, 0x01, // 0xA0 - 0xA7 + 0x09, 0x05, 0x03, 0x01, 0x03, 0x01, 0x03, 0x01, // 0xA8 - 0xAF + 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // 0xB0 - 0xB7 + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0xB8 - 0xBF + 0x27, 0x29, 0x01, 0x01, 0x21, 0x21, 0x27, 0x25, // 0xC0 - 0xC7 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x01, // 0xC8 - 0xCF + 0x23, 0x21, 0x23, 0x21, 0x09, 0x09, 0x01, 0x01, // 0xD0 - 0xD7 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xD8 - 0xDF + 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, // 0xE0 - 0xE7 + 0x05, 0x05, 0x05, 0x09, 0x03, 0x01, 0x03, 0x01, // 0xE8 - 0xEF + 0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x88, 0x90, // 0xF0 - 0xF7 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x98, 0xA0, // 0xF8 - 0xFF + 0x00, 0xA8, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, // 0xF00 - 0xF07 + 0x01, 0x01, 0x00, 0x01, 0x00, 0x21, 0x01, 0x00, // 0xF08 - 0xF0F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF10 - 0xF17 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF18 - 0xF1F + 0x21, 0x21, 0x21, 0x21, 0x00, 0x00, 0x00, 0x00, // 0xF20 - 0xF27 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF28 - 0xF2F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, // 0xF30 - 0xF37 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF38 - 0xF3F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF40 - 0xF47 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF48 - 0xF4F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF50 - 0xF57 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF58 - 0xF5F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF60 - 0xF67 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xF68 - 0xF6F + 0x21, 0x00, 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, // 0xF70 - 0xF77 + 0x21, 0x21, 0x00, 0x00, 0x21, 0x21, 0x21, 0x21, // 0xF78 - 0xF7F + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF80 - 0xF87 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xF88 - 0xF8F + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF90 - 0xF97 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xF98 - 0xF9F + 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x00, 0x00, // 0xFA0 - 0xFA7 + 0x01, 0x01, 0x01, 0x21, 0x29, 0x21, 0x21, 0x21, // 0xFA8 - 0xFAF + 0x23, 0x21, 0x00, 0x21, 0x00, 0x00, 0x23, 0x21, // 0xFB0 - 0xFB7 + 0x21, 0x00, 0x29, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFB8 - 0xFBF + 0x21, 0x21, 0x00, 0x21, 0x00, 0x00, 0x00, 0x21, // 0xFC0 - 0xFC7 + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // 0xFC8 - 0xFCF + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD0 - 0xFD7 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFD8 - 0xFDF + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE0 - 0xFE7 + 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // 0xFE8 - 0xFEF + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF0 - 0xFF7 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0xFF8 - 0xFFF + }; + + static unsigned char group_table[56] = { + 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 1A + 0x27, 0x27, 0x23, 0x23, 0x23, 0x23, 0x23, 0x23, // Group 3 (Byte) + 0x25, 0x25, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // Group 3 + 0x23, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Group 4 + 0x21, 0x21, 0x61, 0x21, 0x61, 0x21, 0x61, 0x00, // Group 5 + 0x00, 0x00, 0x21, 0x21, 0x21, 0x00, 0x21, 0x23, // Group 7 + 0x21, 0x00, 0x00, 0x21, 0x21, 0x00, 0x21, 0x00, // Group 7 (Alternate) + }; + + const unsigned char *insn_ptr = reinterpret_cast<const unsigned char *>(*ip); + int operand_width = 4; + int address_width = 4; + if (is64bit) { + address_width = 8; + } + unsigned char byte, rex = 0; + bool found_prefix = false; + if (rex_ptr) { + *rex_ptr = 0; + } + if (mod_rm_ptr) { + *mod_rm_ptr = 0; + } + if (sib_ptr) { + *sib_ptr = 0; + } + for (;; ++insn_ptr) { + switch (byte = *insn_ptr) { + case 0x66: // Operand width prefix + operand_width ^= 6; + break; + case 0x67: // Address width prefix + address_width ^= is64bit ? 12 : 6; + break; + case 0x26: // Segment selector prefixes + case 0x2e: + case 0x36: + case 0x3e: + case 0x64: + case 0x65: + case 0xF0: + case 0xF2: + case 0xF3: + break; + case 0x40: case 0x41: case 0x42: case 0x43: // 64 bit REX prefixes + case 0x44: case 0x45: case 0x46: case 0x47: + case 0x48: case 0x49: case 0x4A: case 0x4B: + case 0x4C: case 0x4D: case 0x4E: case 0x4F: + if (is64bit) { + if (rex_ptr) { + *rex_ptr = (char *)insn_ptr; + } + rex = byte; + found_prefix = true; + continue; + } + // fall through + default: + ++insn_ptr; + goto no_more_prefixes; + } + rex = 0; + found_prefix = true; + } +no_more_prefixes: + if (has_prefix) { + *has_prefix = found_prefix; + } + if (rex & REX_W) { + operand_width = 8; + } + unsigned char type; + unsigned short insn = byte; + unsigned int idx = 0; + if (byte == 0x0F) { + byte = *insn_ptr++; + insn = (insn << 8) | byte; + idx = 256; + } + type = opcode_types[idx + byte]; + bool found_mod_rm = false; + bool found_group = false; + bool found_sib = false; + unsigned char mod_rm = 0; + unsigned char sib = 0; + if (type & GROUP) { + found_mod_rm = true; + found_group = true; + mod_rm = *insn_ptr; + if (mod_rm_ptr) { + *mod_rm_ptr = (char *)insn_ptr; + } + unsigned char group = (type & GROUP_MASK) + ((mod_rm >> 3) & 0x7); + if ((type & GROUP_MASK) == 40 && (mod_rm >> 6) == 3) { + group += 8; + } + type = group_table[group]; + } + if (!type) { + // We know that we still don't decode some of the more obscure + // instructions, but for all practical purposes that doesn't matter. + // Compilers are unlikely to output them, and even if we encounter + // hand-coded assembly, we will soon synchronize to the instruction + // stream again. + // + // std::cerr << "Unsupported instruction at 0x" << std::hex << + // std::uppercase << reinterpret_cast<long>(*ip) << " [ "; + // for (const unsigned char *ptr = + // reinterpret_cast<const unsigned char *>(*ip); + // ptr < insn_ptr; ) { + // std::cerr << std::hex << std::uppercase << std::setw(2) << + // std::setfill('0') << (unsigned int)*ptr++ << ' '; + // } + // std::cerr << "]" << std::endl; + } else { + if (is64bit && (type & STACK)) { + operand_width = 8; + } + if (type & MOD_RM) { + found_mod_rm = true; + if (mod_rm_ptr) { + *mod_rm_ptr = (char *)insn_ptr; + } + mod_rm = *insn_ptr++; + int mod = (mod_rm >> 6) & 0x3; + int rm = 8*(rex & REX_B) + (mod_rm & 0x7); + if (mod != 3) { + if (address_width == 2) { + switch (mod) { + case 0: + if (rm != 6 /* SI */) { + break; + } + // fall through + case 2: + insn_ptr++; + // fall through + case 1: + insn_ptr++; + break; + } + } else { + if ((rm & 0x7) == 4) { + found_sib = true; + if (sib_ptr) { + *sib_ptr = (char *)insn_ptr; + } + sib = *insn_ptr++; + if (!mod && (sib & 0x7) == 5 /* BP */) { + insn_ptr += 4; + } + } + switch (mod) { + case 0: + if (rm != 5 /* BP */) { + break; + } + // fall through + case 2: + insn_ptr += 3; + // fall through + case 1: + insn_ptr++; + break; + } + } + } + } + switch (insn) { + case 0xC8: // ENTER + insn_ptr++; + // fall through + case 0x9A: // CALL (far) + case 0xC2: // RET (near) + case 0xCA: // LRET + case 0xEA: // JMP (far) + insn_ptr += 2; + break; + case 0xF80: case 0xF81: case 0xF82: case 0xF83: // Jcc (rel) + case 0xF84: case 0xF85: case 0xF86: case 0xF87: + case 0xF88: case 0xF89: case 0xF8A: case 0xF8B: + case 0xF8C: case 0xF8D: case 0xF8E: case 0xF8F: + insn_ptr += operand_width; + break; + } + switch (type & MODE_MASK) { + case IMM: + if (!(type & BYTE_OP)) { + switch (insn) { + case 0xB8: case 0xB9: case 0xBA: case 0xBB: + case 0xBC: case 0xBD: case 0xBE: case 0xBF: + // Allow MOV to/from 64bit addresses + insn_ptr += operand_width; + break; + default: + insn_ptr += (operand_width == 8) ? 4 : operand_width; + break; + } + break; + } + // fall through + case IMM_BYTE: + insn_ptr++; + break; + case MEM_ABS: + insn_ptr += address_width; + break; + } + } + if (is_group) { + *is_group = found_group; + } + *ip = reinterpret_cast<const char *>(insn_ptr); + return insn; +} +#endif + +} // namespace diff --git a/sandbox/linux/seccomp/x86_decode.h b/sandbox/linux/seccomp/x86_decode.h new file mode 100644 index 0000000..6db26ab --- /dev/null +++ b/sandbox/linux/seccomp/x86_decode.h @@ -0,0 +1,15 @@ +#ifndef X86_DECODE_H__ +#define X86_DECODE_H__ +namespace playground { +enum { + REX_B = 0x01, + REX_X = 0x02, + REX_R = 0x04, + REX_W = 0x08 +}; + +unsigned short next_inst(const char **ip, bool is64bit, bool *has_prefix = 0, + char **rex_ptr = 0, char **mod_rm_ptr = 0, + char **sib_ptr = 0, bool *is_group = 0); +} // namespace +#endif // X86_DECODE_H__ |