summaryrefslogtreecommitdiffstats
path: root/sandbox/linux/seccomp/syscall.cc
diff options
context:
space:
mode:
authornsylvain@chromium.org <nsylvain@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-31 01:16:35 +0000
committernsylvain@chromium.org <nsylvain@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-31 01:16:35 +0000
commitfb7b5328a5fd3aecfc27f765dea94b961c657597 (patch)
tree84adc617db0031a881265e95f9c569de66fa733d /sandbox/linux/seccomp/syscall.cc
parent7302ea910ce937d482780649d6a84bbfff4ac521 (diff)
downloadchromium_src-fb7b5328a5fd3aecfc27f765dea94b961c657597.zip
chromium_src-fb7b5328a5fd3aecfc27f765dea94b961c657597.tar.gz
chromium_src-fb7b5328a5fd3aecfc27f765dea94b961c657597.tar.bz2
Revert 57921 - Pull seccomp-sandbox in via DEPS rather than using an in-tree copy
This means changes to the sandbox won't have to be committed twice, to both trees. BUG=none TEST=smoke test of running chromium with --enable-seccomp-sandbox Review URL: http://codereview.chromium.org/3249003 TBR=mseaborn@chromium.org Review URL: http://codereview.chromium.org/3245011 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57933 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'sandbox/linux/seccomp/syscall.cc')
-rw-r--r--sandbox/linux/seccomp/syscall.cc380
1 files changed, 380 insertions, 0 deletions
diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc
new file mode 100644
index 0000000..681fec9
--- /dev/null
+++ b/sandbox/linux/seccomp/syscall.cc
@@ -0,0 +1,380 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "debug.h"
+#include "sandbox_impl.h"
+#include "syscall_table.h"
+
+namespace playground {
+
+// TODO(markus): change this into a function that returns the address of the assembly code. If that isn't possible for sandbox_clone, then move that function into a *.S file
+asm(
+ ".pushsection .text, \"ax\", @progbits\n"
+
+ // This is the special wrapper for the clone() system call. The code
+ // relies on the stack layout of the system call wrapper (c.f. below). It
+ // passes the stack pointer as an additional argument to sandbox__clone(),
+ // so that upon starting the child, register values can be restored and
+ // the child can start executing at the correct IP, instead of trying to
+ // run in the trusted thread.
+ "playground$sandbox_clone:"
+ ".globl playground$sandbox_clone\n"
+ ".type playground$sandbox_clone, @function\n"
+ #if defined(__x86_64__)
+ // Skip the 8 byte return address into the system call wrapper. The
+ // following bytes are the saved register values that we need to restore
+ // upon return from clone() in the new thread.
+ "lea 8(%rsp), %r9\n"
+ "jmp playground$sandbox__clone\n"
+ #elif defined(__i386__)
+ // As i386 passes function arguments on the stack, we need to skip a few
+ // more values before we can get to the saved registers.
+ "lea 28(%esp), %eax\n"
+ "mov %eax, 24(%esp)\n"
+ "jmp playground$sandbox__clone\n"
+ #else
+ #error Unsupported target platform
+ #endif
+ ".size playground$sandbox_clone, .-playground$sandbox_clone\n"
+
+
+ // This is the wrapper which is called by the untrusted code, trying to
+ // make a system call.
+ "playground$syscallWrapper:"
+ ".internal playground$syscallWrapper\n"
+ ".globl playground$syscallWrapper\n"
+ ".type playground$syscallWrapper, @function\n"
+ #if defined(__x86_64__)
+ // Check for rt_sigreturn(). It needs to be handled specially.
+ "cmp $15, %rax\n" // NR_rt_sigreturn
+ "jnz 1f\n"
+ "add $0x90, %rsp\n" // pop return addresses and red zone
+ "0:syscall\n" // rt_sigreturn() is unrestricted
+ "mov $66, %edi\n" // rt_sigreturn() should never return
+ "mov $231, %eax\n" // NR_exit_group
+ "jmp 0b\n"
+
+ // Save all registers
+ "1:push %rbp\n"
+ "mov %rsp, %rbp\n"
+ "push %rbx\n"
+ "push %rcx\n"
+ "push %rdx\n"
+ "push %rsi\n"
+ "push %rdi\n"
+ "push %r8\n"
+ "push %r9\n"
+ "push %r10\n"
+ "push %r11\n"
+ "push %r12\n"
+ "push %r13\n"
+ "push %r14\n"
+ "push %r15\n"
+
+ // Convert from syscall calling conventions to C calling conventions.
+ // System calls have a subtly different register ordering than the user-
+ // space x86-64 ABI.
+ "mov %r10, %rcx\n"
+
+ // Check range of system call
+ "cmp playground$maxSyscall(%rip), %eax\n"
+ "ja 3f\n"
+
+ // Retrieve function call from system call table (c.f. syscall_table.c).
+ // We have three different types of entries; zero for denied system calls,
+ // that should be handled by the defaultSystemCallHandler(); minus one
+ // for unrestricted system calls that need to be forwarded to the trusted
+ // thread; and function pointers to specific handler functions.
+ "mov %rax, %r10\n"
+ "shl $4, %r10\n"
+ "lea playground$syscallTable(%rip), %r11\n"
+ "add %r11, %r10\n"
+ "mov 0(%r10), %r10\n"
+
+ // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
+ // jump to fallback handler.
+ "cmp $1, %r10\n"
+ "jbe 3f\n"
+ "call *%r10\n"
+ "2:"
+
+ // Restore CPU registers, except for %rax which was set by the system call.
+ "pop %r15\n"
+ "pop %r14\n"
+ "pop %r13\n"
+ "pop %r12\n"
+ "pop %r11\n"
+ "pop %r10\n"
+ "pop %r9\n"
+ "pop %r8\n"
+ "pop %rdi\n"
+ "pop %rsi\n"
+ "pop %rdx\n"
+ "pop %rcx\n"
+ "pop %rbx\n"
+ "pop %rbp\n"
+
+ // Remove fake return address. This is added in the patching code in
+ // library.cc and it makes stack traces a little cleaner.
+ "add $8, %rsp\n"
+
+ // Return to caller
+ "ret\n"
+
+ "3:"
+ // If we end up calling a specific handler, we don't need to know the
+ // system call number. However, in the generic case, we do. Shift
+ // registers so that the system call number becomes visible as the
+ // first function argument.
+ "push %r9\n"
+ "mov %r8, %r9\n"
+ "mov %rcx, %r8\n"
+ "mov %rdx, %rcx\n"
+ "mov %rsi, %rdx\n"
+ "mov %rdi, %rsi\n"
+ "mov %rax, %rdi\n"
+
+ // Call default handler.
+ "call playground$defaultSystemCallHandler\n"
+ "pop %r9\n"
+ "jmp 2b\n"
+ #elif defined(__i386__)
+ "cmp $119, %eax\n" // NR_sigreturn
+ "jnz 1f\n"
+ "add $0x4, %esp\n" // pop return address
+ "0:int $0x80\n" // sigreturn() is unrestricted
+ "mov $66, %ebx\n" // sigreturn() should never return
+ "mov %ebx, %eax\n" // NR_exit
+ "jmp 0b\n"
+ "1:cmp $173, %eax\n" // NR_rt_sigreturn
+ "jnz 3f\n"
+
+ // Convert rt_sigframe into sigframe, allowing us to call sigreturn().
+ // This is possible since the first part of signal stack frames have
+ // stayed very stable since the earliest kernel versions. While never
+ // officially documented, lots of user space applications rely on this
+ // part of the ABI, and kernel developers have been careful to maintain
+ // backwards compatibility.
+ // In general, the rt_sigframe includes a lot of extra information that
+ // the signal handler can look at. Most notably, this means a complete
+ // siginfo record.
+ // Fortunately though, the kernel doesn't look at any of this extra data
+ // when returning from a signal handler. So, we can safely convert an
+ // rt_sigframe to a legacy sigframe, discarding the extra data in the
+ // process. Interestingly, the legacy signal frame is actually larger than
+ // the rt signal frame, as it includes a lot more padding.
+ "sub $0x1C8, %esp\n" // a legacy signal stack is much larger
+ "mov 0x1CC(%esp), %eax\n" // push signal number
+ "push %eax\n"
+ "lea 0x270(%esp), %esi\n" // copy siginfo register values
+ "lea 0x4(%esp), %edi\n" // into new location
+ "mov $0x16, %ecx\n"
+ "cld\n"
+ "rep movsl\n"
+ "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask
+ "mov %ebx, 0x54(%esp)\n"
+ "lea 2f, %esi\n"
+ "push %esi\n" // push restorer function
+ "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers
+ "movb $2, %cl\n"
+ "rep movsl\n"
+ "ret\n" // return to restorer function
+ "2:pop %eax\n" // remove dummy argument (signo)
+ "mov $119, %eax\n" // NR_sigaction
+ "int $0x80\n"
+
+
+ // Preserve all registers
+ "3:push %ebx\n"
+ "push %ecx\n"
+ "push %edx\n"
+ "push %esi\n"
+ "push %edi\n"
+ "push %ebp\n"
+
+ // Convert from syscall calling conventions to C calling conventions
+ "push %ebp\n"
+ "push %edi\n"
+ "push %esi\n"
+ "push %edx\n"
+ "push %ecx\n"
+ "push %ebx\n"
+ "push %eax\n"
+
+ // Check range of system call
+ "cmp playground$maxSyscall, %eax\n"
+ "ja 9f\n"
+
+ // We often have long sequences of calls to gettimeofday(). This is
+ // needlessly expensive. Coalesce them into a single call.
+ //
+ // We keep track of state in TLS storage that we can access through
+ // the %fs segment register. See trusted_thread.cc for the exact
+ // memory layout.
+ //
+ // TODO(markus): maybe, we should proactively call gettimeofday() and
+ // clock_gettime(), whenever we talk to the trusted thread?
+ // or maybe, if we have recently seen requests to compute
+ // the time. There might be a repeated pattern of those.
+ "cmp $78, %eax\n" // __NR_gettimeofday
+ "jnz 6f\n"
+ "cmp %eax, %fs:0x102C-0x58\n" // last system call
+ "jnz 4f\n"
+
+ // This system call and the last system call prior to this one both are
+ // calls to gettimeofday(). Try to avoid making the new call and just
+ // return the same result as in the previous call.
+ // Just in case the caller is spinning on the result from gettimeofday(),
+ // every so often, call the actual system call.
+ "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday()
+ "jz 4f\n"
+
+ // Atomically read the 64bit word representing last-known timestamp and
+ // return it to the caller. On x86-32 this is a little more complicated and
+ // requires the use of the cmpxchg8b instruction.
+ "mov %ebx, %eax\n"
+ "mov %ecx, %edx\n"
+ "lock; cmpxchg8b 100f\n"
+ "mov %eax, 0(%ebx)\n"
+ "mov %edx, 4(%ebx)\n"
+ "xor %eax, %eax\n"
+ "add $28, %esp\n"
+ "jmp 8f\n"
+
+ // This is a call to gettimeofday(), but we don't have a valid cached
+ // result, yet.
+ "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number
+ "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations
+ "call playground$defaultSystemCallHandler\n"
+
+ // Returned from gettimeofday(). Remember return value, in case the
+ // application calls us again right away.
+ // Again, this has to happen atomically and requires cmpxchg8b.
+ "mov 4(%ebx), %ecx\n"
+ "mov 0(%ebx), %ebx\n"
+ "mov 100f, %eax\n"
+ "mov 101f, %edx\n"
+ "5:lock; cmpxchg8b 100f\n"
+ "jnz 5b\n"
+ "xor %eax, %eax\n"
+ "jmp 10f\n"
+
+ // Remember the number of the last system call made. We deliberately do
+ // not remember calls to gettid(), as we have often seen long sequences
+ // of calls to just gettimeofday() and gettid(). In that situation, we
+ // would still like to coalesce the gettimeofday() calls.
+ "6:cmp $224, %eax\n" // __NR_gettid
+ "jz 7f\n"
+ "mov %eax, %fs:0x102C-0x58\n" // remember syscall number
+
+ // Retrieve function call from system call table (c.f. syscall_table.c).
+ // We have three different types of entries; zero for denied system calls,
+ // that should be handled by the defaultSystemCallHandler(); minus one
+ // for unrestricted system calls that need to be forwarded to the trusted
+ // thread; and function pointers to specific handler functions.
+ "7:shl $3, %eax\n"
+ "lea playground$syscallTable, %ebx\n"
+ "add %ebx, %eax\n"
+ "mov 0(%eax), %eax\n"
+
+ // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
+ // jump to fallback handler.
+ "cmp $1, %eax\n"
+ "jbe 9f\n"
+ "add $4, %esp\n"
+ "call *%eax\n"
+ "add $24, %esp\n"
+
+ // Restore CPU registers, except for %eax which was set by the system call.
+ "8:pop %ebp\n"
+ "pop %edi\n"
+ "pop %esi\n"
+ "pop %edx\n"
+ "pop %ecx\n"
+ "pop %ebx\n"
+
+ // Return to caller
+ "ret\n"
+
+ // Call default handler.
+ "9:call playground$defaultSystemCallHandler\n"
+ "10:add $28, %esp\n"
+ "jmp 8b\n"
+
+ ".pushsection \".bss\"\n"
+ ".balign 8\n"
+"100:.byte 0, 0, 0, 0\n"
+"101:.byte 0, 0, 0, 0\n"
+ ".popsection\n"
+
+ #else
+ #error Unsupported target platform
+ #endif
+ ".size playground$syscallWrapper, .-playground$syscallWrapper\n"
+ ".popsection\n"
+);
+
+
+void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
+ void* arg2, void* arg3, void* arg4,
+ void* arg5) {
+ // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.
+
+ // We try to avoid intercepting read(), and write(), as these system calls
+ // are not restricted in Seccomp mode. But depending on the exact
+ // instruction sequence in libc, we might not be able to reliably
+ // filter out these system calls at the time when we instrument the code.
+ SysCalls sys;
+ long rc;
+ long long tm;
+ switch (syscallNum) {
+ case __NR_read:
+ Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
+ rc = sys.read((long)arg0, arg1, (size_t)arg2);
+ break;
+ case __NR_write:
+ Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
+ rc = sys.write((long)arg0, arg1, (size_t)arg2);
+ break;
+ default:
+ if (Debug::isEnabled()) {
+ // In debug mode, prevent stderr from being closed
+ if (syscallNum == __NR_close && arg0 == (void *)2)
+ return 0;
+ }
+
+ if ((unsigned)syscallNum <= maxSyscall &&
+ syscallTable[syscallNum].handler == UNRESTRICTED_SYSCALL) {
+ Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
+ perform_unrestricted:
+ struct {
+ int sysnum;
+ void* unrestricted_req[6];
+ } __attribute__((packed)) request = {
+ syscallNum, { arg0, arg1, arg2, arg3, arg4, arg5 } };
+
+ int thread = threadFdPub();
+ void* rc;
+ if (write(sys, thread, &request, sizeof(request)) != sizeof(request) ||
+ read(sys, thread, &rc, sizeof(rc)) != sizeof(rc)) {
+ die("Failed to forward unrestricted system call");
+ }
+ Debug::elapsed(tm, syscallNum);
+ return rc;
+ } else if (Debug::isEnabled()) {
+ Debug::syscall(&tm, syscallNum,
+ "In production mode, this call would be disallowed");
+ goto perform_unrestricted;
+ } else {
+ return (void *)-ENOSYS;
+ }
+ }
+ if (rc < 0) {
+ rc = -sys.my_errno;
+ }
+ Debug::elapsed(tm, syscallNum);
+ return (void *)rc;
+}
+
+} // namespace