SECCOMP-BPF: Added support for greylisting of system calls.

In addition to a Sandbox::Trap() handler, we now have a Sandbox::UnsafeTrap() handler. This feature should only be used for debugging purposes as it subverts the security of the sandbox. But it is useful to track down problems with the sandboxing policy. Within an unsafe trap handler, all sandbox restrictions are lifted. This, for example, allows us to allow system calls that would normally be denied by the policy, but to log their arguments, return value, and call stack. N.B.: this is the second attempt at submitting this CL. See https://chromiumcodereview.appspot.com/11363212/ for previous code reviews BUG=130662 TEST=sandbox_linux_unittests NOTRY=true Review URL: https://chromiumcodereview.appspot.com/11419121 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@169213 0039d316-1c4b-4281-b951-d872f2087c98
author: markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-11-22 03:51:04 +0000
committer: markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-11-22 03:51:04 +0000
commit: 4149acf3704cb1aad4b8fca7dc530eefa995066b (patch)
tree: 1497718f31512a6dda8764f73bad936efbecb9d6 /sandbox
parent: bfa268b7919819d2ca99aaa1d359cd7be0ab19a1 (diff)
download: chromium_src-4149acf3704cb1aad4b8fca7dc530eefa995066b.zip
chromium_src-4149acf3704cb1aad4b8fca7dc530eefa995066b.tar.gz
chromium_src-4149acf3704cb1aad4b8fca7dc530eefa995066b.tar.bz2
14 files changed, 1074 insertions, 67 deletions
diff --git a/sandbox/linux/sandbox_linux.gypi b/sandbox/linux/sandbox_linux.gypi
index c02cd31..535fb89 100644
--- a/sandbox/linux/sandbox_linux.gypi
+++ b/sandbox/linux/sandbox_linux.gypi
@@ -58,6 +58,7 @@
             'seccomp-bpf/errorcode_unittest.cc',
             'seccomp-bpf/sandbox_bpf_unittest.cc',
             'seccomp-bpf/syscall_iterator_unittest.cc',
+            'seccomp-bpf/syscall_unittest.cc',
           ],
         }],
       ],
@@ -77,6 +78,8 @@
         'seccomp-bpf/instruction.h',
         'seccomp-bpf/sandbox_bpf.cc',
         'seccomp-bpf/sandbox_bpf.h',
+        'seccomp-bpf/syscall.cc',
+        'seccomp-bpf/syscall.h',
         'seccomp-bpf/syscall_iterator.cc',
         'seccomp-bpf/syscall_iterator.h',
         'seccomp-bpf/verifier.cc',
diff --git a/sandbox/linux/seccomp-bpf/Makefile b/sandbox/linux/seccomp-bpf/Makefile
index a697198..6d644b8 100644
--- a/sandbox/linux/seccomp-bpf/Makefile
+++ b/sandbox/linux/seccomp-bpf/Makefile
@@ -2,7 +2,7 @@ DEF_CFLAGS = -g -O3 -Wall -Werror -Wextra -Wno-missing-field-initializers -fPIC
 DEF_CPPFLAGS = -D_GNU_SOURCE -DSECCOMP_BPF_STANDALONE -DSECCOMP_BPF_VALGRIND_HACKS -include valgrind/valgrind.h -iquote ../../..
 DEF_LDFLAGS = -g -lpthread
 DEPFLAGS = -MMD -MF .$@.d
-MODS := demo sandbox_bpf die codegen errorcode syscall_iterator util verifier
+MODS := demo sandbox_bpf basicblock codegen die errorcode syscall syscall_iterator util verifier
 OBJS64 := $(shell echo ${MODS} | xargs -n 1 | sed -e 's/$$/.o64/')
 OBJS32 := $(shell echo ${MODS} | xargs -n 1 | sed -e 's/$$/.o32/')
 ALL_OBJS = $(OBJS32) $(OBJS64)
diff --git a/sandbox/linux/seccomp-bpf/codegen.cc b/sandbox/linux/seccomp-bpf/codegen.cc
index 8b36315..649793c 100644
--- a/sandbox/linux/seccomp-bpf/codegen.cc
+++ b/sandbox/linux/seccomp-bpf/codegen.cc
@@ -5,6 +5,31 @@
 #include "sandbox/linux/seccomp-bpf/codegen.h"
 
 
+namespace {
+
+// Helper function for Traverse().
+void TraverseRecursively(std::set<playground2::Instruction *> *visited,
+                         playground2::Instruction *instruction) {
+  if (visited->find(instruction) == visited->end()) {
+    visited->insert(instruction);
+    switch (BPF_CLASS(instruction->code)) {
+    case BPF_JMP:
+      if (BPF_OP(instruction->code) != BPF_JA) {
+        TraverseRecursively(visited, instruction->jf_ptr);
+      }
+      TraverseRecursively(visited, instruction->jt_ptr);
+      break;
+    case BPF_RET:
+      break;
+    default:
+      TraverseRecursively(visited, instruction->next);
+      break;
+    }
+  }
+}
+
+}  // namespace
+
 namespace playground2 {
 
 CodeGen::CodeGen()
@@ -145,6 +170,17 @@ void CodeGen::JoinInstructions(Instruction *head, Instruction *tail) {
   return;
 }
 
+void CodeGen::Traverse(Instruction *instruction,
+                       void (*fnc)(Instruction *, void *), void *aux) {
+  std::set<Instruction *> visited;
+  TraverseRecursively(&visited, instruction);
+  for (std::set<Instruction *>::const_iterator iter = visited.begin();
+       iter != visited.end();
+       ++iter) {
+    fnc(*iter, aux);
+  }
+}
+
 void CodeGen::FindBranchTargets(const Instruction& instructions,
                                 BranchTargets *branch_targets) {
   // Follow all possible paths through the "instructions" graph and compute
diff --git a/sandbox/linux/seccomp-bpf/codegen.h b/sandbox/linux/seccomp-bpf/codegen.h
index b7d1d39..88521c2 100644
--- a/sandbox/linux/seccomp-bpf/codegen.h
+++ b/sandbox/linux/seccomp-bpf/codegen.h
@@ -77,6 +77,15 @@ class CodeGen {
   // or if a (conditional) jump still has an unsatisfied target.
   void JoinInstructions(Instruction *head, Instruction *tail);
 
+  // Traverse the graph of instructions and visit each instruction once.
+  // Traversal order is implementation-defined. It is acceptable to make
+  // changes to the graph from within the callback function. These changes
+  // do not affect traversal.
+  // The "fnc" function gets called with both the instruction and the opaque
+  // "aux" pointer.
+  void Traverse(Instruction *, void (*fnc)(Instruction *, void *aux),
+                void *aux);
+
   // Compiles the graph of instructions into a BPF program that can be passed
   // to the kernel. Please note that this function modifies the graph in place
   // and must therefore only be called once per graph.
diff --git a/sandbox/linux/seccomp-bpf/die.cc b/sandbox/linux/seccomp-bpf/die.cc
index b141424..92ffa2a 100644
--- a/sandbox/linux/seccomp-bpf/die.cc
+++ b/sandbox/linux/seccomp-bpf/die.cc
@@ -5,6 +5,7 @@
 #include <string>
 
 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
+#include "sandbox/linux/seccomp-bpf/syscall.h"
 
 
 namespace playground2 {
@@ -15,7 +16,7 @@ void Die::ExitGroup() {
   // Especially, since we are dealing with system call filters. Continuing
   // execution would be very bad in most cases where ExitGroup() gets called.
   // So, we'll try a few other strategies too.
-  syscall(__NR_exit_group, 1);
+  SandboxSyscall(__NR_exit_group, 1);
 
   // We have no idea what our run-time environment looks like. So, signal
   // handlers might or might not do the right thing. Try to reset settings
@@ -23,7 +24,7 @@ void Die::ExitGroup() {
   // succeeded in doing so. Nonetheless, triggering a fatal signal could help
   // us terminate.
   signal(SIGSEGV, SIG_DFL);
-  syscall(__NR_prctl, PR_SET_DUMPABLE, (void *)0, (void *)0, (void *)0);
+  SandboxSyscall(__NR_prctl, PR_SET_DUMPABLE, (void *)0, (void *)0, (void *)0);
   if (*(volatile char *)0) { }
 
   // If there is no way for us to ask for the program to exit, the next
@@ -32,7 +33,7 @@ void Die::ExitGroup() {
   // We in fact retry the system call inside of our loop so that it will
   // stand out when somebody tries to diagnose the problem by using "strace".
   for (;;) {
-    syscall(__NR_exit_group, 1);
+    SandboxSyscall(__NR_exit_group, 1);
   }
 }
 
@@ -49,6 +50,16 @@ void Die::SandboxDie(const char *msg, const char *file, int line) {
   ExitGroup();
 }
 
+void Die::SandboxInfo(const char *msg, const char *file, int line) {
+  if (!suppress_info_) {
+  #if defined(SECCOMP_BPF_STANDALONE)
+    Die::LogToStderr(msg, file, line);
+  #else
+    logging::LogMessage(file, line, logging::LOG_INFO).stream() << msg;
+  #endif
+  }
+}
+
 void Die::LogToStderr(const char *msg, const char *file, int line) {
   if (msg) {
     char buf[40];
@@ -57,10 +68,11 @@ void Die::LogToStderr(const char *msg, const char *file, int line) {
 
     // No need to loop. Short write()s are unlikely and if they happen we
     // probably prefer them over a loop that blocks.
-    if (HANDLE_EINTR(write(2, s.c_str(), s.length()))) { }
+    if (HANDLE_EINTR(SandboxSyscall(__NR_write, 2, s.c_str(), s.length()))) { }
   }
 }
 
-bool Die::simple_exit_ = false;
+bool Die::simple_exit_   = false;
+bool Die::suppress_info_ = false;
 
 }  // namespace
diff --git a/sandbox/linux/seccomp-bpf/die.h b/sandbox/linux/seccomp-bpf/die.h
index 608afde..c0ad8fd 100644
--- a/sandbox/linux/seccomp-bpf/die.h
+++ b/sandbox/linux/seccomp-bpf/die.h
@@ -13,6 +13,9 @@ class Die {
   // exits with a fatal error.
   #define SANDBOX_DIE(m) Die::SandboxDie(m, __FILE__, __LINE__)
 
+  // Adds an informational message to the log file or stderr as appropriate.
+  #define SANDBOX_INFO(m) Die::SandboxInfo(m, __FILE__, __LINE__)
+
   // Terminate the program, even if the current sandbox policy prevents some
   // of the more commonly used functions used for exiting.
   // Most users would want to call SANDBOX_DIE() instead, as it logs extra
@@ -25,6 +28,10 @@ class Die {
   static void SandboxDie(const char *msg, const char *file, int line)
     __attribute__((noreturn));
 
+  // This method gets called by SANDBOX_INFO(). There is normally no reason
+  // to call it directly unless you are defining your own logging macro.
+  static void SandboxInfo(const char *msg, const char *file, int line);
+
   // Writes a message to stderr. Used as a fall-back choice, if we don't have
   // any other way to report an error.
   static void LogToStderr(const char *msg, const char *file, int line);
@@ -36,8 +43,13 @@ class Die {
   // unit tests or in the supportsSeccompSandbox() method).
   static void EnableSimpleExit() { simple_exit_ = true; }
 
+  // Sometimes we need to disable all informational messages (e.g. from within
+  // unittests).
+  static void SuppressInfoMessages(bool flag) { suppress_info_ = flag; }
+
  private:
   static bool simple_exit_;
+  static bool suppress_info_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(Die);
 };
diff --git a/sandbox/linux/seccomp-bpf/errorcode.cc b/sandbox/linux/seccomp-bpf/errorcode.cc
index cc79cb6..4d21b792 100644
--- a/sandbox/linux/seccomp-bpf/errorcode.cc
+++ b/sandbox/linux/seccomp-bpf/errorcode.cc
@@ -22,10 +22,12 @@ ErrorCode::ErrorCode(int err) {
   }
 }
 
-ErrorCode::ErrorCode(ErrorCode::TrapFnc fnc, const void *aux, uint16_t id)
+ErrorCode::ErrorCode(ErrorCode::TrapFnc fnc, const void *aux, bool safe,
+                     uint16_t id)
     : error_type_(ET_TRAP),
       fnc_(fnc),
       aux_(const_cast<void *>(aux)),
+      safe_(safe),
       err_(SECCOMP_RET_TRAP + id) {
 }
 
diff --git a/sandbox/linux/seccomp-bpf/errorcode.h b/sandbox/linux/seccomp-bpf/errorcode.h
index 2b941ee..d2661db 100644
--- a/sandbox/linux/seccomp-bpf/errorcode.h
+++ b/sandbox/linux/seccomp-bpf/errorcode.h
@@ -94,7 +94,7 @@ class ErrorCode {
   // If we are wrapping a callback, we must assign a unique id. This id is
   // how the kernel tells us which one of our different SECCOMP_RET_TRAP
   // cases has been triggered.
-  ErrorCode(TrapFnc fnc, const void *aux, uint16_t id);
+  ErrorCode(TrapFnc fnc, const void *aux, bool safe, uint16_t id);
 
   // Some system calls require inspection of arguments. This constructor
   // allows us to specify additional constraints.
@@ -108,6 +108,7 @@ class ErrorCode {
     struct {
       TrapFnc fnc_;              // Callback function and arg, if trap was
       void    *aux_;             //   triggered by the kernel's BPF filter.
+      bool    safe_;             // Keep sandbox active while calling fnc_()
     };
 
     // Fields needed when inspecting additional arguments.
diff --git a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
index eb03995..ff855b8 100644
--- a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
+++ b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
@@ -2,8 +2,27 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#include <endian.h>
+#if __BYTE_ORDER == __BIG_ENDIAN
+// The BPF "struct seccomp_data" layout has to deal with storing 64bit
+// values that need to be inspected by a virtual machine that only ever
+// operates on 32bit values. The kernel developers decided how values
+// should be split into two 32bit words to achieve this goal. But at this
+// time, there is no existing BPF implementation in the kernel that uses
+// 64bit big endian values. So, all we have to go by is the consensus
+// from a discussion on LKLM. Actual implementations, if and when they
+// happen, might very well differ.
+// If this code is ever going to be used with such a kernel, you should
+// disable the "#error" and carefully test the code (e.g. run the unit
+// tests). If things don't work, search for all occurrences of __BYTE_ORDER
+// and verify that the proposed implementation agrees with what the kernel
+// actually does.
+#error Big endian operation is untested and expected to be broken
+#endif
+
 #include "sandbox/linux/seccomp-bpf/codegen.h"
 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
+#include "sandbox/linux/seccomp-bpf/syscall.h"
 #include "sandbox/linux/seccomp-bpf/syscall_iterator.h"
 #include "sandbox/linux/seccomp-bpf/verifier.h"
 
@@ -18,6 +37,30 @@ void WriteFailedStderrSetupMessage(int out_fd) {
   }
 }
 
+// We need to tell whether we are performing a "normal" callback, or
+// whether we were called recursively from within a UnsafeTrap() callback.
+// This is a little tricky to do, because we need to somehow get access to
+// per-thread data from within a signal context. Normal TLS storage is not
+// safely accessible at this time. We could roll our own, but that involves
+// a lot of complexity. Instead, we co-opt one bit in the signal mask.
+// If BUS is blocked, we assume that we have been called recursively.
+// There is a possibility for collision with other code that needs to do
+// this, but in practice the risks are low.
+// If SIGBUS turns out to be a problem, we could instead co-opt one of the
+// realtime signals. There are plenty of them. Unfortunately, there is no
+// way to mark a signal as allocated. So, the potential for collision is
+// possibly even worse.
+bool GetIsInSigHandler(const ucontext_t *ctx) {
+  return sigismember(&ctx->uc_sigmask, SIGBUS);
+}
+
+void SetIsInSigHandler() {
+  sigset_t mask;
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGBUS);
+  sigprocmask(SIG_BLOCK, &mask, NULL);
+}
+
 }  // namespace
 
 // The kernel gives us a sandbox, we turn it into a playground :-)
@@ -319,6 +362,48 @@ void Sandbox::policySanityChecks(EvaluateSyscall syscallEvaluator,
   return;
 }
 
+void Sandbox::CheckForUnsafeErrorCodes(Instruction *insn, void *aux) {
+  if (BPF_CLASS(insn->code) == BPF_RET &&
+      insn->k >  SECCOMP_RET_TRAP &&
+      insn->k - SECCOMP_RET_TRAP <= trapArraySize_) {
+    const ErrorCode& err = trapArray_[insn->k - SECCOMP_RET_TRAP - 1];
+    if (!err.safe_) {
+      bool *is_unsafe = static_cast<bool *>(aux);
+      *is_unsafe = true;
+    }
+  }
+}
+
+void Sandbox::RedirectToUserspace(Instruction *insn, void *aux) {
+  // When inside an UnsafeTrap() callback, we want to allow all system calls.
+  // This means, we must conditionally disable the sandbox -- and that's not
+  // something that kernel-side BPF filters can do, as they cannot inspect
+  // any state other than the syscall arguments.
+  // But if we redirect all error handlers to user-space, then we can easily
+  // make this decision.
+  // The performance penalty for this extra round-trip to user-space is not
+  // actually that bad, as we only ever pay it for denied system calls; and a
+  // typical program has very few of these.
+  if (BPF_CLASS(insn->code) == BPF_RET &&
+      (insn->k & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
+    insn->k = Trap(ReturnErrno,
+                   reinterpret_cast<void *>(insn->k & SECCOMP_RET_DATA)).err();
+  }
+}
+
+ErrorCode Sandbox::RedirectToUserspaceEvalWrapper(int sysnum, void *aux) {
+  // We need to replicate the behavior of RedirectToUserspace(), so that our
+  // Verifier can still work correctly.
+  Evaluators *evaluators = reinterpret_cast<Evaluators *>(aux);
+  const std::pair<EvaluateSyscall, void *>& evaluator = *evaluators->begin();
+  ErrorCode err = evaluator.first(sysnum, evaluator.second);
+  if ((err.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
+    return Trap(ReturnErrno,
+                reinterpret_cast<void *>(err.err() & SECCOMP_RET_DATA));
+  }
+  return err;
+}
+
 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) {
   if (status_ == STATUS_ENABLED) {
     SANDBOX_DIE("Cannot change policy after sandbox has started");
@@ -337,8 +422,8 @@ void Sandbox::installFilter(bool quiet) {
   // Set new SIGSYS handler
   struct sigaction sa;
   memset(&sa, 0, sizeof(sa));
-  sa.sa_sigaction = &sigSys;
-  sa.sa_flags = SA_SIGINFO;
+  sa.sa_sigaction = sigSys;
+  sa.sa_flags = SA_SIGINFO | SA_NODEFER;
   if (sigaction(SIGSYS, &sa, NULL) < 0) {
     goto filter_failed;
   }
@@ -369,33 +454,13 @@ void Sandbox::installFilter(bool quiet) {
   Instruction *head =
     gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
                          offsetof(struct arch_seccomp_data, arch),
-    gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,
   tail =
-    // Grab the system call number, so that we can implement jump tables.
-    gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
-                         offsetof(struct arch_seccomp_data, nr)),
+    gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,
+                         NULL,
     gen->MakeInstruction(BPF_RET+BPF_K,
                          Kill(
                            "Invalid audit architecture in BPF filter").err_)));
 
-  // On Intel architectures, verify that system call numbers are in the
-  // expected number range. The older i386 and x86-64 APIs clear bit 30
-  // on all system calls. The newer x32 API always sets bit 30.
-#if defined(__i386__) || defined(__x86_64__)
-  Instruction *invalidX32 =
-    gen->MakeInstruction(BPF_RET+BPF_K,
-                         Kill("Illegal mixing of system call ABIs").err_);
-  Instruction *checkX32 =
-#if defined(__x86_64__) && defined(__ILP32__)
-    gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);
-#else
-    gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);
-#endif
-    gen->JoinInstructions(tail, checkX32);
-    tail = checkX32;
-#endif
-
-
   {
     // Evaluate all possible system calls and group their ErrorCodes into
     // ranges of identical codes.
@@ -406,6 +471,109 @@ void Sandbox::installFilter(bool quiet) {
     Instruction *jumptable =
       assembleJumpTable(gen, ranges.begin(), ranges.end());
 
+    // If there is at least one UnsafeTrap() in our program, the entire sandbox
+    // is unsafe. We need to modify the program so that all non-
+    // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then
+    // allow us to temporarily disable sandboxing rules inside of callbacks to
+    // UnsafeTrap().
+    has_unsafe_traps_ = false;
+    gen->Traverse(jumptable, CheckForUnsafeErrorCodes, &has_unsafe_traps_);
+
+    // Grab the system call number, so that we can implement jump tables.
+    Instruction *load_nr =
+      gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
+                           offsetof(struct arch_seccomp_data, nr));
+
+    // If our BPF program has unsafe jumps, enable support for them. This
+    // test happens very early in the BPF filter program. Even before we
+    // consider looking at system call numbers.
+    // As support for unsafe jumps essentially defeats all the security
+    // measures that the sandbox provides, we print a big warning message --
+    // and of course, we make sure to only ever enable this feature if it
+    // is actually requested by the sandbox policy.
+    if (has_unsafe_traps_) {
+      if (SandboxSyscall(-1) == -1 && errno == ENOSYS) {
+        SANDBOX_DIE("Support for UnsafeTrap() has not yet been ported to this "
+                    "architecture");
+      }
+
+      EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;
+      void *aux                       = evaluators_.begin()->second;
+      if (!evaluateSyscall(__NR_rt_sigprocmask, aux).
+            Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) ||
+          !evaluateSyscall(__NR_rt_sigreturn, aux).
+            Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
+#if defined(__NR_sigprocmask)
+       || !evaluateSyscall(__NR_sigprocmask, aux).
+            Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
+#endif
+#if defined(__NR_sigreturn)
+       || !evaluateSyscall(__NR_sigreturn, aux).
+            Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
+#endif
+          ) {
+        SANDBOX_DIE("Invalid seccomp policy; if using UnsafeTrap(), you must "
+                    "unconditionally allow sigreturn() and sigprocmask()");
+      }
+
+      SANDBOX_INFO("WARNING! Disabling sandbox for debugging purposes");
+      gen->Traverse(jumptable, RedirectToUserspace, NULL);
+
+      // Allow system calls, if they originate from our magic return address
+      // (which we can query by calling SandboxSyscall(-1)).
+      uintptr_t syscall_entry_point =
+        static_cast<uintptr_t>(SandboxSyscall(-1));
+      uint32_t low = static_cast<uint32_t>(syscall_entry_point);
+#if __SIZEOF_POINTER__ > 4
+      uint32_t hi  = static_cast<uint32_t>(syscall_entry_point >> 32);
+#endif
+
+      // BPF cannot do native 64bit comparisons. On 64bit architectures, we
+      // have to compare both 32bit halfs of the instruction pointer. If they
+      // match what we expect, we return ERR_ALLOWED. If either or both don't
+      // match, we continue evalutating the rest of the sandbox policy.
+      Instruction *escape_hatch =
+        gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
+                             offsetof(struct arch_seccomp_data,
+                                      instruction_pointer) +
+                             (__SIZEOF_POINTER__ > 4 &&
+                              __BYTE_ORDER == __BIG_ENDIAN ? 4 : 0),
+        gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, low,
+#if __SIZEOF_POINTER__ > 4
+        gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
+                             offsetof(struct arch_seccomp_data,
+                                      instruction_pointer) +
+                             (__BYTE_ORDER == __BIG_ENDIAN ? 0 : 4),
+        gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, hi,
+#endif
+        gen->MakeInstruction(BPF_RET+BPF_K, ErrorCode(ErrorCode::ERR_ALLOWED)),
+#if __SIZEOF_POINTER__ > 4
+                             load_nr)),
+#endif
+                             load_nr));
+      gen->JoinInstructions(tail, escape_hatch);
+    } else {
+      gen->JoinInstructions(tail, load_nr);
+    }
+    tail = load_nr;
+
+    // On Intel architectures, verify that system call numbers are in the
+    // expected number range. The older i386 and x86-64 APIs clear bit 30
+    // on all system calls. The newer x32 API always sets bit 30.
+#if defined(__i386__) || defined(__x86_64__)
+    Instruction *invalidX32 =
+      gen->MakeInstruction(BPF_RET+BPF_K,
+                           Kill("Illegal mixing of system call ABIs").err_);
+    Instruction *checkX32 =
+#if defined(__x86_64__) && defined(__ILP32__)
+      gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);
+#else
+      gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);
+#endif
+      gen->JoinInstructions(tail, checkX32);
+      tail = checkX32;
+#endif
+
     // Append jump table to our pre-amble
     gen->JoinInstructions(tail, jumptable);
   }
@@ -419,9 +587,22 @@ void Sandbox::installFilter(bool quiet) {
   // correctly. Otherwise, there is an internal error in our BPF compiler.
   // There is really nothing the caller can do until the bug is fixed.
 #ifndef NDEBUG
-  const char *err = NULL;
-  if (!Verifier::VerifyBPF(*program, evaluators_, &err)) {
-    SANDBOX_DIE(err);
+  {
+    // If we previously rewrote the BPF program so that it calls user-space
+    // whenever we return an "errno" value from the filter, then we have to
+    // wrap our system call evaluator to perform the same operation. Otherwise,
+    // the verifier would also report a mismatch in return codes.
+    Evaluators redirected_evaluators;
+    redirected_evaluators.push_back(
+        std::make_pair(RedirectToUserspaceEvalWrapper, &evaluators_));
+
+    const char *err = NULL;
+    if (!Verifier::VerifyBPF(
+                       *program,
+                       has_unsafe_traps_ ? redirected_evaluators : evaluators_,
+                       &err)) {
+      SANDBOX_DIE(err);
+    }
   }
 #endif
 
@@ -444,7 +625,6 @@ void Sandbox::installFilter(bool quiet) {
 
   // Release memory that is no longer needed
   evaluators_.clear();
-  errMap_.clear();
 
 #if defined(SECCOMP_BPF_VALGRIND_HACKS)
   // Valgrind is really not happy about our sandbox. Disable it when running
@@ -561,27 +741,43 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {
     goto sigsys_err;
   }
 
-  // Copy the seccomp-specific data into a arch_seccomp_data structure. This
-  // is what we are showing to TrapFnc callbacks that the system call evaluator
-  // registered with the sandbox.
-  struct arch_seccomp_data data = {
-    sigsys.nr,
-    SECCOMP_ARCH,
-    reinterpret_cast<uint64_t>(sigsys.ip),
-    {
-      static_cast<uint64_t>(SECCOMP_PARM1(ctx)),
-      static_cast<uint64_t>(SECCOMP_PARM2(ctx)),
-      static_cast<uint64_t>(SECCOMP_PARM3(ctx)),
-      static_cast<uint64_t>(SECCOMP_PARM4(ctx)),
-      static_cast<uint64_t>(SECCOMP_PARM5(ctx)),
-      static_cast<uint64_t>(SECCOMP_PARM6(ctx))
+  intptr_t rc;
+  if (has_unsafe_traps_ && GetIsInSigHandler(ctx)) {
+    errno = old_errno;
+    if (sigsys.nr == __NR_clone) {
+      SANDBOX_DIE("Cannot call clone() from an UnsafeTrap() handler");
+    }
+    rc = SandboxSyscall(sigsys.nr,
+                        SECCOMP_PARM1(ctx), SECCOMP_PARM2(ctx),
+                        SECCOMP_PARM3(ctx), SECCOMP_PARM4(ctx),
+                        SECCOMP_PARM5(ctx), SECCOMP_PARM6(ctx));
+  } else {
+    const ErrorCode& err = trapArray_[info->si_errno - 1];
+    if (!err.safe_) {
+      SetIsInSigHandler();
     }
-  };
 
-  // Now call the TrapFnc callback associated with this particular instance
-  // of SECCOMP_RET_TRAP.
-  const ErrorCode& err = trapArray_[info->si_errno - 1];
-  intptr_t rc          = err.fnc_(data, err.aux_);
+    // Copy the seccomp-specific data into a arch_seccomp_data structure. This
+    // is what we are showing to TrapFnc callbacks that the system call
+    // evaluator registered with the sandbox.
+    struct arch_seccomp_data data = {
+      sigsys.nr,
+      SECCOMP_ARCH,
+      reinterpret_cast<uint64_t>(sigsys.ip),
+      {
+        static_cast<uint64_t>(SECCOMP_PARM1(ctx)),
+        static_cast<uint64_t>(SECCOMP_PARM2(ctx)),
+        static_cast<uint64_t>(SECCOMP_PARM3(ctx)),
+        static_cast<uint64_t>(SECCOMP_PARM4(ctx)),
+        static_cast<uint64_t>(SECCOMP_PARM5(ctx)),
+        static_cast<uint64_t>(SECCOMP_PARM6(ctx))
+      }
+    };
+
+    // Now call the TrapFnc callback associated with this particular instance
+    // of SECCOMP_RET_TRAP.
+    rc = err.fnc_(data, err.aux_);
+  }
 
   // Update the CPU register that stores the return code of the system call
   // that we just handled, and restore "errno" to the value that it had
@@ -592,10 +788,21 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {
   return;
 }
 
-ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
+bool Sandbox::TrapKey::operator<(const Sandbox::TrapKey& o) const {
+  if (fnc != o.fnc) {
+    return fnc < o.fnc;
+  } else if (aux != o.aux) {
+    return aux < o.aux;
+  } else {
+    return safe < o.safe;
+  }
+}
+
+ErrorCode Sandbox::MakeTrap(ErrorCode::TrapFnc fnc, const void *aux,
+                            bool safe) {
   // Each unique pair of TrapFnc and auxiliary data make up a distinct instance
   // of a SECCOMP_RET_TRAP.
-  std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux);
+  TrapKey key(fnc, aux, safe);
   TrapIds::const_iterator iter = trapIds_.find(key);
   uint16_t id;
   if (iter != trapIds_.end()) {
@@ -618,7 +825,7 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
     }
     id = traps_->size() + 1;
 
-    traps_->push_back(ErrorCode(fnc, aux, id));
+    traps_->push_back(ErrorCode(fnc, aux, safe, id));
     trapIds_[key] = id;
 
     // We want to access the traps_ vector from our signal handler. But
@@ -629,10 +836,37 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
     // signal handler, where we can safely do so.
     trapArray_     = &(*traps_)[0];
     trapArraySize_ = id;
+    return traps_->back();
   }
 
-  ErrorCode err = ErrorCode(fnc, aux, id);
-  return errMap_[err.err()] = err;
+  return ErrorCode(fnc, aux, safe, id);
+}
+
+ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
+  return MakeTrap(fnc, aux, true /* Safe Trap */);
+}
+
+ErrorCode Sandbox::UnsafeTrap(ErrorCode::TrapFnc fnc, const void *aux) {
+  return MakeTrap(fnc, aux, false /* Unsafe Trap */);
+}
+
+intptr_t Sandbox::ForwardSyscall(const struct arch_seccomp_data& args) {
+  return SandboxSyscall(args.nr,
+                        static_cast<intptr_t>(args.args[0]),
+                        static_cast<intptr_t>(args.args[1]),
+                        static_cast<intptr_t>(args.args[2]),
+                        static_cast<intptr_t>(args.args[3]),
+                        static_cast<intptr_t>(args.args[4]),
+                        static_cast<intptr_t>(args.args[5]));
+}
+
+intptr_t Sandbox::ReturnErrno(const struct arch_seccomp_data&, void *aux) {
+  // TrapFnc functions report error by following the native kernel convention
+  // of returning an exit code in the range of -1..-4096. They do not try to
+  // set errno themselves. The glibc wrapper that triggered the SIGSYS will
+  // ultimately do so for us.
+  int err = reinterpret_cast<intptr_t>(aux) & SECCOMP_RET_DATA;
+  return -err;
 }
 
 intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) {
@@ -646,10 +880,10 @@ ErrorCode Sandbox::Kill(const char *msg) {
 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;
 int    Sandbox::proc_fd_                = -1;
 Sandbox::Evaluators Sandbox::evaluators_;
-Sandbox::ErrMap Sandbox::errMap_;
 Sandbox::Traps *Sandbox::traps_         = NULL;
 Sandbox::TrapIds Sandbox::trapIds_;
 ErrorCode *Sandbox::trapArray_          = NULL;
 size_t Sandbox::trapArraySize_          = 0;
+  bool Sandbox::has_unsafe_traps_       = false;
 
 }  // namespace
diff --git a/sandbox/linux/seccomp-bpf/sandbox_bpf.h b/sandbox/linux/seccomp-bpf/sandbox_bpf.h
index 16ab1d3..5497963 100644
--- a/sandbox/linux/seccomp-bpf/sandbox_bpf.h
+++ b/sandbox/linux/seccomp-bpf/sandbox_bpf.h
@@ -207,6 +207,11 @@ class Sandbox {
   // Please note that TrapFnc is executed from signal context and must be
   // async-signal safe:
   // http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_04.html
+  // Also note that it follows the calling convention of native system calls.
+  // In other words, it reports an error by returning an exit code in the
+  // range -1..-4096. It should not set errno when reporting errors; on the
+  // other hand, accidentally modifying errno is harmless and the changes will
+  // be undone afterwards.
   typedef intptr_t (*TrapFnc)(const struct arch_seccomp_data& args, void *aux);
 
   enum Operation {
@@ -271,6 +276,25 @@ class Sandbox {
   // handler.
   static ErrorCode Trap(ErrorCode::TrapFnc fnc, const void *aux);
 
+  // Calls a user-space trap handler and disables all sandboxing for system
+  // calls made from this trap handler.
+  // NOTE: This feature, by definition, disables all security features of
+  //   the sandbox. It should never be used in production, but it can be
+  //   very useful to diagnose code that is incompatible with the sandbox.
+  //   If even a single system call returns "UnsafeTrap", the security of
+  //   entire sandbox should be considered compromised.
+  static ErrorCode UnsafeTrap(ErrorCode::TrapFnc fnc, const void *aux);
+
+  // From within an UnsafeTrap() it is often useful to be able to execute
+  // the system call that triggered the trap. The ForwardSyscall() method
+  // makes this easy. It is more efficient than calling glibc's syscall()
+  // function, as it avoid the extra round-trip to the signal handler. And
+  // it automatically does the correct thing to report kernel-style error
+  // conditions, rather than setting errno. See the comments for TrapFnc for
+  // details. In other words, the return value from ForwardSyscall() is
+  // directly suitable as a return value for a trap handler.
+  static intptr_t ForwardSyscall(const struct arch_seccomp_data& args);
+
   // Kill the program and print an error message.
   static ErrorCode Kill(const char *msg);
 
@@ -289,18 +313,29 @@ class Sandbox {
   typedef std::vector<struct sock_filter> Program;
 
   struct Range {
-    Range(uint32_t f, uint32_t t, const ErrorCode& e) :
-      from(f),
-      to(t),
-      err(e) {
+    Range(uint32_t f, uint32_t t, const ErrorCode& e)
+        : from(f),
+          to(t),
+          err(e) {
     }
     uint32_t  from, to;
     ErrorCode err;
   };
+  struct TrapKey {
+    TrapKey(TrapFnc f, const void *a, bool s)
+        : fnc(f),
+          aux(a),
+          safe(s) {
+    }
+    TrapFnc    fnc;
+    const void *aux;
+    bool       safe;
+    bool operator<(const TrapKey&) const;
+  };
   typedef std::vector<Range> Ranges;
   typedef std::map<uint32_t, ErrorCode> ErrMap;
   typedef std::vector<ErrorCode> Traps;
-  typedef std::map<std::pair<TrapFnc, const void *>, int> TrapIds;
+  typedef std::map<TrapKey, uint16_t> TrapIds;
 
   // Get a file descriptor pointing to "/proc", if currently available.
   static int proc_fd() { return proc_fd_; }
@@ -320,23 +355,47 @@ class Sandbox {
   static bool      disableFilesystem();
   static void      policySanityChecks(EvaluateSyscall syscallEvaluator,
                                       void *aux);
+
+  // Function that can be passed as a callback function to CodeGen::Traverse().
+  // Checks whether the "insn" returns an UnsafeTrap() ErrorCode. If so, it
+  // sets the "bool" variable pointed to by "aux".
+  static void      CheckForUnsafeErrorCodes(Instruction *insn, void *aux);
+
+  // Function that can be passed as a callback function to CodeGen::Traverse().
+  // Checks whether the "insn" returns an errno value from a BPF filter. If so,
+  // it rewrites the instruction to instead call a Trap() handler that does
+  // the same thing. "aux" is ignored.
+  static void      RedirectToUserspace(Instruction *insn, void *aux);
+
+  // Stackable wrapper around an Evaluators handler. Changes ErrorCodes
+  // returned by a system call evaluator to match the changes made by
+  // RedirectToUserspace(). "aux" should be pointer to wrapped system call
+  // evaluator.
+  static ErrorCode RedirectToUserspaceEvalWrapper(int sysnum, void *aux);
+
   static void      installFilter(bool quiet);
   static void      findRanges(Ranges *ranges);
   static Instruction *assembleJumpTable(CodeGen *gen,
                                         Ranges::const_iterator start,
                                         Ranges::const_iterator stop);
   static void      sigSys(int nr, siginfo_t *info, void *void_context);
+  static ErrorCode MakeTrap(ErrorCode::TrapFnc fn, const void *aux, bool safe);
+
+  // A Trap() handler that returns an "errno" value. The value is encoded
+  // in the "aux" parameter.
+  static intptr_t  ReturnErrno(const struct arch_seccomp_data&, void *aux);
+
   static intptr_t  bpfFailure(const struct arch_seccomp_data& data, void *aux);
   static int       getTrapId(TrapFnc fnc, const void *aux);
 
   static SandboxStatus status_;
   static int           proc_fd_;
   static Evaluators    evaluators_;
-  static ErrMap        errMap_;
   static Traps         *traps_;
   static TrapIds       trapIds_;
   static ErrorCode     *trapArray_;
   static size_t        trapArraySize_;
+  static bool          has_unsafe_traps_;
   DISALLOW_IMPLICIT_CONSTRUCTORS(Sandbox);
 };
 
diff --git a/sandbox/linux/seccomp-bpf/sandbox_bpf_unittest.cc b/sandbox/linux/seccomp-bpf/sandbox_bpf_unittest.cc
index 23ab74c..72ca4c6 100644
--- a/sandbox/linux/seccomp-bpf/sandbox_bpf_unittest.cc
+++ b/sandbox/linux/seccomp-bpf/sandbox_bpf_unittest.cc
@@ -2,9 +2,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
+#include <sys/prctl.h>
+#include <sys/utsname.h>
+
 #include <ostream>
 
 #include "sandbox/linux/seccomp-bpf/bpf_tests.h"
+#include "sandbox/linux/seccomp-bpf/syscall.h"
 #include "sandbox/linux/seccomp-bpf/verifier.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
@@ -266,4 +270,215 @@ BPF_TEST(SandboxBpf, ArmPrivatePolicy, ArmPrivatePolicy) {
 }
 #endif  // defined(__arm__)
 
+intptr_t CountSyscalls(const struct arch_seccomp_data& args, void *aux) {
+  // Count all invocations of our callback function.
+  ++*reinterpret_cast<int *>(aux);
+
+  // Verify that within the callback function all filtering is temporarily
+  // disabled.
+  BPF_ASSERT(syscall(__NR_getpid) > 1);
+
+  // Verify that we can now call the underlying system call without causing
+  // infinite recursion.
+  return Sandbox::ForwardSyscall(args);
+}
+
+ErrorCode GreyListedPolicy(int sysno, void *aux) {
+  // The use of UnsafeTrap() causes us to print a warning message. This is
+  // generally desirable, but it results in the unittest failing, as it doesn't
+  // expect any messages on "stderr". So, temporarily disable messages. The
+  // BPF_TEST() is guaranteed to turn messages back on, after the policy
+  // function has completed.
+  Die::SuppressInfoMessages(true);
+
+  // Some system calls must always be allowed, if our policy wants to make
+  // use of UnsafeTrap()
+  if (sysno == __NR_rt_sigprocmask ||
+      sysno == __NR_rt_sigreturn
+#if defined(__NR_sigprocmask)
+   || sysno == __NR_sigprocmask
+#endif
+#if defined(__NR_sigreturn)
+   || sysno == __NR_sigreturn
+#endif
+      ) {
+    return ErrorCode(ErrorCode::ERR_ALLOWED);
+  } else if (sysno == __NR_getpid) {
+    // Disallow getpid()
+    return ErrorCode(EPERM);
+  } else if (Sandbox::isValidSyscallNumber(sysno)) {
+    // Allow (and count) all other system calls.
+      return Sandbox::UnsafeTrap(CountSyscalls, aux);
+  } else {
+    return ErrorCode(ENOSYS);
+  }
+}
+
+BPF_TEST(SandboxBpf, GreyListedPolicy,
+         GreyListedPolicy, int /* BPF_AUX */) {
+  BPF_ASSERT(syscall(__NR_getpid) == -1);
+  BPF_ASSERT(errno == EPERM);
+  BPF_ASSERT(BPF_AUX == 0);
+  BPF_ASSERT(syscall(__NR_geteuid) == syscall(__NR_getuid));
+  BPF_ASSERT(BPF_AUX == 2);
+  char name[17] = { };
+  BPF_ASSERT(!syscall(__NR_prctl, PR_GET_NAME, name, (void *)NULL,
+                      (void *)NULL, (void *)NULL));
+  BPF_ASSERT(BPF_AUX == 3);
+  BPF_ASSERT(*name);
+}
+
+intptr_t PrctlHandler(const struct arch_seccomp_data& args, void *) {
+  if (args.args[0] == PR_CAPBSET_DROP &&
+      static_cast<int>(args.args[1]) == -1) {
+    // prctl(PR_CAPBSET_DROP, -1) is never valid. The kernel will always
+    // return an error. But our handler allows this call.
+    return 0;
+  } else {
+    return Sandbox::ForwardSyscall(args);
+  }
+}
+
+ErrorCode PrctlPolicy(int sysno, void *aux) {
+  Die::SuppressInfoMessages(true);
+
+  if (sysno == __NR_prctl) {
+    // Handle prctl() inside an UnsafeTrap()
+    return Sandbox::UnsafeTrap(PrctlHandler, NULL);
+  } else if (Sandbox::isValidSyscallNumber(sysno)) {
+    // Allow all other system calls.
+    return ErrorCode(ErrorCode::ERR_ALLOWED);
+  } else {
+    return ErrorCode(ENOSYS);
+  }
+}
+
+BPF_TEST(SandboxBpf, ForwardSyscall, PrctlPolicy) {
+  // This call should never be allowed. But our policy will intercept it and
+  // let it pass successfully.
+  BPF_ASSERT(!prctl(PR_CAPBSET_DROP, -1, (void *)NULL, (void *)NULL,
+                    (void *)NULL));
+
+  // Verify that the call will fail, if it makes it all the way to the kernel.
+  BPF_ASSERT(prctl(PR_CAPBSET_DROP, -2, (void *)NULL, (void *)NULL,
+                   (void *)NULL) == -1);
+
+  // And verify that other uses of prctl() work just fine.
+  char name[17] = { };
+  BPF_ASSERT(!syscall(__NR_prctl, PR_GET_NAME, name, (void *)NULL,
+                      (void *)NULL, (void *)NULL));
+  BPF_ASSERT(*name);
+
+  // Finally, verify that system calls other than prctl() are completely
+  // unaffected by our policy.
+  struct utsname uts = { };
+  BPF_ASSERT(!uname(&uts));
+  BPF_ASSERT(!strcmp(uts.sysname, "Linux"));
+}
+
+intptr_t AllowRedirectedSyscall(const struct arch_seccomp_data& args, void *) {
+  return Sandbox::ForwardSyscall(args);
+}
+
+ErrorCode RedirectAllSyscallsPolicy(int sysno, void *aux) {
+  Die::SuppressInfoMessages(true);
+
+  // Some system calls must always be allowed, if our policy wants to make
+  // use of UnsafeTrap()
+  if (sysno == __NR_rt_sigprocmask ||
+      sysno == __NR_rt_sigreturn
+#if defined(__NR_sigprocmask)
+   || sysno == __NR_sigprocmask
+#endif
+#if defined(__NR_sigreturn)
+   || sysno == __NR_sigreturn
+#endif
+      ) {
+    return ErrorCode(ErrorCode::ERR_ALLOWED);
+  } else if (Sandbox::isValidSyscallNumber(sysno)) {
+    return Sandbox::UnsafeTrap(AllowRedirectedSyscall, aux);
+  } else {
+    return ErrorCode(ENOSYS);
+  }
+}
+
+int bus_handler_fd_ = -1;
+
+void SigBusHandler(int, siginfo_t *info, void *void_context) {
+  BPF_ASSERT(write(bus_handler_fd_, "\x55", 1) == 1);
+}
+
+BPF_TEST(SandboxBpf, SigBus, RedirectAllSyscallsPolicy) {
+  // We use the SIGBUS bit in the signal mask as a thread-local boolean
+  // value in the implementation of UnsafeTrap(). This is obviously a bit
+  // of a hack that could conceivably interfere with code that uses SIGBUS
+  // in more traditional ways. This test verifies that basic functionality
+  // of SIGBUS is not impacted, but it is certainly possibly to construe
+  // more complex uses of signals where our use of the SIGBUS mask is not
+  // 100% transparent. This is expected behavior.
+  int fds[2];
+  BPF_ASSERT(pipe(fds) == 0);
+  bus_handler_fd_ = fds[1];
+  struct sigaction sa = { };
+  sa.sa_sigaction = SigBusHandler;
+  sa.sa_flags = SA_SIGINFO;
+  BPF_ASSERT(sigaction(SIGBUS, &sa, NULL) == 0);
+  raise(SIGBUS);
+  char c = '\000';
+  BPF_ASSERT(read(fds[0], &c, 1) == 1);
+  BPF_ASSERT(close(fds[0]) == 0);
+  BPF_ASSERT(close(fds[1]) == 0);
+  BPF_ASSERT(c == 0x55);
+}
+
+BPF_TEST(SandboxBpf, SigMask, RedirectAllSyscallsPolicy) {
+  // Signal masks are potentially tricky to handle. For instance, if we
+  // ever tried to update them from inside a Trap() or UnsafeTrap() handler,
+  // the call to sigreturn() at the end of the signal handler would undo
+  // all of our efforts. So, it makes sense to test that sigprocmask()
+  // works, even if we have a policy in place that makes use of UnsafeTrap().
+  // In practice, this works because we force sigprocmask() to be handled
+  // entirely in the kernel.
+  sigset_t mask0, mask1, mask2;
+
+  // Call sigprocmask() to verify that SIGUSR1 wasn't blocked, if we didn't
+  // change the mask (it shouldn't have been, as it isn't blocked by default
+  // in POSIX).
+  sigemptyset(&mask0);
+  BPF_ASSERT(!sigprocmask(SIG_BLOCK, &mask0, &mask1));
+  BPF_ASSERT(!sigismember(&mask1, SIGUSR1));
+
+  // Try again, and this time we verify that we can block it. This
+  // requires a second call to sigprocmask().
+  sigaddset(&mask0, SIGUSR1);
+  BPF_ASSERT(!sigprocmask(SIG_BLOCK, &mask0, NULL));
+  BPF_ASSERT(!sigprocmask(SIG_BLOCK, NULL, &mask2));
+  BPF_ASSERT( sigismember(&mask2, SIGUSR1));
+}
+
+BPF_TEST(SandboxBpf, UnsafeTrapWithErrno, RedirectAllSyscallsPolicy) {
+  // An UnsafeTrap() (or for that matter, a Trap()) has to report error
+  // conditions by returning an exit code in the range -1..-4096. This
+  // should happen automatically if using ForwardSyscall(). If the TrapFnc()
+  // uses some other method to make system calls, then it is responsible
+  // for computing the correct return code.
+  // This test verifies that ForwardSyscall() does the correct thing.
+
+  // The glibc system wrapper will ultimately set errno for us. So, from normal
+  // userspace, all of this should be completely transparent.
+  errno = 0;
+  BPF_ASSERT(close(-1) == -1);
+  BPF_ASSERT(errno == EBADF);
+
+  // Explicitly avoid the glibc wrapper. This is not normally the way anybody
+  // would make system calls, but it allows us to verify that we don't
+  // accidentally mess with errno, when we shouldn't.
+  errno = 0;
+  struct arch_seccomp_data args = { };
+  args.nr      = __NR_close;
+  args.args[0] = -1;
+  BPF_ASSERT(Sandbox::ForwardSyscall(args) == -EBADF);
+  BPF_ASSERT(errno == 0);
+}
+
 } // namespace
diff --git a/sandbox/linux/seccomp-bpf/syscall.cc b/sandbox/linux/seccomp-bpf/syscall.cc
new file mode 100644
index 0000000..3f5b263
--- /dev/null
+++ b/sandbox/linux/seccomp-bpf/syscall.cc
@@ -0,0 +1,288 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <asm/unistd.h>
+#include <bits/wordsize.h>
+#include <errno.h>
+#include <stdarg.h>
+
+#include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
+#include "sandbox/linux/seccomp-bpf/syscall.h"
+
+
+namespace playground2 {
+
+  asm(      // We need to be able to tell the kernel exactly where we made a
+            // system call. The C++ compiler likes to sometimes clone or
+            // inline code, which would inadvertently end up duplicating
+            // the entry point.
+            // "gcc" can suppress code duplication with suitable function
+            // attributes, but "clang" doesn't have this ability.
+            // The "clang" developer mailing list suggested that the correct
+            // and portable solution is a file-scope assembly block.
+            // N.B. We do mark our code as a proper function so that backtraces
+            // work correctly. But we make absolutely no attempt to use the
+            // ABI's calling conventions for passing arguments. We will only
+            // ever be called from assembly code and thus can pick more
+            // suitable calling conventions.
+#if defined(__i386__)
+            ".text\n"
+            ".align 16, 0x90\n"
+            ".type SyscallAsm, @function\n"
+ "SyscallAsm:.cfi_startproc\n"
+            // Check if "%eax" is negative. If so, do not attempt to make a
+            // system call. Instead, compute the return address that is visible
+            // to the kernel after we execute "int $0x80". This address can be
+            // used as a marker that BPF code inspects.
+            "test %eax, %eax\n"
+            "jge  1f\n"
+            // Always, make sure that our code is position-independent, or
+            // address space randomization might not work on i386. This means,
+            // we can't use "lea", but instead have to rely on "call/pop".
+            "call 0f;   .cfi_adjust_cfa_offset  4\n"
+          "0:pop  %eax; .cfi_adjust_cfa_offset -4\n"
+            "addl $2f-0b, %eax\n"
+            "ret\n"
+            // Save register that we don't want to clobber. On i386, we need to
+            // save relatively aggressively, as there are a couple or registers
+            // that are used internally (e.g. %ebx for position-independent
+            // code, and %ebp for the frame pointer), and as we need to keep at
+            // least a few registers available for the register allocator.
+          "1:push %esi; .cfi_adjust_cfa_offset 4\n"
+            "push %edi; .cfi_adjust_cfa_offset 4\n"
+            "push %ebx; .cfi_adjust_cfa_offset 4\n"
+            "push %ebp; .cfi_adjust_cfa_offset 4\n"
+            // Copy entries from the array holding the arguments into the
+            // correct CPU registers.
+            "movl  0(%edi), %ebx\n"
+            "movl  4(%edi), %ecx\n"
+            "movl  8(%edi), %edx\n"
+            "movl 12(%edi), %esi\n"
+            "movl 20(%edi), %ebp\n"
+            "movl 16(%edi), %edi\n"
+            // Enter the kernel.
+            "int  $0x80\n"
+            // This is our "magic" return address that the BPF filter sees.
+          "2:"
+            // Restore any clobbered registers that we didn't declare to the
+            // compiler.
+            "pop  %ebp; .cfi_adjust_cfa_offset -4\n"
+            "pop  %ebx; .cfi_adjust_cfa_offset -4\n"
+            "pop  %edi; .cfi_adjust_cfa_offset -4\n"
+            "pop  %esi; .cfi_adjust_cfa_offset -4\n"
+            "ret\n"
+            ".cfi_endproc\n"
+          "9:.size SyscallAsm, 9b-SyscallAsm\n"
+#elif defined(__x86_64__)
+            ".text\n"
+            ".align 16, 0x90\n"
+            ".type SyscallAsm, @function\n"
+ "SyscallAsm:.cfi_startproc\n"
+            // Check if "%rax" is negative. If so, do not attempt to make a
+            // system call. Instead, compute the return address that is visible
+            // to the kernel after we execute "syscall". This address can be
+            // used as a marker that BPF code inspects.
+            "test %rax, %rax\n"
+            "jge  1f\n"
+            // Always make sure that our code is position-independent, or the
+            // linker will throw a hissy fit on x86-64.
+            "call 0f;   .cfi_adjust_cfa_offset  8\n"
+          "0:pop  %rax; .cfi_adjust_cfa_offset -8\n"
+            "addq $2f-0b, %rax\n"
+            "ret\n"
+            // We declared all clobbered registers to the compiler. On x86-64,
+            // there really isn't much of a problem with register pressure. So,
+            // we can go ahead and directly copy the entries from the arguments
+            // array into the appropriate CPU registers.
+          "1:movq  0(%r12), %rdi\n"
+            "movq  8(%r12), %rsi\n"
+            "movq 16(%r12), %rdx\n"
+            "movq 24(%r12), %r10\n"
+            "movq 32(%r12), %r8\n"
+            "movq 40(%r12), %r9\n"
+            // Enter the kernel.
+            "syscall\n"
+            // This is our "magic" return address that the BPF filter sees.
+          "2:ret\n"
+            ".cfi_endproc\n"
+          "9:.size SyscallAsm, 9b-SyscallAsm\n"
+#elif defined(__arm__)
+            // Throughout this file, we use the same mode (ARM vs. thumb)
+            // that the C++ compiler uses. This means, when transfering control
+            // from C++ to assembly code, we do not need to switch modes (e.g.
+            // by using the "bx" instruction). It also means that our assembly
+            // code should not be invoked directly from code that lives in
+            // other compilation units, as we don't bother implementing thumb
+            // interworking. That's OK, as we don't make any of the assembly
+            // symbols public. They are all local to this file.
+            ".text\n"
+            ".align 2\n"
+            ".type SyscallAsm, %function\n"
+#if defined(__thumb__)
+            ".thumb_func\n"
+#else
+            ".arm\n"
+#endif
+ "SyscallAsm:.fnstart\n"
+            "@ args = 0, pretend = 0, frame = 8\n"
+            "@ frame_needed = 1, uses_anonymous_args = 0\n"
+#if defined(__thumb__)
+            ".cfi_startproc\n"
+            "push {r7, lr}\n"
+            ".cfi_offset 14, -4\n"
+            ".cfi_offset  7, -8\n"
+            "mov r7, sp\n"
+            ".cfi_def_cfa_register 7\n"
+            ".cfi_def_cfa_offset 8\n"
+#else
+            "stmfd sp!, {fp, lr}\n"
+            "add fp, sp, #4\n"
+#endif
+            // Check if "r0" is negative. If so, do not attempt to make a
+            // system call. Instead, compute the return address that is visible
+            // to the kernel after we execute "swi 0". This address can be
+            // used as a marker that BPF code inspects.
+            "cmp r0, #0\n"
+            "bge 1f\n"
+            "ldr r0, =2f\n"
+            "b   2f\n"
+            // We declared (almost) all clobbered registers to the compiler. On
+            // ARM there is no particular register pressure. So, we can go
+            // ahead and directly copy the entries from the arguments array
+            // into the appropriate CPU registers.
+          "1:ldr r5, [r6, #20]\n"
+            "ldr r4, [r6, #16]\n"
+            "ldr r3, [r6, #12]\n"
+            "ldr r2, [r6, #8]\n"
+            "ldr r1, [r6, #4]\n"
+            "mov r7, r0\n"
+            "ldr r0, [r6, #0]\n"
+            // Enter the kernel
+            "swi 0\n"
+            // Restore the frame pointer. Also restore the program counter from
+            // the link register; this makes us return to the caller.
+#if defined(__thumb__)
+          "2:pop {r7, pc}\n"
+            ".cfi_endproc\n"
+#else
+          "2:ldmfd sp!, {fp, pc}\n"
+#endif
+            ".fnend\n"
+          "9:.size SyscallAsm, 9b-SyscallAsm\n"
+#endif
+  );  // asm
+
+intptr_t SandboxSyscall(int nr, ...) {
+  // It is most convenient for the caller to pass a variadic list of arguments.
+  // But this is difficult to handle in assembly code without making
+  // assumptions about internal implementation details of "va_list". So, we
+  // first use C code to copy all the arguments into an array, where they are
+  // easily accessible to asm().
+  // This is preferable over copying them into individual variables, which
+  // can result in too much register pressure.
+  if (sizeof(void *)*8 != __WORDSIZE) {
+    SANDBOX_DIE("This can't happen! "
+                "__WORDSIZE doesn't agree with actual size");
+  }
+  void *args[6];
+  va_list ap;
+
+  // System calls take a system call number (typically passed in %eax or
+  // %rax) and up to six arguments (passed in general-purpose CPU registers).
+  //
+  // On 32bit systems, all variadic arguments are passed on the stack as 32bit
+  // quantities. We can use an arbitrary 32bit type to retrieve them with
+  // va_arg() and then forward them to the kernel in the appropriate CPU
+  // register. We do not need to know whether this is an integer or a pointer
+  // value.
+  //
+  // On 64bit systems, variadic arguments can be either 32bit or 64bit wide,
+  // which would seem to make it more important that we pass the correct type
+  // to va_arg(). And we really can't know what this type is unless we have a
+  // table with function signatures for all system calls.
+  //
+  // Fortunately, on x86-64 this is less critical. The first six function
+  // arguments will be passed in CPU registers, no matter whether they were
+  // named or variadic. This only leaves us with a single argument (if present)
+  // that could be passed on the stack. And since x86-64 is little endian,
+  // it will have the correct value both for 32bit and 64bit quantities.
+  //
+  // N.B. Because of how the x86-64 ABI works, it is possible that 32bit
+  //   quantities will have undefined garbage bits in the upper 32 bits of a
+  //   64bit register. This is relatively unlikely for the first five system
+  //   call arguments, as the processor does automatic sign extensions and zero
+  //   filling so frequently, there rarely is garbage in CPU registers. But it
+  //   is quite likely for the last argument, which is passed on the stack.
+  //   That's generally OK, because the kernel has the correct function
+  //   signatures and knows to only inspect the LSB of a 32bit value.
+  //   But callers must be careful in cases, where the compiler cannot tell
+  //   the difference (e.g. when passing NULL to any system call, it must
+  //   always be cast to a pointer type).
+  //   The glibc implementation of syscall() has the exact same issues.
+  //   In the unlikely event that this ever becomes a problem, we could add
+  //   code that handles six-argument system calls specially. The number of
+  //   system calls that take six arguments and expect a 32bit value in the
+  //   sixth argument is very limited.
+  va_start(ap, nr);
+  args[0] = va_arg(ap, void *);
+  args[1] = va_arg(ap, void *);
+  args[2] = va_arg(ap, void *);
+  args[3] = va_arg(ap, void *);
+  args[4] = va_arg(ap, void *);
+  args[5] = va_arg(ap, void *);
+  va_end(ap);
+
+  // Invoke our file-scope assembly code. The constraints have been picked
+  // carefully to match what the rest of the assembly code expects in input,
+  // output, and clobbered registers.
+#if defined(__i386__)
+  intptr_t ret = nr;
+  asm volatile(
+    "call SyscallAsm\n"
+    // N.B. These are not the calling conventions normally used by the ABI.
+    : "=a"(ret)
+    : "0"(ret), "D"(args)
+    : "esp", "memory", "ecx", "edx");
+#elif defined(__x86_64__)
+  intptr_t ret = nr;
+  {
+    register void **data __asm__("r12") = args;
+    asm volatile(
+      "call SyscallAsm\n"
+      // N.B. These are not the calling conventions normally used by the ABI.
+      : "=a"(ret)
+      : "0"(ret), "r"(data)
+      : "rsp", "memory",
+        "rcx", "rdi", "rsi", "rdx", "r8", "r9", "r10", "r11");
+  }
+#elif defined(__arm__)
+  intptr_t ret;
+  {
+    register intptr_t inout __asm__("r0") = nr;
+    register void **data __asm__("r6") = args;
+    asm volatile(
+      "bl SyscallAsm\n"
+      // N.B. These are not the calling conventions normally used by the ABI.
+      : "=r"(inout)
+      : "0"(inout), "r"(data)
+      : "lr", "memory", "r1", "r2", "r3", "r4", "r5"
+#if !defined(__arm__)
+      // In thumb mode, we cannot use "r7" as a general purpose register, as
+      // it is our frame pointer. We have to manually manage and preserve it.
+      // In ARM mode, we have a dedicated frame pointer register and "r7" is
+      // thus available as a general purpose register. We don't preserve it,
+      // but instead mark it as clobbered.
+        , "r7"
+#endif
+      );
+    ret = inout;
+  }
+#else
+  errno = ENOSYS;
+  intptr_t ret = -1;
+#endif
+  return ret;
+}
+
+}  // namespace
diff --git a/sandbox/linux/seccomp-bpf/syscall.h b/sandbox/linux/seccomp-bpf/syscall.h
new file mode 100644
index 0000000..932e398
--- /dev/null
+++ b/sandbox/linux/seccomp-bpf/syscall.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef SANDBOX_LINUX_SECCOMP_BPF_SYSCALL_H__
+#define SANDBOX_LINUX_SECCOMP_BPF_SYSCALL_H__
+
+#include <signal.h>
+#include <stdint.h>
+
+namespace playground2 {
+
+// We have to make sure that we have a single "magic" return address for
+// our system calls, which we can check from within a BPF filter. This
+// works by writing a little bit of asm() code that a) enters the kernel, and
+// that also b) can be invoked in a way that computes this return address.
+// Passing "nr" as "-1" computes the "magic" return address. Passing any
+// other value invokes the appropriate system call.
+intptr_t SandboxSyscall(int nr, ...);
+
+}  // namespace
+
+#endif  // SANDBOX_LINUX_SECCOMP_BPF_SYSCALL_H__
diff --git a/sandbox/linux/seccomp-bpf/syscall_unittest.cc b/sandbox/linux/seccomp-bpf/syscall_unittest.cc
new file mode 100644
index 0000000..374a0fb
--- /dev/null
+++ b/sandbox/linux/seccomp-bpf/syscall_unittest.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <asm/unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "base/posix/eintr_wrapper.h"
+#include "sandbox/linux/seccomp-bpf/syscall.h"
+#include "sandbox/linux/tests/unit_tests.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+using namespace playground2;
+
+namespace {
+
+// Different platforms use different symbols for the six-argument version
+// of the mmap() system call. Test for the correct symbol at compile time.
+#ifdef __NR_mmap2
+const int kMMapNr = __NR_mmap2;
+#else
+const int kMMapNr = __NR_mmap;
+#endif
+
+TEST(Syscall, WellKnownEntryPoint) {
+  // Test that SandboxSyscall(-1) is handled specially. Don't do this on ARM,
+  // where syscall(-1) crashes with SIGILL. Not running the test is fine, as we
+  // are still testing ARM code in the next set of tests.
+#if !defined(__arm__)
+  EXPECT_NE(SandboxSyscall(-1), syscall(-1));
+#endif
+
+  // If possible, test that SandboxSyscall(-1) returns the address right after
+  // a kernel entry point.
+#if defined(__i386__)
+  EXPECT_EQ(0x80CDu, ((uint16_t *)SandboxSyscall(-1))[-1]);      // INT 0x80
+#elif defined(__x86_64__)
+  EXPECT_EQ(0x050Fu, ((uint16_t *)SandboxSyscall(-1))[-1]);      // SYSCALL
+#elif defined(__arm__)
+#if defined(__thumb__)
+  EXPECT_EQ(0xDF00u, ((uint16_t *)SandboxSyscall(-1))[-1]);      // SWI 0
+#else
+  EXPECT_EQ(0xEF000000u, ((uint32_t *)SandboxSyscall(-1))[-1]);  // SVC 0
+#endif
+#else
+  #warning Incomplete test case; need port for target platform
+#endif
+}
+
+TEST(Syscall, TrivialSyscallNoArgs) {
+  // Test that we can do basic system calls
+  EXPECT_EQ(SandboxSyscall(__NR_getpid), syscall(__NR_getpid));
+}
+
+TEST(Syscall, ComplexSyscallSixArgs) {
+  int fd;
+  ASSERT_LE(0, fd = SandboxSyscall(__NR_open, "/dev/null", O_RDWR, 0L));
+
+  // Use mmap() to allocate some read-only memory
+  char *addr0;
+  ASSERT_NE((char *)NULL,
+            addr0 = reinterpret_cast<char *>(
+              SandboxSyscall(kMMapNr, (void *)NULL, 4096, PROT_READ,
+                             MAP_PRIVATE|MAP_ANONYMOUS, fd, 0L)));
+
+  // Try to replace the existing mapping with a read-write mapping
+  char *addr1;
+  ASSERT_EQ(addr0,
+            addr1 = reinterpret_cast<char *>(
+              SandboxSyscall(kMMapNr, addr0, 4096L, PROT_READ|PROT_WRITE,
+                             MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED,
+                             fd, 0L)));
+  ++*addr1; // This should not seg fault
+
+  // Clean up
+  EXPECT_EQ(0, SandboxSyscall(__NR_munmap, addr1, 4096L));
+  EXPECT_EQ(0, HANDLE_EINTR(SandboxSyscall(__NR_close, fd)));
+
+  // Check that the offset argument (i.e. the sixth argument) is processed
+  // correctly.
+  ASSERT_GE(fd = SandboxSyscall(__NR_open, "/proc/self/exe", O_RDONLY, 0L), 0);
+  char *addr2, *addr3;
+  ASSERT_NE((char *)NULL,
+            addr2 = reinterpret_cast<char *>(
+              SandboxSyscall(kMMapNr, (void *)NULL, 8192L, PROT_READ,
+                             MAP_PRIVATE, fd, 0L)));
+  ASSERT_NE((char *)NULL,
+            addr3 = reinterpret_cast<char *>(
+              SandboxSyscall(kMMapNr, (void *)NULL, 4096L, PROT_READ,
+                             MAP_PRIVATE, fd,
+#if defined(__NR_mmap2)
+                      1L
+#else
+                      4096L
+#endif
+                      )));
+  EXPECT_EQ(0, memcmp(addr2 + 4096, addr3, 4096));
+
+  // Just to be absolutely on the safe side, also verify that the file
+  // contents matches what we are getting from a read() operation.
+  char buf[8192];
+  EXPECT_EQ(8192, SandboxSyscall(__NR_read, fd, buf, 8192L));
+  EXPECT_EQ(0, memcmp(addr2, buf, 8192));
+
+  // Clean up
+  EXPECT_EQ(0, SandboxSyscall(__NR_munmap, addr2, 8192L));
+  EXPECT_EQ(0, SandboxSyscall(__NR_munmap, addr3, 4096L));
+  EXPECT_EQ(0, HANDLE_EINTR(SandboxSyscall(__NR_close, fd)));
+}
+
+} // namespace
author	markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-11-22 03:51:04 +0000
committer	markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-11-22 03:51:04 +0000
commit	4149acf3704cb1aad4b8fca7dc530eefa995066b (patch)
tree	1497718f31512a6dda8764f73bad936efbecb9d6 /sandbox
parent	bfa268b7919819d2ca99aaa1d359cd7be0ab19a1 (diff)
download	chromium_src-4149acf3704cb1aad4b8fca7dc530eefa995066b.zip chromium_src-4149acf3704cb1aad4b8fca7dc530eefa995066b.tar.gz chromium_src-4149acf3704cb1aad4b8fca7dc530eefa995066b.tar.bz2