// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "content/public/common/sandbox_init.h" #if defined(OS_LINUX) && defined(__x86_64__) #include #include #include #include #include #include #include #include #include #include #include "base/command_line.h" #include "base/file_util.h" #include "base/logging.h" #include "base/time.h" #include "content/public/common/content_switches.h" #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #endif #ifndef SYS_SECCOMP #define SYS_SECCOMP 1 #endif #ifndef __NR_eventfd2 #define __NR_eventfd2 290 #endif // Constants from very new header files that we can't yet include. #ifndef SECCOMP_MODE_FILTER #define SECCOMP_MODE_FILTER 2 #define SECCOMP_RET_KILL 0x00000000U #define SECCOMP_RET_TRAP 0x00030000U #define SECCOMP_RET_ERRNO 0x00050000U #define SECCOMP_RET_ALLOW 0x7fff0000U #endif namespace { static void CheckSingleThreaded() { // Possibly racy, but it's ok because this is more of a debug check to catch // new threaded situations arising during development. // It also has to be a DCHECK() because production builds will be running // the suid sandbox, which will prevent /proc access in some contexts. DCHECK_EQ(file_util::CountFilesCreatedAfter(FilePath("/proc/self/task"), base::Time::UnixEpoch()), 1); } static void SIGSYS_Handler(int signal, siginfo_t* info, void* void_context) { if (signal != SIGSYS) return; if (info->si_code != SYS_SECCOMP) return; if (!void_context) return; ucontext_t* context = reinterpret_cast(void_context); uintptr_t syscall = context->uc_mcontext.gregs[REG_RAX]; if (syscall >= 1024) syscall = 0; // Encode 8-bits of the 1st two arguments too, so we can discern which socket // type, which fcntl, ... etc., without being likely to hit a mapped // address. // Do not encode more bits here without thinking about increasing the // likelihood of collision with mapped pages. syscall |= ((context->uc_mcontext.gregs[REG_RDI] & 0xffUL) << 12); syscall |= ((context->uc_mcontext.gregs[REG_RSI] & 0xffUL) << 20); // Purposefully dereference the syscall as an address so it'll show up very // clearly and easily in crash dumps. volatile char* addr = reinterpret_cast(syscall); *addr = '\0'; // In case we hit a mapped address, hit the null page with just the syscall, // for paranoia. syscall &= 0xfffUL; addr = reinterpret_cast(syscall); *addr = '\0'; _exit(1); } static void InstallSIGSYSHandler() { struct sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = SIGSYS_Handler; int ret = sigaction(SIGSYS, &sa, NULL); PLOG_IF(FATAL, ret != 0) << "Failed to install SIGSYS handler."; } static void EmitLoad(int offset, std::vector* program) { struct sock_filter filter; filter.code = BPF_LD+BPF_W+BPF_ABS; filter.jt = 0; filter.jf = 0; filter.k = offset; program->push_back(filter); } static void EmitLoadArg(int arg, std::vector* program) { // Each argument is 8 bytes, independent of architecture, and start at // an offset of 16 bytes, indepdendent of architecture. EmitLoad(((arg - 1) * 8) + 16, program); } static void EmitJEQJT1(int value, std::vector* program) { struct sock_filter filter; filter.code = BPF_JMP+BPF_JEQ+BPF_K; filter.jt = 1; filter.jf = 0; filter.k = value; program->push_back(filter); } static void EmitJEQJF(int value, int jf, std::vector* program) { struct sock_filter filter; filter.code = BPF_JMP+BPF_JEQ+BPF_K; filter.jt = 0; filter.jf = jf; filter.k = value; program->push_back(filter); } static void EmitRet(int value, std::vector* program) { struct sock_filter filter; filter.code = BPF_RET+BPF_K; filter.jt = 0; filter.jf = 0; filter.k = value; program->push_back(filter); } static void EmitPreamble(std::vector* program) { // First, check correct syscall arch. // "4" is magic offset for the arch number. EmitLoad(4, program); EmitJEQJT1(AUDIT_ARCH_X86_64, program); EmitRet(SECCOMP_RET_KILL, program); // Load the syscall number. // "0" is magic offset for the syscall number. EmitLoad(0, program); } static void EmitAllowSyscall(int nr, std::vector* program) { EmitJEQJF(nr, 1, program); EmitRet(SECCOMP_RET_ALLOW, program); } static void EmitAllowSyscallArgN(int nr, int arg_nr, int arg_val, std::vector* program) { // Jump forward 4 on no-match so that we also skip the unneccessary reload of // syscall_nr. (It is unneccessary because we have not trashed it yet.) EmitJEQJF(nr, 4, program); EmitLoadArg(arg_nr, program); EmitJEQJF(arg_val, 1, program); EmitRet(SECCOMP_RET_ALLOW, program); // We trashed syscall_nr so put it back in the accumulator. EmitLoad(0, program); } static void EmitFailSyscall(int nr, int err, std::vector* program) { EmitJEQJF(nr, 1, program); EmitRet(SECCOMP_RET_ERRNO | err, program); } static void EmitTrap(std::vector* program) { EmitRet(SECCOMP_RET_TRAP, program); } static void EmitAllowKillSelf(int signal, std::vector* program) { EmitAllowSyscallArgN(__NR_kill, 2, signal, program); } static void EmitAllowGettime(std::vector* program) { EmitAllowSyscall(__NR_clock_gettime, program); EmitAllowSyscall(__NR_gettimeofday, program); } static void ApplyGPUPolicy(std::vector* program) { // "Hot" syscalls go first. EmitAllowSyscall(__NR_read, program); EmitAllowSyscall(__NR_ioctl, program); EmitAllowSyscall(__NR_poll, program); EmitAllowSyscall(__NR_epoll_wait, program); EmitAllowSyscall(__NR_recvfrom, program); EmitAllowSyscall(__NR_write, program); EmitAllowSyscall(__NR_writev, program); EmitAllowSyscall(__NR_gettid, program); EmitAllowSyscall(__NR_sched_yield, program); // Nvidia binary driver. EmitAllowGettime(program); // Less hot syscalls. EmitAllowSyscall(__NR_futex, program); EmitAllowSyscall(__NR_madvise, program); EmitAllowSyscall(__NR_sendmsg, program); EmitAllowSyscall(__NR_recvmsg, program); EmitAllowSyscall(__NR_eventfd2, program); EmitAllowSyscall(__NR_pipe, program); EmitAllowSyscall(__NR_mmap, program); EmitAllowSyscall(__NR_mprotect, program); EmitAllowSyscall(__NR_clone, program); EmitAllowSyscall(__NR_set_robust_list, program); EmitAllowSyscall(__NR_getuid, program); EmitAllowSyscall(__NR_geteuid, program); EmitAllowSyscall(__NR_getgid, program); EmitAllowSyscall(__NR_getegid, program); EmitAllowSyscall(__NR_epoll_create, program); EmitAllowSyscall(__NR_fcntl, program); EmitAllowSyscall(__NR_socketpair, program); EmitAllowSyscall(__NR_epoll_ctl, program); EmitAllowSyscall(__NR_prctl, program); EmitAllowSyscall(__NR_fstat, program); EmitAllowSyscall(__NR_close, program); EmitAllowSyscall(__NR_restart_syscall, program); EmitAllowSyscall(__NR_rt_sigreturn, program); EmitAllowSyscall(__NR_brk, program); EmitAllowSyscall(__NR_rt_sigprocmask, program); EmitAllowSyscall(__NR_munmap, program); EmitAllowSyscall(__NR_dup, program); EmitAllowSyscall(__NR_mlock, program); EmitAllowSyscall(__NR_munlock, program); EmitAllowSyscall(__NR_exit, program); EmitAllowSyscall(__NR_exit_group, program); EmitAllowSyscall(__NR_getpid, program); // Nvidia binary driver. EmitAllowSyscall(__NR_getppid, program); // ATI binary driver. EmitAllowSyscall(__NR_lseek, program); // Nvidia binary driver. EmitAllowKillSelf(SIGTERM, program); // GPU watchdog. // Generally, filename-based syscalls will fail with ENOENT to behave // similarly to a possible future setuid sandbox. EmitFailSyscall(__NR_open, ENOENT, program); EmitFailSyscall(__NR_access, ENOENT, program); EmitFailSyscall(__NR_mkdir, ENOENT, program); // Nvidia binary driver. EmitFailSyscall(__NR_readlink, ENOENT, program); // ATI binary driver. } static void ApplyFlashPolicy(std::vector* program) { // "Hot" syscalls go first. EmitAllowSyscall(__NR_futex, program); EmitAllowSyscall(__NR_write, program); EmitAllowSyscall(__NR_epoll_wait, program); EmitAllowSyscall(__NR_read, program); EmitAllowSyscall(__NR_times, program); // Less hot syscalls. EmitAllowGettime(program); EmitAllowSyscall(__NR_clone, program); EmitAllowSyscall(__NR_set_robust_list, program); EmitAllowSyscall(__NR_getuid, program); EmitAllowSyscall(__NR_geteuid, program); EmitAllowSyscall(__NR_getgid, program); EmitAllowSyscall(__NR_getegid, program); EmitAllowSyscall(__NR_epoll_create, program); EmitAllowSyscall(__NR_fcntl, program); EmitAllowSyscall(__NR_socketpair, program); EmitAllowSyscall(__NR_pipe, program); EmitAllowSyscall(__NR_epoll_ctl, program); EmitAllowSyscall(__NR_gettid, program); EmitAllowSyscall(__NR_prctl, program); EmitAllowSyscall(__NR_fstat, program); EmitAllowSyscall(__NR_sendmsg, program); EmitAllowSyscall(__NR_mmap, program); EmitAllowSyscall(__NR_munmap, program); EmitAllowSyscall(__NR_mprotect, program); EmitAllowSyscall(__NR_madvise, program); EmitAllowSyscall(__NR_rt_sigaction, program); EmitAllowSyscall(__NR_rt_sigprocmask, program); EmitAllowSyscall(__NR_wait4, program); EmitAllowSyscall(__NR_exit_group, program); EmitAllowSyscall(__NR_exit, program); EmitAllowSyscall(__NR_rt_sigreturn, program); EmitAllowSyscall(__NR_restart_syscall, program); EmitAllowSyscall(__NR_close, program); EmitAllowSyscall(__NR_recvmsg, program); EmitAllowSyscall(__NR_lseek, program); EmitAllowSyscall(__NR_brk, program); EmitAllowSyscall(__NR_sched_yield, program); // These are under investigation, and hopefully not here for the long term. EmitAllowSyscall(__NR_shmctl, program); EmitAllowSyscall(__NR_shmat, program); EmitAllowSyscall(__NR_shmdt, program); EmitFailSyscall(__NR_open, ENOENT, program); EmitFailSyscall(__NR_execve, ENOENT, program); EmitFailSyscall(__NR_access, ENOENT, program); } static bool CanUseSeccompFilters() { int ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 0, 0, 0); if (ret != 0 && errno == EFAULT) return true; return false; } static void InstallFilter(const std::vector& program) { int ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); PLOG_IF(FATAL, ret != 0) << "prctl(PR_SET_NO_NEW_PRIVS) failed"; struct sock_fprog fprog; fprog.len = program.size(); fprog.filter = const_cast(&program[0]); ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &fprog, 0, 0); PLOG_IF(FATAL, ret != 0) << "Failed to install filter."; } } // anonymous namespace namespace content { void InitializeSandbox() { const CommandLine& command_line = *CommandLine::ForCurrentProcess(); if (command_line.HasSwitch(switches::kNoSandbox) || command_line.HasSwitch(switches::kDisableSeccompFilterSandbox)) return; std::string process_type = command_line.GetSwitchValueASCII(switches::kProcessType); if (process_type == switches::kGpuProcess && command_line.HasSwitch(switches::kDisableGpuSandbox)) return; if (!CanUseSeccompFilters()) return; CheckSingleThreaded(); std::vector program; EmitPreamble(&program); if (process_type == switches::kGpuProcess) { ApplyGPUPolicy(&program); } else if (process_type == switches::kPpapiPluginProcess) { ApplyFlashPolicy(&program); } else { NOTREACHED(); } EmitTrap(&program); InstallSIGSYSHandler(); InstallFilter(program); } } // namespace content #else namespace content { void InitializeSandbox() { } } // namespace content #endif