diff options
author | markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-08 17:06:40 +0000 |
---|---|---|
committer | markus@chromium.org <markus@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-03-08 17:06:40 +0000 |
commit | 9c856aad878f62b8517c74546455bd9909e055d3 (patch) | |
tree | 4897ec23fa996c543b1e415f8af5be0572031e28 /sandbox/linux | |
parent | 668c911c24e6ddef75a7ee6578da11b0055b1f16 (diff) | |
download | chromium_src-9c856aad878f62b8517c74546455bd9909e055d3.zip chromium_src-9c856aad878f62b8517c74546455bd9909e055d3.tar.gz chromium_src-9c856aad878f62b8517c74546455bd9909e055d3.tar.bz2 |
- Add a custom allocator for STL objects. This fixes sandbox failures that
were observed on some machines (in particular in 32bit mode).
- Some more changes to avoid calling into glibc when we can make a direct
system call, instead. These particular call sites were unlikely to cause
any problems. But it makes the code easier to audit if we avoid all
unnecessary calls into glibc.
- In 64bit mode, gettimeofday() is handled by vsyscalls and tends to be cheap.
In 32bit mode, it is just a regular system call. Some users rely on being
able to call gettimeofday() at a very high rate (up to thousands of
consecutive calls). Recognize this system call pattern and optimize for it.
- Add debugging option that allows us to warn about expensive system calls.
In many cases, these warnings can then be used to optimize the sandboxed
application.
- Fix compilation on newer versions of gcc.
- Changed the x86-32 version of the code that we use when intercepting
system calls. Previously, we would use CALL to jump to the set of
instructions that we had relocated. But we made the mistake of allowing
relocation of instructions that reference %esp. This doesn't work, as
CALL modifies the stack. We now avoid using CALL and instead jump
directly. On x86-32 that requires the use of a PUSH/RET combination as
there is no 32bit wide JMP instruction.
The x86-64 version of the code was already written in a way that would
avoid this particular problem.
(I would like to thank Craig Schlenter for his exceptional detective
work in tracking down the root cause of this bug!)
- For debugging purposes, injected a really small library (less than 4kB)
and discovered that some of our memory map manipulations implicitly
relied on mappings to be at least two pages long. Fixed the code that
made this incorrect assumption.
- For really small libraries, the runtime linker can choose a different
more compact layout. Our computation of the ASR offset did not know how
to deal with that. Fixed by explicitly looking for a ".text" segment
instead of looking for a PT_DYNAMIC section.
- Closed a file descriptor that we kept open longer than needed.
- Removed some unused code.
- Added copyright headers
TEST=tested on i386 and x86-64
BUG=36133
Review URL: http://codereview.chromium.org/661438
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@40900 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'sandbox/linux')
-rw-r--r-- | sandbox/linux/seccomp/allocator.cc | 136 | ||||
-rw-r--r-- | sandbox/linux/seccomp/allocator.h | 88 | ||||
-rw-r--r-- | sandbox/linux/seccomp/clone.cc | 5 | ||||
-rw-r--r-- | sandbox/linux/seccomp/library.cc | 153 | ||||
-rw-r--r-- | sandbox/linux/seccomp/library.h | 34 | ||||
-rw-r--r-- | sandbox/linux/seccomp/maps.cc | 18 | ||||
-rw-r--r-- | sandbox/linux/seccomp/maps.h | 17 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox.cc | 29 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox_impl.h | 5 | ||||
-rw-r--r-- | sandbox/linux/seccomp/securemem.h | 14 | ||||
-rw-r--r-- | sandbox/linux/seccomp/syscall.cc | 90 | ||||
-rw-r--r-- | sandbox/linux/seccomp/trusted_thread.cc | 270 |
12 files changed, 641 insertions, 218 deletions
diff --git a/sandbox/linux/seccomp/allocator.cc b/sandbox/linux/seccomp/allocator.cc new file mode 100644 index 0000000..6e11a4a --- /dev/null +++ b/sandbox/linux/seccomp/allocator.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The allocator is very simplistic. It requests memory pages directly from +// the system. Each page starts with a header describing the allocation. This +// makes sure that we can return the memory to the system when it is +// deallocated. +// For allocations that are smaller than a single page, we try to squeeze +// multiple of them into the same page. +// We expect to use this allocator for a moderate number of small allocations. +// In most cases, it will only need to ever make a single request to the +// operating system for the lifetime of the STL container object. +// We don't worry about memory fragmentation as the allocator is expected to +// be short-lived. + +#include <stdint.h> +#include <sys/mman.h> + +#include "allocator.h" +#include "linux_syscall_support.h" + +namespace playground { + +class SysCalls { + public: + #define SYS_CPLUSPLUS + #define SYS_ERRNO my_errno + #define SYS_INLINE inline + #define SYS_PREFIX -1 + #undef SYS_LINUX_SYSCALL_SUPPORT_H + #include "linux_syscall_support.h" + SysCalls() : my_errno(0) { } + int my_errno; +}; +#ifdef __NR_mmap2 + #define MMAP mmap2 + #define __NR_MMAP __NR_mmap2 +#else + #define MMAP mmap + #define __NR_MMAP __NR_mmap +#endif + +// We only ever keep track of the very last partial page that was used for +// allocations. This approach simplifies the code a lot. It can theoretically +// lead to more memory fragmentation, but for our use case that is unlikely +// to happen. +struct Header { + // The total amount of memory allocated for this chunk of memory. Typically, + // this would be a single page. + size_t total_len; + + // "used" keeps track of the number of bytes currently allocated in this + // page. Note that as elements are freed from this page, "used" is updated + // allowing us to track when the page is free. However, these holes in the + // page are never re-used, so "tail" is the only way to find out how much + // free space remains and when we need to request another chunk of memory + // from the system. + size_t used; + void *tail; +}; +static Header* last_alloc; + +void* SystemAllocatorHelper::sys_allocate(size_t size) { + // Number of bytes that need to be allocated + if (size + 3 < size) { + return NULL; + } + size_t len = (size + 3) & ~3; + + if (last_alloc) { + // Remaining space in the last chunk of memory allocated from system + size_t remainder = last_alloc->total_len - + (reinterpret_cast<char *>(last_alloc->tail) - + reinterpret_cast<char *>(last_alloc)); + + if (remainder >= len) { + void* ret = last_alloc->tail; + last_alloc->tail = reinterpret_cast<char *>(last_alloc->tail) + len; + last_alloc->used += len; + return ret; + } + } + + SysCalls sys; + if (sizeof(Header) + len + 4095 < len) { + return NULL; + } + size_t total_len = (sizeof(Header) + len + 4095) & ~4095; + Header* mem = reinterpret_cast<Header *>( + sys.MMAP(NULL, total_len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); + if (mem == MAP_FAILED) { + return NULL; + } + + // If we were only asked to allocate a single page, then we will use any + // remaining space for other small allocations. + if (total_len - sizeof(Header) - len >= 4) { + last_alloc = mem; + } + mem->total_len = total_len; + mem->used = len; + char* ret = reinterpret_cast<char *>(mem) + sizeof(Header); + mem->tail = ret + len; + + return ret; +} + +void SystemAllocatorHelper::sys_deallocate(void* p, size_t size) { + // Number of bytes in this allocation + if (size + 3 < size) { + return; + } + size_t len = (size + 3) & ~3; + + // All allocations (small and large) have starting addresses in the + // first page that was allocated from the system. This page starts with + // a header that keeps track of how many bytes are currently used. The + // header can be found by truncating the last few bits of the address. + Header* header = reinterpret_cast<Header *>( + reinterpret_cast<uintptr_t>(p) & ~4095); + header->used -= len; + + // After the last allocation has been freed, return the page(s) to the + // system + if (!header->used) { + SysCalls sys; + sys.munmap(header, header->total_len); + if (last_alloc == header) { + last_alloc = NULL; + } + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/allocator.h b/sandbox/linux/seccomp/allocator.h new file mode 100644 index 0000000..29e0065 --- /dev/null +++ b/sandbox/linux/seccomp/allocator.h @@ -0,0 +1,88 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Implement a very basic memory allocator that make direct system calls +// instead of relying on libc. +// This allocator is not thread-safe. + +#ifndef ALLOCATOR_H__ +#define ALLOCATOR_H__ + +#include <cstddef> + +namespace playground { + +class SystemAllocatorHelper { + protected: + static void *sys_allocate(size_t size); + static void sys_deallocate(void* p, size_t size); +}; + +template <class T> +class SystemAllocator : SystemAllocatorHelper { + public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + + template <class U> + struct rebind { + typedef SystemAllocator<U> other; + }; + + pointer address(reference value) const { + return &value; + } + + const_pointer address(const_reference value) const { + return &value; + } + + SystemAllocator() throw() { } + SystemAllocator(const SystemAllocator& src) throw() { } + template <class U> SystemAllocator(const SystemAllocator<U>& src) throw() { } + ~SystemAllocator() throw() { } + + size_type max_size() const throw() { + return (1 << 30) / sizeof(T); + } + + pointer allocate(size_type num, const void* = 0) { + if (num > max_size()) { + return NULL; + } + return (pointer)sys_allocate(num * sizeof(T)); + } + + void construct(pointer p, const T& value) { + new(reinterpret_cast<void *>(p))T(value); + } + + void destroy(pointer p) { + p->~T(); + } + + void deallocate(pointer p, size_type num) { + sys_deallocate(p, num * sizeof(T)); + } +}; + +template <class T1, class T2> +bool operator== (const SystemAllocator<T1>&, const SystemAllocator<T2>&) + throw() { + return true; +} +template <class T1, class T2> +bool operator!= (const SystemAllocator<T1>&, const SystemAllocator<T2>&) + throw() { + return false; +} + +} // namespace + +#endif // ALLOCATOR_H__ diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc index 2b6703f..28a3584 100644 --- a/sandbox/linux/seccomp/clone.cc +++ b/sandbox/linux/seccomp/clone.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "debug.h" #include "sandbox_impl.h" @@ -84,7 +88,6 @@ bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub, mem->r14 = clone_req.regs64.r14; mem->r15 = clone_req.regs64.r15; #elif defined(__i386__) - mem->ret2 = clone_req.regs32.ret2; mem->ebp = clone_req.regs32.ebp; mem->edi = clone_req.regs32.edi; mem->esi = clone_req.regs32.esi; diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc index cf7477b..1b06bc1 100644 --- a/sandbox/linux/seccomp/library.cc +++ b/sandbox/linux/seccomp/library.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #define XOPEN_SOURCE 500 #include <algorithm> #include <elf.h> @@ -16,6 +20,7 @@ #include <sys/stat.h> #include <sys/types.h> +#include "allocator.h" #include "debug.h" #include "library.h" #include "sandbox_impl.h" @@ -84,7 +89,11 @@ Library::~Library() { // found. Make sure to preserve any changes that we might have made since. Sandbox::SysCalls sys; sys.mprotect(image_, 4096, PROT_READ | PROT_WRITE); - memcpy(image_, memory_ranges_.rbegin()->second.start, 4096); + if (memcmp(image_, memory_ranges_.rbegin()->second.start, 4096)) { + // Only copy data, if we made any changes in this data. Otherwise there + // is no need to create another modified COW mapping. + memcpy(image_, memory_ranges_.rbegin()->second.start, 4096); + } sys.mprotect(image_, 4096, PROT_READ | PROT_EXEC); sys.mremap(image_, image_size_, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, memory_ranges_.rbegin()->second.start); @@ -173,7 +182,7 @@ char *Library::get(Elf_Addr offset, char *buf, size_t len) { return buf; } -std::string Library::get(Elf_Addr offset) { +Library::string Library::get(Elf_Addr offset) { if (!valid_) { return ""; } @@ -192,7 +201,7 @@ std::string Library::get(Elf_Addr offset) { while (*stop) { ++stop; } - std::string s = stop > start ? std::string(start, stop - start) : ""; + string s = stop > start ? string(start, stop - start) : ""; return s; } @@ -215,8 +224,21 @@ char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) { image_size_ = memory_ranges_.begin()->first + (reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - reinterpret_cast<char *>(memory_ranges_.begin()->second.start)); + if (image_size_ < 8192) { + // It is possible to create a library that is only a single page in + // size. In that case, we have to make sure that we artificially map + // one extra page past the end of it, as our code relies on mremap() + // actually moving the mapping. + image_size_ = 8192; + } image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_, MREMAP_MAYMOVE)); + if (image_size_ == 8192 && image_ == start) { + // We really mean it, when we say we want the memory to be moved. + image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_, + MREMAP_MAYMOVE)); + sys.munmap(reinterpret_cast<char *>(start) + 4096, 4096); + } if (image_ == MAP_FAILED) { image_ = NULL; } else { @@ -250,7 +272,7 @@ char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) { return buf ? get(offset, buf, len) : NULL; } -std::string Library::getOriginal(Elf_Addr offset) { +Library::string Library::getOriginal(Elf_Addr offset) { if (!valid_) { return ""; } @@ -271,7 +293,7 @@ std::string Library::getOriginal(Elf_Addr offset) { getOriginal(stop - image_, NULL, 1); } } - return std::string(start, stop - start); + return string(start, stop - start); } return ""; } @@ -285,7 +307,7 @@ const Elf_Ehdr* Library::getEhdr() { return &ehdr_; } -const Elf_Shdr* Library::getSection(const std::string& section) { +const Elf_Shdr* Library::getSection(const string& section) { if (!valid_) { return NULL; } @@ -296,7 +318,7 @@ const Elf_Shdr* Library::getSection(const std::string& section) { return &iter->second.second; } -const int Library::getSectionIndex(const std::string& section) { +const int Library::getSectionIndex(const string& section) { if (!valid_) { return -1; } @@ -307,22 +329,6 @@ const int Library::getSectionIndex(const std::string& section) { return iter->second.first; } -void **Library::getRelocation(const std::string& symbol) { - PltTable::const_iterator iter = plt_entries_.find(symbol); - if (iter == plt_entries_.end()) { - return NULL; - } - return reinterpret_cast<void **>(asr_offset_ + iter->second); -} - -void *Library::getSymbol(const std::string& symbol) { - SymbolTable::const_iterator iter = symbols_.find(symbol); - if (iter == symbols_.end() || !iter->second.st_value) { - return NULL; - } - return asr_offset_ + iter->second.st_value; -} - void Library::makeWritable(bool state) const { for (RangeMap::const_iterator iter = memory_ranges_.begin(); iter != memory_ranges_.end(); ++iter) { @@ -380,7 +386,7 @@ char* Library::getScratchSpace(const Maps* maps, char* near, int needed, void Library::patchSystemCallsInFunction(const Maps* maps, char *start, char *end, char** extraSpace, int* extraLength) { - std::set<char *> branch_targets; + std::set<char *, std::less<char *>, SystemAllocator<char *> > branch_targets; for (char *ptr = start; ptr < end; ) { unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64); char *target; @@ -516,12 +522,21 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, } } // We now know, how many instructions neighboring the system call we - // can safely overwrite. We need five bytes to insert a JMP/CALL and a - // 32bit address. We then jump to a code fragment that safely forwards - // to our system call wrapper. On x86-64, this is complicated by - // the fact that the API allows up to 128 bytes of red-zones below the - // current stack pointer. So, we cannot write to the stack until we - // have adjusted the stack pointer. + // can safely overwrite. On x86-32 we need six bytes, and on x86-64 + // We need five bytes to insert a JMPQ and a 32bit address. We then + // jump to a code fragment that safely forwards to our system call + // wrapper. + // On x86-64, this is complicated by the fact that the API allows up + // to 128 bytes of red-zones below the current stack pointer. So, we + // cannot write to the stack until we have adjusted the stack + // pointer. + // On both x86-32 and x86-64 we take care to leave the stack unchanged + // while we are executing the preamble and postamble. This allows us + // to treat instructions that reference %esp/%rsp as safe for + // relocation. + // In particular, this means that on x86-32 we cannot use CALL, but + // have to use a PUSH/RET combination to change the instruction pointer. + // On x86-64, we can instead use a 32bit JMPQ. // // .. .. .. .. ; any leading instructions copied from original code // 48 81 EC 80 00 00 00 SUB $0x80, %rsp @@ -549,9 +564,10 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, // 68 .. .. .. .. PUSH $syscallWrapper // C3 RET // .. .. .. .. ; any trailing instructions copied from original code + // 68 .. .. .. .. PUSH return_addr // C3 RET // - // Total: 12 bytes + any bytes that were copied + // Total: 17 bytes + any bytes that were copied // // For indirect jumps from the VDSO to the VSyscall page, we instead // replace the following code (this is only necessary on x86-64). This @@ -575,7 +591,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, // // Total: 52 bytes + any bytes that were copied - if (length < 5) { + if (length < (__WORDSIZE == 32 ? 6 : 5)) { // There are a very small number of instruction sequences that we // cannot easily intercept, and that have been observed in real world // examples. Handle them here: @@ -648,7 +664,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, Sandbox::die("Cannot intercept system call"); } } - int needed = 5 - code[codeIdx].len; + int needed = (__WORDSIZE == 32 ? 6 : 5) - code[codeIdx].len; int first = codeIdx; while (needed > 0 && first != startIdx) { first = (first + (sizeof(code) / sizeof(struct Code)) - 1) % @@ -673,7 +689,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, needed = 52 + preamble + postamble; } #elif defined(__i386__) - needed = 12 + preamble + postamble; + needed = 17 + preamble + postamble; #else #error Unsupported target platform #endif @@ -752,7 +768,10 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, reinterpret_cast<void *>(&syscallWrapper); } #elif defined(__i386__) - *(dest + preamble + 11 + postamble) = '\xC3'; + *(dest + preamble + 11 + postamble) = '\x68'; // PUSH + *reinterpret_cast<char **>(dest + preamble + 12 + postamble) = + code[second].addr + code[second].len; + *(dest + preamble + 16 + postamble) = '\xC3'; // RET *reinterpret_cast<char **>(dest + preamble + 1) = dest + preamble + 11; *reinterpret_cast<void (**)()>(dest + preamble + 6) = syscallWrapper; @@ -766,14 +785,16 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, // Replace the system call with an unconditional jump to our new code. #if defined(__x86_64__) - *code[first].addr = '\xE9'; // JMPQ + *code[first].addr = '\xE9'; // JMPQ + *reinterpret_cast<int *>(code[first].addr + 1) = + dest - (code[first].addr + 5); #elif defined(__i386__) - *code[first].addr = '\xE8'; // CALL + code[first].addr[0] = '\x68'; // PUSH + *reinterpret_cast<char **>(code[first].addr + 1) = dest; + code[first].addr[5] = '\xC3'; // RET #else #error Unsupported target platform #endif - *reinterpret_cast<int *>(code[first].addr + 1) = - dest - (code[first].addr + 5); } replaced: codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code)); @@ -1049,27 +1070,11 @@ bool Library::parseElf() { &str_shdr)) { // Not all memory mappings are necessarily ELF files. Skip memory // mappings that we cannot identify. + error: valid_ = false; return false; } - // Find PT_DYNAMIC segment. This is what our PLT entries and symbols will - // point to. This information is probably incorrect in the child, as it - // requires access to the original memory mappings. - for (int i = 0; i < ehdr_.e_phnum; i++) { - Elf_Phdr phdr; - if (getOriginal(ehdr_.e_phoff + i*ehdr_.e_phentsize, &phdr) && - phdr.p_type == PT_DYNAMIC) { - RangeMap::const_iterator iter = - memory_ranges_.lower_bound(phdr.p_offset); - if (iter != memory_ranges_.end()) { - asr_offset_ = reinterpret_cast<char *>(iter->second.start) - - (phdr.p_vaddr - (phdr.p_offset - iter->first)); - } - break; - } - } - // Parse section table and find all sections in this ELF file for (int i = 0; i < ehdr_.e_shnum; i++) { Elf_Shdr shdr; @@ -1081,6 +1086,38 @@ bool Library::parseElf() { std::make_pair(i, shdr))); } + // Compute the offset of entries in the .text segment + const Elf_Shdr* text = getSection(".text"); + if (text == NULL) { + // On x86-32, the VDSO is unusual in as much as it does not have a single + // ".text" section. Instead, it has one section per function. Each + // section name starts with ".text". We just need to pick an arbitrary + // one in order to find the asr_offset_ -- which would typically be zero + // for the VDSO. + for (SectionTable::const_iterator iter = section_table_.begin(); + iter != section_table_.end(); ++iter) { + if (!strncmp(iter->first.c_str(), ".text", 5)) { + text = &iter->second.second; + break; + } + } + } + + // Now that we know where the .text segment is located, we can compute the + // asr_offset_. + if (text) { + RangeMap::const_iterator iter = + memory_ranges_.lower_bound(text->sh_offset); + if (iter != memory_ranges_.end()) { + asr_offset_ = reinterpret_cast<char *>(iter->second.start) - + (text->sh_addr - (text->sh_offset - iter->first)); + } else { + goto error; + } + } else { + goto error; + } + return !isVDSO_ || parseSymbols(); } @@ -1128,7 +1165,7 @@ bool Library::parseSymbols() { valid_ = false; return false; } - std::string name = getOriginal(strtab.sh_offset + sym.st_name); + string name = getOriginal(strtab.sh_offset + sym.st_name); if (name.empty()) { continue; } @@ -1147,7 +1184,7 @@ bool Library::parseSymbols() { valid_ = false; return false; } - std::string name = getOriginal(strtab.sh_offset + sym.st_name); + string name = getOriginal(strtab.sh_offset + sym.st_name); if (name.empty()) { continue; } diff --git a/sandbox/linux/seccomp/library.h b/sandbox/linux/seccomp/library.h index 523652c..29a755e 100644 --- a/sandbox/linux/seccomp/library.h +++ b/sandbox/linux/seccomp/library.h @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef LIBRARY_H__ #define LIBRARY_H__ @@ -30,6 +34,8 @@ namespace playground { class Library { friend class Maps; public: + typedef Maps::string string; + Library() : valid_(false), isVDSO_(false), @@ -50,14 +56,24 @@ class Library { void addMemoryRange(void* start, void* stop, Elf_Addr offset, int prot, int isVDSO) { - memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot))); isVDSO_ = isVDSO; + RangeMap::const_iterator iter = memory_ranges_.find(offset); + if (iter != memory_ranges_.end()) { + // It is possible to have overlapping mappings. This is particularly + // likely to happen with very small programs or libraries. If it does + // happen, we really only care about the text segment. Look for a + // mapping that is mapped executable. + if ((prot & PROT_EXEC) == 0) { + return; + } + } + memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot))); } char *get(Elf_Addr offset, char *buf, size_t len); - std::string get(Elf_Addr offset); + string get(Elf_Addr offset); char *getOriginal(Elf_Addr offset, char *buf, size_t len); - std::string getOriginal(Elf_Addr offset); + string getOriginal(Elf_Addr offset); template<class T>T* get(Elf_Addr offset, T* t) { if (!valid_) { @@ -108,10 +124,8 @@ class Library { bool parseElf(); const Elf_Ehdr* getEhdr(); - const Elf_Shdr* getSection(const std::string& section); - const int getSectionIndex(const std::string& section); - void **getRelocation(const std::string& symbol); - void *getSymbol(const std::string& symbol); + const Elf_Shdr* getSection(const string& section); + const int getSectionIndex(const string& section); void makeWritable(bool state) const; void patchSystemCalls(); bool isVDSO() const { return isVDSO_; } @@ -136,9 +150,9 @@ class Library { }; typedef std::map<Elf_Addr, Range, GreaterThan> RangeMap; - typedef std::map<std::string, std::pair<int, Elf_Shdr> > SectionTable; - typedef std::map<std::string, Elf_Sym> SymbolTable; - typedef std::map<std::string, Elf_Addr> PltTable; + typedef std::map<string, std::pair<int, Elf_Shdr> > SectionTable; + typedef std::map<string, Elf_Sym> SymbolTable; + typedef std::map<string, Elf_Addr> PltTable; char* getBytes(char* dst, const char* src, ssize_t len); static bool isSafeInsn(unsigned short insn); diff --git a/sandbox/linux/seccomp/maps.cc b/sandbox/linux/seccomp/maps.cc index cb303e7..d18405a 100644 --- a/sandbox/linux/seccomp/maps.cc +++ b/sandbox/linux/seccomp/maps.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include <errno.h> #include <fcntl.h> #include <iostream> @@ -42,18 +46,18 @@ Maps::Maps(int proc_self_maps) : while (*ptr == ' ' || *ptr == '\t') ++ptr; char *perm_ptr = ptr; while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; - std::string perm(perm_ptr, ptr - perm_ptr); + string perm(perm_ptr, ptr - perm_ptr); unsigned long offset = strtoul(ptr, &ptr, 16); while (*ptr == ' ' || *ptr == '\t') ++ptr; char *id_ptr = ptr; while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; while (*ptr == ' ' || *ptr == '\t') ++ptr; while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; - std::string id(id_ptr, ptr - id_ptr); + string id(id_ptr, ptr - id_ptr); while (*ptr == ' ' || *ptr == '\t') ++ptr; char *library_ptr = ptr; while (*ptr && *ptr != ' ' && *ptr != '\t' && *ptr != '\n') ++ptr; - std::string library(library_ptr, ptr - library_ptr); + string library(library_ptr, ptr - library_ptr); bool isVDSO = false; if (library == "[vdso]") { // /proc/self/maps has a misleading file offset in the [vdso] entry. @@ -66,13 +70,13 @@ Maps::Maps(int proc_self_maps) : goto skip_entry; } int prot = 0; - if (perm.find('r') != std::string::npos) { + if (perm.find('r') != string::npos) { prot |= PROT_READ; } - if (perm.find('w') != std::string::npos) { + if (perm.find('w') != string::npos) { prot |= PROT_WRITE; } - if (perm.find('x') != std::string::npos) { + if (perm.find('x') != string::npos) { prot |= PROT_EXEC; } if ((prot & (PROT_EXEC | PROT_READ)) == 0) { @@ -146,7 +150,7 @@ bool Maps::Iterator::operator!=(const Maps::Iterator& iter) const { return !operator==(iter); } -std::string Maps::Iterator::name() const { +Maps::string Maps::Iterator::name() const { return getIterator()->first; } diff --git a/sandbox/linux/seccomp/maps.h b/sandbox/linux/seccomp/maps.h index 1d30506..5f51782 100644 --- a/sandbox/linux/seccomp/maps.h +++ b/sandbox/linux/seccomp/maps.h @@ -1,9 +1,16 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef MAPS_H__ #define MAPS_H__ #include <elf.h> +#include <functional> +#include <map> #include <string> -#include <vector> + +#include "allocator.h" #if defined(__x86_64__) typedef Elf64_Addr Elf_Addr; @@ -19,6 +26,9 @@ class Library; class Maps { friend class Library; public: + typedef std::basic_string<char, std::char_traits<char>, + SystemAllocator<char> > string; + Maps(int proc_self_maps); ~Maps() { } @@ -26,7 +36,8 @@ class Maps { // A map with all the libraries currently loaded into the application. // The key is a unique combination of device number, inode number, and // file name. It should be treated as opaque. - typedef std::map<std::string, Library> LibraryMap; + typedef std::map<string, Library, std::less<string>, + SystemAllocator<string> > LibraryMap; friend class Iterator; class Iterator { friend class Maps; @@ -44,7 +55,7 @@ class Maps { Library* operator*() const; bool operator==(const Iterator& iter) const; bool operator!=(const Iterator& iter) const; - std::string name() const; + string name() const; protected: mutable LibraryMap::iterator iter_; diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc index ff2b59e..12f0c0f 100644 --- a/sandbox/linux/seccomp/sandbox.cc +++ b/sandbox/linux/seccomp/sandbox.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "library.h" #include "sandbox_impl.h" #include "syscall_table.h" @@ -372,9 +376,10 @@ int Sandbox::supportsSeccompSandbox(int proc_fd) { case 0: { int devnull = sys.open("/dev/null", O_RDWR, 0); if (devnull >= 0) { - dup2(devnull, 0); - dup2(devnull, 1); - dup2(devnull, 2); + sys.dup2(devnull, 0); + sys.dup2(devnull, 1); + sys.dup2(devnull, 2); + sys.close(devnull); } if (proc_fd >= 0) { setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0)); @@ -423,7 +428,7 @@ void Sandbox::startSandbox() { SysCalls sys; if (proc_self_maps_ < 0) { - proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0); + proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0); if (proc_self_maps_ < 0) { die("Cannot access \"/proc/self/maps\""); } @@ -431,21 +436,21 @@ void Sandbox::startSandbox() { // The pid is unchanged for the entire program, so we can retrieve it once // and store it in a global variable. - pid_ = sys.getpid(); + pid_ = sys.getpid(); // Block all signals, except for the RDTSC handler setupSignalHandlers(); // Get socketpairs for talking to the trusted process int pair[4]; - if (socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || - socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { + if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || + sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { die("Failed to create trusted thread"); } - processFdPub_ = pair[0]; - cloneFdPub_ = pair[2]; - SecureMemArgs::Args* secureMem = createTrustedProcess(pair[0], pair[1], - pair[2], pair[3]); + processFdPub_ = pair[0]; + cloneFdPub_ = pair[2]; + SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1], + pair[2], pair[3]); // We find all libraries that have system calls and redirect the system // calls to the sandbox. If we miss any system calls, the application will be @@ -454,7 +459,7 @@ void Sandbox::startSandbox() { // correctly. { Maps maps(proc_self_maps_); - const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; + const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; // Intercept system calls in the VDSO segment (if any). This has to happen // before intercepting system calls in any of the other libraries, as diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h index 79621d6..0a98283 100644 --- a/sandbox/linux/seccomp/sandbox_impl.h +++ b/sandbox/linux/seccomp/sandbox_impl.h @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef SANDBOX_IMPL_H__ #define SANDBOX_IMPL_H__ @@ -360,7 +364,6 @@ class Sandbox { void* edx; void* ecx; void* ebx; - void* ret2; } regs32 __attribute__((packed)); #else #error Unsupported target platform diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h index 4c208ce..f9a5c97 100644 --- a/sandbox/linux/seccomp/securemem.h +++ b/sandbox/linux/seccomp/securemem.h @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef SECURE_MEM_H__ #define SECURE_MEM_H__ @@ -50,7 +54,6 @@ class SecureMem { void* r14; void* r15; #elif defined(__i386__) - void* ret2; void* ebp; void* edi; void* esi; @@ -86,9 +89,9 @@ class SecureMem { char securePage[4096]; }; union { - // This scratch space is used by the trusted thread to read parameters - // for unrestricted system calls. struct { + // This scratch space is used by the trusted thread to read parameters + // for unrestricted system calls. long tmpSyscallNum; void* tmpArg1; void* tmpArg2; @@ -97,6 +100,11 @@ class SecureMem { void* tmpArg5; void* tmpArg6; void* tmpReturnValue; + + // We often have long sequences of calls to gettimeofday(). This is + // needlessly expensive. Coalesce them into a single call. + long lastSyscallNum; + int gettimeofdayCounter; } __attribute__((packed)); char scratchPage[4096]; }; diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc index e1e2547..d3dc7aa 100644 --- a/sandbox/linux/seccomp/syscall.cc +++ b/sandbox/linux/seccomp/syscall.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "debug.h" #include "sandbox_impl.h" #include "syscall_table.h" @@ -146,14 +150,76 @@ asm( // Check range of system call "cmp playground$maxSyscall, %eax\n" - "ja 1f\n" + "ja 5f\n" + + // We often have long sequences of calls to gettimeofday(). This is + // needlessly expensive. Coalesce them into a single call. + // + // We keep track of state in TLS storage that we can access through + // the %fs segment register. See trusted_thread.cc for the exact + // memory layout. + // + // TODO(markus): maybe, we should proactively call gettimeofday() and + // clock_gettime(), whenever we talk to the trusted thread? + // or maybe, if we have recently seen requests to compute + // the time. There might be a repeated pattern of those. + "cmp $78, %eax\n" // __NR_gettimeofday + "jnz 2f\n" + "cmp %eax, %fs:0x102C-0x54\n" // last system call + "jnz 0f\n" + + // This system call and the last system call prior to this one both are + // calls to gettimeofday(). Try to avoid making the new call and just + // return the same result as in the previous call. + // Just in case the caller is spinning on the result from gettimeofday(), + // every so often, call the actual system call. + "decl %fs:0x1030-0x54\n" // countdown calls to gettimofday() + "jz 0f\n" + + // Atomically read the 64bit word representing last-known timestamp and + // return it to the caller. On x86-32 this is a little more complicated and + // requires the use of the cmpxchg8b instruction. + "mov %ebx, %eax\n" + "mov %ecx, %edx\n" + "lock; cmpxchg8b 100f\n" + "mov %eax, 0(%ebx)\n" + "mov %edx, 4(%ebx)\n" + "xor %eax, %eax\n" + "add $28, %esp\n" + "jmp 4f\n" + + // This is a call to gettimeofday(), but we don't have a valid cached + // result, yet. + "0:mov %eax, %fs:0x102C-0x54\n" // remember syscall number + "movl $500, %fs:0x1030-0x54\n" // make system call, each 500 invocations + "call playground$defaultSystemCallHandler\n" + + // Returned from gettimeofday(). Remember return value, in case the + // application calls us again right away. + // Again, this has to happen atomically and requires cmpxchg8b. + "mov 4(%ebx), %ecx\n" + "mov 0(%ebx), %ebx\n" + "mov 100f, %eax\n" + "mov 101f, %edx\n" + "1:lock; cmpxchg8b 100f\n" + "jnz 1b\n" + "xor %eax, %eax\n" + "jmp 6f\n" + + // Remember the number of the last system call made. We deliberately do + // not remember calls to gettid(), as we have often seen long sequences + // of calls to just gettimeofday() and gettid(). In that situation, we + // would still like to coalesce the gettimeofday() calls. + "2:cmp $224, %eax\n" // __NR_gettid + "jz 3f\n" + "mov %eax, %fs:0x102C-0x54\n" // remember syscall number // Retrieve function call from system call table (c.f. syscall_table.c). // We have three different types of entries; zero for denied system calls, // that should be handled by the defaultSystemCallHandler(); minus one // for unrestricted system calls that need to be forwarded to the trusted // thread; and function pointers to specific handler functions. - "shl $3, %eax\n" + "3:shl $3, %eax\n" "lea playground$syscallTable, %ebx\n" "add %ebx, %eax\n" "mov 0(%eax), %eax\n" @@ -161,14 +227,13 @@ asm( // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise // jump to fallback handler. "cmp $1, %eax\n" - "jbe 1f\n" + "jbe 5f\n" "add $4, %esp\n" "call *%eax\n" "add $24, %esp\n" - "0:" // Restore CPU registers, except for %eax which was set by the system call. - "pop %ebp\n" + "4:pop %ebp\n" "pop %edi\n" "pop %esi\n" "pop %edx\n" @@ -178,13 +243,16 @@ asm( // Return to caller "ret\n" - "1:" // Call default handler. - "push $2f\n" - "push $playground$defaultSystemCallHandler\n" - "ret\n" - "2:add $28, %esp\n" - "jmp 0b\n" + "5:call playground$defaultSystemCallHandler\n" + "6:add $28, %esp\n" + "jmp 4b\n" + + ".pushsection \".bss\"\n" + ".balign 8\n" +"100:.byte 0, 0, 0, 0\n" +"101:.byte 0, 0, 0, 0\n" + ".popsection\n" #else #error Unsupported target platform diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc index 6edc05d..af2e913 100644 --- a/sandbox/linux/seccomp/trusted_thread.cc +++ b/sandbox/linux/seccomp/trusted_thread.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "sandbox_impl.h" #include "syscall_table.h" @@ -17,7 +21,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov %0, %%rbp\n" // %rbp = args "xor %%rbx, %%rbx\n" // initial sequence number "lea 999f(%%rip), %%r15\n" // continue in same thread - "jmp 19f\n" // create trusted thread + "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger // buffer. @@ -140,7 +144,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "test %%rax, %%rax\n" "js 25f\n" // exit process "mov %%rax, %%rdi\n" - "jnz 7f\n" // wait for child, then return result + "jnz 8f\n" // wait for child, then return result "mov %%fs:0x0, %%rdi\n" // start = secure_mem "mov $4096, %%esi\n" // len = 4096 "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE @@ -148,17 +152,43 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" "mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id "xor %%rdi, %%rdi\n" + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, %%fs:0xD0\n" // debug mode + "jz 26f\n" + "mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "syscall\n" + "xor %%rdi, %%rdi\n" + #endif + "jmp 26f\n" // exit program, no message "4:syscall\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -2, execute locked system call from the // secure memory area - "5:jg 11f\n" + "5:jg 12f\n" "cmp $-2, %%eax\n" - "jnz 8f\n" + "jnz 9f\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, %%fs:0xD0\n" // debug mode + "jz 6f\n" + "mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "syscall\n" + "6:" + #endif + "mov %%fs:0x10, %%rax\n" "mov %%fs:0x18, %%rdi\n" "mov %%fs:0x20, %%rsi\n" @@ -171,17 +201,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // clone() has unusual calling conventions and must be handled specially "cmp $56, %%rax\n" // NR_clone - "jz 18f\n" + "jz 19f\n" // exit() terminates trusted thread "cmp $60, %%eax\n" // NR_exit - "jz 17f\n" + "jz 18f\n" // Perform requested system call "syscall\n" // Unlock mutex - "6:cmp %%rbx, %%fs:0x8\n" + "7:cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process "add $2, %%rbx\n" "mov %%rax, %%r8\n" @@ -193,37 +223,37 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "js 25f\n" // exit process "jz 22f\n" // unlock and exit "mov %%rax, %%rdi\n" - "7:xor %%rsi, %%rsi\n" + "8:xor %%rsi, %%rsi\n" "xor %%rdx, %%rdx\n" "xor %%r10, %%r10\n" "mov $61, %%eax\n" // NR_wait4 "syscall\n" "cmp $-4, %%eax\n" // EINTR - "jz 7b\n" + "jz 8b\n" "mov %%r8, %%rax\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -3, read the time stamp counter - "8:cmp $-3, %%eax\n" - "jnz 9f\n" + "9:cmp $-3, %%eax\n" + "jnz 10f\n" "rdtsc\n" // sets %edx:%eax "xor %%rcx, %%rcx\n" - "jmp 10f\n" - "9:cmp $-4, %%eax\n" - "jnz 11f\n" + "jmp 11f\n" + "10:cmp $-4, %%eax\n" + "jnz 12f\n" "rdtscp\n" // sets %edx:%eax and %ecx - "10:add $0x3C, %%rsi\n" + "11:add $0x3C, %%rsi\n" "mov %%eax, 0(%%rsi)\n" "mov %%edx, 4(%%rsi)\n" "mov %%ecx, 8(%%rsi)\n" "mov $12, %%edx\n" - "jmp 15f\n" // return result + "jmp 16f\n" // return result // Check in syscallTable whether this system call is unrestricted - "11:mov %%rax, %%r9\n" + "12:mov %%rax, %%r9\n" #ifndef NDEBUG "cmpw $0, %%fs:0xD0\n" // debug mode - "jnz 12f\n" + "jnz 13f\n" #endif "cmp playground$maxSyscall(%%rip), %%eax\n" "ja 25f\n" // exit process @@ -236,14 +266,14 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Default behavior for unrestricted system calls is to just execute // them. Read the remaining arguments first. - "12:mov %%rsi, %%r8\n" + "13:mov %%rsi, %%r8\n" "xor %%rax, %%rax\n" // NR_read "mov %%r13, %%rdi\n" // fd = threadFd "add $4, %%rsi\n" // buf = &scratch + 4 "mov $48, %%edx\n" // len = 6*sizeof(void *) - "13:syscall\n" + "14:syscall\n" "cmp $-4, %%rax\n" // EINTR - "jz 13b\n" + "jz 14b\n" "cmp %%rdx, %%rax\n" "jnz 25f\n" // exit process "mov %%r9, %%rax\n" @@ -258,27 +288,27 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" // Return result of system call to sandboxed thread - "14:mov %%fs:0x0, %%rsi\n" + "15:mov %%fs:0x0, %%rsi\n" "add $0x1034, %%rsi\n" // buf = &scratch + 52 "mov %%rax, (%%rsi)\n" "mov $8, %%edx\n" // len = 8 - "15:mov %%r13, %%rdi\n" // fd = threadFd + "16:mov %%r13, %%rdi\n" // fd = threadFd "mov $1, %%eax\n" // NR_write - "16:syscall\n" + "17:syscall\n" "cmp %%rdx, %%rax\n" "jz 1b\n" "cmp $-4, %%rax\n" // EINTR - "jz 16b\n" + "jz 17b\n" "jmp 25f\n" // exit process // NR_exit: // Exit trusted thread after cleaning up resources - "17:mov %%fs:0x0, %%rsi\n" + "18:mov %%fs:0x0, %%rsi\n" "mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub "mov $3, %%eax\n" // NR_close "syscall\n" "mov %%rsi, %%rdi\n" // start = secure_mem - "mov $8192, %%esi\n" // length = 4096 + "mov $8192, %%esi\n" // length = 8192 "xor %%rdx, %%rdx\n" // prot = PROT_NONE "mov $10, %%eax\n" // NR_mprotect "syscall\n" @@ -291,6 +321,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" "mov %%rax, %%rdi\n" "test %%rax, %%rax\n" + "js 26f\n" // exit process "jne 21f\n" // reap helper, exit thread "jmp 22f\n" // unlock mutex @@ -305,20 +336,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // terminates the program. But if we ever support signal handling, // we have to be careful that the user cannot install a SIGSEGV // handler that gets executed with elevated privileges. - "18:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem "syscall\n" // calls NR_clone "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values - "jae 6b\n" + "jae 7b\n" // unlock mutex, return result "add $2, %%rbx\n" "test %%rax, %%rax\n" - "jne 14b\n" // return result + "jne 15b\n" // return result // In nascent thread, now. "sub $2, %%rbx\n" "xor %%r15, %%r15\n" // Request to return from clone() when done // Get thread id of nascent thread - "19:mov $186, %%eax\n" // NR_gettid + "20:mov $186, %%eax\n" // NR_gettid "syscall\n" "mov %%rax, %%r14\n" @@ -342,14 +373,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // the parent. For our purposes, it is sufficient to fail with a // fatal error. "jmp 25f\n" // exit process - "20:mov $56, %%eax\n" // NR_clone - "mov $17, %%rdi\n" // flags = SIGCHLD - "mov $1, %%rsi\n" // stack = 1 - "syscall\n" - "test %%rax, %%rax\n" - "js 25f\n" // exit process - "jz 22f\n" // unlock and exit - "mov %%rax, %%rdi\n" "21:xor %%rsi, %%rsi\n" "xor %%rdx, %%rdx\n" "xor %%r10, %%r10\n" @@ -374,7 +397,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "24:syscall\n" "25:mov $1, %%eax\n" // NR_write "mov $2, %%edi\n" // fd = stderr - "lea 100f(%%rip), %%rsi\n" + "lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "syscall\n" "mov $1, %%edi\n" @@ -426,7 +449,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $0x1001, %%edi\n" // option = ARCH_SET_GS "syscall\n" "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values - "jae 20b\n" // exit thread, unlock global mutex + "jae 25b\n" // exit process // Check whether this is the initial thread, or a newly created one. // At startup we run the same code as when we create a new thread. At @@ -524,7 +547,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $61, %%eax\n" // NR_wait4 "syscall\n" "cmp $-4, %%eax\n" // EINTR - "jz 30\n" + "jz 30b\n" // Release privileges by entering seccomp mode. "mov $157, %%eax\n" // NR_prctl @@ -578,7 +601,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" - "101:\n" + "101:.ascii \"WARNING! This is an expensive system call\\n\"\n" + "102:\n" ".popsection\n" "999:pop %%rbp\n" @@ -613,7 +637,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%ebx, %%mm3\n" "xor %%ebx, %%ebx\n" // initial sequence number "movd %%ebx, %%mm2\n" - "jmp 19f\n" // create trusted thread + "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger // buffer. @@ -645,21 +669,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x1C: fifth argument; passed to syscall in %edi // 0x20: sixth argument; passed to syscall in %ebp // 0x24: stored return address for clone() system call - // 0x28: second stored return address for clone() system call - // 0x2C: stored %ebp value for clone() system call - // 0x30: stored %edi value for clone() system call - // 0x34: stored %esi value for clone() system call - // 0x38: stored %edx value for clone() system call - // 0x3C: stored %ecx value for clone() system call - // 0x40: stored %ebx value for clone() system call - // 0x44: new shared memory for clone() - // 0x48: processFdPub for talking to trusted process - // 0x4C: cloneFdPub for talking to trusted process - // 0x50: set to non-zero, if in debugging mode - // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE) - // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE) - // 0x60: thread id (TLS_TID) - // 0x68: threadFdPub (TLS_THREAD_FD) + // 0x28: stored %ebp value for clone() system call + // 0x2C: stored %edi value for clone() system call + // 0x30: stored %esi value for clone() system call + // 0x34: stored %edx value for clone() system call + // 0x38: stored %ecx value for clone() system call + // 0x3C: stored %ebx value for clone() system call + // 0x40: new shared memory for clone() + // 0x44: processFdPub for talking to trusted process + // 0x48: cloneFdPub for talking to trusted process + // 0x4C: set to non-zero, if in debugging mode + // 0x50: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0x54: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0x5C: thread id (TLS_TID) + // 0x64: threadFdPub (TLS_THREAD_FD) // 0x200-0x1000: securely passed verified file name(s) // Layout of (untrusted) scratch space: @@ -674,6 +697,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x20: RDTSCP result (%eax) // 0x24: RDTSCP result (%edx) // 0x28: RDTSCP result (%ecx) + // 0x2C: last system call (updated in syscall.cc) + // 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday) "0:xor %%esp, %%esp\n" "mov $2, %%eax\n" // %mm2 = initial sequence number @@ -738,26 +763,55 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "test %%eax, %%eax\n" "js 25f\n" // exit process "mov %%eax, %%ebx\n" - "jnz 7f\n" // wait for child, then return result + "jnz 8f\n" // wait for child, then return result "movd %%mm5, %%ebx\n" // start = secure_mem "mov $4096, %%ecx\n" // len = 4096 "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE "mov $125, %%eax\n" // NR_mprotect "int $0x80\n" - "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id + "mov %%ebp, 0x50(%%ebx)\n" // set most recently returned SysV shm id "xor %%ebx, %%ebx\n" + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "movd %%mm5, %%ecx\n" + "cmpw $0, 0x4C(%%ecx)\n" // debug mode + "jz 26f\n" + "mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 101f, %%ecx\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "xor %%ebx, %%ebx\n" + #endif + "jmp 26f\n" // exit program, no message "4:int $0x80\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -2, execute locked system call from the // secure memory area - "5:jg 11f\n" + "5:jg 12f\n" "cmp $-2, %%eax\n" - "jnz 8f\n" + "jnz 9f\n" "movd %%mm2, %%ebp\n" "cmp %%ebp, 0x4-0x1000(%%ecx)\n" "jne 25f\n" // exit process + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "jz 6f\n" // debug mode + "mov %%ecx, %%ebp\n" + "mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 101f, %%ecx\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "mov %%ebp, %%ecx\n" + "6:" + #endif + "mov 0x08-0x1000(%%ecx), %%eax\n" "mov 0x0C-0x1000(%%ecx), %%ebx\n" "mov 0x14-0x1000(%%ecx), %%edx\n" @@ -774,11 +828,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // clone() has unusual calling conventions and must be handled specially "cmp $120, %%eax\n" // NR_clone - "jz 18f\n" + "jz 19f\n" // exit() terminates trusted thread "cmp $1, %%eax\n" // NR_exit - "jz 17f\n" + "jz 18f\n" // Perform requested system call "movd %%mm4, %%edi\n" @@ -786,7 +840,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "int $0x80\n" // Unlock mutex - "6:movd %%mm2, %%ebp\n" + "7:movd %%mm2, %%ebp\n" "movd %%mm5, %%edi\n" "cmp %%ebp, 4(%%edi)\n" "jne 25f\n" // exit process @@ -801,38 +855,38 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "js 25f\n" // exit process "jz 22f\n" // unlock and exit "mov %%eax, %%ebx\n" - "7:xor %%ecx, %%ecx\n" + "8:xor %%ecx, %%ecx\n" "xor %%edx, %%edx\n" "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR "jz 6\n" "mov %%ebp, %%eax\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -3, read the time stamp counter - "8:cmp $-3, %%eax\n" - "jnz 9f\n" + "9:cmp $-3, %%eax\n" + "jnz 10f\n" "rdtsc\n" // sets %edx:%eax "xor %%ecx, %%ecx\n" - "jmp 10f\n" - "9:cmp $-4, %%eax\n" - "jnz 11f\n" + "jmp 11f\n" + "10:cmp $-4, %%eax\n" + "jnz 12f\n" "rdtscp\n" // sets %edx:%eax and %ecx - "10:movd %%mm5, %%ebx\n" + "11:movd %%mm5, %%ebx\n" "add $0x1020, %%ebx\n" "mov %%eax, 0(%%ebx)\n" "mov %%edx, 4(%%ebx)\n" "mov %%ecx, 8(%%ebx)\n" "mov %%ebx, %%ecx\n" "mov $12, %%edx\n" - "jmp 15f\n" // return result + "jmp 16f\n" // return result // Check in syscallTable whether this system call is unrestricted - "11:mov %%eax, %%ebp\n" + "12:mov %%eax, %%ebp\n" #ifndef NDEBUG - "cmpw $0, 0x50-0x1000(%%ecx)\n" - "jnz 12f\n" // debug mode + "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "jnz 13f\n" // debug mode #endif "cmp playground$maxSyscall, %%eax\n" "ja 25f\n" // exit process @@ -844,13 +898,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Default behavior for unrestricted system calls is to just execute // them. Read the remaining arguments first. - "12:mov $3, %%eax\n" // NR_read + "13:mov $3, %%eax\n" // NR_read "movd %%mm0, %%ebx\n" // fd = threadFd "add $4, %%ecx\n" // buf = &scratch + 4 "mov $24, %%edx\n" // len = 6*sizeof(void *) - "13:int $0x80\n" + "14:int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 13b\n" + "jz 14b\n" "cmp %%edx, %%eax\n" "jnz 25f\n" // exit process "mov %%ebp, %%eax\n" @@ -865,27 +919,27 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "int $0x80\n" // Return result of system call to sandboxed thread - "14:movd %%mm5, %%ecx\n" + "15:movd %%mm5, %%ecx\n" "add $0x101C, %%ecx\n" // buf = &scratch + 28 "mov %%eax, (%%ecx)\n" "mov $4, %%edx\n" // len = 4 - "15:movd %%mm0, %%ebx\n" // fd = threadFd + "16:movd %%mm0, %%ebx\n" // fd = threadFd "mov $4, %%eax\n" // NR_write - "16:int $0x80\n" + "17:int $0x80\n" "cmp %%edx, %%eax\n" "jz 1b\n" "cmp $-4, %%eax\n" // EINTR - "jz 16b\n" + "jz 17b\n" "jmp 25f\n" // exit process // NR_exit: // Exit trusted thread after cleaning up resources - "17:mov %%edi, %%ecx\n" - "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub + "18:mov %%edi, %%ecx\n" + "mov 0x64(%%ecx), %%ebx\n" // fd = threadFdPub "mov $6, %%eax\n" // NR_close "int $0x80\n" "mov %%ecx, %%ebx\n" // start = secure_mem - "mov $8192, %%ecx\n" // length = 4096 + "mov $8192, %%ecx\n" // length = 8192 "xor %%edx, %%edx\n" // prot = PROT_NONE "mov $125, %%eax\n" // NR_mprotect "int $0x80\n" @@ -898,6 +952,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "int $0x80\n" "mov %%eax, %%ebx\n" "test %%eax, %%eax\n" + "js 25f\n" // exit process "jne 21f\n" // reap helper, exit thread "jmp 22f\n" // unlock mutex @@ -912,17 +967,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // terminates the program. But if we ever support signal handling, // we have to be careful that the user cannot install a SIGSEGV // handler that gets executed with elevated privileges. - "18:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem + "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem "movd %%mm4, %%edi\n" "movd %%mm7, %%ebp\n" "int $0x80\n" // calls NR_clone "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values - "jae 6b\n" + "jae 7b\n" // unlock mutex, return result "movd %%mm2, %%edi\n" "add $2, %%edi\n" "movd %%edi, %%mm2\n" "test %%eax, %%eax\n" - "jne 14b\n" // return result + "jne 15b\n" // return result // In nascent thread, now. "sub $2, %%edi\n" @@ -930,7 +985,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%eax, %%mm3\n" // Request to return from clone() when done // Get thread id of nascent thread - "19:mov $224, %%eax\n" // NR_gettid + "20:mov $224, %%eax\n" // NR_gettid "int $0x80\n" "movd %%eax, %%mm4\n" @@ -958,14 +1013,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // the parent. For our purposes, it is sufficient to fail with a // fatal error. "jmp 25f\n" // exit process - "20:mov $120, %%eax\n" // NR_clone - "mov $17, %%ebx\n" // flags = SIGCHLD - "mov $1, %%ecx\n" // stack = 1 - "int $0x80\n" - "test %%eax, %%eax\n" - "js 25f\n" // exit process - "jz 22f\n" // unlock and exit - "mov %%eax, %%ebx\n" "21:xor %%ecx, %%ecx\n" "xor %%edx, %%edx\n" "mov $7, %%eax\n" // NR_waitpid @@ -989,7 +1036,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "24:int $0x80\n" "25:mov $4, %%eax\n" // NR_write "mov $2, %%ebx\n" // fd = stderr - "lea 100f, %%ecx\n" + "lea 100f, %%ecx\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "int $0x80\n" "mov $1, %%ebx\n" @@ -998,7 +1045,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // The first page is mapped read-only for use as securely shared memory "27:movd %%mm6, %%ebp\n" - "mov 0x44(%%ebp), %%esi\n" + "mov 0x40(%%ebp), %%esi\n" "movd %%esi, %%mm5\n" // %mm5 = secure shared memory "movd %%mm2, %%edi\n" "cmp %%edi, 4(%%ebp)\n" @@ -1024,7 +1071,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $120, %%eax\n" // NR_clone "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR "mov $1, %%ecx\n" // stack = 1 - "movd 0x48(%%ebp), %%mm1\n" // %mm1 = processFdPub + "movd 0x44(%%ebp), %%mm1\n" // %mm1 = processFdPub "cmp %%edi, 4(%%ebp)\n" "jne 25b\n" // exit process "int $0x80\n" @@ -1037,7 +1084,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "push %%eax\n" "mov $0xFFFFF, %%eax\n" // limit "push %%eax\n" - "add $0x58, %%esi\n" + "add $0x54, %%esi\n" "push %%esi\n" // base_addr = &secure_mem.TLS "mov %%fs, %%eax\n" "shr $3, %%eax\n" @@ -1080,8 +1127,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "push %%eax\n" "mov 0x3C(%%ebp), %%eax\n" "push %%eax\n" - "mov 0x40(%%ebp), %%eax\n" - "push %%eax\n" "cmp %%edi, 4(%%ebp)\n" "jne 25b\n" // exit process @@ -1111,7 +1156,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%mm0, %%eax\n" // fd1 = threadFd "push %%eax\n" "push %%esi\n" // fd0 = threadFdPub - "mov 0x4C(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub() + "mov 0x48(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub() "cmp %%edi, 4(%%ebp)\n" "jne 25b\n" // exit process "push %%eax\n" @@ -1139,7 +1184,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 30\n" + "jz 30b\n" // Release privileges by entering seccomp mode. "mov $172, %%eax\n" // NR_prctl @@ -1190,7 +1235,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" - "101:\n" + "101:.ascii \"WARNING! This is an expensive system call\\n\"\n" + "102:\n" ".popsection\n" "999:pop %%ebp\n" |