diff options
Diffstat (limited to 'sandbox/linux')
-rw-r--r-- | sandbox/linux/seccomp/allocator.cc | 136 | ||||
-rw-r--r-- | sandbox/linux/seccomp/allocator.h | 88 | ||||
-rw-r--r-- | sandbox/linux/seccomp/clone.cc | 5 | ||||
-rw-r--r-- | sandbox/linux/seccomp/library.cc | 153 | ||||
-rw-r--r-- | sandbox/linux/seccomp/library.h | 34 | ||||
-rw-r--r-- | sandbox/linux/seccomp/maps.cc | 18 | ||||
-rw-r--r-- | sandbox/linux/seccomp/maps.h | 17 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox.cc | 29 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox_impl.h | 5 | ||||
-rw-r--r-- | sandbox/linux/seccomp/securemem.h | 14 | ||||
-rw-r--r-- | sandbox/linux/seccomp/syscall.cc | 90 | ||||
-rw-r--r-- | sandbox/linux/seccomp/trusted_thread.cc | 270 |
12 files changed, 641 insertions, 218 deletions
diff --git a/sandbox/linux/seccomp/allocator.cc b/sandbox/linux/seccomp/allocator.cc new file mode 100644 index 0000000..6e11a4a --- /dev/null +++ b/sandbox/linux/seccomp/allocator.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The allocator is very simplistic. It requests memory pages directly from +// the system. Each page starts with a header describing the allocation. This +// makes sure that we can return the memory to the system when it is +// deallocated. +// For allocations that are smaller than a single page, we try to squeeze +// multiple of them into the same page. +// We expect to use this allocator for a moderate number of small allocations. +// In most cases, it will only need to ever make a single request to the +// operating system for the lifetime of the STL container object. +// We don't worry about memory fragmentation as the allocator is expected to +// be short-lived. + +#include <stdint.h> +#include <sys/mman.h> + +#include "allocator.h" +#include "linux_syscall_support.h" + +namespace playground { + +class SysCalls { + public: + #define SYS_CPLUSPLUS + #define SYS_ERRNO my_errno + #define SYS_INLINE inline + #define SYS_PREFIX -1 + #undef SYS_LINUX_SYSCALL_SUPPORT_H + #include "linux_syscall_support.h" + SysCalls() : my_errno(0) { } + int my_errno; +}; +#ifdef __NR_mmap2 + #define MMAP mmap2 + #define __NR_MMAP __NR_mmap2 +#else + #define MMAP mmap + #define __NR_MMAP __NR_mmap +#endif + +// We only ever keep track of the very last partial page that was used for +// allocations. This approach simplifies the code a lot. It can theoretically +// lead to more memory fragmentation, but for our use case that is unlikely +// to happen. +struct Header { + // The total amount of memory allocated for this chunk of memory. Typically, + // this would be a single page. + size_t total_len; + + // "used" keeps track of the number of bytes currently allocated in this + // page. Note that as elements are freed from this page, "used" is updated + // allowing us to track when the page is free. However, these holes in the + // page are never re-used, so "tail" is the only way to find out how much + // free space remains and when we need to request another chunk of memory + // from the system. + size_t used; + void *tail; +}; +static Header* last_alloc; + +void* SystemAllocatorHelper::sys_allocate(size_t size) { + // Number of bytes that need to be allocated + if (size + 3 < size) { + return NULL; + } + size_t len = (size + 3) & ~3; + + if (last_alloc) { + // Remaining space in the last chunk of memory allocated from system + size_t remainder = last_alloc->total_len - + (reinterpret_cast<char *>(last_alloc->tail) - + reinterpret_cast<char *>(last_alloc)); + + if (remainder >= len) { + void* ret = last_alloc->tail; + last_alloc->tail = reinterpret_cast<char *>(last_alloc->tail) + len; + last_alloc->used += len; + return ret; + } + } + + SysCalls sys; + if (sizeof(Header) + len + 4095 < len) { + return NULL; + } + size_t total_len = (sizeof(Header) + len + 4095) & ~4095; + Header* mem = reinterpret_cast<Header *>( + sys.MMAP(NULL, total_len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); + if (mem == MAP_FAILED) { + return NULL; + } + + // If we were only asked to allocate a single page, then we will use any + // remaining space for other small allocations. + if (total_len - sizeof(Header) - len >= 4) { + last_alloc = mem; + } + mem->total_len = total_len; + mem->used = len; + char* ret = reinterpret_cast<char *>(mem) + sizeof(Header); + mem->tail = ret + len; + + return ret; +} + +void SystemAllocatorHelper::sys_deallocate(void* p, size_t size) { + // Number of bytes in this allocation + if (size + 3 < size) { + return; + } + size_t len = (size + 3) & ~3; + + // All allocations (small and large) have starting addresses in the + // first page that was allocated from the system. This page starts with + // a header that keeps track of how many bytes are currently used. The + // header can be found by truncating the last few bits of the address. + Header* header = reinterpret_cast<Header *>( + reinterpret_cast<uintptr_t>(p) & ~4095); + header->used -= len; + + // After the last allocation has been freed, return the page(s) to the + // system + if (!header->used) { + SysCalls sys; + sys.munmap(header, header->total_len); + if (last_alloc == header) { + last_alloc = NULL; + } + } +} + +} // namespace diff --git a/sandbox/linux/seccomp/allocator.h b/sandbox/linux/seccomp/allocator.h new file mode 100644 index 0000000..29e0065 --- /dev/null +++ b/sandbox/linux/seccomp/allocator.h @@ -0,0 +1,88 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Implement a very basic memory allocator that make direct system calls +// instead of relying on libc. +// This allocator is not thread-safe. + +#ifndef ALLOCATOR_H__ +#define ALLOCATOR_H__ + +#include <cstddef> + +namespace playground { + +class SystemAllocatorHelper { + protected: + static void *sys_allocate(size_t size); + static void sys_deallocate(void* p, size_t size); +}; + +template <class T> +class SystemAllocator : SystemAllocatorHelper { + public: + typedef T value_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef size_t size_type; + typedef std::ptrdiff_t difference_type; + + template <class U> + struct rebind { + typedef SystemAllocator<U> other; + }; + + pointer address(reference value) const { + return &value; + } + + const_pointer address(const_reference value) const { + return &value; + } + + SystemAllocator() throw() { } + SystemAllocator(const SystemAllocator& src) throw() { } + template <class U> SystemAllocator(const SystemAllocator<U>& src) throw() { } + ~SystemAllocator() throw() { } + + size_type max_size() const throw() { + return (1 << 30) / sizeof(T); + } + + pointer allocate(size_type num, const void* = 0) { + if (num > max_size()) { + return NULL; + } + return (pointer)sys_allocate(num * sizeof(T)); + } + + void construct(pointer p, const T& value) { + new(reinterpret_cast<void *>(p))T(value); + } + + void destroy(pointer p) { + p->~T(); + } + + void deallocate(pointer p, size_type num) { + sys_deallocate(p, num * sizeof(T)); + } +}; + +template <class T1, class T2> +bool operator== (const SystemAllocator<T1>&, const SystemAllocator<T2>&) + throw() { + return true; +} +template <class T1, class T2> +bool operator!= (const SystemAllocator<T1>&, const SystemAllocator<T2>&) + throw() { + return false; +} + +} // namespace + +#endif // ALLOCATOR_H__ diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc index 2b6703f..28a3584 100644 --- a/sandbox/linux/seccomp/clone.cc +++ b/sandbox/linux/seccomp/clone.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "debug.h" #include "sandbox_impl.h" @@ -84,7 +88,6 @@ bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub, mem->r14 = clone_req.regs64.r14; mem->r15 = clone_req.regs64.r15; #elif defined(__i386__) - mem->ret2 = clone_req.regs32.ret2; mem->ebp = clone_req.regs32.ebp; mem->edi = clone_req.regs32.edi; mem->esi = clone_req.regs32.esi; diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc index cf7477b..1b06bc1 100644 --- a/sandbox/linux/seccomp/library.cc +++ b/sandbox/linux/seccomp/library.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #define XOPEN_SOURCE 500 #include <algorithm> #include <elf.h> @@ -16,6 +20,7 @@ #include <sys/stat.h> #include <sys/types.h> +#include "allocator.h" #include "debug.h" #include "library.h" #include "sandbox_impl.h" @@ -84,7 +89,11 @@ Library::~Library() { // found. Make sure to preserve any changes that we might have made since. Sandbox::SysCalls sys; sys.mprotect(image_, 4096, PROT_READ | PROT_WRITE); - memcpy(image_, memory_ranges_.rbegin()->second.start, 4096); + if (memcmp(image_, memory_ranges_.rbegin()->second.start, 4096)) { + // Only copy data, if we made any changes in this data. Otherwise there + // is no need to create another modified COW mapping. + memcpy(image_, memory_ranges_.rbegin()->second.start, 4096); + } sys.mprotect(image_, 4096, PROT_READ | PROT_EXEC); sys.mremap(image_, image_size_, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, memory_ranges_.rbegin()->second.start); @@ -173,7 +182,7 @@ char *Library::get(Elf_Addr offset, char *buf, size_t len) { return buf; } -std::string Library::get(Elf_Addr offset) { +Library::string Library::get(Elf_Addr offset) { if (!valid_) { return ""; } @@ -192,7 +201,7 @@ std::string Library::get(Elf_Addr offset) { while (*stop) { ++stop; } - std::string s = stop > start ? std::string(start, stop - start) : ""; + string s = stop > start ? string(start, stop - start) : ""; return s; } @@ -215,8 +224,21 @@ char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) { image_size_ = memory_ranges_.begin()->first + (reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) - reinterpret_cast<char *>(memory_ranges_.begin()->second.start)); + if (image_size_ < 8192) { + // It is possible to create a library that is only a single page in + // size. In that case, we have to make sure that we artificially map + // one extra page past the end of it, as our code relies on mremap() + // actually moving the mapping. + image_size_ = 8192; + } image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_, MREMAP_MAYMOVE)); + if (image_size_ == 8192 && image_ == start) { + // We really mean it, when we say we want the memory to be moved. + image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_, + MREMAP_MAYMOVE)); + sys.munmap(reinterpret_cast<char *>(start) + 4096, 4096); + } if (image_ == MAP_FAILED) { image_ = NULL; } else { @@ -250,7 +272,7 @@ char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) { return buf ? get(offset, buf, len) : NULL; } -std::string Library::getOriginal(Elf_Addr offset) { +Library::string Library::getOriginal(Elf_Addr offset) { if (!valid_) { return ""; } @@ -271,7 +293,7 @@ std::string Library::getOriginal(Elf_Addr offset) { getOriginal(stop - image_, NULL, 1); } } - return std::string(start, stop - start); + return string(start, stop - start); } return ""; } @@ -285,7 +307,7 @@ const Elf_Ehdr* Library::getEhdr() { return &ehdr_; } -const Elf_Shdr* Library::getSection(const std::string& section) { +const Elf_Shdr* Library::getSection(const string& section) { if (!valid_) { return NULL; } @@ -296,7 +318,7 @@ const Elf_Shdr* Library::getSection(const std::string& section) { return &iter->second.second; } -const int Library::getSectionIndex(const std::string& section) { +const int Library::getSectionIndex(const string& section) { if (!valid_) { return -1; } @@ -307,22 +329,6 @@ const int Library::getSectionIndex(const std::string& section) { return iter->second.first; } -void **Library::getRelocation(const std::string& symbol) { - PltTable::const_iterator iter = plt_entries_.find(symbol); - if (iter == plt_entries_.end()) { - return NULL; - } - return reinterpret_cast<void **>(asr_offset_ + iter->second); -} - -void *Library::getSymbol(const std::string& symbol) { - SymbolTable::const_iterator iter = symbols_.find(symbol); - if (iter == symbols_.end() || !iter->second.st_value) { - return NULL; - } - return asr_offset_ + iter->second.st_value; -} - void Library::makeWritable(bool state) const { for (RangeMap::const_iterator iter = memory_ranges_.begin(); iter != memory_ranges_.end(); ++iter) { @@ -380,7 +386,7 @@ char* Library::getScratchSpace(const Maps* maps, char* near, int needed, void Library::patchSystemCallsInFunction(const Maps* maps, char *start, char *end, char** extraSpace, int* extraLength) { - std::set<char *> branch_targets; + std::set<char *, std::less<char *>, SystemAllocator<char *> > branch_targets; for (char *ptr = start; ptr < end; ) { unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64); char *target; @@ -516,12 +522,21 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, } } // We now know, how many instructions neighboring the system call we - // can safely overwrite. We need five bytes to insert a JMP/CALL and a - // 32bit address. We then jump to a code fragment that safely forwards - // to our system call wrapper. On x86-64, this is complicated by - // the fact that the API allows up to 128 bytes of red-zones below the - // current stack pointer. So, we cannot write to the stack until we - // have adjusted the stack pointer. + // can safely overwrite. On x86-32 we need six bytes, and on x86-64 + // We need five bytes to insert a JMPQ and a 32bit address. We then + // jump to a code fragment that safely forwards to our system call + // wrapper. + // On x86-64, this is complicated by the fact that the API allows up + // to 128 bytes of red-zones below the current stack pointer. So, we + // cannot write to the stack until we have adjusted the stack + // pointer. + // On both x86-32 and x86-64 we take care to leave the stack unchanged + // while we are executing the preamble and postamble. This allows us + // to treat instructions that reference %esp/%rsp as safe for + // relocation. + // In particular, this means that on x86-32 we cannot use CALL, but + // have to use a PUSH/RET combination to change the instruction pointer. + // On x86-64, we can instead use a 32bit JMPQ. // // .. .. .. .. ; any leading instructions copied from original code // 48 81 EC 80 00 00 00 SUB $0x80, %rsp @@ -549,9 +564,10 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, // 68 .. .. .. .. PUSH $syscallWrapper // C3 RET // .. .. .. .. ; any trailing instructions copied from original code + // 68 .. .. .. .. PUSH return_addr // C3 RET // - // Total: 12 bytes + any bytes that were copied + // Total: 17 bytes + any bytes that were copied // // For indirect jumps from the VDSO to the VSyscall page, we instead // replace the following code (this is only necessary on x86-64). This @@ -575,7 +591,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, // // Total: 52 bytes + any bytes that were copied - if (length < 5) { + if (length < (__WORDSIZE == 32 ? 6 : 5)) { // There are a very small number of instruction sequences that we // cannot easily intercept, and that have been observed in real world // examples. Handle them here: @@ -648,7 +664,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, Sandbox::die("Cannot intercept system call"); } } - int needed = 5 - code[codeIdx].len; + int needed = (__WORDSIZE == 32 ? 6 : 5) - code[codeIdx].len; int first = codeIdx; while (needed > 0 && first != startIdx) { first = (first + (sizeof(code) / sizeof(struct Code)) - 1) % @@ -673,7 +689,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, needed = 52 + preamble + postamble; } #elif defined(__i386__) - needed = 12 + preamble + postamble; + needed = 17 + preamble + postamble; #else #error Unsupported target platform #endif @@ -752,7 +768,10 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, reinterpret_cast<void *>(&syscallWrapper); } #elif defined(__i386__) - *(dest + preamble + 11 + postamble) = '\xC3'; + *(dest + preamble + 11 + postamble) = '\x68'; // PUSH + *reinterpret_cast<char **>(dest + preamble + 12 + postamble) = + code[second].addr + code[second].len; + *(dest + preamble + 16 + postamble) = '\xC3'; // RET *reinterpret_cast<char **>(dest + preamble + 1) = dest + preamble + 11; *reinterpret_cast<void (**)()>(dest + preamble + 6) = syscallWrapper; @@ -766,14 +785,16 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start, // Replace the system call with an unconditional jump to our new code. #if defined(__x86_64__) - *code[first].addr = '\xE9'; // JMPQ + *code[first].addr = '\xE9'; // JMPQ + *reinterpret_cast<int *>(code[first].addr + 1) = + dest - (code[first].addr + 5); #elif defined(__i386__) - *code[first].addr = '\xE8'; // CALL + code[first].addr[0] = '\x68'; // PUSH + *reinterpret_cast<char **>(code[first].addr + 1) = dest; + code[first].addr[5] = '\xC3'; // RET #else #error Unsupported target platform #endif - *reinterpret_cast<int *>(code[first].addr + 1) = - dest - (code[first].addr + 5); } replaced: codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code)); @@ -1049,27 +1070,11 @@ bool Library::parseElf() { &str_shdr)) { // Not all memory mappings are necessarily ELF files. Skip memory // mappings that we cannot identify. + error: valid_ = false; return false; } - // Find PT_DYNAMIC segment. This is what our PLT entries and symbols will - // point to. This information is probably incorrect in the child, as it - // requires access to the original memory mappings. - for (int i = 0; i < ehdr_.e_phnum; i++) { - Elf_Phdr phdr; - if (getOriginal(ehdr_.e_phoff + i*ehdr_.e_phentsize, &phdr) && - phdr.p_type == PT_DYNAMIC) { - RangeMap::const_iterator iter = - memory_ranges_.lower_bound(phdr.p_offset); - if (iter != memory_ranges_.end()) { - asr_offset_ = reinterpret_cast<char *>(iter->second.start) - - (phdr.p_vaddr - (phdr.p_offset - iter->first)); - } - break; - } - } - // Parse section table and find all sections in this ELF file for (int i = 0; i < ehdr_.e_shnum; i++) { Elf_Shdr shdr; @@ -1081,6 +1086,38 @@ bool Library::parseElf() { std::make_pair(i, shdr))); } + // Compute the offset of entries in the .text segment + const Elf_Shdr* text = getSection(".text"); + if (text == NULL) { + // On x86-32, the VDSO is unusual in as much as it does not have a single + // ".text" section. Instead, it has one section per function. Each + // section name starts with ".text". We just need to pick an arbitrary + // one in order to find the asr_offset_ -- which would typically be zero + // for the VDSO. + for (SectionTable::const_iterator iter = section_table_.begin(); + iter != section_table_.end(); ++iter) { + if (!strncmp(iter->first.c_str(), ".text", 5)) { + text = &iter->second.second; + break; + } + } + } + + // Now that we know where the .text segment is located, we can compute the + // asr_offset_. + if (text) { + RangeMap::const_iterator iter = + memory_ranges_.lower_bound(text->sh_offset); + if (iter != memory_ranges_.end()) { + asr_offset_ = reinterpret_cast<char *>(iter->second.start) - + (text->sh_addr - (text->sh_offset - iter->first)); + } else { + goto error; + } + } else { + goto error; + } + return !isVDSO_ || parseSymbols(); } @@ -1128,7 +1165,7 @@ bool Library::parseSymbols() { valid_ = false; return false; } - std::string name = getOriginal(strtab.sh_offset + sym.st_name); + string name = getOriginal(strtab.sh_offset + sym.st_name); if (name.empty()) { continue; } @@ -1147,7 +1184,7 @@ bool Library::parseSymbols() { valid_ = false; return false; } - std::string name = getOriginal(strtab.sh_offset + sym.st_name); + string name = getOriginal(strtab.sh_offset + sym.st_name); if (name.empty()) { continue; } diff --git a/sandbox/linux/seccomp/library.h b/sandbox/linux/seccomp/library.h index 523652c..29a755e 100644 --- a/sandbox/linux/seccomp/library.h +++ b/sandbox/linux/seccomp/library.h @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef LIBRARY_H__ #define LIBRARY_H__ @@ -30,6 +34,8 @@ namespace playground { class Library { friend class Maps; public: + typedef Maps::string string; + Library() : valid_(false), isVDSO_(false), @@ -50,14 +56,24 @@ class Library { void addMemoryRange(void* start, void* stop, Elf_Addr offset, int prot, int isVDSO) { - memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot))); isVDSO_ = isVDSO; + RangeMap::const_iterator iter = memory_ranges_.find(offset); + if (iter != memory_ranges_.end()) { + // It is possible to have overlapping mappings. This is particularly + // likely to happen with very small programs or libraries. If it does + // happen, we really only care about the text segment. Look for a + // mapping that is mapped executable. + if ((prot & PROT_EXEC) == 0) { + return; + } + } + memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot))); } char *get(Elf_Addr offset, char *buf, size_t len); - std::string get(Elf_Addr offset); + string get(Elf_Addr offset); char *getOriginal(Elf_Addr offset, char *buf, size_t len); - std::string getOriginal(Elf_Addr offset); + string getOriginal(Elf_Addr offset); template<class T>T* get(Elf_Addr offset, T* t) { if (!valid_) { @@ -108,10 +124,8 @@ class Library { bool parseElf(); const Elf_Ehdr* getEhdr(); - const Elf_Shdr* getSection(const std::string& section); - const int getSectionIndex(const std::string& section); - void **getRelocation(const std::string& symbol); - void *getSymbol(const std::string& symbol); + const Elf_Shdr* getSection(const string& section); + const int getSectionIndex(const string& section); void makeWritable(bool state) const; void patchSystemCalls(); bool isVDSO() const { return isVDSO_; } @@ -136,9 +150,9 @@ class Library { }; typedef std::map<Elf_Addr, Range, GreaterThan> RangeMap; - typedef std::map<std::string, std::pair<int, Elf_Shdr> > SectionTable; - typedef std::map<std::string, Elf_Sym> SymbolTable; - typedef std::map<std::string, Elf_Addr> PltTable; + typedef std::map<string, std::pair<int, Elf_Shdr> > SectionTable; + typedef std::map<string, Elf_Sym> SymbolTable; + typedef std::map<string, Elf_Addr> PltTable; char* getBytes(char* dst, const char* src, ssize_t len); static bool isSafeInsn(unsigned short insn); diff --git a/sandbox/linux/seccomp/maps.cc b/sandbox/linux/seccomp/maps.cc index cb303e7..d18405a 100644 --- a/sandbox/linux/seccomp/maps.cc +++ b/sandbox/linux/seccomp/maps.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include <errno.h> #include <fcntl.h> #include <iostream> @@ -42,18 +46,18 @@ Maps::Maps(int proc_self_maps) : while (*ptr == ' ' || *ptr == '\t') ++ptr; char *perm_ptr = ptr; while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; - std::string perm(perm_ptr, ptr - perm_ptr); + string perm(perm_ptr, ptr - perm_ptr); unsigned long offset = strtoul(ptr, &ptr, 16); while (*ptr == ' ' || *ptr == '\t') ++ptr; char *id_ptr = ptr; while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; while (*ptr == ' ' || *ptr == '\t') ++ptr; while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr; - std::string id(id_ptr, ptr - id_ptr); + string id(id_ptr, ptr - id_ptr); while (*ptr == ' ' || *ptr == '\t') ++ptr; char *library_ptr = ptr; while (*ptr && *ptr != ' ' && *ptr != '\t' && *ptr != '\n') ++ptr; - std::string library(library_ptr, ptr - library_ptr); + string library(library_ptr, ptr - library_ptr); bool isVDSO = false; if (library == "[vdso]") { // /proc/self/maps has a misleading file offset in the [vdso] entry. @@ -66,13 +70,13 @@ Maps::Maps(int proc_self_maps) : goto skip_entry; } int prot = 0; - if (perm.find('r') != std::string::npos) { + if (perm.find('r') != string::npos) { prot |= PROT_READ; } - if (perm.find('w') != std::string::npos) { + if (perm.find('w') != string::npos) { prot |= PROT_WRITE; } - if (perm.find('x') != std::string::npos) { + if (perm.find('x') != string::npos) { prot |= PROT_EXEC; } if ((prot & (PROT_EXEC | PROT_READ)) == 0) { @@ -146,7 +150,7 @@ bool Maps::Iterator::operator!=(const Maps::Iterator& iter) const { return !operator==(iter); } -std::string Maps::Iterator::name() const { +Maps::string Maps::Iterator::name() const { return getIterator()->first; } diff --git a/sandbox/linux/seccomp/maps.h b/sandbox/linux/seccomp/maps.h index 1d30506..5f51782 100644 --- a/sandbox/linux/seccomp/maps.h +++ b/sandbox/linux/seccomp/maps.h @@ -1,9 +1,16 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef MAPS_H__ #define MAPS_H__ #include <elf.h> +#include <functional> +#include <map> #include <string> -#include <vector> + +#include "allocator.h" #if defined(__x86_64__) typedef Elf64_Addr Elf_Addr; @@ -19,6 +26,9 @@ class Library; class Maps { friend class Library; public: + typedef std::basic_string<char, std::char_traits<char>, + SystemAllocator<char> > string; + Maps(int proc_self_maps); ~Maps() { } @@ -26,7 +36,8 @@ class Maps { // A map with all the libraries currently loaded into the application. // The key is a unique combination of device number, inode number, and // file name. It should be treated as opaque. - typedef std::map<std::string, Library> LibraryMap; + typedef std::map<string, Library, std::less<string>, + SystemAllocator<string> > LibraryMap; friend class Iterator; class Iterator { friend class Maps; @@ -44,7 +55,7 @@ class Maps { Library* operator*() const; bool operator==(const Iterator& iter) const; bool operator!=(const Iterator& iter) const; - std::string name() const; + string name() const; protected: mutable LibraryMap::iterator iter_; diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc index ff2b59e..12f0c0f 100644 --- a/sandbox/linux/seccomp/sandbox.cc +++ b/sandbox/linux/seccomp/sandbox.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "library.h" #include "sandbox_impl.h" #include "syscall_table.h" @@ -372,9 +376,10 @@ int Sandbox::supportsSeccompSandbox(int proc_fd) { case 0: { int devnull = sys.open("/dev/null", O_RDWR, 0); if (devnull >= 0) { - dup2(devnull, 0); - dup2(devnull, 1); - dup2(devnull, 2); + sys.dup2(devnull, 0); + sys.dup2(devnull, 1); + sys.dup2(devnull, 2); + sys.close(devnull); } if (proc_fd >= 0) { setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0)); @@ -423,7 +428,7 @@ void Sandbox::startSandbox() { SysCalls sys; if (proc_self_maps_ < 0) { - proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0); + proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0); if (proc_self_maps_ < 0) { die("Cannot access \"/proc/self/maps\""); } @@ -431,21 +436,21 @@ void Sandbox::startSandbox() { // The pid is unchanged for the entire program, so we can retrieve it once // and store it in a global variable. - pid_ = sys.getpid(); + pid_ = sys.getpid(); // Block all signals, except for the RDTSC handler setupSignalHandlers(); // Get socketpairs for talking to the trusted process int pair[4]; - if (socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || - socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { + if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) || + sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) { die("Failed to create trusted thread"); } - processFdPub_ = pair[0]; - cloneFdPub_ = pair[2]; - SecureMemArgs::Args* secureMem = createTrustedProcess(pair[0], pair[1], - pair[2], pair[3]); + processFdPub_ = pair[0]; + cloneFdPub_ = pair[2]; + SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1], + pair[2], pair[3]); // We find all libraries that have system calls and redirect the system // calls to the sandbox. If we miss any system calls, the application will be @@ -454,7 +459,7 @@ void Sandbox::startSandbox() { // correctly. { Maps maps(proc_self_maps_); - const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; + const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL }; // Intercept system calls in the VDSO segment (if any). This has to happen // before intercepting system calls in any of the other libraries, as diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h index 79621d6..0a98283 100644 --- a/sandbox/linux/seccomp/sandbox_impl.h +++ b/sandbox/linux/seccomp/sandbox_impl.h @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef SANDBOX_IMPL_H__ #define SANDBOX_IMPL_H__ @@ -360,7 +364,6 @@ class Sandbox { void* edx; void* ecx; void* ebx; - void* ret2; } regs32 __attribute__((packed)); #else #error Unsupported target platform diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h index 4c208ce..f9a5c97 100644 --- a/sandbox/linux/seccomp/securemem.h +++ b/sandbox/linux/seccomp/securemem.h @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #ifndef SECURE_MEM_H__ #define SECURE_MEM_H__ @@ -50,7 +54,6 @@ class SecureMem { void* r14; void* r15; #elif defined(__i386__) - void* ret2; void* ebp; void* edi; void* esi; @@ -86,9 +89,9 @@ class SecureMem { char securePage[4096]; }; union { - // This scratch space is used by the trusted thread to read parameters - // for unrestricted system calls. struct { + // This scratch space is used by the trusted thread to read parameters + // for unrestricted system calls. long tmpSyscallNum; void* tmpArg1; void* tmpArg2; @@ -97,6 +100,11 @@ class SecureMem { void* tmpArg5; void* tmpArg6; void* tmpReturnValue; + + // We often have long sequences of calls to gettimeofday(). This is + // needlessly expensive. Coalesce them into a single call. + long lastSyscallNum; + int gettimeofdayCounter; } __attribute__((packed)); char scratchPage[4096]; }; diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc index e1e2547..d3dc7aa 100644 --- a/sandbox/linux/seccomp/syscall.cc +++ b/sandbox/linux/seccomp/syscall.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "debug.h" #include "sandbox_impl.h" #include "syscall_table.h" @@ -146,14 +150,76 @@ asm( // Check range of system call "cmp playground$maxSyscall, %eax\n" - "ja 1f\n" + "ja 5f\n" + + // We often have long sequences of calls to gettimeofday(). This is + // needlessly expensive. Coalesce them into a single call. + // + // We keep track of state in TLS storage that we can access through + // the %fs segment register. See trusted_thread.cc for the exact + // memory layout. + // + // TODO(markus): maybe, we should proactively call gettimeofday() and + // clock_gettime(), whenever we talk to the trusted thread? + // or maybe, if we have recently seen requests to compute + // the time. There might be a repeated pattern of those. + "cmp $78, %eax\n" // __NR_gettimeofday + "jnz 2f\n" + "cmp %eax, %fs:0x102C-0x54\n" // last system call + "jnz 0f\n" + + // This system call and the last system call prior to this one both are + // calls to gettimeofday(). Try to avoid making the new call and just + // return the same result as in the previous call. + // Just in case the caller is spinning on the result from gettimeofday(), + // every so often, call the actual system call. + "decl %fs:0x1030-0x54\n" // countdown calls to gettimofday() + "jz 0f\n" + + // Atomically read the 64bit word representing last-known timestamp and + // return it to the caller. On x86-32 this is a little more complicated and + // requires the use of the cmpxchg8b instruction. + "mov %ebx, %eax\n" + "mov %ecx, %edx\n" + "lock; cmpxchg8b 100f\n" + "mov %eax, 0(%ebx)\n" + "mov %edx, 4(%ebx)\n" + "xor %eax, %eax\n" + "add $28, %esp\n" + "jmp 4f\n" + + // This is a call to gettimeofday(), but we don't have a valid cached + // result, yet. + "0:mov %eax, %fs:0x102C-0x54\n" // remember syscall number + "movl $500, %fs:0x1030-0x54\n" // make system call, each 500 invocations + "call playground$defaultSystemCallHandler\n" + + // Returned from gettimeofday(). Remember return value, in case the + // application calls us again right away. + // Again, this has to happen atomically and requires cmpxchg8b. + "mov 4(%ebx), %ecx\n" + "mov 0(%ebx), %ebx\n" + "mov 100f, %eax\n" + "mov 101f, %edx\n" + "1:lock; cmpxchg8b 100f\n" + "jnz 1b\n" + "xor %eax, %eax\n" + "jmp 6f\n" + + // Remember the number of the last system call made. We deliberately do + // not remember calls to gettid(), as we have often seen long sequences + // of calls to just gettimeofday() and gettid(). In that situation, we + // would still like to coalesce the gettimeofday() calls. + "2:cmp $224, %eax\n" // __NR_gettid + "jz 3f\n" + "mov %eax, %fs:0x102C-0x54\n" // remember syscall number // Retrieve function call from system call table (c.f. syscall_table.c). // We have three different types of entries; zero for denied system calls, // that should be handled by the defaultSystemCallHandler(); minus one // for unrestricted system calls that need to be forwarded to the trusted // thread; and function pointers to specific handler functions. - "shl $3, %eax\n" + "3:shl $3, %eax\n" "lea playground$syscallTable, %ebx\n" "add %ebx, %eax\n" "mov 0(%eax), %eax\n" @@ -161,14 +227,13 @@ asm( // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise // jump to fallback handler. "cmp $1, %eax\n" - "jbe 1f\n" + "jbe 5f\n" "add $4, %esp\n" "call *%eax\n" "add $24, %esp\n" - "0:" // Restore CPU registers, except for %eax which was set by the system call. - "pop %ebp\n" + "4:pop %ebp\n" "pop %edi\n" "pop %esi\n" "pop %edx\n" @@ -178,13 +243,16 @@ asm( // Return to caller "ret\n" - "1:" // Call default handler. - "push $2f\n" - "push $playground$defaultSystemCallHandler\n" - "ret\n" - "2:add $28, %esp\n" - "jmp 0b\n" + "5:call playground$defaultSystemCallHandler\n" + "6:add $28, %esp\n" + "jmp 4b\n" + + ".pushsection \".bss\"\n" + ".balign 8\n" +"100:.byte 0, 0, 0, 0\n" +"101:.byte 0, 0, 0, 0\n" + ".popsection\n" #else #error Unsupported target platform diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc index 6edc05d..af2e913 100644 --- a/sandbox/linux/seccomp/trusted_thread.cc +++ b/sandbox/linux/seccomp/trusted_thread.cc @@ -1,3 +1,7 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + #include "sandbox_impl.h" #include "syscall_table.h" @@ -17,7 +21,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov %0, %%rbp\n" // %rbp = args "xor %%rbx, %%rbx\n" // initial sequence number "lea 999f(%%rip), %%r15\n" // continue in same thread - "jmp 19f\n" // create trusted thread + "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger // buffer. @@ -140,7 +144,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "test %%rax, %%rax\n" "js 25f\n" // exit process "mov %%rax, %%rdi\n" - "jnz 7f\n" // wait for child, then return result + "jnz 8f\n" // wait for child, then return result "mov %%fs:0x0, %%rdi\n" // start = secure_mem "mov $4096, %%esi\n" // len = 4096 "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE @@ -148,17 +152,43 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" "mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id "xor %%rdi, %%rdi\n" + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, %%fs:0xD0\n" // debug mode + "jz 26f\n" + "mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "syscall\n" + "xor %%rdi, %%rdi\n" + #endif + "jmp 26f\n" // exit program, no message "4:syscall\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -2, execute locked system call from the // secure memory area - "5:jg 11f\n" + "5:jg 12f\n" "cmp $-2, %%eax\n" - "jnz 8f\n" + "jnz 9f\n" "cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, %%fs:0xD0\n" // debug mode + "jz 6f\n" + "mov $1, %%eax\n" // NR_write + "mov $2, %%edi\n" // fd = stderr + "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "syscall\n" + "6:" + #endif + "mov %%fs:0x10, %%rax\n" "mov %%fs:0x18, %%rdi\n" "mov %%fs:0x20, %%rsi\n" @@ -171,17 +201,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // clone() has unusual calling conventions and must be handled specially "cmp $56, %%rax\n" // NR_clone - "jz 18f\n" + "jz 19f\n" // exit() terminates trusted thread "cmp $60, %%eax\n" // NR_exit - "jz 17f\n" + "jz 18f\n" // Perform requested system call "syscall\n" // Unlock mutex - "6:cmp %%rbx, %%fs:0x8\n" + "7:cmp %%rbx, %%fs:0x8\n" "jne 25f\n" // exit process "add $2, %%rbx\n" "mov %%rax, %%r8\n" @@ -193,37 +223,37 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "js 25f\n" // exit process "jz 22f\n" // unlock and exit "mov %%rax, %%rdi\n" - "7:xor %%rsi, %%rsi\n" + "8:xor %%rsi, %%rsi\n" "xor %%rdx, %%rdx\n" "xor %%r10, %%r10\n" "mov $61, %%eax\n" // NR_wait4 "syscall\n" "cmp $-4, %%eax\n" // EINTR - "jz 7b\n" + "jz 8b\n" "mov %%r8, %%rax\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -3, read the time stamp counter - "8:cmp $-3, %%eax\n" - "jnz 9f\n" + "9:cmp $-3, %%eax\n" + "jnz 10f\n" "rdtsc\n" // sets %edx:%eax "xor %%rcx, %%rcx\n" - "jmp 10f\n" - "9:cmp $-4, %%eax\n" - "jnz 11f\n" + "jmp 11f\n" + "10:cmp $-4, %%eax\n" + "jnz 12f\n" "rdtscp\n" // sets %edx:%eax and %ecx - "10:add $0x3C, %%rsi\n" + "11:add $0x3C, %%rsi\n" "mov %%eax, 0(%%rsi)\n" "mov %%edx, 4(%%rsi)\n" "mov %%ecx, 8(%%rsi)\n" "mov $12, %%edx\n" - "jmp 15f\n" // return result + "jmp 16f\n" // return result // Check in syscallTable whether this system call is unrestricted - "11:mov %%rax, %%r9\n" + "12:mov %%rax, %%r9\n" #ifndef NDEBUG "cmpw $0, %%fs:0xD0\n" // debug mode - "jnz 12f\n" + "jnz 13f\n" #endif "cmp playground$maxSyscall(%%rip), %%eax\n" "ja 25f\n" // exit process @@ -236,14 +266,14 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Default behavior for unrestricted system calls is to just execute // them. Read the remaining arguments first. - "12:mov %%rsi, %%r8\n" + "13:mov %%rsi, %%r8\n" "xor %%rax, %%rax\n" // NR_read "mov %%r13, %%rdi\n" // fd = threadFd "add $4, %%rsi\n" // buf = &scratch + 4 "mov $48, %%edx\n" // len = 6*sizeof(void *) - "13:syscall\n" + "14:syscall\n" "cmp $-4, %%rax\n" // EINTR - "jz 13b\n" + "jz 14b\n" "cmp %%rdx, %%rax\n" "jnz 25f\n" // exit process "mov %%r9, %%rax\n" @@ -258,27 +288,27 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" // Return result of system call to sandboxed thread - "14:mov %%fs:0x0, %%rsi\n" + "15:mov %%fs:0x0, %%rsi\n" "add $0x1034, %%rsi\n" // buf = &scratch + 52 "mov %%rax, (%%rsi)\n" "mov $8, %%edx\n" // len = 8 - "15:mov %%r13, %%rdi\n" // fd = threadFd + "16:mov %%r13, %%rdi\n" // fd = threadFd "mov $1, %%eax\n" // NR_write - "16:syscall\n" + "17:syscall\n" "cmp %%rdx, %%rax\n" "jz 1b\n" "cmp $-4, %%rax\n" // EINTR - "jz 16b\n" + "jz 17b\n" "jmp 25f\n" // exit process // NR_exit: // Exit trusted thread after cleaning up resources - "17:mov %%fs:0x0, %%rsi\n" + "18:mov %%fs:0x0, %%rsi\n" "mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub "mov $3, %%eax\n" // NR_close "syscall\n" "mov %%rsi, %%rdi\n" // start = secure_mem - "mov $8192, %%esi\n" // length = 4096 + "mov $8192, %%esi\n" // length = 8192 "xor %%rdx, %%rdx\n" // prot = PROT_NONE "mov $10, %%eax\n" // NR_mprotect "syscall\n" @@ -291,6 +321,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "syscall\n" "mov %%rax, %%rdi\n" "test %%rax, %%rax\n" + "js 26f\n" // exit process "jne 21f\n" // reap helper, exit thread "jmp 22f\n" // unlock mutex @@ -305,20 +336,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // terminates the program. But if we ever support signal handling, // we have to be careful that the user cannot install a SIGSEGV // handler that gets executed with elevated privileges. - "18:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem + "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem "syscall\n" // calls NR_clone "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values - "jae 6b\n" + "jae 7b\n" // unlock mutex, return result "add $2, %%rbx\n" "test %%rax, %%rax\n" - "jne 14b\n" // return result + "jne 15b\n" // return result // In nascent thread, now. "sub $2, %%rbx\n" "xor %%r15, %%r15\n" // Request to return from clone() when done // Get thread id of nascent thread - "19:mov $186, %%eax\n" // NR_gettid + "20:mov $186, %%eax\n" // NR_gettid "syscall\n" "mov %%rax, %%r14\n" @@ -342,14 +373,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // the parent. For our purposes, it is sufficient to fail with a // fatal error. "jmp 25f\n" // exit process - "20:mov $56, %%eax\n" // NR_clone - "mov $17, %%rdi\n" // flags = SIGCHLD - "mov $1, %%rsi\n" // stack = 1 - "syscall\n" - "test %%rax, %%rax\n" - "js 25f\n" // exit process - "jz 22f\n" // unlock and exit - "mov %%rax, %%rdi\n" "21:xor %%rsi, %%rsi\n" "xor %%rdx, %%rdx\n" "xor %%r10, %%r10\n" @@ -374,7 +397,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "24:syscall\n" "25:mov $1, %%eax\n" // NR_write "mov $2, %%edi\n" // fd = stderr - "lea 100f(%%rip), %%rsi\n" + "lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "syscall\n" "mov $1, %%edi\n" @@ -426,7 +449,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $0x1001, %%edi\n" // option = ARCH_SET_GS "syscall\n" "cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values - "jae 20b\n" // exit thread, unlock global mutex + "jae 25b\n" // exit process // Check whether this is the initial thread, or a newly created one. // At startup we run the same code as when we create a new thread. At @@ -524,7 +547,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $61, %%eax\n" // NR_wait4 "syscall\n" "cmp $-4, %%eax\n" // EINTR - "jz 30\n" + "jz 30b\n" // Release privileges by entering seccomp mode. "mov $157, %%eax\n" // NR_prctl @@ -578,7 +601,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" - "101:\n" + "101:.ascii \"WARNING! This is an expensive system call\\n\"\n" + "102:\n" ".popsection\n" "999:pop %%rbp\n" @@ -613,7 +637,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%ebx, %%mm3\n" "xor %%ebx, %%ebx\n" // initial sequence number "movd %%ebx, %%mm2\n" - "jmp 19f\n" // create trusted thread + "jmp 20f\n" // create trusted thread // TODO(markus): Coalesce the read() operations by reading into a bigger // buffer. @@ -645,21 +669,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x1C: fifth argument; passed to syscall in %edi // 0x20: sixth argument; passed to syscall in %ebp // 0x24: stored return address for clone() system call - // 0x28: second stored return address for clone() system call - // 0x2C: stored %ebp value for clone() system call - // 0x30: stored %edi value for clone() system call - // 0x34: stored %esi value for clone() system call - // 0x38: stored %edx value for clone() system call - // 0x3C: stored %ecx value for clone() system call - // 0x40: stored %ebx value for clone() system call - // 0x44: new shared memory for clone() - // 0x48: processFdPub for talking to trusted process - // 0x4C: cloneFdPub for talking to trusted process - // 0x50: set to non-zero, if in debugging mode - // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE) - // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE) - // 0x60: thread id (TLS_TID) - // 0x68: threadFdPub (TLS_THREAD_FD) + // 0x28: stored %ebp value for clone() system call + // 0x2C: stored %edi value for clone() system call + // 0x30: stored %esi value for clone() system call + // 0x34: stored %edx value for clone() system call + // 0x38: stored %ecx value for clone() system call + // 0x3C: stored %ebx value for clone() system call + // 0x40: new shared memory for clone() + // 0x44: processFdPub for talking to trusted process + // 0x48: cloneFdPub for talking to trusted process + // 0x4C: set to non-zero, if in debugging mode + // 0x50: most recent SHM id returned by shmget(IPC_PRIVATE) + // 0x54: cookie assigned to us by the trusted process (TLS_COOKIE) + // 0x5C: thread id (TLS_TID) + // 0x64: threadFdPub (TLS_THREAD_FD) // 0x200-0x1000: securely passed verified file name(s) // Layout of (untrusted) scratch space: @@ -674,6 +697,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // 0x20: RDTSCP result (%eax) // 0x24: RDTSCP result (%edx) // 0x28: RDTSCP result (%ecx) + // 0x2C: last system call (updated in syscall.cc) + // 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday) "0:xor %%esp, %%esp\n" "mov $2, %%eax\n" // %mm2 = initial sequence number @@ -738,26 +763,55 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "test %%eax, %%eax\n" "js 25f\n" // exit process "mov %%eax, %%ebx\n" - "jnz 7f\n" // wait for child, then return result + "jnz 8f\n" // wait for child, then return result "movd %%mm5, %%ebx\n" // start = secure_mem "mov $4096, %%ecx\n" // len = 4096 "mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE "mov $125, %%eax\n" // NR_mprotect "int $0x80\n" - "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id + "mov %%ebp, 0x50(%%ebx)\n" // set most recently returned SysV shm id "xor %%ebx, %%ebx\n" + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "movd %%mm5, %%ecx\n" + "cmpw $0, 0x4C(%%ecx)\n" // debug mode + "jz 26f\n" + "mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 101f, %%ecx\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "xor %%ebx, %%ebx\n" + #endif + "jmp 26f\n" // exit program, no message "4:int $0x80\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -2, execute locked system call from the // secure memory area - "5:jg 11f\n" + "5:jg 12f\n" "cmp $-2, %%eax\n" - "jnz 8f\n" + "jnz 9f\n" "movd %%mm2, %%ebp\n" "cmp %%ebp, 0x4-0x1000(%%ecx)\n" "jne 25f\n" // exit process + + // When debugging messages are enabled, warn about expensive system calls + #ifndef NDEBUG + "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "jz 6f\n" // debug mode + "mov %%ecx, %%ebp\n" + "mov $4, %%eax\n" // NR_write + "mov $2, %%ebx\n" // fd = stderr + "lea 101f, %%ecx\n" // "This is an expensive system call" + "mov $102f-101f, %%edx\n" // len = strlen(msg) + "int $0x80\n" + "mov %%ebp, %%ecx\n" + "6:" + #endif + "mov 0x08-0x1000(%%ecx), %%eax\n" "mov 0x0C-0x1000(%%ecx), %%ebx\n" "mov 0x14-0x1000(%%ecx), %%edx\n" @@ -774,11 +828,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // clone() has unusual calling conventions and must be handled specially "cmp $120, %%eax\n" // NR_clone - "jz 18f\n" + "jz 19f\n" // exit() terminates trusted thread "cmp $1, %%eax\n" // NR_exit - "jz 17f\n" + "jz 18f\n" // Perform requested system call "movd %%mm4, %%edi\n" @@ -786,7 +840,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "int $0x80\n" // Unlock mutex - "6:movd %%mm2, %%ebp\n" + "7:movd %%mm2, %%ebp\n" "movd %%mm5, %%edi\n" "cmp %%ebp, 4(%%edi)\n" "jne 25f\n" // exit process @@ -801,38 +855,38 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "js 25f\n" // exit process "jz 22f\n" // unlock and exit "mov %%eax, %%ebx\n" - "7:xor %%ecx, %%ecx\n" + "8:xor %%ecx, %%ecx\n" "xor %%edx, %%edx\n" "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR "jz 6\n" "mov %%ebp, %%eax\n" - "jmp 14f\n" // return result + "jmp 15f\n" // return result // If syscall number is -3, read the time stamp counter - "8:cmp $-3, %%eax\n" - "jnz 9f\n" + "9:cmp $-3, %%eax\n" + "jnz 10f\n" "rdtsc\n" // sets %edx:%eax "xor %%ecx, %%ecx\n" - "jmp 10f\n" - "9:cmp $-4, %%eax\n" - "jnz 11f\n" + "jmp 11f\n" + "10:cmp $-4, %%eax\n" + "jnz 12f\n" "rdtscp\n" // sets %edx:%eax and %ecx - "10:movd %%mm5, %%ebx\n" + "11:movd %%mm5, %%ebx\n" "add $0x1020, %%ebx\n" "mov %%eax, 0(%%ebx)\n" "mov %%edx, 4(%%ebx)\n" "mov %%ecx, 8(%%ebx)\n" "mov %%ebx, %%ecx\n" "mov $12, %%edx\n" - "jmp 15f\n" // return result + "jmp 16f\n" // return result // Check in syscallTable whether this system call is unrestricted - "11:mov %%eax, %%ebp\n" + "12:mov %%eax, %%ebp\n" #ifndef NDEBUG - "cmpw $0, 0x50-0x1000(%%ecx)\n" - "jnz 12f\n" // debug mode + "cmpw $0, 0x4C-0x1000(%%ecx)\n" + "jnz 13f\n" // debug mode #endif "cmp playground$maxSyscall, %%eax\n" "ja 25f\n" // exit process @@ -844,13 +898,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // Default behavior for unrestricted system calls is to just execute // them. Read the remaining arguments first. - "12:mov $3, %%eax\n" // NR_read + "13:mov $3, %%eax\n" // NR_read "movd %%mm0, %%ebx\n" // fd = threadFd "add $4, %%ecx\n" // buf = &scratch + 4 "mov $24, %%edx\n" // len = 6*sizeof(void *) - "13:int $0x80\n" + "14:int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 13b\n" + "jz 14b\n" "cmp %%edx, %%eax\n" "jnz 25f\n" // exit process "mov %%ebp, %%eax\n" @@ -865,27 +919,27 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "int $0x80\n" // Return result of system call to sandboxed thread - "14:movd %%mm5, %%ecx\n" + "15:movd %%mm5, %%ecx\n" "add $0x101C, %%ecx\n" // buf = &scratch + 28 "mov %%eax, (%%ecx)\n" "mov $4, %%edx\n" // len = 4 - "15:movd %%mm0, %%ebx\n" // fd = threadFd + "16:movd %%mm0, %%ebx\n" // fd = threadFd "mov $4, %%eax\n" // NR_write - "16:int $0x80\n" + "17:int $0x80\n" "cmp %%edx, %%eax\n" "jz 1b\n" "cmp $-4, %%eax\n" // EINTR - "jz 16b\n" + "jz 17b\n" "jmp 25f\n" // exit process // NR_exit: // Exit trusted thread after cleaning up resources - "17:mov %%edi, %%ecx\n" - "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub + "18:mov %%edi, %%ecx\n" + "mov 0x64(%%ecx), %%ebx\n" // fd = threadFdPub "mov $6, %%eax\n" // NR_close "int $0x80\n" "mov %%ecx, %%ebx\n" // start = secure_mem - "mov $8192, %%ecx\n" // length = 4096 + "mov $8192, %%ecx\n" // length = 8192 "xor %%edx, %%edx\n" // prot = PROT_NONE "mov $125, %%eax\n" // NR_mprotect "int $0x80\n" @@ -898,6 +952,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "int $0x80\n" "mov %%eax, %%ebx\n" "test %%eax, %%eax\n" + "js 25f\n" // exit process "jne 21f\n" // reap helper, exit thread "jmp 22f\n" // unlock mutex @@ -912,17 +967,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // terminates the program. But if we ever support signal handling, // we have to be careful that the user cannot install a SIGSEGV // handler that gets executed with elevated privileges. - "18:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem + "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem "movd %%mm4, %%edi\n" "movd %%mm7, %%ebp\n" "int $0x80\n" // calls NR_clone "cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values - "jae 6b\n" + "jae 7b\n" // unlock mutex, return result "movd %%mm2, %%edi\n" "add $2, %%edi\n" "movd %%edi, %%mm2\n" "test %%eax, %%eax\n" - "jne 14b\n" // return result + "jne 15b\n" // return result // In nascent thread, now. "sub $2, %%edi\n" @@ -930,7 +985,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%eax, %%mm3\n" // Request to return from clone() when done // Get thread id of nascent thread - "19:mov $224, %%eax\n" // NR_gettid + "20:mov $224, %%eax\n" // NR_gettid "int $0x80\n" "movd %%eax, %%mm4\n" @@ -958,14 +1013,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // the parent. For our purposes, it is sufficient to fail with a // fatal error. "jmp 25f\n" // exit process - "20:mov $120, %%eax\n" // NR_clone - "mov $17, %%ebx\n" // flags = SIGCHLD - "mov $1, %%ecx\n" // stack = 1 - "int $0x80\n" - "test %%eax, %%eax\n" - "js 25f\n" // exit process - "jz 22f\n" // unlock and exit - "mov %%eax, %%ebx\n" "21:xor %%ecx, %%ecx\n" "xor %%edx, %%edx\n" "mov $7, %%eax\n" // NR_waitpid @@ -989,7 +1036,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "24:int $0x80\n" "25:mov $4, %%eax\n" // NR_write "mov $2, %%ebx\n" // fd = stderr - "lea 100f, %%ecx\n" + "lea 100f, %%ecx\n" // "Sandbox violation detected" "mov $101f-100f, %%edx\n" // len = strlen(msg) "int $0x80\n" "mov $1, %%ebx\n" @@ -998,7 +1045,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, // The first page is mapped read-only for use as securely shared memory "27:movd %%mm6, %%ebp\n" - "mov 0x44(%%ebp), %%esi\n" + "mov 0x40(%%ebp), %%esi\n" "movd %%esi, %%mm5\n" // %mm5 = secure shared memory "movd %%mm2, %%edi\n" "cmp %%edi, 4(%%ebp)\n" @@ -1024,7 +1071,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $120, %%eax\n" // NR_clone "mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR "mov $1, %%ecx\n" // stack = 1 - "movd 0x48(%%ebp), %%mm1\n" // %mm1 = processFdPub + "movd 0x44(%%ebp), %%mm1\n" // %mm1 = processFdPub "cmp %%edi, 4(%%ebp)\n" "jne 25b\n" // exit process "int $0x80\n" @@ -1037,7 +1084,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "push %%eax\n" "mov $0xFFFFF, %%eax\n" // limit "push %%eax\n" - "add $0x58, %%esi\n" + "add $0x54, %%esi\n" "push %%esi\n" // base_addr = &secure_mem.TLS "mov %%fs, %%eax\n" "shr $3, %%eax\n" @@ -1080,8 +1127,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "push %%eax\n" "mov 0x3C(%%ebp), %%eax\n" "push %%eax\n" - "mov 0x40(%%ebp), %%eax\n" - "push %%eax\n" "cmp %%edi, 4(%%ebp)\n" "jne 25b\n" // exit process @@ -1111,7 +1156,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "movd %%mm0, %%eax\n" // fd1 = threadFd "push %%eax\n" "push %%esi\n" // fd0 = threadFdPub - "mov 0x4C(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub() + "mov 0x48(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub() "cmp %%edi, 4(%%ebp)\n" "jne 25b\n" // exit process "push %%eax\n" @@ -1139,7 +1184,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, "mov $7, %%eax\n" // NR_waitpid "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 30\n" + "jz 30b\n" // Release privileges by entering seccomp mode. "mov $172, %%eax\n" // NR_prctl @@ -1190,7 +1235,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub, ".pushsection \".rodata\"\n" "100:.ascii \"Sandbox violation detected, program aborted\\n\"\n" - "101:\n" + "101:.ascii \"WARNING! This is an expensive system call\\n\"\n" + "102:\n" ".popsection\n" "999:pop %%ebp\n" |