summaryrefslogtreecommitdiffstats
path: root/sandbox/linux
diff options
context:
space:
mode:
Diffstat (limited to 'sandbox/linux')
-rw-r--r--sandbox/linux/seccomp/allocator.cc136
-rw-r--r--sandbox/linux/seccomp/allocator.h88
-rw-r--r--sandbox/linux/seccomp/clone.cc5
-rw-r--r--sandbox/linux/seccomp/library.cc153
-rw-r--r--sandbox/linux/seccomp/library.h34
-rw-r--r--sandbox/linux/seccomp/maps.cc18
-rw-r--r--sandbox/linux/seccomp/maps.h17
-rw-r--r--sandbox/linux/seccomp/sandbox.cc29
-rw-r--r--sandbox/linux/seccomp/sandbox_impl.h5
-rw-r--r--sandbox/linux/seccomp/securemem.h14
-rw-r--r--sandbox/linux/seccomp/syscall.cc90
-rw-r--r--sandbox/linux/seccomp/trusted_thread.cc270
12 files changed, 641 insertions, 218 deletions
diff --git a/sandbox/linux/seccomp/allocator.cc b/sandbox/linux/seccomp/allocator.cc
new file mode 100644
index 0000000..6e11a4a
--- /dev/null
+++ b/sandbox/linux/seccomp/allocator.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// The allocator is very simplistic. It requests memory pages directly from
+// the system. Each page starts with a header describing the allocation. This
+// makes sure that we can return the memory to the system when it is
+// deallocated.
+// For allocations that are smaller than a single page, we try to squeeze
+// multiple of them into the same page.
+// We expect to use this allocator for a moderate number of small allocations.
+// In most cases, it will only need to ever make a single request to the
+// operating system for the lifetime of the STL container object.
+// We don't worry about memory fragmentation as the allocator is expected to
+// be short-lived.
+
+#include <stdint.h>
+#include <sys/mman.h>
+
+#include "allocator.h"
+#include "linux_syscall_support.h"
+
+namespace playground {
+
+class SysCalls {
+ public:
+ #define SYS_CPLUSPLUS
+ #define SYS_ERRNO my_errno
+ #define SYS_INLINE inline
+ #define SYS_PREFIX -1
+ #undef SYS_LINUX_SYSCALL_SUPPORT_H
+ #include "linux_syscall_support.h"
+ SysCalls() : my_errno(0) { }
+ int my_errno;
+};
+#ifdef __NR_mmap2
+ #define MMAP mmap2
+ #define __NR_MMAP __NR_mmap2
+#else
+ #define MMAP mmap
+ #define __NR_MMAP __NR_mmap
+#endif
+
+// We only ever keep track of the very last partial page that was used for
+// allocations. This approach simplifies the code a lot. It can theoretically
+// lead to more memory fragmentation, but for our use case that is unlikely
+// to happen.
+struct Header {
+ // The total amount of memory allocated for this chunk of memory. Typically,
+ // this would be a single page.
+ size_t total_len;
+
+ // "used" keeps track of the number of bytes currently allocated in this
+ // page. Note that as elements are freed from this page, "used" is updated
+ // allowing us to track when the page is free. However, these holes in the
+ // page are never re-used, so "tail" is the only way to find out how much
+ // free space remains and when we need to request another chunk of memory
+ // from the system.
+ size_t used;
+ void *tail;
+};
+static Header* last_alloc;
+
+void* SystemAllocatorHelper::sys_allocate(size_t size) {
+ // Number of bytes that need to be allocated
+ if (size + 3 < size) {
+ return NULL;
+ }
+ size_t len = (size + 3) & ~3;
+
+ if (last_alloc) {
+ // Remaining space in the last chunk of memory allocated from system
+ size_t remainder = last_alloc->total_len -
+ (reinterpret_cast<char *>(last_alloc->tail) -
+ reinterpret_cast<char *>(last_alloc));
+
+ if (remainder >= len) {
+ void* ret = last_alloc->tail;
+ last_alloc->tail = reinterpret_cast<char *>(last_alloc->tail) + len;
+ last_alloc->used += len;
+ return ret;
+ }
+ }
+
+ SysCalls sys;
+ if (sizeof(Header) + len + 4095 < len) {
+ return NULL;
+ }
+ size_t total_len = (sizeof(Header) + len + 4095) & ~4095;
+ Header* mem = reinterpret_cast<Header *>(
+ sys.MMAP(NULL, total_len, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
+ if (mem == MAP_FAILED) {
+ return NULL;
+ }
+
+ // If we were only asked to allocate a single page, then we will use any
+ // remaining space for other small allocations.
+ if (total_len - sizeof(Header) - len >= 4) {
+ last_alloc = mem;
+ }
+ mem->total_len = total_len;
+ mem->used = len;
+ char* ret = reinterpret_cast<char *>(mem) + sizeof(Header);
+ mem->tail = ret + len;
+
+ return ret;
+}
+
+void SystemAllocatorHelper::sys_deallocate(void* p, size_t size) {
+ // Number of bytes in this allocation
+ if (size + 3 < size) {
+ return;
+ }
+ size_t len = (size + 3) & ~3;
+
+ // All allocations (small and large) have starting addresses in the
+ // first page that was allocated from the system. This page starts with
+ // a header that keeps track of how many bytes are currently used. The
+ // header can be found by truncating the last few bits of the address.
+ Header* header = reinterpret_cast<Header *>(
+ reinterpret_cast<uintptr_t>(p) & ~4095);
+ header->used -= len;
+
+ // After the last allocation has been freed, return the page(s) to the
+ // system
+ if (!header->used) {
+ SysCalls sys;
+ sys.munmap(header, header->total_len);
+ if (last_alloc == header) {
+ last_alloc = NULL;
+ }
+ }
+}
+
+} // namespace
diff --git a/sandbox/linux/seccomp/allocator.h b/sandbox/linux/seccomp/allocator.h
new file mode 100644
index 0000000..29e0065
--- /dev/null
+++ b/sandbox/linux/seccomp/allocator.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Implement a very basic memory allocator that make direct system calls
+// instead of relying on libc.
+// This allocator is not thread-safe.
+
+#ifndef ALLOCATOR_H__
+#define ALLOCATOR_H__
+
+#include <cstddef>
+
+namespace playground {
+
+class SystemAllocatorHelper {
+ protected:
+ static void *sys_allocate(size_t size);
+ static void sys_deallocate(void* p, size_t size);
+};
+
+template <class T>
+class SystemAllocator : SystemAllocatorHelper {
+ public:
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ template <class U>
+ struct rebind {
+ typedef SystemAllocator<U> other;
+ };
+
+ pointer address(reference value) const {
+ return &value;
+ }
+
+ const_pointer address(const_reference value) const {
+ return &value;
+ }
+
+ SystemAllocator() throw() { }
+ SystemAllocator(const SystemAllocator& src) throw() { }
+ template <class U> SystemAllocator(const SystemAllocator<U>& src) throw() { }
+ ~SystemAllocator() throw() { }
+
+ size_type max_size() const throw() {
+ return (1 << 30) / sizeof(T);
+ }
+
+ pointer allocate(size_type num, const void* = 0) {
+ if (num > max_size()) {
+ return NULL;
+ }
+ return (pointer)sys_allocate(num * sizeof(T));
+ }
+
+ void construct(pointer p, const T& value) {
+ new(reinterpret_cast<void *>(p))T(value);
+ }
+
+ void destroy(pointer p) {
+ p->~T();
+ }
+
+ void deallocate(pointer p, size_type num) {
+ sys_deallocate(p, num * sizeof(T));
+ }
+};
+
+template <class T1, class T2>
+bool operator== (const SystemAllocator<T1>&, const SystemAllocator<T2>&)
+ throw() {
+ return true;
+}
+template <class T1, class T2>
+bool operator!= (const SystemAllocator<T1>&, const SystemAllocator<T2>&)
+ throw() {
+ return false;
+}
+
+} // namespace
+
+#endif // ALLOCATOR_H__
diff --git a/sandbox/linux/seccomp/clone.cc b/sandbox/linux/seccomp/clone.cc
index 2b6703f..28a3584 100644
--- a/sandbox/linux/seccomp/clone.cc
+++ b/sandbox/linux/seccomp/clone.cc
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#include "debug.h"
#include "sandbox_impl.h"
@@ -84,7 +88,6 @@ bool Sandbox::process_clone(int parentMapsFd, int sandboxFd, int threadFdPub,
mem->r14 = clone_req.regs64.r14;
mem->r15 = clone_req.regs64.r15;
#elif defined(__i386__)
- mem->ret2 = clone_req.regs32.ret2;
mem->ebp = clone_req.regs32.ebp;
mem->edi = clone_req.regs32.edi;
mem->esi = clone_req.regs32.esi;
diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc
index cf7477b..1b06bc1 100644
--- a/sandbox/linux/seccomp/library.cc
+++ b/sandbox/linux/seccomp/library.cc
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#define XOPEN_SOURCE 500
#include <algorithm>
#include <elf.h>
@@ -16,6 +20,7 @@
#include <sys/stat.h>
#include <sys/types.h>
+#include "allocator.h"
#include "debug.h"
#include "library.h"
#include "sandbox_impl.h"
@@ -84,7 +89,11 @@ Library::~Library() {
// found. Make sure to preserve any changes that we might have made since.
Sandbox::SysCalls sys;
sys.mprotect(image_, 4096, PROT_READ | PROT_WRITE);
- memcpy(image_, memory_ranges_.rbegin()->second.start, 4096);
+ if (memcmp(image_, memory_ranges_.rbegin()->second.start, 4096)) {
+ // Only copy data, if we made any changes in this data. Otherwise there
+ // is no need to create another modified COW mapping.
+ memcpy(image_, memory_ranges_.rbegin()->second.start, 4096);
+ }
sys.mprotect(image_, 4096, PROT_READ | PROT_EXEC);
sys.mremap(image_, image_size_, 4096, MREMAP_MAYMOVE | MREMAP_FIXED,
memory_ranges_.rbegin()->second.start);
@@ -173,7 +182,7 @@ char *Library::get(Elf_Addr offset, char *buf, size_t len) {
return buf;
}
-std::string Library::get(Elf_Addr offset) {
+Library::string Library::get(Elf_Addr offset) {
if (!valid_) {
return "";
}
@@ -192,7 +201,7 @@ std::string Library::get(Elf_Addr offset) {
while (*stop) {
++stop;
}
- std::string s = stop > start ? std::string(start, stop - start) : "";
+ string s = stop > start ? string(start, stop - start) : "";
return s;
}
@@ -215,8 +224,21 @@ char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) {
image_size_ = memory_ranges_.begin()->first +
(reinterpret_cast<char *>(memory_ranges_.begin()->second.stop) -
reinterpret_cast<char *>(memory_ranges_.begin()->second.start));
+ if (image_size_ < 8192) {
+ // It is possible to create a library that is only a single page in
+ // size. In that case, we have to make sure that we artificially map
+ // one extra page past the end of it, as our code relies on mremap()
+ // actually moving the mapping.
+ image_size_ = 8192;
+ }
image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_,
MREMAP_MAYMOVE));
+ if (image_size_ == 8192 && image_ == start) {
+ // We really mean it, when we say we want the memory to be moved.
+ image_ = reinterpret_cast<char *>(sys.mremap(start, 4096, image_size_,
+ MREMAP_MAYMOVE));
+ sys.munmap(reinterpret_cast<char *>(start) + 4096, 4096);
+ }
if (image_ == MAP_FAILED) {
image_ = NULL;
} else {
@@ -250,7 +272,7 @@ char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) {
return buf ? get(offset, buf, len) : NULL;
}
-std::string Library::getOriginal(Elf_Addr offset) {
+Library::string Library::getOriginal(Elf_Addr offset) {
if (!valid_) {
return "";
}
@@ -271,7 +293,7 @@ std::string Library::getOriginal(Elf_Addr offset) {
getOriginal(stop - image_, NULL, 1);
}
}
- return std::string(start, stop - start);
+ return string(start, stop - start);
}
return "";
}
@@ -285,7 +307,7 @@ const Elf_Ehdr* Library::getEhdr() {
return &ehdr_;
}
-const Elf_Shdr* Library::getSection(const std::string& section) {
+const Elf_Shdr* Library::getSection(const string& section) {
if (!valid_) {
return NULL;
}
@@ -296,7 +318,7 @@ const Elf_Shdr* Library::getSection(const std::string& section) {
return &iter->second.second;
}
-const int Library::getSectionIndex(const std::string& section) {
+const int Library::getSectionIndex(const string& section) {
if (!valid_) {
return -1;
}
@@ -307,22 +329,6 @@ const int Library::getSectionIndex(const std::string& section) {
return iter->second.first;
}
-void **Library::getRelocation(const std::string& symbol) {
- PltTable::const_iterator iter = plt_entries_.find(symbol);
- if (iter == plt_entries_.end()) {
- return NULL;
- }
- return reinterpret_cast<void **>(asr_offset_ + iter->second);
-}
-
-void *Library::getSymbol(const std::string& symbol) {
- SymbolTable::const_iterator iter = symbols_.find(symbol);
- if (iter == symbols_.end() || !iter->second.st_value) {
- return NULL;
- }
- return asr_offset_ + iter->second.st_value;
-}
-
void Library::makeWritable(bool state) const {
for (RangeMap::const_iterator iter = memory_ranges_.begin();
iter != memory_ranges_.end(); ++iter) {
@@ -380,7 +386,7 @@ char* Library::getScratchSpace(const Maps* maps, char* near, int needed,
void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
char *end, char** extraSpace,
int* extraLength) {
- std::set<char *> branch_targets;
+ std::set<char *, std::less<char *>, SystemAllocator<char *> > branch_targets;
for (char *ptr = start; ptr < end; ) {
unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64);
char *target;
@@ -516,12 +522,21 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
}
}
// We now know, how many instructions neighboring the system call we
- // can safely overwrite. We need five bytes to insert a JMP/CALL and a
- // 32bit address. We then jump to a code fragment that safely forwards
- // to our system call wrapper. On x86-64, this is complicated by
- // the fact that the API allows up to 128 bytes of red-zones below the
- // current stack pointer. So, we cannot write to the stack until we
- // have adjusted the stack pointer.
+ // can safely overwrite. On x86-32 we need six bytes, and on x86-64
+ // We need five bytes to insert a JMPQ and a 32bit address. We then
+ // jump to a code fragment that safely forwards to our system call
+ // wrapper.
+ // On x86-64, this is complicated by the fact that the API allows up
+ // to 128 bytes of red-zones below the current stack pointer. So, we
+ // cannot write to the stack until we have adjusted the stack
+ // pointer.
+ // On both x86-32 and x86-64 we take care to leave the stack unchanged
+ // while we are executing the preamble and postamble. This allows us
+ // to treat instructions that reference %esp/%rsp as safe for
+ // relocation.
+ // In particular, this means that on x86-32 we cannot use CALL, but
+ // have to use a PUSH/RET combination to change the instruction pointer.
+ // On x86-64, we can instead use a 32bit JMPQ.
//
// .. .. .. .. ; any leading instructions copied from original code
// 48 81 EC 80 00 00 00 SUB $0x80, %rsp
@@ -549,9 +564,10 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
// 68 .. .. .. .. PUSH $syscallWrapper
// C3 RET
// .. .. .. .. ; any trailing instructions copied from original code
+ // 68 .. .. .. .. PUSH return_addr
// C3 RET
//
- // Total: 12 bytes + any bytes that were copied
+ // Total: 17 bytes + any bytes that were copied
//
// For indirect jumps from the VDSO to the VSyscall page, we instead
// replace the following code (this is only necessary on x86-64). This
@@ -575,7 +591,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
//
// Total: 52 bytes + any bytes that were copied
- if (length < 5) {
+ if (length < (__WORDSIZE == 32 ? 6 : 5)) {
// There are a very small number of instruction sequences that we
// cannot easily intercept, and that have been observed in real world
// examples. Handle them here:
@@ -648,7 +664,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
Sandbox::die("Cannot intercept system call");
}
}
- int needed = 5 - code[codeIdx].len;
+ int needed = (__WORDSIZE == 32 ? 6 : 5) - code[codeIdx].len;
int first = codeIdx;
while (needed > 0 && first != startIdx) {
first = (first + (sizeof(code) / sizeof(struct Code)) - 1) %
@@ -673,7 +689,7 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
needed = 52 + preamble + postamble;
}
#elif defined(__i386__)
- needed = 12 + preamble + postamble;
+ needed = 17 + preamble + postamble;
#else
#error Unsupported target platform
#endif
@@ -752,7 +768,10 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
reinterpret_cast<void *>(&syscallWrapper);
}
#elif defined(__i386__)
- *(dest + preamble + 11 + postamble) = '\xC3';
+ *(dest + preamble + 11 + postamble) = '\x68'; // PUSH
+ *reinterpret_cast<char **>(dest + preamble + 12 + postamble) =
+ code[second].addr + code[second].len;
+ *(dest + preamble + 16 + postamble) = '\xC3'; // RET
*reinterpret_cast<char **>(dest + preamble + 1) =
dest + preamble + 11;
*reinterpret_cast<void (**)()>(dest + preamble + 6) = syscallWrapper;
@@ -766,14 +785,16 @@ void Library::patchSystemCallsInFunction(const Maps* maps, char *start,
// Replace the system call with an unconditional jump to our new code.
#if defined(__x86_64__)
- *code[first].addr = '\xE9'; // JMPQ
+ *code[first].addr = '\xE9'; // JMPQ
+ *reinterpret_cast<int *>(code[first].addr + 1) =
+ dest - (code[first].addr + 5);
#elif defined(__i386__)
- *code[first].addr = '\xE8'; // CALL
+ code[first].addr[0] = '\x68'; // PUSH
+ *reinterpret_cast<char **>(code[first].addr + 1) = dest;
+ code[first].addr[5] = '\xC3'; // RET
#else
#error Unsupported target platform
#endif
- *reinterpret_cast<int *>(code[first].addr + 1) =
- dest - (code[first].addr + 5);
}
replaced:
codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code));
@@ -1049,27 +1070,11 @@ bool Library::parseElf() {
&str_shdr)) {
// Not all memory mappings are necessarily ELF files. Skip memory
// mappings that we cannot identify.
+ error:
valid_ = false;
return false;
}
- // Find PT_DYNAMIC segment. This is what our PLT entries and symbols will
- // point to. This information is probably incorrect in the child, as it
- // requires access to the original memory mappings.
- for (int i = 0; i < ehdr_.e_phnum; i++) {
- Elf_Phdr phdr;
- if (getOriginal(ehdr_.e_phoff + i*ehdr_.e_phentsize, &phdr) &&
- phdr.p_type == PT_DYNAMIC) {
- RangeMap::const_iterator iter =
- memory_ranges_.lower_bound(phdr.p_offset);
- if (iter != memory_ranges_.end()) {
- asr_offset_ = reinterpret_cast<char *>(iter->second.start) -
- (phdr.p_vaddr - (phdr.p_offset - iter->first));
- }
- break;
- }
- }
-
// Parse section table and find all sections in this ELF file
for (int i = 0; i < ehdr_.e_shnum; i++) {
Elf_Shdr shdr;
@@ -1081,6 +1086,38 @@ bool Library::parseElf() {
std::make_pair(i, shdr)));
}
+ // Compute the offset of entries in the .text segment
+ const Elf_Shdr* text = getSection(".text");
+ if (text == NULL) {
+ // On x86-32, the VDSO is unusual in as much as it does not have a single
+ // ".text" section. Instead, it has one section per function. Each
+ // section name starts with ".text". We just need to pick an arbitrary
+ // one in order to find the asr_offset_ -- which would typically be zero
+ // for the VDSO.
+ for (SectionTable::const_iterator iter = section_table_.begin();
+ iter != section_table_.end(); ++iter) {
+ if (!strncmp(iter->first.c_str(), ".text", 5)) {
+ text = &iter->second.second;
+ break;
+ }
+ }
+ }
+
+ // Now that we know where the .text segment is located, we can compute the
+ // asr_offset_.
+ if (text) {
+ RangeMap::const_iterator iter =
+ memory_ranges_.lower_bound(text->sh_offset);
+ if (iter != memory_ranges_.end()) {
+ asr_offset_ = reinterpret_cast<char *>(iter->second.start) -
+ (text->sh_addr - (text->sh_offset - iter->first));
+ } else {
+ goto error;
+ }
+ } else {
+ goto error;
+ }
+
return !isVDSO_ || parseSymbols();
}
@@ -1128,7 +1165,7 @@ bool Library::parseSymbols() {
valid_ = false;
return false;
}
- std::string name = getOriginal(strtab.sh_offset + sym.st_name);
+ string name = getOriginal(strtab.sh_offset + sym.st_name);
if (name.empty()) {
continue;
}
@@ -1147,7 +1184,7 @@ bool Library::parseSymbols() {
valid_ = false;
return false;
}
- std::string name = getOriginal(strtab.sh_offset + sym.st_name);
+ string name = getOriginal(strtab.sh_offset + sym.st_name);
if (name.empty()) {
continue;
}
diff --git a/sandbox/linux/seccomp/library.h b/sandbox/linux/seccomp/library.h
index 523652c..29a755e 100644
--- a/sandbox/linux/seccomp/library.h
+++ b/sandbox/linux/seccomp/library.h
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#ifndef LIBRARY_H__
#define LIBRARY_H__
@@ -30,6 +34,8 @@ namespace playground {
class Library {
friend class Maps;
public:
+ typedef Maps::string string;
+
Library() :
valid_(false),
isVDSO_(false),
@@ -50,14 +56,24 @@ class Library {
void addMemoryRange(void* start, void* stop, Elf_Addr offset,
int prot, int isVDSO) {
- memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot)));
isVDSO_ = isVDSO;
+ RangeMap::const_iterator iter = memory_ranges_.find(offset);
+ if (iter != memory_ranges_.end()) {
+ // It is possible to have overlapping mappings. This is particularly
+ // likely to happen with very small programs or libraries. If it does
+ // happen, we really only care about the text segment. Look for a
+ // mapping that is mapped executable.
+ if ((prot & PROT_EXEC) == 0) {
+ return;
+ }
+ }
+ memory_ranges_.insert(std::make_pair(offset, Range(start, stop, prot)));
}
char *get(Elf_Addr offset, char *buf, size_t len);
- std::string get(Elf_Addr offset);
+ string get(Elf_Addr offset);
char *getOriginal(Elf_Addr offset, char *buf, size_t len);
- std::string getOriginal(Elf_Addr offset);
+ string getOriginal(Elf_Addr offset);
template<class T>T* get(Elf_Addr offset, T* t) {
if (!valid_) {
@@ -108,10 +124,8 @@ class Library {
bool parseElf();
const Elf_Ehdr* getEhdr();
- const Elf_Shdr* getSection(const std::string& section);
- const int getSectionIndex(const std::string& section);
- void **getRelocation(const std::string& symbol);
- void *getSymbol(const std::string& symbol);
+ const Elf_Shdr* getSection(const string& section);
+ const int getSectionIndex(const string& section);
void makeWritable(bool state) const;
void patchSystemCalls();
bool isVDSO() const { return isVDSO_; }
@@ -136,9 +150,9 @@ class Library {
};
typedef std::map<Elf_Addr, Range, GreaterThan> RangeMap;
- typedef std::map<std::string, std::pair<int, Elf_Shdr> > SectionTable;
- typedef std::map<std::string, Elf_Sym> SymbolTable;
- typedef std::map<std::string, Elf_Addr> PltTable;
+ typedef std::map<string, std::pair<int, Elf_Shdr> > SectionTable;
+ typedef std::map<string, Elf_Sym> SymbolTable;
+ typedef std::map<string, Elf_Addr> PltTable;
char* getBytes(char* dst, const char* src, ssize_t len);
static bool isSafeInsn(unsigned short insn);
diff --git a/sandbox/linux/seccomp/maps.cc b/sandbox/linux/seccomp/maps.cc
index cb303e7..d18405a 100644
--- a/sandbox/linux/seccomp/maps.cc
+++ b/sandbox/linux/seccomp/maps.cc
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#include <errno.h>
#include <fcntl.h>
#include <iostream>
@@ -42,18 +46,18 @@ Maps::Maps(int proc_self_maps) :
while (*ptr == ' ' || *ptr == '\t') ++ptr;
char *perm_ptr = ptr;
while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
- std::string perm(perm_ptr, ptr - perm_ptr);
+ string perm(perm_ptr, ptr - perm_ptr);
unsigned long offset = strtoul(ptr, &ptr, 16);
while (*ptr == ' ' || *ptr == '\t') ++ptr;
char *id_ptr = ptr;
while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
while (*ptr == ' ' || *ptr == '\t') ++ptr;
while (*ptr && *ptr != ' ' && *ptr != '\t') ++ptr;
- std::string id(id_ptr, ptr - id_ptr);
+ string id(id_ptr, ptr - id_ptr);
while (*ptr == ' ' || *ptr == '\t') ++ptr;
char *library_ptr = ptr;
while (*ptr && *ptr != ' ' && *ptr != '\t' && *ptr != '\n') ++ptr;
- std::string library(library_ptr, ptr - library_ptr);
+ string library(library_ptr, ptr - library_ptr);
bool isVDSO = false;
if (library == "[vdso]") {
// /proc/self/maps has a misleading file offset in the [vdso] entry.
@@ -66,13 +70,13 @@ Maps::Maps(int proc_self_maps) :
goto skip_entry;
}
int prot = 0;
- if (perm.find('r') != std::string::npos) {
+ if (perm.find('r') != string::npos) {
prot |= PROT_READ;
}
- if (perm.find('w') != std::string::npos) {
+ if (perm.find('w') != string::npos) {
prot |= PROT_WRITE;
}
- if (perm.find('x') != std::string::npos) {
+ if (perm.find('x') != string::npos) {
prot |= PROT_EXEC;
}
if ((prot & (PROT_EXEC | PROT_READ)) == 0) {
@@ -146,7 +150,7 @@ bool Maps::Iterator::operator!=(const Maps::Iterator& iter) const {
return !operator==(iter);
}
-std::string Maps::Iterator::name() const {
+Maps::string Maps::Iterator::name() const {
return getIterator()->first;
}
diff --git a/sandbox/linux/seccomp/maps.h b/sandbox/linux/seccomp/maps.h
index 1d30506..5f51782 100644
--- a/sandbox/linux/seccomp/maps.h
+++ b/sandbox/linux/seccomp/maps.h
@@ -1,9 +1,16 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#ifndef MAPS_H__
#define MAPS_H__
#include <elf.h>
+#include <functional>
+#include <map>
#include <string>
-#include <vector>
+
+#include "allocator.h"
#if defined(__x86_64__)
typedef Elf64_Addr Elf_Addr;
@@ -19,6 +26,9 @@ class Library;
class Maps {
friend class Library;
public:
+ typedef std::basic_string<char, std::char_traits<char>,
+ SystemAllocator<char> > string;
+
Maps(int proc_self_maps);
~Maps() { }
@@ -26,7 +36,8 @@ class Maps {
// A map with all the libraries currently loaded into the application.
// The key is a unique combination of device number, inode number, and
// file name. It should be treated as opaque.
- typedef std::map<std::string, Library> LibraryMap;
+ typedef std::map<string, Library, std::less<string>,
+ SystemAllocator<string> > LibraryMap;
friend class Iterator;
class Iterator {
friend class Maps;
@@ -44,7 +55,7 @@ class Maps {
Library* operator*() const;
bool operator==(const Iterator& iter) const;
bool operator!=(const Iterator& iter) const;
- std::string name() const;
+ string name() const;
protected:
mutable LibraryMap::iterator iter_;
diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc
index ff2b59e..12f0c0f 100644
--- a/sandbox/linux/seccomp/sandbox.cc
+++ b/sandbox/linux/seccomp/sandbox.cc
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#include "library.h"
#include "sandbox_impl.h"
#include "syscall_table.h"
@@ -372,9 +376,10 @@ int Sandbox::supportsSeccompSandbox(int proc_fd) {
case 0: {
int devnull = sys.open("/dev/null", O_RDWR, 0);
if (devnull >= 0) {
- dup2(devnull, 0);
- dup2(devnull, 1);
- dup2(devnull, 2);
+ sys.dup2(devnull, 0);
+ sys.dup2(devnull, 1);
+ sys.dup2(devnull, 2);
+ sys.close(devnull);
}
if (proc_fd >= 0) {
setProcSelfMaps(sys.openat(proc_fd, "self/maps", O_RDONLY, 0));
@@ -423,7 +428,7 @@ void Sandbox::startSandbox() {
SysCalls sys;
if (proc_self_maps_ < 0) {
- proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0);
+ proc_self_maps_ = sys.open("/proc/self/maps", O_RDONLY, 0);
if (proc_self_maps_ < 0) {
die("Cannot access \"/proc/self/maps\"");
}
@@ -431,21 +436,21 @@ void Sandbox::startSandbox() {
// The pid is unchanged for the entire program, so we can retrieve it once
// and store it in a global variable.
- pid_ = sys.getpid();
+ pid_ = sys.getpid();
// Block all signals, except for the RDTSC handler
setupSignalHandlers();
// Get socketpairs for talking to the trusted process
int pair[4];
- if (socketpair(AF_UNIX, SOCK_STREAM, 0, pair) ||
- socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) {
+ if (sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair) ||
+ sys.socketpair(AF_UNIX, SOCK_STREAM, 0, pair+2)) {
die("Failed to create trusted thread");
}
- processFdPub_ = pair[0];
- cloneFdPub_ = pair[2];
- SecureMemArgs::Args* secureMem = createTrustedProcess(pair[0], pair[1],
- pair[2], pair[3]);
+ processFdPub_ = pair[0];
+ cloneFdPub_ = pair[2];
+ SecureMemArgs* secureMem = createTrustedProcess(pair[0], pair[1],
+ pair[2], pair[3]);
// We find all libraries that have system calls and redirect the system
// calls to the sandbox. If we miss any system calls, the application will be
@@ -454,7 +459,7 @@ void Sandbox::startSandbox() {
// correctly.
{
Maps maps(proc_self_maps_);
- const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL };
+ const char *libs[] = { "ld", "libc", "librt", "libpthread", NULL };
// Intercept system calls in the VDSO segment (if any). This has to happen
// before intercepting system calls in any of the other libraries, as
diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h
index 79621d6..0a98283 100644
--- a/sandbox/linux/seccomp/sandbox_impl.h
+++ b/sandbox/linux/seccomp/sandbox_impl.h
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#ifndef SANDBOX_IMPL_H__
#define SANDBOX_IMPL_H__
@@ -360,7 +364,6 @@ class Sandbox {
void* edx;
void* ecx;
void* ebx;
- void* ret2;
} regs32 __attribute__((packed));
#else
#error Unsupported target platform
diff --git a/sandbox/linux/seccomp/securemem.h b/sandbox/linux/seccomp/securemem.h
index 4c208ce..f9a5c97 100644
--- a/sandbox/linux/seccomp/securemem.h
+++ b/sandbox/linux/seccomp/securemem.h
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#ifndef SECURE_MEM_H__
#define SECURE_MEM_H__
@@ -50,7 +54,6 @@ class SecureMem {
void* r14;
void* r15;
#elif defined(__i386__)
- void* ret2;
void* ebp;
void* edi;
void* esi;
@@ -86,9 +89,9 @@ class SecureMem {
char securePage[4096];
};
union {
- // This scratch space is used by the trusted thread to read parameters
- // for unrestricted system calls.
struct {
+ // This scratch space is used by the trusted thread to read parameters
+ // for unrestricted system calls.
long tmpSyscallNum;
void* tmpArg1;
void* tmpArg2;
@@ -97,6 +100,11 @@ class SecureMem {
void* tmpArg5;
void* tmpArg6;
void* tmpReturnValue;
+
+ // We often have long sequences of calls to gettimeofday(). This is
+ // needlessly expensive. Coalesce them into a single call.
+ long lastSyscallNum;
+ int gettimeofdayCounter;
} __attribute__((packed));
char scratchPage[4096];
};
diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc
index e1e2547..d3dc7aa 100644
--- a/sandbox/linux/seccomp/syscall.cc
+++ b/sandbox/linux/seccomp/syscall.cc
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#include "debug.h"
#include "sandbox_impl.h"
#include "syscall_table.h"
@@ -146,14 +150,76 @@ asm(
// Check range of system call
"cmp playground$maxSyscall, %eax\n"
- "ja 1f\n"
+ "ja 5f\n"
+
+ // We often have long sequences of calls to gettimeofday(). This is
+ // needlessly expensive. Coalesce them into a single call.
+ //
+ // We keep track of state in TLS storage that we can access through
+ // the %fs segment register. See trusted_thread.cc for the exact
+ // memory layout.
+ //
+ // TODO(markus): maybe, we should proactively call gettimeofday() and
+ // clock_gettime(), whenever we talk to the trusted thread?
+ // or maybe, if we have recently seen requests to compute
+ // the time. There might be a repeated pattern of those.
+ "cmp $78, %eax\n" // __NR_gettimeofday
+ "jnz 2f\n"
+ "cmp %eax, %fs:0x102C-0x54\n" // last system call
+ "jnz 0f\n"
+
+ // This system call and the last system call prior to this one both are
+ // calls to gettimeofday(). Try to avoid making the new call and just
+ // return the same result as in the previous call.
+ // Just in case the caller is spinning on the result from gettimeofday(),
+ // every so often, call the actual system call.
+ "decl %fs:0x1030-0x54\n" // countdown calls to gettimofday()
+ "jz 0f\n"
+
+ // Atomically read the 64bit word representing last-known timestamp and
+ // return it to the caller. On x86-32 this is a little more complicated and
+ // requires the use of the cmpxchg8b instruction.
+ "mov %ebx, %eax\n"
+ "mov %ecx, %edx\n"
+ "lock; cmpxchg8b 100f\n"
+ "mov %eax, 0(%ebx)\n"
+ "mov %edx, 4(%ebx)\n"
+ "xor %eax, %eax\n"
+ "add $28, %esp\n"
+ "jmp 4f\n"
+
+ // This is a call to gettimeofday(), but we don't have a valid cached
+ // result, yet.
+ "0:mov %eax, %fs:0x102C-0x54\n" // remember syscall number
+ "movl $500, %fs:0x1030-0x54\n" // make system call, each 500 invocations
+ "call playground$defaultSystemCallHandler\n"
+
+ // Returned from gettimeofday(). Remember return value, in case the
+ // application calls us again right away.
+ // Again, this has to happen atomically and requires cmpxchg8b.
+ "mov 4(%ebx), %ecx\n"
+ "mov 0(%ebx), %ebx\n"
+ "mov 100f, %eax\n"
+ "mov 101f, %edx\n"
+ "1:lock; cmpxchg8b 100f\n"
+ "jnz 1b\n"
+ "xor %eax, %eax\n"
+ "jmp 6f\n"
+
+ // Remember the number of the last system call made. We deliberately do
+ // not remember calls to gettid(), as we have often seen long sequences
+ // of calls to just gettimeofday() and gettid(). In that situation, we
+ // would still like to coalesce the gettimeofday() calls.
+ "2:cmp $224, %eax\n" // __NR_gettid
+ "jz 3f\n"
+ "mov %eax, %fs:0x102C-0x54\n" // remember syscall number
// Retrieve function call from system call table (c.f. syscall_table.c).
// We have three different types of entries; zero for denied system calls,
// that should be handled by the defaultSystemCallHandler(); minus one
// for unrestricted system calls that need to be forwarded to the trusted
// thread; and function pointers to specific handler functions.
- "shl $3, %eax\n"
+ "3:shl $3, %eax\n"
"lea playground$syscallTable, %ebx\n"
"add %ebx, %eax\n"
"mov 0(%eax), %eax\n"
@@ -161,14 +227,13 @@ asm(
// Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
// jump to fallback handler.
"cmp $1, %eax\n"
- "jbe 1f\n"
+ "jbe 5f\n"
"add $4, %esp\n"
"call *%eax\n"
"add $24, %esp\n"
- "0:"
// Restore CPU registers, except for %eax which was set by the system call.
- "pop %ebp\n"
+ "4:pop %ebp\n"
"pop %edi\n"
"pop %esi\n"
"pop %edx\n"
@@ -178,13 +243,16 @@ asm(
// Return to caller
"ret\n"
- "1:"
// Call default handler.
- "push $2f\n"
- "push $playground$defaultSystemCallHandler\n"
- "ret\n"
- "2:add $28, %esp\n"
- "jmp 0b\n"
+ "5:call playground$defaultSystemCallHandler\n"
+ "6:add $28, %esp\n"
+ "jmp 4b\n"
+
+ ".pushsection \".bss\"\n"
+ ".balign 8\n"
+"100:.byte 0, 0, 0, 0\n"
+"101:.byte 0, 0, 0, 0\n"
+ ".popsection\n"
#else
#error Unsupported target platform
diff --git a/sandbox/linux/seccomp/trusted_thread.cc b/sandbox/linux/seccomp/trusted_thread.cc
index 6edc05d..af2e913 100644
--- a/sandbox/linux/seccomp/trusted_thread.cc
+++ b/sandbox/linux/seccomp/trusted_thread.cc
@@ -1,3 +1,7 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
#include "sandbox_impl.h"
#include "syscall_table.h"
@@ -17,7 +21,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov %0, %%rbp\n" // %rbp = args
"xor %%rbx, %%rbx\n" // initial sequence number
"lea 999f(%%rip), %%r15\n" // continue in same thread
- "jmp 19f\n" // create trusted thread
+ "jmp 20f\n" // create trusted thread
// TODO(markus): Coalesce the read() operations by reading into a bigger
// buffer.
@@ -140,7 +144,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"test %%rax, %%rax\n"
"js 25f\n" // exit process
"mov %%rax, %%rdi\n"
- "jnz 7f\n" // wait for child, then return result
+ "jnz 8f\n" // wait for child, then return result
"mov %%fs:0x0, %%rdi\n" // start = secure_mem
"mov $4096, %%esi\n" // len = 4096
"mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
@@ -148,17 +152,43 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"syscall\n"
"mov %%r8d, 0xD4(%%rdi)\n" // set most recently returned SysV shm id
"xor %%rdi, %%rdi\n"
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "cmpw $0, %%fs:0xD0\n" // debug mode
+ "jz 26f\n"
+ "mov $1, %%eax\n" // NR_write
+ "mov $2, %%edi\n" // fd = stderr
+ "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "syscall\n"
+ "xor %%rdi, %%rdi\n"
+ #endif
+
"jmp 26f\n" // exit program, no message
"4:syscall\n"
- "jmp 14f\n" // return result
+ "jmp 15f\n" // return result
// If syscall number is -2, execute locked system call from the
// secure memory area
- "5:jg 11f\n"
+ "5:jg 12f\n"
"cmp $-2, %%eax\n"
- "jnz 8f\n"
+ "jnz 9f\n"
"cmp %%rbx, %%fs:0x8\n"
"jne 25f\n" // exit process
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "cmpw $0, %%fs:0xD0\n" // debug mode
+ "jz 6f\n"
+ "mov $1, %%eax\n" // NR_write
+ "mov $2, %%edi\n" // fd = stderr
+ "lea 101f(%%rip), %%rsi\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "syscall\n"
+ "6:"
+ #endif
+
"mov %%fs:0x10, %%rax\n"
"mov %%fs:0x18, %%rdi\n"
"mov %%fs:0x20, %%rsi\n"
@@ -171,17 +201,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// clone() has unusual calling conventions and must be handled specially
"cmp $56, %%rax\n" // NR_clone
- "jz 18f\n"
+ "jz 19f\n"
// exit() terminates trusted thread
"cmp $60, %%eax\n" // NR_exit
- "jz 17f\n"
+ "jz 18f\n"
// Perform requested system call
"syscall\n"
// Unlock mutex
- "6:cmp %%rbx, %%fs:0x8\n"
+ "7:cmp %%rbx, %%fs:0x8\n"
"jne 25f\n" // exit process
"add $2, %%rbx\n"
"mov %%rax, %%r8\n"
@@ -193,37 +223,37 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"js 25f\n" // exit process
"jz 22f\n" // unlock and exit
"mov %%rax, %%rdi\n"
- "7:xor %%rsi, %%rsi\n"
+ "8:xor %%rsi, %%rsi\n"
"xor %%rdx, %%rdx\n"
"xor %%r10, %%r10\n"
"mov $61, %%eax\n" // NR_wait4
"syscall\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 7b\n"
+ "jz 8b\n"
"mov %%r8, %%rax\n"
- "jmp 14f\n" // return result
+ "jmp 15f\n" // return result
// If syscall number is -3, read the time stamp counter
- "8:cmp $-3, %%eax\n"
- "jnz 9f\n"
+ "9:cmp $-3, %%eax\n"
+ "jnz 10f\n"
"rdtsc\n" // sets %edx:%eax
"xor %%rcx, %%rcx\n"
- "jmp 10f\n"
- "9:cmp $-4, %%eax\n"
- "jnz 11f\n"
+ "jmp 11f\n"
+ "10:cmp $-4, %%eax\n"
+ "jnz 12f\n"
"rdtscp\n" // sets %edx:%eax and %ecx
- "10:add $0x3C, %%rsi\n"
+ "11:add $0x3C, %%rsi\n"
"mov %%eax, 0(%%rsi)\n"
"mov %%edx, 4(%%rsi)\n"
"mov %%ecx, 8(%%rsi)\n"
"mov $12, %%edx\n"
- "jmp 15f\n" // return result
+ "jmp 16f\n" // return result
// Check in syscallTable whether this system call is unrestricted
- "11:mov %%rax, %%r9\n"
+ "12:mov %%rax, %%r9\n"
#ifndef NDEBUG
"cmpw $0, %%fs:0xD0\n" // debug mode
- "jnz 12f\n"
+ "jnz 13f\n"
#endif
"cmp playground$maxSyscall(%%rip), %%eax\n"
"ja 25f\n" // exit process
@@ -236,14 +266,14 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Default behavior for unrestricted system calls is to just execute
// them. Read the remaining arguments first.
- "12:mov %%rsi, %%r8\n"
+ "13:mov %%rsi, %%r8\n"
"xor %%rax, %%rax\n" // NR_read
"mov %%r13, %%rdi\n" // fd = threadFd
"add $4, %%rsi\n" // buf = &scratch + 4
"mov $48, %%edx\n" // len = 6*sizeof(void *)
- "13:syscall\n"
+ "14:syscall\n"
"cmp $-4, %%rax\n" // EINTR
- "jz 13b\n"
+ "jz 14b\n"
"cmp %%rdx, %%rax\n"
"jnz 25f\n" // exit process
"mov %%r9, %%rax\n"
@@ -258,27 +288,27 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"syscall\n"
// Return result of system call to sandboxed thread
- "14:mov %%fs:0x0, %%rsi\n"
+ "15:mov %%fs:0x0, %%rsi\n"
"add $0x1034, %%rsi\n" // buf = &scratch + 52
"mov %%rax, (%%rsi)\n"
"mov $8, %%edx\n" // len = 8
- "15:mov %%r13, %%rdi\n" // fd = threadFd
+ "16:mov %%r13, %%rdi\n" // fd = threadFd
"mov $1, %%eax\n" // NR_write
- "16:syscall\n"
+ "17:syscall\n"
"cmp %%rdx, %%rax\n"
"jz 1b\n"
"cmp $-4, %%rax\n" // EINTR
- "jz 16b\n"
+ "jz 17b\n"
"jmp 25f\n" // exit process
// NR_exit:
// Exit trusted thread after cleaning up resources
- "17:mov %%fs:0x0, %%rsi\n"
+ "18:mov %%fs:0x0, %%rsi\n"
"mov 0xE8(%%rsi), %%rdi\n" // fd = threadFdPub
"mov $3, %%eax\n" // NR_close
"syscall\n"
"mov %%rsi, %%rdi\n" // start = secure_mem
- "mov $8192, %%esi\n" // length = 4096
+ "mov $8192, %%esi\n" // length = 8192
"xor %%rdx, %%rdx\n" // prot = PROT_NONE
"mov $10, %%eax\n" // NR_mprotect
"syscall\n"
@@ -291,6 +321,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"syscall\n"
"mov %%rax, %%rdi\n"
"test %%rax, %%rax\n"
+ "js 26f\n" // exit process
"jne 21f\n" // reap helper, exit thread
"jmp 22f\n" // unlock mutex
@@ -305,20 +336,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// terminates the program. But if we ever support signal handling,
// we have to be careful that the user cannot install a SIGSEGV
// handler that gets executed with elevated privileges.
- "18:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem
+ "19:mov %%fs:0x0, %%rbp\n" // %rbp = old_shared_mem
"syscall\n" // calls NR_clone
"cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values
- "jae 6b\n"
+ "jae 7b\n" // unlock mutex, return result
"add $2, %%rbx\n"
"test %%rax, %%rax\n"
- "jne 14b\n" // return result
+ "jne 15b\n" // return result
// In nascent thread, now.
"sub $2, %%rbx\n"
"xor %%r15, %%r15\n" // Request to return from clone() when done
// Get thread id of nascent thread
- "19:mov $186, %%eax\n" // NR_gettid
+ "20:mov $186, %%eax\n" // NR_gettid
"syscall\n"
"mov %%rax, %%r14\n"
@@ -342,14 +373,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// the parent. For our purposes, it is sufficient to fail with a
// fatal error.
"jmp 25f\n" // exit process
- "20:mov $56, %%eax\n" // NR_clone
- "mov $17, %%rdi\n" // flags = SIGCHLD
- "mov $1, %%rsi\n" // stack = 1
- "syscall\n"
- "test %%rax, %%rax\n"
- "js 25f\n" // exit process
- "jz 22f\n" // unlock and exit
- "mov %%rax, %%rdi\n"
"21:xor %%rsi, %%rsi\n"
"xor %%rdx, %%rdx\n"
"xor %%r10, %%r10\n"
@@ -374,7 +397,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"24:syscall\n"
"25:mov $1, %%eax\n" // NR_write
"mov $2, %%edi\n" // fd = stderr
- "lea 100f(%%rip), %%rsi\n"
+ "lea 100f(%%rip), %%rsi\n" // "Sandbox violation detected"
"mov $101f-100f, %%edx\n" // len = strlen(msg)
"syscall\n"
"mov $1, %%edi\n"
@@ -426,7 +449,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $0x1001, %%edi\n" // option = ARCH_SET_GS
"syscall\n"
"cmp $-4095, %%rax\n" // return codes -1..-4095 are errno values
- "jae 20b\n" // exit thread, unlock global mutex
+ "jae 25b\n" // exit process
// Check whether this is the initial thread, or a newly created one.
// At startup we run the same code as when we create a new thread. At
@@ -524,7 +547,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $61, %%eax\n" // NR_wait4
"syscall\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 30\n"
+ "jz 30b\n"
// Release privileges by entering seccomp mode.
"mov $157, %%eax\n" // NR_prctl
@@ -578,7 +601,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
".pushsection \".rodata\"\n"
"100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
- "101:\n"
+ "101:.ascii \"WARNING! This is an expensive system call\\n\"\n"
+ "102:\n"
".popsection\n"
"999:pop %%rbp\n"
@@ -613,7 +637,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"movd %%ebx, %%mm3\n"
"xor %%ebx, %%ebx\n" // initial sequence number
"movd %%ebx, %%mm2\n"
- "jmp 19f\n" // create trusted thread
+ "jmp 20f\n" // create trusted thread
// TODO(markus): Coalesce the read() operations by reading into a bigger
// buffer.
@@ -645,21 +669,20 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// 0x1C: fifth argument; passed to syscall in %edi
// 0x20: sixth argument; passed to syscall in %ebp
// 0x24: stored return address for clone() system call
- // 0x28: second stored return address for clone() system call
- // 0x2C: stored %ebp value for clone() system call
- // 0x30: stored %edi value for clone() system call
- // 0x34: stored %esi value for clone() system call
- // 0x38: stored %edx value for clone() system call
- // 0x3C: stored %ecx value for clone() system call
- // 0x40: stored %ebx value for clone() system call
- // 0x44: new shared memory for clone()
- // 0x48: processFdPub for talking to trusted process
- // 0x4C: cloneFdPub for talking to trusted process
- // 0x50: set to non-zero, if in debugging mode
- // 0x54: most recent SHM id returned by shmget(IPC_PRIVATE)
- // 0x58: cookie assigned to us by the trusted process (TLS_COOKIE)
- // 0x60: thread id (TLS_TID)
- // 0x68: threadFdPub (TLS_THREAD_FD)
+ // 0x28: stored %ebp value for clone() system call
+ // 0x2C: stored %edi value for clone() system call
+ // 0x30: stored %esi value for clone() system call
+ // 0x34: stored %edx value for clone() system call
+ // 0x38: stored %ecx value for clone() system call
+ // 0x3C: stored %ebx value for clone() system call
+ // 0x40: new shared memory for clone()
+ // 0x44: processFdPub for talking to trusted process
+ // 0x48: cloneFdPub for talking to trusted process
+ // 0x4C: set to non-zero, if in debugging mode
+ // 0x50: most recent SHM id returned by shmget(IPC_PRIVATE)
+ // 0x54: cookie assigned to us by the trusted process (TLS_COOKIE)
+ // 0x5C: thread id (TLS_TID)
+ // 0x64: threadFdPub (TLS_THREAD_FD)
// 0x200-0x1000: securely passed verified file name(s)
// Layout of (untrusted) scratch space:
@@ -674,6 +697,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// 0x20: RDTSCP result (%eax)
// 0x24: RDTSCP result (%edx)
// 0x28: RDTSCP result (%ecx)
+ // 0x2C: last system call (updated in syscall.cc)
+ // 0x30: number of consecutive calls to a time fnc. (e.g. gettimeofday)
"0:xor %%esp, %%esp\n"
"mov $2, %%eax\n" // %mm2 = initial sequence number
@@ -738,26 +763,55 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"test %%eax, %%eax\n"
"js 25f\n" // exit process
"mov %%eax, %%ebx\n"
- "jnz 7f\n" // wait for child, then return result
+ "jnz 8f\n" // wait for child, then return result
"movd %%mm5, %%ebx\n" // start = secure_mem
"mov $4096, %%ecx\n" // len = 4096
"mov $3, %%edx\n" // prot = PROT_READ | PROT_WRITE
"mov $125, %%eax\n" // NR_mprotect
"int $0x80\n"
- "mov %%ebp, 0x54(%%ebx)\n" // set most recently returned SysV shm id
+ "mov %%ebp, 0x50(%%ebx)\n" // set most recently returned SysV shm id
"xor %%ebx, %%ebx\n"
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "movd %%mm5, %%ecx\n"
+ "cmpw $0, 0x4C(%%ecx)\n" // debug mode
+ "jz 26f\n"
+ "mov $4, %%eax\n" // NR_write
+ "mov $2, %%ebx\n" // fd = stderr
+ "lea 101f, %%ecx\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "int $0x80\n"
+ "xor %%ebx, %%ebx\n"
+ #endif
+
"jmp 26f\n" // exit program, no message
"4:int $0x80\n"
- "jmp 14f\n" // return result
+ "jmp 15f\n" // return result
// If syscall number is -2, execute locked system call from the
// secure memory area
- "5:jg 11f\n"
+ "5:jg 12f\n"
"cmp $-2, %%eax\n"
- "jnz 8f\n"
+ "jnz 9f\n"
"movd %%mm2, %%ebp\n"
"cmp %%ebp, 0x4-0x1000(%%ecx)\n"
"jne 25f\n" // exit process
+
+ // When debugging messages are enabled, warn about expensive system calls
+ #ifndef NDEBUG
+ "cmpw $0, 0x4C-0x1000(%%ecx)\n"
+ "jz 6f\n" // debug mode
+ "mov %%ecx, %%ebp\n"
+ "mov $4, %%eax\n" // NR_write
+ "mov $2, %%ebx\n" // fd = stderr
+ "lea 101f, %%ecx\n" // "This is an expensive system call"
+ "mov $102f-101f, %%edx\n" // len = strlen(msg)
+ "int $0x80\n"
+ "mov %%ebp, %%ecx\n"
+ "6:"
+ #endif
+
"mov 0x08-0x1000(%%ecx), %%eax\n"
"mov 0x0C-0x1000(%%ecx), %%ebx\n"
"mov 0x14-0x1000(%%ecx), %%edx\n"
@@ -774,11 +828,11 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// clone() has unusual calling conventions and must be handled specially
"cmp $120, %%eax\n" // NR_clone
- "jz 18f\n"
+ "jz 19f\n"
// exit() terminates trusted thread
"cmp $1, %%eax\n" // NR_exit
- "jz 17f\n"
+ "jz 18f\n"
// Perform requested system call
"movd %%mm4, %%edi\n"
@@ -786,7 +840,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"int $0x80\n"
// Unlock mutex
- "6:movd %%mm2, %%ebp\n"
+ "7:movd %%mm2, %%ebp\n"
"movd %%mm5, %%edi\n"
"cmp %%ebp, 4(%%edi)\n"
"jne 25f\n" // exit process
@@ -801,38 +855,38 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"js 25f\n" // exit process
"jz 22f\n" // unlock and exit
"mov %%eax, %%ebx\n"
- "7:xor %%ecx, %%ecx\n"
+ "8:xor %%ecx, %%ecx\n"
"xor %%edx, %%edx\n"
"mov $7, %%eax\n" // NR_waitpid
"int $0x80\n"
"cmp $-4, %%eax\n" // EINTR
"jz 6\n"
"mov %%ebp, %%eax\n"
- "jmp 14f\n" // return result
+ "jmp 15f\n" // return result
// If syscall number is -3, read the time stamp counter
- "8:cmp $-3, %%eax\n"
- "jnz 9f\n"
+ "9:cmp $-3, %%eax\n"
+ "jnz 10f\n"
"rdtsc\n" // sets %edx:%eax
"xor %%ecx, %%ecx\n"
- "jmp 10f\n"
- "9:cmp $-4, %%eax\n"
- "jnz 11f\n"
+ "jmp 11f\n"
+ "10:cmp $-4, %%eax\n"
+ "jnz 12f\n"
"rdtscp\n" // sets %edx:%eax and %ecx
- "10:movd %%mm5, %%ebx\n"
+ "11:movd %%mm5, %%ebx\n"
"add $0x1020, %%ebx\n"
"mov %%eax, 0(%%ebx)\n"
"mov %%edx, 4(%%ebx)\n"
"mov %%ecx, 8(%%ebx)\n"
"mov %%ebx, %%ecx\n"
"mov $12, %%edx\n"
- "jmp 15f\n" // return result
+ "jmp 16f\n" // return result
// Check in syscallTable whether this system call is unrestricted
- "11:mov %%eax, %%ebp\n"
+ "12:mov %%eax, %%ebp\n"
#ifndef NDEBUG
- "cmpw $0, 0x50-0x1000(%%ecx)\n"
- "jnz 12f\n" // debug mode
+ "cmpw $0, 0x4C-0x1000(%%ecx)\n"
+ "jnz 13f\n" // debug mode
#endif
"cmp playground$maxSyscall, %%eax\n"
"ja 25f\n" // exit process
@@ -844,13 +898,13 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// Default behavior for unrestricted system calls is to just execute
// them. Read the remaining arguments first.
- "12:mov $3, %%eax\n" // NR_read
+ "13:mov $3, %%eax\n" // NR_read
"movd %%mm0, %%ebx\n" // fd = threadFd
"add $4, %%ecx\n" // buf = &scratch + 4
"mov $24, %%edx\n" // len = 6*sizeof(void *)
- "13:int $0x80\n"
+ "14:int $0x80\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 13b\n"
+ "jz 14b\n"
"cmp %%edx, %%eax\n"
"jnz 25f\n" // exit process
"mov %%ebp, %%eax\n"
@@ -865,27 +919,27 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"int $0x80\n"
// Return result of system call to sandboxed thread
- "14:movd %%mm5, %%ecx\n"
+ "15:movd %%mm5, %%ecx\n"
"add $0x101C, %%ecx\n" // buf = &scratch + 28
"mov %%eax, (%%ecx)\n"
"mov $4, %%edx\n" // len = 4
- "15:movd %%mm0, %%ebx\n" // fd = threadFd
+ "16:movd %%mm0, %%ebx\n" // fd = threadFd
"mov $4, %%eax\n" // NR_write
- "16:int $0x80\n"
+ "17:int $0x80\n"
"cmp %%edx, %%eax\n"
"jz 1b\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 16b\n"
+ "jz 17b\n"
"jmp 25f\n" // exit process
// NR_exit:
// Exit trusted thread after cleaning up resources
- "17:mov %%edi, %%ecx\n"
- "mov 0x68(%%ecx), %%ebx\n" // fd = threadFdPub
+ "18:mov %%edi, %%ecx\n"
+ "mov 0x64(%%ecx), %%ebx\n" // fd = threadFdPub
"mov $6, %%eax\n" // NR_close
"int $0x80\n"
"mov %%ecx, %%ebx\n" // start = secure_mem
- "mov $8192, %%ecx\n" // length = 4096
+ "mov $8192, %%ecx\n" // length = 8192
"xor %%edx, %%edx\n" // prot = PROT_NONE
"mov $125, %%eax\n" // NR_mprotect
"int $0x80\n"
@@ -898,6 +952,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"int $0x80\n"
"mov %%eax, %%ebx\n"
"test %%eax, %%eax\n"
+ "js 25f\n" // exit process
"jne 21f\n" // reap helper, exit thread
"jmp 22f\n" // unlock mutex
@@ -912,17 +967,17 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// terminates the program. But if we ever support signal handling,
// we have to be careful that the user cannot install a SIGSEGV
// handler that gets executed with elevated privileges.
- "18:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem
+ "19:movd %%edi, %%mm6\n" // %mm6 = old_shared_mem
"movd %%mm4, %%edi\n"
"movd %%mm7, %%ebp\n"
"int $0x80\n" // calls NR_clone
"cmp $-4095, %%eax\n" // return codes -1..-4095 are errno values
- "jae 6b\n"
+ "jae 7b\n" // unlock mutex, return result
"movd %%mm2, %%edi\n"
"add $2, %%edi\n"
"movd %%edi, %%mm2\n"
"test %%eax, %%eax\n"
- "jne 14b\n" // return result
+ "jne 15b\n" // return result
// In nascent thread, now.
"sub $2, %%edi\n"
@@ -930,7 +985,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"movd %%eax, %%mm3\n" // Request to return from clone() when done
// Get thread id of nascent thread
- "19:mov $224, %%eax\n" // NR_gettid
+ "20:mov $224, %%eax\n" // NR_gettid
"int $0x80\n"
"movd %%eax, %%mm4\n"
@@ -958,14 +1013,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// the parent. For our purposes, it is sufficient to fail with a
// fatal error.
"jmp 25f\n" // exit process
- "20:mov $120, %%eax\n" // NR_clone
- "mov $17, %%ebx\n" // flags = SIGCHLD
- "mov $1, %%ecx\n" // stack = 1
- "int $0x80\n"
- "test %%eax, %%eax\n"
- "js 25f\n" // exit process
- "jz 22f\n" // unlock and exit
- "mov %%eax, %%ebx\n"
"21:xor %%ecx, %%ecx\n"
"xor %%edx, %%edx\n"
"mov $7, %%eax\n" // NR_waitpid
@@ -989,7 +1036,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"24:int $0x80\n"
"25:mov $4, %%eax\n" // NR_write
"mov $2, %%ebx\n" // fd = stderr
- "lea 100f, %%ecx\n"
+ "lea 100f, %%ecx\n" // "Sandbox violation detected"
"mov $101f-100f, %%edx\n" // len = strlen(msg)
"int $0x80\n"
"mov $1, %%ebx\n"
@@ -998,7 +1045,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
// The first page is mapped read-only for use as securely shared memory
"27:movd %%mm6, %%ebp\n"
- "mov 0x44(%%ebp), %%esi\n"
+ "mov 0x40(%%ebp), %%esi\n"
"movd %%esi, %%mm5\n" // %mm5 = secure shared memory
"movd %%mm2, %%edi\n"
"cmp %%edi, 4(%%ebp)\n"
@@ -1024,7 +1071,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $120, %%eax\n" // NR_clone
"mov $0x850F00, %%ebx\n" // flags = VM|FS|FILES|SIGH|THR|SYSV|UTR
"mov $1, %%ecx\n" // stack = 1
- "movd 0x48(%%ebp), %%mm1\n" // %mm1 = processFdPub
+ "movd 0x44(%%ebp), %%mm1\n" // %mm1 = processFdPub
"cmp %%edi, 4(%%ebp)\n"
"jne 25b\n" // exit process
"int $0x80\n"
@@ -1037,7 +1084,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"push %%eax\n"
"mov $0xFFFFF, %%eax\n" // limit
"push %%eax\n"
- "add $0x58, %%esi\n"
+ "add $0x54, %%esi\n"
"push %%esi\n" // base_addr = &secure_mem.TLS
"mov %%fs, %%eax\n"
"shr $3, %%eax\n"
@@ -1080,8 +1127,6 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"push %%eax\n"
"mov 0x3C(%%ebp), %%eax\n"
"push %%eax\n"
- "mov 0x40(%%ebp), %%eax\n"
- "push %%eax\n"
"cmp %%edi, 4(%%ebp)\n"
"jne 25b\n" // exit process
@@ -1111,7 +1156,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"movd %%mm0, %%eax\n" // fd1 = threadFd
"push %%eax\n"
"push %%esi\n" // fd0 = threadFdPub
- "mov 0x4C(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub()
+ "mov 0x48(%%ebp), %%eax\n" // transport = Sandbox::cloneFdPub()
"cmp %%edi, 4(%%ebp)\n"
"jne 25b\n" // exit process
"push %%eax\n"
@@ -1139,7 +1184,7 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
"mov $7, %%eax\n" // NR_waitpid
"int $0x80\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 30\n"
+ "jz 30b\n"
// Release privileges by entering seccomp mode.
"mov $172, %%eax\n" // NR_prctl
@@ -1190,7 +1235,8 @@ void Sandbox::createTrustedThread(int processFdPub, int cloneFdPub,
".pushsection \".rodata\"\n"
"100:.ascii \"Sandbox violation detected, program aborted\\n\"\n"
- "101:\n"
+ "101:.ascii \"WARNING! This is an expensive system call\\n\"\n"
+ "102:\n"
".popsection\n"
"999:pop %%ebp\n"