#define XOPEN_SOURCE 500 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "debug.h" #include "library.h" #include "sandbox_impl.h" #include "syscall.h" #include "syscall_table.h" #include "x86_decode.h" #if defined(__x86_64__) typedef Elf64_Phdr Elf_Phdr; typedef Elf64_Rela Elf_Rel; typedef Elf64_Half Elf_Half; typedef Elf64_Word Elf_Word; typedef Elf64_Sword Elf_Sword; typedef Elf64_Xword Elf_Xword; typedef Elf64_Sxword Elf_Sxword; typedef Elf64_Off Elf_Off; typedef Elf64_Section Elf_Section; typedef Elf64_Versym Elf_Versym; #define ELF_ST_BIND ELF64_ST_BIND #define ELF_ST_TYPE ELF64_ST_TYPE #define ELF_ST_INFO ELF64_ST_INFO #define ELF_R_SYM ELF64_R_SYM #define ELF_R_TYPE ELF64_R_TYPE #define ELF_R_INFO ELF64_R_INFO #define ELF_REL_PLT ".rela.plt" #define ELF_JUMP_SLOT R_X86_64_JUMP_SLOT #elif defined(__i386__) typedef Elf32_Phdr Elf_Phdr; typedef Elf32_Rel Elf_Rel; typedef Elf32_Half Elf_Half; typedef Elf32_Word Elf_Word; typedef Elf32_Sword Elf_Sword; typedef Elf32_Xword Elf_Xword; typedef Elf32_Sxword Elf_Sxword; typedef Elf32_Off Elf_Off; typedef Elf32_Section Elf_Section; typedef Elf32_Versym Elf_Versym; #define ELF_ST_BIND ELF32_ST_BIND #define ELF_ST_TYPE ELF32_ST_TYPE #define ELF_ST_INFO ELF32_ST_INFO #define ELF_R_SYM ELF32_R_SYM #define ELF_R_TYPE ELF32_R_TYPE #define ELF_R_INFO ELF32_R_INFO #define ELF_REL_PLT ".rel.plt" #define ELF_JUMP_SLOT R_386_JMP_SLOT #else #error Unsupported target platform #endif namespace playground { char* Library::__kernel_vsyscall; char* Library::__kernel_sigreturn; char* Library::__kernel_rt_sigreturn; Library::~Library() { if (image_size_) { // We no longer need access to a full mapping of the underlying library // file. Move the temporarily extended mapping back to where we originally // found. Make sure to preserve any changes that we might have made since. Sandbox::SysCalls sys; sys.mprotect(image_, 4096, PROT_READ | PROT_WRITE); memcpy(image_, memory_ranges_.rbegin()->second.start, 4096); sys.mprotect(image_, 4096, PROT_READ | PROT_EXEC); sys.mremap(image_, image_size_, 4096, MREMAP_MAYMOVE | MREMAP_FIXED, memory_ranges_.rbegin()->second.start); } } char* Library::getBytes(char* dst, const char* src, ssize_t len) { // Some kernels don't allow accessing the VDSO from write() if (isVDSO_ && src >= memory_ranges_.begin()->second.start && src <= memory_ranges_.begin()->second.stop) { ssize_t max = reinterpret_cast(memory_ranges_.begin()->second.stop) - src; if (len > max) { len = max; } memcpy(dst, src, len); return dst; } // Read up to "len" bytes from "src" and copy them to "dst". Short // copies are possible, if we are at the end of a mapping. Returns // NULL, if the operation failed completely. static int helper_socket[2]; Sandbox::SysCalls sys; if (!helper_socket[0] && !helper_socket[1]) { // Copy data through a socketpair, as this allows us to access it // without incurring a segmentation fault. sys.socketpair(AF_UNIX, SOCK_STREAM, 0, helper_socket); } char* ptr = dst; int inc = 4096; while (len > 0) { ssize_t l = inc == 1 ? inc : 4096 - (reinterpret_cast(src) & 0xFFF); if (l > len) { l = len; } l = NOINTR_SYS(sys.write(helper_socket[0], src, l)); if (l == -1) { if (sys.my_errno == EFAULT) { if (inc == 1) { if (ptr == dst) { return NULL; } break; } inc = 1; continue; } else { return NULL; } } l = sys.read(helper_socket[1], ptr, l); if (l <= 0) { return NULL; } ptr += l; src += l; len -= l; } return dst; } char *Library::get(Elf_Addr offset, char *buf, size_t len) { if (!valid_) { memset(buf, 0, len); return NULL; } RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); if (iter == memory_ranges_.end()) { memset(buf, 0, len); return NULL; } offset -= iter->first; long size = reinterpret_cast(iter->second.stop) - reinterpret_cast(iter->second.start); if (offset > size - len) { memset(buf, 0, len); return NULL; } char *src = reinterpret_cast(iter->second.start) + offset; memset(buf, 0, len); if (!getBytes(buf, src, len)) { return NULL; } return buf; } std::string Library::get(Elf_Addr offset) { if (!valid_) { return ""; } RangeMap::const_iterator iter = memory_ranges_.lower_bound(offset); if (iter == memory_ranges_.end()) { return ""; } offset -= iter->first; const char *start = reinterpret_cast(iter->second.start) + offset; const char *stop = reinterpret_cast(iter->second.stop) + offset; char buf[4096] = { 0 }; getBytes(buf, start, stop - start >= (int)sizeof(buf) ? sizeof(buf) - 1 : stop - start); start = buf; stop = buf; while (*stop) { ++stop; } std::string s = stop > start ? std::string(start, stop - start) : ""; return s; } char *Library::getOriginal(Elf_Addr offset, char *buf, size_t len) { if (!valid_) { memset(buf, 0, len); return NULL; } Sandbox::SysCalls sys; if (!image_ && !isVDSO_ && !memory_ranges_.empty() && memory_ranges_.rbegin()->first == 0) { // Extend the mapping of the very first page of the underlying library // file. This way, we can read the original file contents of the entire // library. // We have to be careful, because doing so temporarily removes the first // 4096 bytes of the library from memory. And we don't want to accidentally // unmap code that we are executing. So, only use functions that can be // inlined. void* start = memory_ranges_.rbegin()->second.start; image_size_ = memory_ranges_.begin()->first + (reinterpret_cast(memory_ranges_.begin()->second.stop) - reinterpret_cast(memory_ranges_.begin()->second.start)); image_ = reinterpret_cast(sys.mremap(start, 4096, image_size_, MREMAP_MAYMOVE)); if (image_ == MAP_FAILED) { image_ = NULL; } else { sys.MMAP(start, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); for (int i = 4096 / sizeof(long); --i; reinterpret_cast(start)[i] = reinterpret_cast(image_)[i]); } } if (image_) { if (offset + len > image_size_) { // It is quite likely that we initially did not map the entire file as // we did not know how large it is. So, if necessary, try to extend the // mapping. size_t new_size = (offset + len + 4095) & ~4095; char* tmp = reinterpret_cast(sys.mremap(image_, image_size_, new_size, MREMAP_MAYMOVE)); if (tmp != MAP_FAILED) { image_ = tmp; image_size_ = new_size; } } if (buf && offset + len <= image_size_) { return reinterpret_cast(memcpy(buf, image_ + offset, len)); } return NULL; } return buf ? get(offset, buf, len) : NULL; } std::string Library::getOriginal(Elf_Addr offset) { if (!valid_) { return ""; } // Make sure we actually have a mapping that we can access. If the string // is located at the end of the image, we might not yet have extended the // mapping sufficiently. if (!image_ || image_size_ <= offset) { getOriginal(offset, NULL, 1); } if (image_) { if (offset < image_size_) { char* start = image_ + offset; char* stop = start; while (stop < image_ + image_size_ && *stop) { ++stop; if (stop >= image_ + image_size_) { getOriginal(stop - image_, NULL, 1); } } return std::string(start, stop - start); } return ""; } return get(offset); } const Elf_Ehdr* Library::getEhdr() { if (!valid_) { return NULL; } return &ehdr_; } const Elf_Shdr* Library::getSection(const std::string& section) { if (!valid_) { return NULL; } SectionTable::const_iterator iter = section_table_.find(section); if (iter == section_table_.end()) { return NULL; } return &iter->second.second; } const int Library::getSectionIndex(const std::string& section) { if (!valid_) { return -1; } SectionTable::const_iterator iter = section_table_.find(section); if (iter == section_table_.end()) { return -1; } return iter->second.first; } void **Library::getRelocation(const std::string& symbol) { PltTable::const_iterator iter = plt_entries_.find(symbol); if (iter == plt_entries_.end()) { return NULL; } return reinterpret_cast(asr_offset_ + iter->second); } void *Library::getSymbol(const std::string& symbol) { SymbolTable::const_iterator iter = symbols_.find(symbol); if (iter == symbols_.end() || !iter->second.st_value) { return NULL; } return asr_offset_ + iter->second.st_value; } void Library::makeWritable(bool state) const { for (RangeMap::const_iterator iter = memory_ranges_.begin(); iter != memory_ranges_.end(); ++iter) { const Range& range = iter->second; long length = reinterpret_cast(range.stop) - reinterpret_cast(range.start); Sandbox::SysCalls sys; sys.mprotect(range.start, length, range.prot | (state ? PROT_WRITE : 0)); } } bool Library::isSafeInsn(unsigned short insn) { // Check if the instruction has no unexpected side-effects. If so, it can // be safely relocated from the function that we are patching into the // out-of-line scratch space that we are setting up. This is often necessary // to make room for the JMP into the scratch space. return ((insn & 0x7) < 0x6 && (insn & 0xF0) < 0x40 /* ADD, OR, ADC, SBB, AND, SUB, XOR, CMP */) || #if defined(__x86_64__) insn == 0x63 /* MOVSXD */ || #endif (insn >= 0x80 && insn <= 0x8E /* ADD, OR, ADC, SBB, AND, SUB, XOR, CMP, TEST, XCHG, MOV, LEA */) || (insn == 0x90) || /* NOP */ (insn >= 0xA0 && insn <= 0xA9) /* MOV, TEST */ || (insn >= 0xB0 && insn <= 0xBF /* MOV */) || (insn >= 0xC0 && insn <= 0xC1) || /* Bit Shift */ (insn >= 0xD0 && insn <= 0xD3) || /* Bit Shift */ (insn >= 0xC6 && insn <= 0xC7 /* MOV */) || (insn == 0xF7) /* TEST, NOT, NEG, MUL, IMUL, DIV, IDIV */; } char* Library::getScratchSpace(const Maps* maps, char* near, int needed, char** extraSpace, int* extraLength) { if (needed > *extraLength || labs(*extraSpace - reinterpret_cast(near)) > (1536 << 20)) { if (*extraSpace) { // Start a new scratch page and mark any previous page as write-protected Sandbox::SysCalls sys; sys.mprotect(*extraSpace, 4096, PROT_READ|PROT_EXEC); } // Our new scratch space is initially executable and writable. *extraLength = 4096; *extraSpace = maps->allocNearAddr(near, *extraLength, PROT_READ|PROT_WRITE|PROT_EXEC); } if (*extraSpace) { *extraLength -= needed; return *extraSpace + *extraLength; } Sandbox::die("Insufficient space to intercept system call"); } void Library::patchSystemCallsInFunction(const Maps* maps, char *start, char *end, char** extraSpace, int* extraLength) { std::set branch_targets; for (char *ptr = start; ptr < end; ) { unsigned short insn = next_inst((const char **)&ptr, __WORDSIZE == 64); char *target; if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || insn == 0xEB /* JMP */) { target = ptr + (reinterpret_cast(ptr))[-1]; } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ || (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) { target = ptr + (reinterpret_cast(ptr))[-1]; } else { continue; } branch_targets.insert(target); } struct Code { char* addr; int len; unsigned short insn; bool is_ip_relative; } code[5] = { { 0 } }; int codeIdx = 0; char* ptr = start; while (ptr < end) { // Keep a ring-buffer of the last few instruction in order to find the // correct place to patch the code. char *mod_rm; code[codeIdx].addr = ptr; code[codeIdx].insn = next_inst((const char **)&ptr, __WORDSIZE == 64, 0, 0, &mod_rm, 0, 0); code[codeIdx].len = ptr - code[codeIdx].addr; code[codeIdx].is_ip_relative = #if defined(__x86_64__) mod_rm && (*mod_rm & 0xC7) == 0x5; #else false; #endif // Whenever we find a system call, we patch it with a jump to out-of-line // code that redirects to our system call wrapper. bool is_syscall = true; #if defined(__x86_64__) bool is_indirect_call = false; if (code[codeIdx].insn == 0x0F05 /* SYSCALL */ || // In addition, on x86-64, we need to redirect all CALLs between the // VDSO and the VSyscalls page. We want these to jump to our own // modified copy of the VSyscalls. As we know that the VSyscalls are // always more than 2GB away from the VDSO, the compiler has to // generate some form of indirect jumps. We can find all indirect // CALLs and redirect them to a separate scratch area, where we can // inspect the destination address. If it indeed points to the // VSyscall area, we then adjust the destination address accordingly. (is_indirect_call = (isVDSO_ && vsys_offset_ && code[codeIdx].insn == 0xFF && !code[codeIdx].is_ip_relative && mod_rm && (*mod_rm & 0x38) == 0x10 /* CALL (indirect) */))) { is_syscall = !is_indirect_call; #elif defined(__i386__) bool is_gs_call = false; if (code[codeIdx].len == 7 && code[codeIdx].insn == 0xFF && code[codeIdx].addr[2] == '\x15' /* CALL (indirect) */ && code[codeIdx].addr[0] == '\x65' /* %gs prefix */) { char* target; asm volatile("mov %%gs:(%1), %0\n" : "=a"(target) : "c"(*reinterpret_cast(code[codeIdx].addr+3))); if (target == __kernel_vsyscall) { is_gs_call = true; // TODO(markus): also handle the other vsyscalls } } if (is_gs_call || (code[codeIdx].insn == 0xCD && code[codeIdx].addr[1] == '\x80' /* INT $0x80 */)) { #else #error Unsupported target platform #endif // Found a system call. Search backwards to figure out how to redirect // the code. We will need to overwrite a couple of instructions and, // of course, move these instructions somewhere else. int startIdx = codeIdx; int endIdx = codeIdx; int length = code[codeIdx].len; for (int idx = codeIdx; (idx = (idx + (sizeof(code) / sizeof(struct Code)) - 1) % (sizeof(code) / sizeof(struct Code))) != codeIdx; ) { std::set::const_iterator iter = std::upper_bound(branch_targets.begin(), branch_targets.end(), code[idx].addr); if (iter != branch_targets.end() && *iter < ptr) { // Found a branch pointing to somewhere past our instruction. This // instruction cannot be moved safely. Leave it in place. break; } if (code[idx].addr && !code[idx].is_ip_relative && isSafeInsn(code[idx].insn)) { // These are all benign instructions with no side-effects and no // dependency on the program counter. We should be able to safely // relocate them. startIdx = idx; length = ptr - code[startIdx].addr; } else { break; } } // Search forward past the system call, too. Sometimes, we can only // find relocatable instructions following the system call. #if defined(__i386__) findEndIdx: #endif char *next = ptr; for (int i = codeIdx; next < end && (i = (i + 1) % (sizeof(code) / sizeof(struct Code))) != startIdx; ) { std::set::const_iterator iter = std::lower_bound(branch_targets.begin(), branch_targets.end(), next); if (iter != branch_targets.end() && *iter == next) { // Found branch target pointing to our instruction break; } char *tmp_rm; code[i].addr = next; code[i].insn = next_inst((const char **)&next, __WORDSIZE == 64, 0, 0, &tmp_rm, 0, 0); code[i].len = next - code[i].addr; code[i].is_ip_relative = tmp_rm && (*tmp_rm & 0xC7) == 0x5; if (!code[i].is_ip_relative && isSafeInsn(code[i].insn)) { endIdx = i; length = next - code[startIdx].addr; } else { break; } } // We now know, how many instructions neighboring the system call we // can safely overwrite. We need five bytes to insert a JMP/CALL and a // 32bit address. We then jump to a code fragment that safely forwards // to our system call wrapper. On x86-64, this is complicated by // the fact that the API allows up to 128 bytes of red-zones below the // current stack pointer. So, we cannot write to the stack until we // have adjusted the stack pointer. // // .. .. .. .. ; any leading instructions copied from original code // 48 81 EC 80 00 00 00 SUB $0x80, %rsp // 50 PUSH %rax // 48 8D 05 .. .. .. .. LEA ...(%rip), %rax // 50 PUSH %rax // 48 B8 .. .. .. .. MOV $syscallWrapper, %rax // .. .. .. .. // 50 PUSH %rax // 48 8D 05 06 00 00 00 LEA 6(%rip), %rax // 48 87 44 24 10 XCHG %rax, 16(%rsp) // C3 RETQ // 48 81 C4 80 00 00 00 ADD $0x80, %rsp // .. .. .. .. ; any trailing instructions copied from original code // E9 .. .. .. .. JMPQ ... // // Total: 52 bytes + any bytes that were copied // // On x86-32, the stack is available and we can do: // // TODO(markus): Try to maintain frame pointers on x86-32 // // .. .. .. .. ; any leading instructions copied from original code // 68 .. .. .. .. PUSH return_addr // 68 .. .. .. .. PUSH $syscallWrapper // C3 RET // .. .. .. .. ; any trailing instructions copied from original code // C3 RET // // Total: 12 bytes + any bytes that were copied // // For indirect jumps from the VDSO to the VSyscall page, we instead // replace the following code (this is only necessary on x86-64). This // time, we don't have to worry about red zones: // // .. .. .. .. ; any leading instructions copied from original code // E8 00 00 00 00 CALL . // 48 83 04 24 .. ADDQ $.., (%rsp) // FF .. .. .. .. .. PUSH .. ; from original CALL instruction // 48 81 3C 24 00 00 00 FF CMPQ $0xFFFFFFFFFF000000, 0(%rsp) // 72 10 JB . + 16 // 81 2C 24 .. .. .. .. SUBL ..., 0(%rsp) // C7 44 24 04 00 00 00 00 MOVL $0, 4(%rsp) // C3 RETQ // 48 87 04 24 XCHG %rax,(%rsp) // 48 89 44 24 08 MOV %rax,0x8(%rsp) // 58 POP %rax // C3 RETQ // .. .. .. .. ; any trailing instructions copied from original code // E9 .. .. .. .. JMPQ ... // // Total: 52 bytes + any bytes that were copied if (length < 5) { // There are a very small number of instruction sequences that we // cannot easily intercept, and that have been observed in real world // examples. Handle them here: #if defined(__i386__) int diff; if (!memcmp(code[codeIdx].addr, "\xCD\x80\xEB", 3) && (diff = *reinterpret_cast( code[codeIdx].addr + 3)) < 0 && diff >= -6) { // We have seen... // for (;;) { // _exit(0); // } // ..get compiled to: // B8 01 00 00 00 MOV $__NR_exit, %eax // 66 90 XCHG %ax, %ax // 31 DB 0:XOR %ebx, %ebx // CD 80 INT $0x80 // EB FA JMP 0b // The JMP is really superfluous as the system call never returns. // And there are in fact no returning system calls that need to be // unconditionally repeated in an infinite loop. // If we replace the JMP with NOPs, the system call can successfully // be intercepted. *reinterpret_cast(code[codeIdx].addr + 2) = 0x9090; goto findEndIdx; } #elif defined(__x86_64__) std::set::const_iterator iter; #endif // If we cannot figure out any other way to intercept this system call, // we replace it with a call to INT0. This causes a SEGV which we then // handle in the signal handler. That's a lot slower than rewriting the // instruction with a jump, but it should only happen very rarely. if (is_syscall) { memcpy(code[codeIdx].addr, "\xCD", 2); if (code[codeIdx].len > 2) { memset(code[codeIdx].addr + 2, 0x90, code[codeIdx].len - 2); } goto replaced; } #if defined(__x86_64__) // On x86-64, we occasionally see code like this in the VDSO: // 48 8B 05 CF FE FF FF MOV -0x131(%rip),%rax // FF 50 20 CALLQ *0x20(%rax) // By default, we would not replace the MOV instruction, as it is // IP relative. But if the following instruction is also IP relative, // we are left with only three bytes which is not enough to insert a // jump. // We recognize this particular situation, and as long as the CALLQ // is not a branch target, we decide to still relocate the entire // sequence. We just have to make sure that we then patch up the // IP relative addressing. else if (is_indirect_call && startIdx == codeIdx && code[startIdx = (startIdx + (sizeof(code) / sizeof(struct Code)) - 1) % (sizeof(code) / sizeof(struct Code))].addr && ptr - code[startIdx].addr >= 5 && code[startIdx].is_ip_relative && isSafeInsn(code[startIdx].insn) && ((iter = std::upper_bound(branch_targets.begin(), branch_targets.end(), code[startIdx].addr)) == branch_targets.end() || *iter >= ptr)) { // We changed startIdx to include the IP relative instruction. // When copying this preamble, we make sure to patch up the // offset. } #endif else { Sandbox::die("Cannot intercept system call"); } } int needed = 5 - code[codeIdx].len; int first = codeIdx; while (needed > 0 && first != startIdx) { first = (first + (sizeof(code) / sizeof(struct Code)) - 1) % (sizeof(code) / sizeof(struct Code)); needed -= code[first].len; } int second = codeIdx; while (needed > 0) { second = (second + 1) % (sizeof(code) / sizeof(struct Code)); needed -= code[second].len; } int preamble = code[codeIdx].addr - code[first].addr; int postamble = code[second].addr + code[second].len - code[codeIdx].addr - code[codeIdx].len; // The following is all the code that construct the various bits of // assembly code. #if defined(__x86_64__) if (is_indirect_call) { needed = 52 + preamble + code[codeIdx].len + postamble; } else { needed = 52 + preamble + postamble; } #elif defined(__i386__) needed = 12 + preamble + postamble; #else #error Unsupported target platform #endif // Allocate scratch space and copy the preamble of code that was moved // from the function that we are patching. char* dest = getScratchSpace(maps, code[first].addr, needed, extraSpace, extraLength); memcpy(dest, code[first].addr, preamble); // For jumps from the VDSO to the VSyscalls we sometimes allow exactly // one IP relative instruction in the preamble. if (code[first].is_ip_relative) { *reinterpret_cast(dest + (code[codeIdx].addr - code[first].addr) - 4) -= dest - code[first].addr; } // For indirect calls, we need to copy the actual CALL instruction and // turn it into a PUSH instruction. #if defined(__x86_64__) if (is_indirect_call) { memcpy(dest + preamble, "\xE8\x00\x00\x00\x00\x48\x83\x04\x24", 9); dest[preamble + 9] = code[codeIdx].len + 42; memcpy(dest + preamble + 10, code[codeIdx].addr, code[codeIdx].len); // Convert CALL -> PUSH dest[preamble + 10 + (mod_rm - code[codeIdx].addr)] |= 0x20; preamble += 10 + code[codeIdx].len; } #endif // Copy the static body of the assembly code. memcpy(dest + preamble, #if defined(__x86_64__) is_indirect_call ? "\x48\x81\x3C\x24\x00\x00\x00\xFF\x72\x10\x81\x2C\x24\x00\x00\x00" "\x00\xC7\x44\x24\x04\x00\x00\x00\x00\xC3\x48\x87\x04\x24\x48\x89" "\x44\x24\x08\x58\xC3" : "\x48\x81\xEC\x80\x00\x00\x00\x50\x48\x8D\x05\x00\x00\x00\x00\x50" "\x48\xB8\x00\x00\x00\x00\x00\x00\x00\x00\x50\x48\x8D\x05\x06\x00" "\x00\x00\x48\x87\x44\x24\x10\xC3\x48\x81\xC4\x80\x00\x00", is_indirect_call ? 37 : 47 #elif defined(__i386__) "\x68\x00\x00\x00\x00\x68\x00\x00\x00\x00\xC3", 11 #else #error Unsupported target platform #endif ); // Copy the postamble that was moved from the function that we are // patching. memcpy(dest + preamble + #if defined(__x86_64__) (is_indirect_call ? 37 : 47), #elif defined(__i386__) 11, #else #error Unsupported target platform #endif code[codeIdx].addr + code[codeIdx].len, postamble); // Patch up the various computed values #if defined(__x86_64__) int post = preamble + (is_indirect_call ? 37 : 47) + postamble; dest[post] = '\xE9'; *reinterpret_cast(dest + post + 1) = (code[second].addr + code[second].len) - (dest + post + 5); if (is_indirect_call) { *reinterpret_cast(dest + preamble + 13) = vsys_offset_; } else { *reinterpret_cast(dest + preamble + 11) = (code[second].addr + code[second].len) - (dest + preamble + 15); *reinterpret_cast(dest + preamble + 18) = reinterpret_cast(&syscallWrapper); } #elif defined(__i386__) *(dest + preamble + 11 + postamble) = '\xC3'; *reinterpret_cast(dest + preamble + 1) = dest + preamble + 11; *reinterpret_cast(dest + preamble + 6) = syscallWrapper; #else #error Unsupported target platform #endif // Pad unused space in the original function with NOPs memset(code[first].addr, 0x90 /* NOP */, code[second].addr + code[second].len - code[first].addr); // Replace the system call with an unconditional jump to our new code. #if defined(__x86_64__) *code[first].addr = '\xE9'; // JMPQ #elif defined(__i386__) *code[first].addr = '\xE8'; // CALL #else #error Unsupported target platform #endif *reinterpret_cast(code[first].addr + 1) = dest - (code[first].addr + 5); } replaced: codeIdx = (codeIdx + 1) % (sizeof(code) / sizeof(struct Code)); } } void Library::patchVDSO(char** extraSpace, int* extraLength){ #if defined(__i386__) Sandbox::SysCalls sys; if (!__kernel_vsyscall || sys.mprotect(reinterpret_cast( reinterpret_cast(__kernel_vsyscall) & ~0xFFF), 4096, PROT_READ|PROT_WRITE|PROT_EXEC)) { return; } // x86-32 has a small number of well-defined functions in the VDSO library. // These functions do not easily lend themselves to be rewritten by the // automatic code. Instead, we explicitly find new definitions for them. // // We don't bother with optimizing the syscall instruction instead always // use INT $0x80, no matter whether the hardware supports more modern // calling conventions. // // TODO(markus): Investigate whether it is worthwhile to optimize this // code path and use the platform-specific entry code. if (__kernel_vsyscall) { // Replace the kernel entry point with: // // E9 .. .. .. .. JMP syscallWrapper *__kernel_vsyscall = '\xE9'; *reinterpret_cast(__kernel_vsyscall + 1) = reinterpret_cast(&syscallWrapper) - reinterpret_cast(__kernel_vsyscall + 5); } if (__kernel_sigreturn) { // Replace the sigreturn() system call with a jump to code that does: // // 58 POP %eax // B8 77 00 00 00 MOV $0x77, %eax // E9 .. .. .. .. JMP syscallWrapper char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace, extraLength); memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE9", 7); *reinterpret_cast(dest + 7) = reinterpret_cast(&syscallWrapper) - reinterpret_cast(dest + 11); *__kernel_sigreturn = '\xE9'; *reinterpret_cast(__kernel_sigreturn + 1) = dest - reinterpret_cast(__kernel_sigreturn + 5); } if (__kernel_rt_sigreturn) { // Replace the rt_sigreturn() system call with a jump to code that does: // // B8 AD 00 00 00 MOV $0xAD, %eax // E9 .. .. .. .. JMP syscallWrapper char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace, extraLength); memcpy(dest, "\xB8\xAD\x00\x00\x00\xE9", 6); *reinterpret_cast(dest + 6) = reinterpret_cast(&syscallWrapper) - reinterpret_cast(dest + 10); *__kernel_rt_sigreturn = '\xE9'; *reinterpret_cast(__kernel_rt_sigreturn + 1) = dest - reinterpret_cast(__kernel_rt_sigreturn + 5); } #endif } int Library::patchVSystemCalls() { #if defined(__x86_64__) // VSyscalls live in a shared 4kB page at the top of the address space. This // page cannot be unmapped nor remapped. We have to create a copy within // 2GB of the page, and rewrite all IP-relative accesses to shared variables. // As the top of the address space is not accessible by mmap(), this means // that we need to wrap around addresses to the bottom 2GB of the address // space. // Only x86-64 has VSyscalls. if (maps_->vsyscall()) { char* copy = maps_->allocNearAddr(maps_->vsyscall(), 0x1000, PROT_READ|PROT_WRITE); char* extraSpace = copy; int extraLength = 0x1000; memcpy(copy, maps_->vsyscall(), 0x1000); long adjust = (long)maps_->vsyscall() - (long)copy; for (int vsys = 0; vsys < 0x1000; vsys += 0x400) { char* start = copy + vsys; char* end = start + 0x400; // There can only be up to four VSyscalls starting at an offset of // n*0x1000, each. VSyscalls are invoked by functions in the VDSO // and provide fast implementations of a time source. We don't exactly // know where the code and where the data is in the VSyscalls page. // So, we disassemble the code for each function and find all branch // targets within the function in order to find the last address of // function. for (char *last = start, *vars = end, *ptr = start; ptr < end; ) { new_function: char* mod_rm; unsigned short insn = next_inst((const char **)&ptr, true, 0, 0, &mod_rm, 0, 0); if (mod_rm && (*mod_rm & 0xC7) == 0x5) { // Instruction has IP relative addressing mode. Adjust to reference // the variables in the original VSyscall segment. long offset = *reinterpret_cast(mod_rm + 1); char* var = ptr + offset; if (var >= ptr && var < vars) { // Variables are stored somewhere past all the functions. Remember // the first variable in the VSyscall slot, so that we stop // scanning for instructions once we reach that address. vars = var; } offset += adjust; if ((offset >> 32) && (offset >> 32) != -1) { Sandbox::die("Cannot patch [vsystemcall]"); } *reinterpret_cast(mod_rm + 1) = offset; } // Check for jump targets to higher addresses (but within our own // VSyscall slot). They extend the possible end-address of this // function. char *target = 0; if ((insn >= 0x70 && insn <= 0x7F) /* Jcc */ || insn == 0xEB /* JMP */) { target = ptr + (reinterpret_cast(ptr))[-1]; } else if (insn == 0xE8 /* CALL */ || insn == 0xE9 /* JMP */ || (insn >= 0x0F80 && insn <= 0x0F8F) /* Jcc */) { target = ptr + (reinterpret_cast(ptr))[-1]; } // The function end is found, once the loop reaches the last valid // address in the VSyscall slot, or once it finds a RET instruction // that is not followed by any jump targets. Unconditional jumps that // point backwards are treated the same as a RET instruction. if (insn == 0xC3 /* RET */ || (target < ptr && (insn == 0xEB /* JMP */ || insn == 0xE9 /* JMP */))) { if (last >= ptr) { continue; } else { // The function can optionally be followed by more functions in // the same VSyscall slot. Allow for alignment to a 16 byte // boundary. If we then find more non-zero bytes, and if this is // not the known start of the variables, assume a new function // started. for (; ptr < vars; ++ptr) { if ((long)ptr & 0xF) { if (*ptr && *ptr != '\x90' /* NOP */) { goto new_function; } *ptr = '\x90'; // NOP } else { if (*ptr && *ptr != '\x90' /* NOP */) { goto new_function; } break; } } // Translate all SYSCALLs to jumps into our system call handler. patchSystemCallsInFunction(NULL, start, ptr, &extraSpace, &extraLength); break; } } // Adjust assumed end address for this function, if a valid jump // target has been found that originates from the current instruction. if (target > last && target < start + 0x100) { last = target; } } } // We are done. Write-protect our code and make it executable. Sandbox::SysCalls sys; sys.mprotect(copy, 0x1000, PROT_READ|PROT_EXEC); return maps_->vsyscall() - copy; } #endif return 0; } void Library::patchSystemCalls() { if (!valid_) { return; } int extraLength = 0; char* extraSpace = NULL; if (isVDSO_) { // patchVDSO() calls patchSystemCallsInFunction() which needs vsys_offset_ // iff processing the VDSO library. So, make sure we call // patchVSystemCalls() first. vsys_offset_ = patchVSystemCalls(); #if defined(__i386__) patchVDSO(&extraSpace, &extraLength); return; #endif } SectionTable::const_iterator iter; if ((iter = section_table_.find(".text")) == section_table_.end()) { return; } const Elf_Shdr& shdr = iter->second.second; char* start = reinterpret_cast(shdr.sh_addr + asr_offset_); char* stop = start + shdr.sh_size; char* func = start; int nopcount = 0; bool has_syscall = false; for (char *ptr = start; ptr < stop; ptr++) { #if defined(__x86_64__) if ((*ptr == '\x0F' && ptr[1] == '\x05' /* SYSCALL */) || (isVDSO_ && *ptr == '\xFF')) { #elif defined(__i386__) if ((*ptr == '\xCD' && ptr[1] == '\x80' /* INT $0x80 */) || (*ptr == '\x65' && ptr[1] == '\xFF' && ptr[2] == '\x15' /* CALL %gs:.. */)) { #else #error Unsupported target platform #endif ptr++; has_syscall = true; nopcount = 0; } else if (*ptr == '\x90' /* NOP */) { nopcount++; } else if (!(reinterpret_cast(ptr) & 0xF)) { if (nopcount > 2) { // This is very likely the beginning of a new function. Functions // are aligned on 16 byte boundaries and the preceding function is // padded out with NOPs. // // For performance reasons, we quickly scan the entire text segment // for potential SYSCALLs, and then patch the code in increments of // individual functions. if (has_syscall) { has_syscall = false; // Our quick scan of the function found a potential system call. // Do a more thorough scan, now. patchSystemCallsInFunction(maps_, func, ptr, &extraSpace, &extraLength); } func = ptr; } nopcount = 0; } else { nopcount = 0; } } if (has_syscall) { // Patch any remaining system calls that were in the last function before // the loop terminated. patchSystemCallsInFunction(maps_, func, stop, &extraSpace, &extraLength); } // Mark our scratch space as write-protected and executable. if (extraSpace) { Sandbox::SysCalls sys; sys.mprotect(extraSpace, 4096, PROT_READ|PROT_EXEC); } } bool Library::parseElf() { valid_ = true; // Verify ELF header Elf_Shdr str_shdr; if (!getOriginal(0, &ehdr_) || ehdr_.e_ehsize < sizeof(Elf_Ehdr) || ehdr_.e_phentsize < sizeof(Elf_Phdr) || ehdr_.e_shentsize < sizeof(Elf_Shdr) || !getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, &str_shdr)) { // Not all memory mappings are necessarily ELF files. Skip memory // mappings that we cannot identify. valid_ = false; return false; } // Find PT_DYNAMIC segment. This is what our PLT entries and symbols will // point to. This information is probably incorrect in the child, as it // requires access to the original memory mappings. for (int i = 0; i < ehdr_.e_phnum; i++) { Elf_Phdr phdr; if (getOriginal(ehdr_.e_phoff + i*ehdr_.e_phentsize, &phdr) && phdr.p_type == PT_DYNAMIC) { RangeMap::const_iterator iter = memory_ranges_.lower_bound(phdr.p_offset); if (iter != memory_ranges_.end()) { asr_offset_ = reinterpret_cast(iter->second.start) - (phdr.p_vaddr - (phdr.p_offset - iter->first)); } break; } } // Parse section table and find all sections in this ELF file for (int i = 0; i < ehdr_.e_shnum; i++) { Elf_Shdr shdr; if (!getOriginal(ehdr_.e_shoff + i*ehdr_.e_shentsize, &shdr)) { continue; } section_table_.insert( std::make_pair(getOriginal(str_shdr.sh_offset + shdr.sh_name), std::make_pair(i, shdr))); } return !isVDSO_ || parseSymbols(); } bool Library::parseSymbols() { if (!valid_) { return false; } Elf_Shdr str_shdr; getOriginal(ehdr_.e_shoff + ehdr_.e_shstrndx * ehdr_.e_shentsize, &str_shdr); // Find PLT and symbol tables const Elf_Shdr* plt = getSection(ELF_REL_PLT); const Elf_Shdr* symtab = getSection(".dynsym"); Elf_Shdr strtab = { 0 }; if (symtab) { if (symtab->sh_link >= ehdr_.e_shnum || !getOriginal(ehdr_.e_shoff + symtab->sh_link * ehdr_.e_shentsize, &strtab)) { Debug::message("Cannot find valid symbol table\n"); valid_ = false; return false; } } if (plt && symtab) { // Parse PLT table and add its entries for (int i = plt->sh_size/sizeof(Elf_Rel); --i >= 0; ) { Elf_Rel rel; if (!getOriginal(plt->sh_offset + i * sizeof(Elf_Rel), &rel) || ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym) >= symtab->sh_size) { Debug::message("Encountered invalid plt entry\n"); valid_ = false; return false; } if (ELF_R_TYPE(rel.r_info) != ELF_JUMP_SLOT) { continue; } Elf_Sym sym; if (!getOriginal(symtab->sh_offset + ELF_R_SYM(rel.r_info)*sizeof(Elf_Sym), &sym) || sym.st_shndx >= ehdr_.e_shnum) { Debug::message("Encountered invalid symbol for plt entry\n"); valid_ = false; return false; } std::string name = getOriginal(strtab.sh_offset + sym.st_name); if (name.empty()) { continue; } plt_entries_.insert(std::make_pair(name, rel.r_offset)); } } if (symtab) { // Parse symbol table and add its entries for (Elf_Addr addr = 0; addr < symtab->sh_size; addr += sizeof(Elf_Sym)) { Elf_Sym sym; if (!getOriginal(symtab->sh_offset + addr, &sym) || (sym.st_shndx >= ehdr_.e_shnum && sym.st_shndx < SHN_LORESERVE)) { Debug::message("Encountered invalid symbol\n"); valid_ = false; return false; } std::string name = getOriginal(strtab.sh_offset + sym.st_name); if (name.empty()) { continue; } symbols_.insert(std::make_pair(name, sym)); } } SymbolTable::const_iterator iter = symbols_.find("__kernel_vsyscall"); if (iter != symbols_.end() && iter->second.st_value) { __kernel_vsyscall = asr_offset_ + iter->second.st_value; } iter = symbols_.find("__kernel_sigreturn"); if (iter != symbols_.end() && iter->second.st_value) { __kernel_sigreturn = asr_offset_ + iter->second.st_value; } iter = symbols_.find("__kernel_rt_sigreturn"); if (iter != symbols_.end() && iter->second.st_value) { __kernel_rt_sigreturn = asr_offset_ + iter->second.st_value; } return true; } } // namespace