// Copyright (c) 2009 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // TODO // - Make capturing system call arguments optional and the number configurable. // - Lots of places depend on the ABI so that we can modify EAX or EDX, this // is safe, but these could be moved to be saved and restored anyway. // - Understand the loader better, and make some more meaningful hooks with // proper data collection and durations. Right now it's just noise. // - Get the returned pointer from AllocateHeap. #include #include #include #include #include "assembler.h" #include "logging.h" #include "rdtsc.h" #include "sym_resolver.h" #include "syscall_map.h" #include "sidestep/mini_disassembler.h" namespace { std::string JSONString(const std::string& str) { static const char hextable[] = "0123456789abcdef"; std::string out; out.push_back('"'); for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) { unsigned char c = static_cast(*it); switch (c) { case '\\': case '"': case '\'': out.push_back('\\'); out.push_back(c); break; default: if (c < 20 || c >= 127) { out.push_back('\\'); out.push_back('x'); out.push_back(hextable[c >> 4]); out.push_back(hextable[c & 0xf]); } else { // Unescaped. out.push_back(c); } break; } } out.push_back('"'); return out; } } // namespace class Playground { public: static const int kPlaygroundSize = 64 * 1024 * 1024; // Encapsulate the configuration options to the playground. class Options { public: Options() : stack_unwind_depth_(0), log_heap_(false), log_lock_(false), vista_(false) { } // The maximum amount of frames we should unwind from the call stack. int stack_unwind_depth() { return stack_unwind_depth_; } void set_stack_unwind_depth(int depth) { stack_unwind_depth_ = depth; } // Whether we should log heap operations (alloc / free). bool log_heap() { return log_heap_; } void set_log_heap(bool x) { log_heap_ = x; } // Whether we should log lock (critical section) operations. bool log_lock() { return log_lock_; } void set_log_lock(bool x) { log_lock_ = x; } // Whether we are running on Vista. bool vista() { return vista_; } void set_vista(bool x) { vista_ = x; } private: int stack_unwind_depth_; bool log_heap_; bool log_lock_; bool vista_; }; Playground(HANDLE proc, const Options& options) : proc_(proc), remote_addr_(NULL), resolver_("ntdll.dll"), options_(options) { // We copy the entire playground into the remote process, and we have // fields that we expect to be zero. TODO this could be a lot better. memset(buf_, 0, sizeof(buf_)); } void AllocateInRemote() { // Try to get something out of the way and easy to debug. static void* kPlaygroundAddr = reinterpret_cast(0x66660000); // Allocate our playground memory in the target process. This is a big // slab of read/write/execute memory that we use for our code // instrumentation, and the memory for writing out our logging events. remote_addr_ = reinterpret_cast( VirtualAllocEx(proc_, kPlaygroundAddr, kPlaygroundSize, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)); if (remote_addr_ == NULL || remote_addr_ != kPlaygroundAddr) { NOTREACHED("Falied to allocate playground: 0x%08x", remote_addr_); } } void CopyToRemote() { WriteProcessMemory(proc_, remote_addr_, buf_, sizeof(buf_), NULL); } void CopyFromRemote() { SIZE_T size = 0; ReadProcessMemory(proc_, remote_addr_, buf_, sizeof(buf_), &size); } enum EventRecordType { EVENT_TYPE_LDR = 0, EVENT_TYPE_THREADBEGIN = 1, EVENT_TYPE_THREADNAME = 2, EVENT_TYPE_EXCEPTION = 3, EVENT_TYPE_PROCESSEXIT = 4, EVENT_TYPE_CREATETHREAD = 5, EVENT_TYPE_THREADEXIT = 6, EVENT_TYPE_ALLOCHEAP = 7, EVENT_TYPE_FREEHEAP = 8, EVENT_TYPE_SYSCALL = 9, EVENT_TYPE_ENTER_CS = 10, EVENT_TYPE_TRYENTER_CS = 11, EVENT_TYPE_LEAVE_CS = 12, EVENT_TYPE_APC = 13 }; static const int kThreadNameBufSize = 64; static const int kLdrBufSize = 512; // Looks like internal buffer is 512. static const int kCodeBlockSize = 256; static const int kOffLdrCode = 0 * kCodeBlockSize; static const int kOffCreateThreadCode = 1 * kCodeBlockSize; static const int kOffThreadCode = 2 * kCodeBlockSize; static const int kOffExpCode = 3 * kCodeBlockSize; static const int kOffExitCode = 4 * kCodeBlockSize; static const int kOffThreadExitCode = 5 * kCodeBlockSize; static const int kOffAllocHeapCode = 6 * kCodeBlockSize; static const int kOffFreeHeapCode = 7 * kCodeBlockSize; static const int kOffSyscallCode = 8 * kCodeBlockSize; static const int kOffEnterCritSecCode = 9 * kCodeBlockSize; static const int kOffTryEnterCritSecCode = 10 * kCodeBlockSize; static const int kOffLeaveCritSecCode = 11 * kCodeBlockSize; static const int kOffApcDispCode = 12 * kCodeBlockSize; static const int kOffLogAreaPtr = 4096; static const int kOffLogAreaData = 4096 + 4; static const int kRecordHeaderSize = 8 + 4 + 4 + 4; // Given the address to the start of a function, patch the function to jump // to a given offset into the playground. This function will try to take // advantage of hotpatch code, if the function is prefixed with 5 0x90 bytes. // Returns a std::string of any assembly instructions that must be relocated, // as they were overwritten during patching. std::string PatchPreamble(int func_addr, int playground_off) { sidestep::MiniDisassembler disas; int stub_addr = reinterpret_cast(remote_addr_ + playground_off); std::string instrs; char buf[15]; if (ReadProcessMemory(proc_, reinterpret_cast(func_addr - 5), buf, sizeof(buf), NULL) == 0) { NOTREACHED("ReadProcessMemory(0x%08x) failed: %d", func_addr - 5, GetLastError()); } // TODO(deanm): It seems in more recent updates the compiler is generating // complicated sequences for padding / alignment. For example: // 00000000 8DA42400000000 lea esp,[esp+0x0] // 00000007 8D4900 lea ecx,[ecx+0x0] // is used for a 16 byte alignment. We need a better way of handling this. if (memcmp(buf, "\x90\x90\x90\x90\x90", 5) == 0 || memcmp(buf, "\x00\x8D\x64\x24\x00", 5) == 0 || memcmp(buf, "\x00\x00\x8D\x49\x00", 5) == 0) { unsigned int instr_bytes = 0; // We might have a hotpatch no-op of mov edi, edi "\x8b\xff". It is a // bit of a waste to relocate it, but it makes everything simpler. while (instr_bytes < 2) { if (disas.Disassemble( reinterpret_cast(buf + 5 + instr_bytes), &instr_bytes) != sidestep::IT_GENERIC) { NOTREACHED("Could not disassemble or relocate instruction."); } // We only read 10 bytes worth of instructions. CHECK(instr_bytes < 10); } instrs.assign(buf + 5, instr_bytes); // We have a hotpatch prefix of 5 nop bytes. We can use this for our // long jump, and then overwrite the first 2 bytes to jump back to there. CodeBuffer patch(buf); int off = stub_addr - func_addr; patch.jmp_rel(off); patch.jmp_rel_short(-2 - 5); } else { // We need a full 5 bytes for the jump. unsigned int instr_bytes = 0; while (instr_bytes < 5) { if (disas.Disassemble( reinterpret_cast(buf + 5 + instr_bytes), &instr_bytes) != sidestep::IT_GENERIC) { NOTREACHED("Could not disassemble or relocate instruction."); } // We only read 10 bytes worth of instructions. CHECK(instr_bytes < 10); } instrs.assign(buf + 5, instr_bytes); // Overwrite the first 5 bytes with a relative jump to our stub. CodeBuffer patch(buf + 5); int off = stub_addr - (func_addr + 5); patch.jmp_rel(off); } // Write back the bytes, we are really probably writing more back than we // need to, but it shouldn't really matter. if (WriteProcessMemory(proc_, reinterpret_cast(func_addr - 5), buf, sizeof(buf), NULL) == 0) { NOTREACHED("WriteProcessMemory(0x%08x) failed: %d", func_addr - 5, GetLastError()); } return instrs; } std::string PatchPreamble(const char* func_name, int playground_off) { return PatchPreamble( reinterpret_cast(resolver_.Resolve(func_name)), playground_off); } // Restore any instructions that needed to be moved to make space for our // patch and jump back to the original code. void ResumeOriginalFunction(const char* func_name, const std::string& moved_instructions, int stub_offset, CodeBuffer* cb) { cb->emit_bytes(moved_instructions); int off = resolver_.Resolve(func_name) + moved_instructions.size() - (remote_addr_ + stub_offset + cb->size() + 5); cb->jmp_rel(off); } // Makes a call to NtQueryPerformanceCounter, writing the timestamp to the // buffer pointed to by EDI. EDI it not incremented. EAX is not preserved. void AssembleQueryPerformanceCounter(CodeBuffer* cb) { // Make a call to NtQueryPerformanceCounter and write the result into // the log area. The buffer we write to should be aligned, but we should // garantee that anyway for the logging area for performance. cb->push_imm(0); // PerformanceFrequency cb->push(EDI); // PerformanceCounter cb->mov_imm(EAX, reinterpret_cast( resolver_.Resolve("ntdll!NtQueryPerformanceCounter"))); cb->call(EAX); } // This is the common log setup routine. It will allocate a new log entry, // and write out the common log header to the event entry. The header is: // is [ 64bit QPC ] [ 32bit cpu id ] [ 32bit thread id ] [ 32bit rec id ] // EDI will be left pointing to the log entry, with |space| bytes left for // type specific data. All other registers should not be clobbered. void AssembleHeaderCode(CodeBuffer* cb, EventRecordType rt, int space) { cb->push(EAX); cb->push(EDX); cb->push(ECX); cb->push(ESI); int unwind_depth = options_.stack_unwind_depth(); // Load EDI with the number of bytes we want for our log entry, this will // be used in the atomic increment to allocate the log entry. cb->mov_imm(EDI, kRecordHeaderSize + (unwind_depth * 4) + space); // Do the increment and have EDI point to our log entry buffer space. cb->mov_imm(EDX, reinterpret_cast(remote_addr_ + kOffLogAreaPtr)); cb->inc_atomic(EDX, EDI); // EDI is the buffer offset, make it a pointer to the record entry. cb->add_imm(EDI, reinterpret_cast(remote_addr_ + kOffLogAreaData)); AssembleQueryPerformanceCounter(cb); cb->add_imm(EDI, 8); cb->which_cpu(); cb->stosd(); cb->which_thread(); cb->stosd(); // Stack unwinding, follow EBP to the maximum number of frames, and make // sure that it stays on the stack (between ESP and TEB.StackBase). if (unwind_depth > 0) { cb->mov_imm(ECX, unwind_depth); cb->fs(); cb->mov(EDX, Operand(0x04)); // get TEB.StackBase // Start at EBP. cb->mov(EAX, EBP); Label unwind_loop, bail; cb->bind(&unwind_loop); // Bail if (EAX < ESP) (below the stack) cb->cmp(EAX, ESP); cb->jcc(below, &bail); // Bail if (EAX >= EDX) (above the stack) cb->cmp(EAX, EDX); cb->jcc(above_equal, &bail); // We have a valid stack pointer, it should point to something like: // [ saved frame pointer ] [ return address ] [ arguments ... ] cb->mov(ESI, EAX); cb->lodsd(); // Get the new stack pointer to follow in EAX cb->movsd(); // Copy the return address to the log area. cb->loop(&unwind_loop); cb->bind(&bail); // If we did managed to unwind to the max, fill the rest with 0 (really // we just want to inc EDI to the end, and this is an easy way). cb->mov_imm(EAX, 0); // TODO use an xor cb->rep(); cb->stosd(); } // Store the type for this record entry. cb->mov_imm(EAX, rt); cb->stosd(); cb->pop(ESI); cb->pop(ECX); cb->pop(EDX); cb->pop(EAX); } void PatchLoader() { static const EventRecordType kRecordType = EVENT_TYPE_LDR; static const char* kFuncName = "ntdll!DebugPrint"; static const int kStubOffset = kOffLdrCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); // Set ShowSnaps to one to get the print routines to be called. char enabled = 1; WriteProcessMemory( proc_, resolver_.Resolve("ntdll!ShowSnaps"), &enabled, 1, NULL); CodeBuffer cb(buf_ + kStubOffset); cb.pop(EDX); // return address cb.pop(EAX); // First param in eax cb.push(ESI); cb.push(EDI); cb.push(EDX); cb.mov(ESI, EAX); // ESI points at the string structure. // We used to do variable length based on the length supplied in the str // structure, but it's easier (and sloppier) to just copy a fixed amount. AssembleHeaderCode(&cb, kRecordType, kLdrBufSize); cb.lodsd(); // Load the character count cb.lodsd(); // Load the char* cb.mov(ESI, EAX); cb.mov_imm(ECX, kLdrBufSize / 4); // load the char count as the rep count cb.rep(); cb.movsb(); // Copy the string to the logging buffer // Return cb.pop(EDX); cb.pop(EDI); cb.pop(ESI); cb.pop(ECX); // don't care cb.pop(ECX); // don't care cb.jmp(EDX); } void PatchCreateThread() { static const EventRecordType kRecordType = EVENT_TYPE_CREATETHREAD; static const char* kFuncName = options_.vista() ? "ntdll!NtCreateThreadEx" : "ntdll!NtCreateThread"; static const int kStubOffset = kOffCreateThreadCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.push(EDI); cb.push(ESI); AssembleHeaderCode(&cb, kRecordType, 8); cb.mov(EAX, Operand(ESP, 0x18 + 8)); // Super ugly hack. To coorrelate between creating a thread and the new // thread running, we stash something to identify the creating event when // we log the created event. We just use a pointer to the event log data // since this will be unique and can tie the two events together. We pass // it by writing into the context structure, so it will be passed in ESI. cb.add_imm(EAX, 0xa0); cb.push(EDI); cb.mov(EDI, EAX); cb.pop(EAX); cb.push(EAX); cb.stosd(); // Get and save CONTEXT.Eip cb.mov(ESI, EDI); cb.add_imm(ESI, 20); cb.pop(EDI); cb.mov(EAX, EDI); cb.stosd(); // Record the event identifier to tie together the events. cb.movsd(); // write Eip to the log event cb.pop(ESI); cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } void PatchThreadBegin() { static const EventRecordType kRecordType = EVENT_TYPE_THREADBEGIN; static const char* kFuncName = "ntdll!CsrNewThread"; static const int kStubOffset = kOffThreadCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.push(EDI); AssembleHeaderCode(&cb, kRecordType, 8); cb.mov(EAX, ESI); // We stashed the creator's eventid in the context ESI. cb.stosd(); // TODO(deanm): The pointer is going to point into the CRT or something, // should we dig deeper to get more information about the real entry? cb.mov(EAX, Operand(EBP, 0x8)); cb.stosd(); cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } void PatchThreadBeginVista() { static const EventRecordType kRecordType = EVENT_TYPE_THREADBEGIN; static const char* kFuncName = "ntdll!_RtlUserThreadStart"; static const int kStubOffset = kOffThreadCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.push(EDI); AssembleHeaderCode(&cb, kRecordType, 8); cb.mov(EAX, ESI); // We stashed the creator's eventid in the context ESI. cb.stosd(); // TODO(deanm): The pointer is going to point into the CRT or something, // should we dig deeper to get more information about the real entry? //cb.mov(EAX, Operand(EBP, 0x8)); cb.mov_imm(EAX, 0); cb.stosd(); cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } // Intercept exception dispatching so we can catch when threads set a thread // name (which is an exception with a special code). TODO it could be // useful to log all exceptions. void PatchSetThreadName() { static const EventRecordType kRecordType = EVENT_TYPE_THREADNAME; static const char* kFuncName = "ntdll!RtlDispatchException"; static const int kStubOffset = kOffExpCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.pop(EDX); // return address cb.pop(EAX); // ExceptionRecord cb.push(EAX); cb.push(EDX); cb.push(ESI); cb.mov(ESI, EAX); cb.lodsd(); Label bail; // exception code cb.cmp_imm(EAX, 0x406D1388); cb.jcc(not_equal, &bail); cb.push(EDI); AssembleHeaderCode(&cb, kRecordType, kThreadNameBufSize); // Fetch the second parameter. for (int i = 0; i < 6; ++i) { cb.lodsd(); } // TODO This is sloppy and we could run into unmapped memory... cb.mov(ESI, EAX); cb.mov_imm(ECX, kThreadNameBufSize / 4); cb.rep(); cb.movsd(); cb.pop(EDI); cb.bind(&bail); cb.pop(ESI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } void PatchThreadExit() { static const EventRecordType kRecordType = EVENT_TYPE_THREADEXIT; static const char* kFuncName = "ntdll!LdrShutdownThread"; static const int kStubOffset = kOffThreadExitCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.push(EDI); AssembleHeaderCode(&cb, kRecordType, 0); cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } void PatchAllocateHeap() { static const EventRecordType kRecordType = EVENT_TYPE_ALLOCHEAP; static const char* kFuncName = "ntdll!RtlAllocateHeap"; static const int kStubOffset = kOffAllocHeapCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.push(EDI); cb.push(ESI); AssembleHeaderCode(&cb, kRecordType, 12); cb.mov(ESI, ESP); cb.add_imm(ESI, 12); // Skip over our saved and the return address cb.movsd(); cb.movsd(); cb.movsd(); // Copy the 3 parameters cb.pop(ESI); cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } void PatchFreeHeap() { static const EventRecordType kRecordType = EVENT_TYPE_FREEHEAP; static const char* kFuncName = "ntdll!RtlFreeHeap"; static const int kStubOffset = kOffFreeHeapCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.push(EDI); cb.push(ESI); AssembleHeaderCode(&cb, kRecordType, 12); cb.mov(ESI, ESP); cb.add_imm(ESI, 12); // Skip over our saved and the return address cb.movsd(); cb.movsd(); cb.movsd(); // Copy the 3 parameters cb.pop(ESI); cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } // Don't even bother going back to the original code, just implement our // own KiFastSystemCall. The original looks like: // .text:7C90EB8B mov edx, esp // .text:7C90EB8D sysenter // .text:7C90EB8F nop // .text:7C90EB90 nop // .text:7C90EB91 nop // .text:7C90EB92 nop // .text:7C90EB93 nop void PatchSyscall() { static const EventRecordType kRecordType = EVENT_TYPE_SYSCALL; static const char* kFuncName = "ntdll!KiFastSystemCall"; static const int kStubOffset = kOffSyscallCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); { CodeBuffer cb(buf_ + kStubOffset); Label skip; // Skip 0xa5 which is QueryPerformanceCounter, to make sure we don't log // our own logging's QPC. Disabled for now, using ret addr check... // cb.cmp_imm(EAX, 0xa5); // cb.jcc(equal, &skip); // Check if the return address is from 0x6666 (our code region). // 66817C24066666 cmp word [esp+0x6],0x6666 cb.emit(0x66); cb.emit(0x81); cb.emit(0x7C); cb.emit(0x24); cb.emit(0x06); cb.emit(0x66); cb.emit(0x66); cb.jcc(equal, &skip); // This is all a bit shit. Originally I thought I could store some state // on the stack above ESP, however, it seems that when APCs, etc are // queued, they will use the stack above ESP. Well, not above ESP, above // what was passed in as EDX into the systemcall, not matter if ESP was // different than this :(. So we need to store our state in the event // log record, and then we stick a pointer to that over a ret addr... // Our stack starts like: // [ ret addr ] [ ret addr 2 ] [ arguments ] // We will update it to look like // [ ret stub addr ] [ event entry ptr ] [ arguments ] cb.push(EDI); // save EDI since we're using it AssembleHeaderCode(&cb, kRecordType, 16 + 16 + 8); cb.mov(EDX, EAX); // Save EAX... cb.stosd(); // eax is the syscall number cb.pop(EAX); cb.stosd(); // store the saved EDI cb.pop(EAX); cb.stosd(); // store the real return address cb.pop(EAX); cb.stosd(); // store the real (secondary) return address; cb.push(ESI); cb.mov(ESI, ESP); cb.lodsd(); cb.movsd(); // argument 1 cb.movsd(); // argument 2 cb.movsd(); // argument 3 cb.pop(ESI); cb.push(EDI); // store our event ptr over the secondary ret addr. cb.push_imm(reinterpret_cast(remote_addr_ + kOffSyscallCode + 200)); cb.mov(EAX, EDX); // restore EAX cb.bind(&skip); cb.mov(EDX, ESP); cb.sysenter(); if (cb.size() > 200) { NOTREACHED("code too big: %d", cb.size()); } } { CodeBuffer cb(buf_ + kStubOffset + 200); // TODO share the QPC code, this is a copy and paste... cb.pop(EDI); // get the log area cb.stosd(); // Log the system call return value. // QPC will clobber EAX, and it's very important to save it since it // is the return value from the system call. TODO validate if there is // anything else we need to save... cb.push(EAX); AssembleQueryPerformanceCounter(&cb); cb.pop(EAX); // We need to: // - Restore the original "seconary" return address // - Restore the original value of the EDI register // - Jump control flow to the original return address // All 3 of these values are stored in the log record... // [ syscall num ] [ saved edi ] [ real rets ] [ args ] [ retval ] [ ts ] // currently edi points here ----^ cb.push(Operand(EDI, -4 - 16)); // push the real 2nd ret cb.push(Operand(EDI, -8 - 16)); // push the real ret cb.push(Operand(EDI, -12 - 16)); // push the saved EDI cb.pop(EDI); // restore EDI that was saved in the record cb.ret(); // jmp back to the real ret ... if (cb.size() > 56) { NOTREACHED("ug"); } } } // Patch lock (criticial section) holding. void PatchEnterCriticalSection() { static const EventRecordType kRecordType = EVENT_TYPE_ENTER_CS; static const char* kFuncName = "ntdll!RtlEnterCriticalSection"; static const int kStubOffset = kOffEnterCritSecCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); // We just want to capture the return address and original argument, so // we know when EnterCriticalSection returned, we don't want to know when // it entered because it could sit waiting. We want to know when the lock // actually started being held. The compiler will sometimes generated code // that overwrites arguments, so we'll keep a copy of the argument just in // case code like this is ever generated in the future. TODO is it enough // to just assume a LPCRITICAL_SECTION uniquely identifies the lock, or // can the same lock have multiple different copies, I would assume not. { CodeBuffer cb(buf_ + kStubOffset); // Set up an additional frame so that we capture the return. // TODO use memory instructions instead of using registers. cb.pop(EAX); // return address cb.pop(EDX); // first argument (critical section pointer) cb.push(EDX); cb.push(EAX); cb.push(EDX); cb.push_imm( reinterpret_cast(remote_addr_ + kStubOffset + 40)); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); CHECK(cb.size() < 40); } { CodeBuffer cb(buf_ + kStubOffset + 40); cb.push(ESI); cb.mov(ESI, ESP); cb.push(EAX); cb.push(EDI); AssembleHeaderCode(&cb, kRecordType, 4); cb.lodsd(); // Skip over our saved ESI cb.lodsd(); // Skip over the return address cb.movsd(); // Write the CRITICAL_SECTION* to the event record. cb.pop(EDI); cb.pop(EAX); cb.pop(ESI); cb.ret(0x04); } } void PatchTryEnterCriticalSection() { static const EventRecordType kRecordType = EVENT_TYPE_TRYENTER_CS; static const char* kFuncName = "ntdll!RtlTryEnterCriticalSection"; static const int kStubOffset = kOffTryEnterCritSecCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); { CodeBuffer cb(buf_ + kStubOffset); // Set up an additional frame so that we capture the return. // TODO use memory instructions instead of using registers. cb.pop(EAX); // return address cb.pop(EDX); // first argument (critical section pointer) cb.push(EDX); cb.push(EAX); cb.push(EDX); cb.push_imm(reinterpret_cast(remote_addr_ + kStubOffset + 40)); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); CHECK(cb.size() < 40); } { CodeBuffer cb(buf_ + kStubOffset + 40); cb.push(ESI); cb.mov(ESI, ESP); cb.push(EDI); cb.push(EAX); AssembleHeaderCode(&cb, kRecordType, 8); cb.lodsd(); // Skip over our saved ESI cb.lodsd(); // Skip over the return address cb.movsd(); // Write the CRITICAL_SECTION* to the event record. cb.pop(EAX); cb.stosd(); // Write the return value to the event record. cb.pop(EDI); cb.pop(ESI); cb.ret(0x04); } } void PatchLeaveCriticalSection() { static const EventRecordType kRecordType = EVENT_TYPE_LEAVE_CS; static const char* kFuncName = "ntdll!RtlLeaveCriticalSection"; static const int kStubOffset = kOffLeaveCritSecCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); // TODO use memory instructions instead of using registers. cb.pop(EDX); // return address cb.pop(EAX); // first argument (critical section pointer) cb.push(EAX); cb.push(EDX); cb.push(EDI); AssembleHeaderCode(&cb, kRecordType, 4); cb.stosd(); // Write the CRITICAL_SECTION* to the event record. cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } // Patch APC dispatching. This is a bit hacky, since the return to kernel // mode is done with NtContinue, we have to shim in a stub return address to // catch when the callback is finished. It is probably a bit fragile. void PatchApcDispatcher() { static const EventRecordType kRecordType = EVENT_TYPE_APC; static const char* kFuncName = "ntdll!KiUserApcDispatcher"; static const int kStubOffset = kOffApcDispCode; std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); { CodeBuffer cb(buf_ + kStubOffset); // We don't really need to preserve these since we're the first thing // executing from the kernel dispatch, but yeah, it is good practice. cb.push(EDI); cb.push(EAX); AssembleHeaderCode(&cb, kRecordType, 4 + 4 + 8); cb.mov_imm(EAX, reinterpret_cast(remote_addr_ + kStubOffset + 140)); cb.xchg(EAX, Operand(ESP, 8)); // Swap the callback address with ours. cb.stosd(); // Store the original callback function address. // TODO for now we're lazy and depend that ESI will be preserved, and we // use it to store the pointer into our log record. EDI isn't preserved. cb.mov(ESI, EDI); cb.pop(EAX); cb.pop(EDI); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); CHECK(cb.size() < 140); } { CodeBuffer cb(buf_ + kStubOffset + 140); // This is our shim, we need to call the original callback function, then // we can catch the return and log when it was completed. cb.pop(EAX); // The real return address, safe to use EAX w/ the ABI? cb.push(EDI); cb.mov(EDI, ESI); cb.stosd(); // Store the real return address, we'll need it. cb.add_imm(ESI, -4); cb.lodsd(); // Load the real callback address. cb.mov(ESI, EDI); cb.pop(EDI); cb.call(EAX); // Call the original callback address. cb.push(EAX); cb.push(EDI); cb.mov(EDI, ESI); AssembleQueryPerformanceCounter(&cb); cb.pop(EDI); cb.pop(EAX); cb.push(Operand(ESI, -4)); // Push the real return address. cb.ret(); // Return back to the APC Dispatcher. CHECK(cb.size() < 50); } } // We need to hook into process shutdown for two reasons. Most importantly, // we need to copy the playground back from the process before the address // space goes away. We could avoid this with shared memory, however, there // is a reason two. In order to capture symbols for all of the libraries // loaded into arbitrary applications, on shutdown we do an instrusive load // of symbols into the traced process. // // ntdll!LdrShutdownProcess // - NtSetEvent(event, 0); // - NtWaitForSingleObject(event, FALSE, NULL); // - jmp back void PatchExit(HANDLE exiting, HANDLE exited) { static const EventRecordType kRecordType = EVENT_TYPE_PROCESSEXIT; static const char* kFuncName = "ntdll!LdrShutdownProcess"; static const int kStubOffset = kOffExitCode; HANDLE rexiting, rexited; if (!DuplicateHandle(::GetCurrentProcess(), exiting, proc_, &rexiting, 0, FALSE, DUPLICATE_SAME_ACCESS)) { NOTREACHED(""); } if (!DuplicateHandle(::GetCurrentProcess(), exited, proc_, &rexited, 0, FALSE, DUPLICATE_SAME_ACCESS)) { NOTREACHED(""); } std::string moved_instructions = PatchPreamble(kFuncName, kStubOffset); CodeBuffer cb(buf_ + kStubOffset); cb.push(EDI); AssembleHeaderCode(&cb, kRecordType, 0); cb.pop(EDI); // NtSetEvent(exiting, 0); cb.push_imm(0); cb.push_imm(reinterpret_cast(rexiting)); cb.mov_imm(EAX, reinterpret_cast( resolver_.Resolve("ntdll!NtSetEvent"))); cb.call(EAX); // NtWaitForSingleObject(exited, FALSE, INFINITE); cb.push_imm(0); cb.push_imm(0); cb.push_imm(reinterpret_cast(rexited)); cb.mov_imm(EAX, reinterpret_cast( resolver_.Resolve("ntdll!NtWaitForSingleObject"))); cb.call(EAX); ResumeOriginalFunction(kFuncName, moved_instructions, kStubOffset, &cb); } void Patch() { if (options_.vista()) { // TODO(deanm): Make PatchCreateThread work on Vista. PatchThreadBeginVista(); } else { PatchCreateThread(); PatchThreadBegin(); } PatchThreadExit(); PatchSetThreadName(); PatchSyscall(); PatchApcDispatcher(); // The loader logging needs to be improved a bit to really be useful. //PatchLoader(); // These are interesting, but will collect a ton of data: if (options_.log_heap()) { PatchAllocateHeap(); PatchFreeHeap(); } if (options_.log_lock()) { PatchEnterCriticalSection(); PatchTryEnterCriticalSection(); PatchLeaveCriticalSection(); } } // Dump the event records from the playground to stdout in a JSON format. // TODO: Drop RDTSCNormalizer, it was from old code that tried to use the // rdtsc counters from the CPU, and this required a bunch of normalization // to account for non-syncronized timestamps across different cores, etc. void DumpJSON(RDTSCNormalizer* rdn, SymResolver* res) { int pos = kOffLogAreaPtr; int i = IntAt(pos); pos += 4; std::map syscalls = CreateSyscallMap(); printf("parseEvents([\n"); for (int end = pos + i; pos < end; ) { printf("{\n"); __int64 ts = Int64At(pos); pos += 8; void* cpuid = reinterpret_cast(IntAt(pos)); pos += 4; printf("'ms': %f,\n", rdn->MsFromStart(cpuid, ts)); printf("'cpu': 0x%x,\n'thread': 0x%x,\n", cpuid, IntAt(pos)); pos += 4; if (options_.stack_unwind_depth() > 0) { printf("'stacktrace': [\n"); for (int i = 0; i < options_.stack_unwind_depth(); ++i) { int retaddr = IntAt(pos + (i * 4)); if (!retaddr) break; printf(" [ 0x%x, %s ],\n", retaddr, res ? JSONString(res->Unresolve(retaddr)).c_str() : "\"\""); } printf("],\n"); pos += (options_.stack_unwind_depth() * 4); } EventRecordType rt = static_cast(IntAt(pos)); pos += 4; switch (rt) { case EVENT_TYPE_LDR: { printf("'eventtype': 'EVENT_TYPE_LDR',\n"); std::string str(&buf_[pos], kLdrBufSize); str = str.substr(0, str.find('\0')); printf("'ldrinfo': %s,\n", JSONString(str).c_str()); pos += kLdrBufSize; break; } case EVENT_TYPE_CREATETHREAD: { printf("'eventtype': 'EVENT_TYPE_CREATETHREAD',\n" "'eventid': 0x%x,\n" "'startaddr': 0x%x,\n", IntAt(pos), IntAt(pos+4)); pos += 8; break; } case EVENT_TYPE_THREADBEGIN: { printf("'eventtype': 'EVENT_TYPE_THREADBEGIN',\n" "'parenteventid': 0x%x,\n" "'startaddr': 0x%x,\n", IntAt(pos), IntAt(pos+4)); pos += 8; break; } case EVENT_TYPE_THREADNAME: { std::string str(&buf_[pos], kThreadNameBufSize); str = str.substr(0, str.find('\0')); printf("'eventtype': 'EVENT_TYPE_THREADNAME',\n" "'threadname': %s,\n", JSONString(str).c_str()); pos += kThreadNameBufSize; break; } case EVENT_TYPE_PROCESSEXIT: { printf("'eventtype': 'EVENT_TYPE_PROCESSEXIT',\n"); break; } case EVENT_TYPE_THREADEXIT: { printf("'eventtype': 'EVENT_TYPE_THREADEXIT',\n"); break; } case EVENT_TYPE_ALLOCHEAP: { printf("'eventtype': 'EVENT_TYPE_ALLOCHEAP',\n" "'heaphandle': 0x%x,\n" "'heapflags': 0x%x,\n" "'heapsize': %d,\n", IntAt(pos), IntAt(pos+4), IntAt(pos+8)); pos += 12; break; } case EVENT_TYPE_FREEHEAP: { printf("'eventtype': 'EVENT_TYPE_FREEHEAP',\n" "'heaphandle': 0x%x,\n" "'heapflags': 0x%x,\n" "'heapptr': %d,\n", IntAt(pos), IntAt(pos+4), IntAt(pos+8)); pos += 12; break; } case EVENT_TYPE_SYSCALL: { int syscall = IntAt(pos); printf("'eventtype': 'EVENT_TYPE_SYSCALL',\n" "'syscall': 0x%x,\n", syscall); pos += 16; printf("'syscallargs': [\n"); for (int i = 0; i < 3; ++i) { printf(" 0x%x,\n", IntAt(pos)); pos += 4; } printf("],\n"); printf("'retval': 0x%x,\n" "'done': %f,\n", IntAt(pos), rdn->MsFromStart(0, Int64At(pos+4))); pos += 12; if (syscalls.count(syscall) == 1) { std::string sname = syscalls[syscall]; printf("'syscallname': %s,\n", JSONString(sname).c_str()); // Mark system calls that we should consider "waiting" system // calls, where we are not actually active. if (sname.find("WaitFor") != std::string::npos || sname.find("RemoveIoCompletion") != std::string::npos) { printf("'waiting': 1,\n"); } } break; } case EVENT_TYPE_ENTER_CS: { printf("'eventtype': 'EVENT_TYPE_ENTER_CS',\n" "'critical_section': 0x%x,\n", IntAt(pos)); pos += 4; break; } case EVENT_TYPE_TRYENTER_CS: { printf("'eventtype': 'EVENT_TYPE_TRYENTER_CS',\n" "'critical_section': 0x%x,\n" "'retval': 0x%x,\n", IntAt(pos), IntAt(pos+4)); pos += 8; break; } case EVENT_TYPE_LEAVE_CS: { printf("'eventtype': 'EVENT_TYPE_LEAVE_CS',\n" "'critical_section': 0x%x,\n", IntAt(pos)); pos += 4; break; } case EVENT_TYPE_APC: { int func_addr = IntAt(pos); printf("'eventtype': 'EVENT_TYPE_APC',\n" "'func_addr': 0x%x,\n" "'func_addr_name': %s,\n" "'ret_addr': 0x%x,\n" "'done': %f,\n", func_addr, res ? JSONString(res->Unresolve(func_addr)).c_str() : "\"\"", IntAt(pos+4), rdn->MsFromStart(0, Int64At(pos+8))); pos += 16; break; } default: NOTREACHED("Unknown event type: %d", rt); break; } printf("},\n"); } printf("]);"); } int IntAt(int pos) { return *reinterpret_cast(&buf_[pos]); } __int64 Int64At(int pos) { return *reinterpret_cast<__int64*>(&buf_[pos]); } private: // Handle the process we install into or read back from. HANDLE proc_; // The address where we will keep our playground in the remote process. char* remote_addr_; // Lookup addresses from symbol names for ntdll.dll. SymResolver resolver_; Options options_; // A local copy of the playground data, we copy it into the remote process. char buf_[kPlaygroundSize]; }; int main(int argc, char** argv) { std::string command_line; bool use_symbols = false; bool attaching = false; bool launched = false; bool manual_quit = false; Playground::Options options; PROCESS_INFORMATION info = {0}; argc--; argv++; while (argc > 0) { if (std::string("--symbols") == argv[0]) { use_symbols = true; } else if (std::string("--vista") == argv[0]) { options.set_vista(true); } else if (std::string("--log-heap") == argv[0]) { options.set_log_heap(true); } else if (std::string("--log-lock") == argv[0]) { options.set_log_lock(true); } else if (std::string("--manual-quit") == argv[0]) { manual_quit = true; } else if (argc >= 2 && std::string("--unwind") == argv[0]) { options.set_stack_unwind_depth(atoi(argv[1])); argc--; argv++; } else if (argc >= 2 && !launched && std::string("--attach") == argv[0]) { attaching = true; info.hProcess = OpenProcess(PROCESS_ALL_ACCESS, FALSE, atoi(argv[1])); launched = true; argc--; argv++; } else if (!launched) { STARTUPINFOA start_info = {0}; start_info.cb = sizeof(start_info); if (!CreateProcessA(NULL, argv[0], NULL, NULL, FALSE, CREATE_SUSPENDED, NULL, NULL, &start_info, &info)) { NOTREACHED("Failed to launch \"%s\": %d\n", argv[0], GetLastError()); return 1; } launched = true; } else { NOTREACHED("error parsing command line."); } argc--; argv++; } if (!launched) { printf("usage: traceline.exe \"app.exe my arguments\"\n" " --attach 123: buggy support for attaching to a process\n" " --unwind 16: unwind the stack to the specified max depth\n" " --symbols: use symbols for stacktraces\n" " --log-heap: log heap operations (alloc / free).\n" " --log-lock: log lock (critical section) operations.\n"); return 1; } HANDLE exiting = CreateEvent(NULL, FALSE, FALSE, NULL); HANDLE exited = CreateEvent(NULL, FALSE, FALSE, NULL); // The playground object is big (32MB), dynamically alloc. Playground* pg = new Playground(info.hProcess, options); pg->AllocateInRemote(); pg->Patch(); pg->PatchExit(exiting, exited); pg->CopyToRemote(); RDTSCNormalizer rdn; rdn.Start(); if (!attaching) ResumeThread(info.hThread); // Wait until we have been notified that it's exiting. if (manual_quit) { fprintf(stderr, "Press enter when you want stop tracing and collect.\n"); fflush(stderr); getchar(); } else { HANDLE whs[] = {exiting, info.hProcess}; if (WaitForMultipleObjects(2, whs, FALSE, INFINITE) != WAIT_OBJECT_0) { NOTREACHED("Failed to correctly capture process shutdown."); } } pg->CopyFromRemote(); if (use_symbols) { // Break in and get the symbols... SymResolver res(NULL, info.hProcess); pg->DumpJSON(&rdn, &res); } else { pg->DumpJSON(&rdn, NULL); } // Notify that it can exit now, we are done. SetEvent(exited); CloseHandle(info.hProcess); CloseHandle(info.hThread); delete pg; }