diff options
Diffstat (limited to 'sandbox/linux')
-rw-r--r-- | sandbox/linux/seccomp/ioctl.cc | 7 | ||||
-rw-r--r-- | sandbox/linux/seccomp/library.cc | 26 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox.cc | 219 | ||||
-rw-r--r-- | sandbox/linux/seccomp/sandbox_impl.h | 2 | ||||
-rw-r--r-- | sandbox/linux/seccomp/syscall.cc | 112 |
5 files changed, 239 insertions, 127 deletions
diff --git a/sandbox/linux/seccomp/ioctl.cc b/sandbox/linux/seccomp/ioctl.cc index 07f1aa3..4d2b3c5c5 100644 --- a/sandbox/linux/seccomp/ioctl.cc +++ b/sandbox/linux/seccomp/ioctl.cc @@ -48,8 +48,11 @@ bool Sandbox::process_ioctl(int parentMapsFd, int sandboxFd, int threadFdPub, ioctl_req.d, ioctl_req.req, ioctl_req.arg); return true; default: - std::cerr << "Unsupported ioctl: 0x" << std::hex << ioctl_req.req << - std::endl; + if (Debug::isEnabled()) { + char buf[80]; + sprintf(buf, "Unsupported ioctl: 0x%04X\n", ioctl_req.req); + Debug::message(buf); + } SecureMem::abandonSystemCall(threadFd, rc); return false; } diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc index e882ba4..68ff974 100644 --- a/sandbox/linux/seccomp/library.cc +++ b/sandbox/linux/seccomp/library.cc @@ -835,31 +835,29 @@ void Library::patchVDSO(char** extraSpace, int* extraLength){ // // 58 POP %eax // B8 77 00 00 00 MOV $0x77, %eax - // E9 .. .. .. .. JMP syscallWrapper + // E8 .. .. .. .. CALL syscallWrapper char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace, extraLength); - memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE9", 7); - *reinterpret_cast<char *>(dest + 7) = - reinterpret_cast<char *>(&syscallWrapper) - - reinterpret_cast<char *>(dest + 11); + memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE8", 7); + *reinterpret_cast<long *>(dest + 7) = + reinterpret_cast<char *>(&syscallWrapper) - dest - 11;; *__kernel_sigreturn = '\xE9'; - *reinterpret_cast<char *>(__kernel_sigreturn + 1) = - dest - reinterpret_cast<char *>(__kernel_sigreturn + 5); + *reinterpret_cast<long *>(__kernel_sigreturn + 1) = + dest - reinterpret_cast<char *>(__kernel_sigreturn) - 5; } if (__kernel_rt_sigreturn) { // Replace the rt_sigreturn() system call with a jump to code that does: // // B8 AD 00 00 00 MOV $0xAD, %eax - // E9 .. .. .. .. JMP syscallWrapper + // E8 .. .. .. .. CALL syscallWrapper char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace, extraLength); - memcpy(dest, "\xB8\xAD\x00\x00\x00\xE9", 6); - *reinterpret_cast<char *>(dest + 6) = - reinterpret_cast<char *>(&syscallWrapper) - - reinterpret_cast<char *>(dest + 10); + memcpy(dest, "\xB8\xAD\x00\x00\x00\xE8", 6); + *reinterpret_cast<long *>(dest + 6) = + reinterpret_cast<char *>(&syscallWrapper) - dest - 10; *__kernel_rt_sigreturn = '\xE9'; - *reinterpret_cast<char *>(__kernel_rt_sigreturn + 1) = - dest - reinterpret_cast<char *>(__kernel_rt_sigreturn + 5); + *reinterpret_cast<long *>(__kernel_rt_sigreturn + 1) = + dest - reinterpret_cast<char *>(__kernel_rt_sigreturn) - 5; } #endif } diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc index b7a249e..93ce12e 100644 --- a/sandbox/linux/seccomp/sandbox.cc +++ b/sandbox/linux/seccomp/sandbox.cc @@ -121,7 +121,8 @@ void Sandbox::setupSignalHandlers() { // Set up SEGV handler for dealing with RDTSC instructions, system calls // that have been rewritten to use INT0, and for sigpending() emulation. - sa.sa_handler_ = segv(); + sa.sa_sigaction_ = segv(); + sa.sa_flags = SA_SIGINFO; sys.sigaction(SIGSEGV, &sa, NULL); // Unblock SIGSEGV and SIGCHLD @@ -131,8 +132,8 @@ void Sandbox::setupSignalHandlers() { sys.sigprocmask(SIG_UNBLOCK, &mask, 0); } -void (*Sandbox::segv())(int signo) { - void (*fnc)(int signo); +void (*Sandbox::segv())(int signo, SysCalls::siginfo *context, void *unused) { + void (*fnc)(int signo, SysCalls::siginfo *context, void *unused); asm volatile( "call 999f\n" #if defined(__x86_64__) @@ -198,7 +199,7 @@ void (*Sandbox::segv())(int signo) { // rewrite the system call instruction. Retrieve the CPU register // at the time of the segmentation fault and invoke syscallWrapper(). "8:cmpw $0x00CD, (%%r15)\n" // INT $0x0 - "jnz 14f\n" + "jnz 16f\n" #ifndef NDEBUG "lea 200f(%%rip), %%rdi\n" "call playground$debugMessage\n" @@ -239,10 +240,18 @@ void (*Sandbox::segv())(int signo) { "mov %%r10, 0(%%rdx)\n" // old_set "jmp 7b\n" + // Handle rt_sigreturn() + "12:cmp $15, %%rax\n" // NR_rt_sigreturn + "jnz 14f\n" + "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault + "13:syscall\n" // rt_sigreturn() is unrestricted + "mov $66, %%edi\n" // rt_sigreturn() should never return + "mov $231, %%eax\n" // NR_exit_group + "jmp 13b\n" // Copy signal frame onto new stack. See clone.cc for details - "12:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000 - "jnz 13f\n" + "14:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000 + "jnz 15f\n" "mov 0xA8(%%rsp), %%rcx\n" // %rsp at time of segmentation fault "sub %%rsp, %%rcx\n" // %rcx = size of stack frame "sub $8, %%rcx\n" // skip return address @@ -256,7 +265,7 @@ void (*Sandbox::segv())(int signo) { "jmp 7b\n" // Forward system call to syscallWrapper() - "13:lea 7b(%%rip), %%rcx\n" + "15:lea 7b(%%rip), %%rcx\n" "push %%rcx\n" "push 0xB8(%%rsp)\n" // %rip at time of segmentation fault "lea playground$syscallWrapper(%%rip), %%rcx\n" @@ -265,7 +274,7 @@ void (*Sandbox::segv())(int signo) { // This was a genuine segmentation fault. Trigger the kernel's default // signal disposition. The only way we can do this from seccomp mode // is by blocking the signal and retriggering it. - "14:mov $2, %%edi\n" // stderr + "16:mov $2, %%edi\n" // stderr "lea 300f(%%rip), %%rsi\n" // "Segmentation fault\n" "mov $301f-300f, %%edx\n" "mov $1, %%eax\n" // NR_write @@ -277,13 +286,13 @@ void (*Sandbox::segv())(int signo) { // happened. If it is RDTSC, forward the request to the trusted // thread. "mov $-3, %%ebx\n" // request for RDTSC - "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault + "mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault "cmpw $0x310F, (%%ebp)\n" // RDTSC "jz 0f\n" - "cmpw $0x010F, (%%ebp)\n" - "jnz 8f\n" + "cmpw $0x010F, (%%ebp)\n" // RDTSCP + "jnz 9f\n" "cmpb $0xF9, 2(%%ebp)\n" - "jnz 8f\n" + "jnz 9f\n" "mov $-4, %%ebx\n" // request for RDTSCP "0:" #ifndef NDEBUG @@ -292,7 +301,7 @@ void (*Sandbox::segv())(int signo) { "call playground$debugMessage\n" "sub $4, %%esp\n" #else - "sub $8, %%esp\n" + "sub $8, %%esp\n" // allocate buffer for receiving timestamp #endif "push %%ebx\n" "mov %%fs:16, %%ebx\n" // fd = threadFdPub @@ -301,126 +310,178 @@ void (*Sandbox::segv())(int signo) { "1:mov %%edx, %%eax\n" // NR_write "int $0x80\n" "cmp %%eax, %%edx\n" - "jz 5f\n" + "jz 7f\n" "cmp $-4, %%eax\n" // EINTR "jz 1b\n" - "2:add $12, %%esp\n" - "movl $0, 0x34(%%esp)\n" // %eax at time of segmentation fault - "movl $0, 0x2C(%%esp)\n" // %edx at time of segmentation fault + "2:add $12, %%esp\n" // remove temporary buffer from stack + "xor %%eax, %%eax\n" + "movl $0, 0xC8(%%esp)\n" // %edx at time of segmentation fault "cmpw $0x310F, (%%ebp)\n" // RDTSC "jz 3f\n" - "movl $0, 0x30(%%esp)\n" // %ecx at time of segmentation fault - "3:addl $2, 0x40(%%esp)\n" // %eip at time of segmentation fault - "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault - "cmpw $0x010F, (%%ebp)\n" // RDTSC - "jnz 4f\n" - "addl $1, 0x40(%%esp)\n" // %eip at time of segmentation fault - "4:ret\n" - "5:mov $12, %%edx\n" // len = 3*sizeof(int) - "6:mov $3, %%eax\n" // NR_read + "movl $0, 0xCC(%%esp)\n" // %ecx at time of segmentation fault + "3:mov %%eax, 0xD0(%%esp)\n" // %eax at time of segmentation fault + "4:mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault + "addl $2, 0xDC(%%esp)\n" // %eip at time of segmentation fault + "cmpw $0x010F, (%%ebp)\n" // RDTSCP + "jnz 5f\n" + "addl $1, 0xDC(%%esp)\n" // %eip at time of segmentation fault + "5:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger + "mov 0x1CC(%%esp), %%eax\n" // push signal number + "push %%eax\n" + "lea 0x270(%%esp), %%esi\n" // copy siginfo register values + "lea 0x4(%%esp), %%edi\n" // into new location + "mov $22, %%ecx\n" + "cld\n" + "rep movsl\n" + "mov 0x2C8(%%esp), %%ebx\n" // copy first half of signal mask + "mov %%ebx, 0x54(%%esp)\n" + "lea 6f, %%esi\n" // copy "magic" restorer function + "push %%esi\n" // push restorer function + "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers + "movb $2, %%cl\n" + "rep movsl\n" + "ret\n" // return to restorer function + + // The restorer function is sometimes used by gdb as a magic marker to + // recognize signal stack frames. Don't change any of the next three + // instructions. + "6:pop %%eax\n" // remove dummy argument (signo) + "mov $119, %%eax\n" // NR_sigreturn + "int $0x80\n" + "7:mov $12, %%edx\n" // len = 3*sizeof(int) + "8:mov $3, %%eax\n" // NR_read "int $0x80\n" "cmp $-4, %%eax\n" // EINTR - "jz 6b\n" + "jz 8b\n" "cmp %%eax, %%edx\n" "jnz 2b\n" "pop %%eax\n" "pop %%edx\n" "pop %%ecx\n" - "mov %%edx, 0x2C(%%esp)\n" // %edx at time of segmentation fault + "mov %%edx, 0xC8(%%esp)\n" // %edx at time of segmentation fault "cmpw $0x310F, (%%ebp)\n" // RDTSC - "jz 7f\n" - "mov %%ecx, 0x30(%%esp)\n" // %ecx at time of segmentation fault - "7:mov %%eax, 0x34(%%esp)\n" // %eax at time of segmentation fault + "jz 3b\n" + "mov %%ecx, 0xCC(%%esp)\n" // %ecx at time of segmentation fault "jmp 3b\n" // If the instruction is INT 0, then this was probably the result // of playground::Library being unable to find a way to safely // rewrite the system call instruction. Retrieve the CPU register // at the time of the segmentation fault and invoke syscallWrapper(). - "8:cmpw $0x00CD, (%%ebp)\n" // INT $0x0 - "jnz 16f\n" + "9:cmpw $0x00CD, (%%ebp)\n" // INT $0x0 + "jnz 20f\n" #ifndef NDEBUG "lea 200f, %%eax\n" "push %%eax\n" "call playground$debugMessage\n" "add $0x4, %%esp\n" #endif - "mov 0x34(%%esp), %%eax\n" // %eax at time of segmentation fault - "mov 0x28(%%esp), %%ebx\n" // %ebx at time of segmentation fault - "mov 0x30(%%esp), %%ecx\n" // %ecx at time of segmentation fault - "mov 0x2C(%%esp), %%edx\n" // %edx at time of segmentation fault - "mov 0x1C(%%esp), %%esi\n" // %esi at time of segmentation fault - "mov 0x18(%%esp), %%edi\n" // %edi at time of segmentation fault - "mov 0x20(%%esp), %%ebp\n" // %ebp at time of segmentation fault + "mov 0xD0(%%esp), %%eax\n" // %eax at time of segmentation fault + "mov 0xC4(%%esp), %%ebx\n" // %ebx at time of segmentation fault + "mov 0xCC(%%esp), %%ecx\n" // %ecx at time of segmentation fault + "mov 0xC8(%%esp), %%edx\n" // %edx at time of segmentation fault + "mov 0xB8(%%esp), %%esi\n" // %esi at time of segmentation fault + "mov 0xB4(%%esp), %%edi\n" // %edi at time of segmentation fault + "mov 0xB2(%%esp), %%ebp\n" // %ebp at time of segmentation fault // Handle sigprocmask() and rt_sigprocmask() "cmp $175, %%eax\n" // NR_rt_sigprocmask - "jnz 9f\n" + "jnz 10f\n" "mov $-22, %%eax\n" // -EINVAL "cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals) - "jl 7b\n" - "jmp 10f\n" - "9:cmp $126, %%eax\n" // NR_sigprocmask - "jnz 14f\n" + "jl 3b\n" + "jmp 11f\n" + "10:cmp $126, %%eax\n" // NR_sigprocmask + "jnz 15f\n" "mov $-22, %%eax\n" - "10:mov 0x58(%%esp), %%edi\n" // signal mask at time of segmentation fault - "mov 0x5C(%%esp), %%ebp\n" + "11:mov 0xFC(%%esp), %%edi\n" // signal mask at time of segmentation fault + "mov 0x100(%%esp), %%ebp\n" "test %%ecx, %%ecx\n" // only set mask, if set is non-NULL - "jz 13f\n" + "jz 14f\n" "mov 0(%%ecx), %%esi\n" "mov 4(%%ecx), %%ecx\n" "cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK) - "jnz 11f\n" - "or %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault - "or %%ecx, 0x5C(%%esp)\n" - "jmp 13f\n" - "11:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK) "jnz 12f\n" + "or %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault + "or %%ecx, 0x100(%%esp)\n" + "jmp 14f\n" + "12:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK) + "jnz 13f\n" "xor $-1, %%esi\n" "xor $-1, %%ecx\n" - "and %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault - "and %%ecx, 0x5C(%%esp)\n" - "jmp 13f\n" - "12:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK) - "jnz 7b\n" - "mov %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault - "mov %%ecx, 0x5C(%%esp)\n" - "13:xor %%eax, %%eax\n" + "and %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault + "and %%ecx, 0x100(%%esp)\n" + "jmp 14f\n" + "13:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK) + "jnz 3b\n" + "mov %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault + "mov %%ecx, 0x100(%%esp)\n" + "14:xor %%eax, %%eax\n" "test %%edx, %%edx\n" // only return old mask, if set is non-NULL - "jz 7b\n" + "jz 3b\n" "mov %%edi, 0(%%edx)\n" // old_set "mov %%ebp, 4(%%edx)\n" - "jmp 7b\n" + "jmp 3b\n" - // Copy signal frame onto new stack. See clone.cc for details - "14:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000 - "jnz 15f\n" - "mov 0x24(%%esp), %%ecx\n" // %esp at time of segmentation fault - "sub %%esp, %%ecx\n" // %ecx = size of stack frame - "sub $8, %%ecx\n" // skip return address and dummy - "mov %%ecx, %%eax\n" // return size of signal stack frame + // Handle sigreturn() and rt_sigreturn() + // See syscall.cc for a discussion on how we can emulate rt_sigreturn() + // by calling sigreturn() with a suitably adjusted stack. + "15:cmp $119, %%eax\n" // NR_sigreturn + "jnz 17f\n" + "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault + "16:int $0x80\n" // sigreturn() is unrestricted + "17:cmp $173, %%eax\n" // NR_rt_sigreturn + "jnz 18f\n" + "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault + "sub $4, %%esp\n" // add fake return address + "jmp 4b\n" + + // Copy signal frame onto new stack. In the process, we have to convert + // it from an RT signal frame to a legacy signal frame. + // See clone.cc for details + "18:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000 + "jnz 19f\n" + "mov 0xC0(%%esp), %%ecx\n" // %esp at time of segmentation fault + "sub %%esp, %%ecx\n" // %ecx = size of RT stack frame + "mov %%ecx, %%eax\n" + "add $0x1C8, %%eax\n" // adjust for size of legacy stack frame + "sub $0x100, %%ecx\n" "mov 0(%%edx), %%edi\n" // stack for newly clone()'d thread "sub %%ecx, %%edi\n" // copy onto new stack - "mov %%edi, 0(%%edx)\n" // allocate space on new stack - "lea 8(%%esp), %%esi\n" // copy from current stack + "lea 0x100(%%esp), %%esi\n" "cld\n" - "rep movsb\n" - "jmp 7b\n" + "rep movsb\n" // copy parts of RT stack(sigmask, FP state) + "mov 0xF0(%%esp), %%ebx\n" // adjust pointer to fpstate + "sub %%esi, %%ebx\n" + "add %%edi, %%ebx\n" + "sub %%eax, %%edi\n" + "mov %%edi, 0(%%edx)\n" // allocate space on new stack + "lea 0xA4(%%esp), %%esi\n" // copy sigcontext from current stack + "mov $0x16, %%ecx\n" + "rep movsl\n" + "mov %%ebx, -0xC(%%edi)\n" // set pointer to fpstate + "mov 0xFC(%%esp), %%ebx\n" // copy first half of signal mask + "mov %%ebx, -0x8(%%edi)\n" + "mov %%eax, -0x2C(%%edi)\n" // return size of stack frame in %%eax + "addl $2, -0x20(%%edi)\n" // adjust %eip + "mov 0(%%edx), %%esp\n" + "mov $119, %%eax\n" // NR_sigreturn + "int $0x80\n" // Forward system call to syscallWrapper() - "15:call playground$syscallWrapper\n" - "jmp 7b\n" + "19:call playground$syscallWrapper\n" + "jmp 3b\n" // This was a genuine segmentation fault. Trigger the kernel's default // signal disposition. The only way we can do this from seccomp mode // is by blocking the signal and retriggering it. - "16:mov $2, %%ebx\n" // stderr + "20:mov $2, %%ebx\n" // stderr "lea 300f, %%ecx\n" // "Segmentation fault\n" "mov $301f-300f, %%edx\n" "mov $4, %%eax\n" // NR_write "int $0x80\n" - "orb $4, 0x59(%%esp)\n" // signal mask at time of segmentation fault - "ret\n" + "orb $4, 0xFD(%%esp)\n" // signal mask at time of segmentation fault + "jmp 4b\n" #else #error Unsupported target platform #endif diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h index 38a1803..9c49ffc 100644 --- a/sandbox/linux/seccomp/sandbox_impl.h +++ b/sandbox/linux/seccomp/sandbox_impl.h @@ -598,7 +598,7 @@ class Sandbox { // The SEGV handler knows how to handle RDTSC instructions static void setupSignalHandlers(); - static void (*segv())(int signo); + static void (*segv())(int signo, SysCalls::siginfo *context, void *unused); // If no specific handler has been registered for a system call, call this // function which asks the trusted thread to perform the call. This is used diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc index 76e96e4..681fec9 100644 --- a/sandbox/linux/seccomp/syscall.cc +++ b/sandbox/linux/seccomp/syscall.cc @@ -46,8 +46,17 @@ asm( ".globl playground$syscallWrapper\n" ".type playground$syscallWrapper, @function\n" #if defined(__x86_64__) + // Check for rt_sigreturn(). It needs to be handled specially. + "cmp $15, %rax\n" // NR_rt_sigreturn + "jnz 1f\n" + "add $0x90, %rsp\n" // pop return addresses and red zone + "0:syscall\n" // rt_sigreturn() is unrestricted + "mov $66, %edi\n" // rt_sigreturn() should never return + "mov $231, %eax\n" // NR_exit_group + "jmp 0b\n" + // Save all registers - "push %rbp\n" + "1:push %rbp\n" "mov %rsp, %rbp\n" "push %rbx\n" "push %rcx\n" @@ -70,7 +79,7 @@ asm( // Check range of system call "cmp playground$maxSyscall(%rip), %eax\n" - "ja 1f\n" + "ja 3f\n" // Retrieve function call from system call table (c.f. syscall_table.c). // We have three different types of entries; zero for denied system calls, @@ -86,9 +95,9 @@ asm( // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise // jump to fallback handler. "cmp $1, %r10\n" - "jbe 1f\n" + "jbe 3f\n" "call *%r10\n" - "0:" + "2:" // Restore CPU registers, except for %rax which was set by the system call. "pop %r15\n" @@ -113,7 +122,7 @@ asm( // Return to caller "ret\n" - "1:" + "3:" // If we end up calling a specific handler, we don't need to know the // system call number. However, in the generic case, we do. Shift // registers so that the system call number becomes visible as the @@ -129,10 +138,55 @@ asm( // Call default handler. "call playground$defaultSystemCallHandler\n" "pop %r9\n" - "jmp 0b\n" + "jmp 2b\n" #elif defined(__i386__) + "cmp $119, %eax\n" // NR_sigreturn + "jnz 1f\n" + "add $0x4, %esp\n" // pop return address + "0:int $0x80\n" // sigreturn() is unrestricted + "mov $66, %ebx\n" // sigreturn() should never return + "mov %ebx, %eax\n" // NR_exit + "jmp 0b\n" + "1:cmp $173, %eax\n" // NR_rt_sigreturn + "jnz 3f\n" + + // Convert rt_sigframe into sigframe, allowing us to call sigreturn(). + // This is possible since the first part of signal stack frames have + // stayed very stable since the earliest kernel versions. While never + // officially documented, lots of user space applications rely on this + // part of the ABI, and kernel developers have been careful to maintain + // backwards compatibility. + // In general, the rt_sigframe includes a lot of extra information that + // the signal handler can look at. Most notably, this means a complete + // siginfo record. + // Fortunately though, the kernel doesn't look at any of this extra data + // when returning from a signal handler. So, we can safely convert an + // rt_sigframe to a legacy sigframe, discarding the extra data in the + // process. Interestingly, the legacy signal frame is actually larger than + // the rt signal frame, as it includes a lot more padding. + "sub $0x1C8, %esp\n" // a legacy signal stack is much larger + "mov 0x1CC(%esp), %eax\n" // push signal number + "push %eax\n" + "lea 0x270(%esp), %esi\n" // copy siginfo register values + "lea 0x4(%esp), %edi\n" // into new location + "mov $0x16, %ecx\n" + "cld\n" + "rep movsl\n" + "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask + "mov %ebx, 0x54(%esp)\n" + "lea 2f, %esi\n" + "push %esi\n" // push restorer function + "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers + "movb $2, %cl\n" + "rep movsl\n" + "ret\n" // return to restorer function + "2:pop %eax\n" // remove dummy argument (signo) + "mov $119, %eax\n" // NR_sigaction + "int $0x80\n" + + // Preserve all registers - "push %ebx\n" + "3:push %ebx\n" "push %ecx\n" "push %edx\n" "push %esi\n" @@ -150,7 +204,7 @@ asm( // Check range of system call "cmp playground$maxSyscall, %eax\n" - "ja 5f\n" + "ja 9f\n" // We often have long sequences of calls to gettimeofday(). This is // needlessly expensive. Coalesce them into a single call. @@ -164,9 +218,9 @@ asm( // or maybe, if we have recently seen requests to compute // the time. There might be a repeated pattern of those. "cmp $78, %eax\n" // __NR_gettimeofday - "jnz 2f\n" + "jnz 6f\n" "cmp %eax, %fs:0x102C-0x58\n" // last system call - "jnz 0f\n" + "jnz 4f\n" // This system call and the last system call prior to this one both are // calls to gettimeofday(). Try to avoid making the new call and just @@ -174,7 +228,7 @@ asm( // Just in case the caller is spinning on the result from gettimeofday(), // every so often, call the actual system call. "decl %fs:0x1030-0x58\n" // countdown calls to gettimofday() - "jz 0f\n" + "jz 4f\n" // Atomically read the 64bit word representing last-known timestamp and // return it to the caller. On x86-32 this is a little more complicated and @@ -186,11 +240,11 @@ asm( "mov %edx, 4(%ebx)\n" "xor %eax, %eax\n" "add $28, %esp\n" - "jmp 4f\n" + "jmp 8f\n" // This is a call to gettimeofday(), but we don't have a valid cached // result, yet. - "0:mov %eax, %fs:0x102C-0x58\n" // remember syscall number + "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number "movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations "call playground$defaultSystemCallHandler\n" @@ -201,17 +255,17 @@ asm( "mov 0(%ebx), %ebx\n" "mov 100f, %eax\n" "mov 101f, %edx\n" - "1:lock; cmpxchg8b 100f\n" - "jnz 1b\n" + "5:lock; cmpxchg8b 100f\n" + "jnz 5b\n" "xor %eax, %eax\n" - "jmp 6f\n" + "jmp 10f\n" // Remember the number of the last system call made. We deliberately do // not remember calls to gettid(), as we have often seen long sequences // of calls to just gettimeofday() and gettid(). In that situation, we // would still like to coalesce the gettimeofday() calls. - "2:cmp $224, %eax\n" // __NR_gettid - "jz 3f\n" + "6:cmp $224, %eax\n" // __NR_gettid + "jz 7f\n" "mov %eax, %fs:0x102C-0x58\n" // remember syscall number // Retrieve function call from system call table (c.f. syscall_table.c). @@ -219,7 +273,7 @@ asm( // that should be handled by the defaultSystemCallHandler(); minus one // for unrestricted system calls that need to be forwarded to the trusted // thread; and function pointers to specific handler functions. - "3:shl $3, %eax\n" + "7:shl $3, %eax\n" "lea playground$syscallTable, %ebx\n" "add %ebx, %eax\n" "mov 0(%eax), %eax\n" @@ -227,13 +281,13 @@ asm( // Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise // jump to fallback handler. "cmp $1, %eax\n" - "jbe 5f\n" + "jbe 9f\n" "add $4, %esp\n" "call *%eax\n" "add $24, %esp\n" // Restore CPU registers, except for %eax which was set by the system call. - "4:pop %ebp\n" + "8:pop %ebp\n" "pop %edi\n" "pop %esi\n" "pop %edx\n" @@ -244,9 +298,9 @@ asm( "ret\n" // Call default handler. - "5:call playground$defaultSystemCallHandler\n" - "6:add $28, %esp\n" - "jmp 4b\n" + "9:call playground$defaultSystemCallHandler\n" + "10:add $28, %esp\n" + "jmp 8b\n" ".pushsection \".bss\"\n" ".balign 8\n" @@ -267,9 +321,9 @@ void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1, void* arg5) { // TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that. - // We try to avoid intercepting read(), write(), and sigreturn(), as - // these system calls are not restricted in Seccomp mode. But depending on - // the exact instruction sequence in libc, we might not be able to reliably + // We try to avoid intercepting read(), and write(), as these system calls + // are not restricted in Seccomp mode. But depending on the exact + // instruction sequence in libc, we might not be able to reliably // filter out these system calls at the time when we instrument the code. SysCalls sys; long rc; @@ -283,10 +337,6 @@ void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1, Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); rc = sys.write((long)arg0, arg1, (size_t)arg2); break; - case __NR_rt_sigreturn: - Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call"); - rc = sys.rt_sigreturn((unsigned long)arg0); - break; default: if (Debug::isEnabled()) { // In debug mode, prevent stderr from being closed |