summaryrefslogtreecommitdiffstats
path: root/sandbox/linux
diff options
context:
space:
mode:
Diffstat (limited to 'sandbox/linux')
-rw-r--r--sandbox/linux/seccomp/ioctl.cc7
-rw-r--r--sandbox/linux/seccomp/library.cc26
-rw-r--r--sandbox/linux/seccomp/sandbox.cc219
-rw-r--r--sandbox/linux/seccomp/sandbox_impl.h2
-rw-r--r--sandbox/linux/seccomp/syscall.cc112
5 files changed, 239 insertions, 127 deletions
diff --git a/sandbox/linux/seccomp/ioctl.cc b/sandbox/linux/seccomp/ioctl.cc
index 07f1aa3..4d2b3c5c5 100644
--- a/sandbox/linux/seccomp/ioctl.cc
+++ b/sandbox/linux/seccomp/ioctl.cc
@@ -48,8 +48,11 @@ bool Sandbox::process_ioctl(int parentMapsFd, int sandboxFd, int threadFdPub,
ioctl_req.d, ioctl_req.req, ioctl_req.arg);
return true;
default:
- std::cerr << "Unsupported ioctl: 0x" << std::hex << ioctl_req.req <<
- std::endl;
+ if (Debug::isEnabled()) {
+ char buf[80];
+ sprintf(buf, "Unsupported ioctl: 0x%04X\n", ioctl_req.req);
+ Debug::message(buf);
+ }
SecureMem::abandonSystemCall(threadFd, rc);
return false;
}
diff --git a/sandbox/linux/seccomp/library.cc b/sandbox/linux/seccomp/library.cc
index e882ba4..68ff974 100644
--- a/sandbox/linux/seccomp/library.cc
+++ b/sandbox/linux/seccomp/library.cc
@@ -835,31 +835,29 @@ void Library::patchVDSO(char** extraSpace, int* extraLength){
//
// 58 POP %eax
// B8 77 00 00 00 MOV $0x77, %eax
- // E9 .. .. .. .. JMP syscallWrapper
+ // E8 .. .. .. .. CALL syscallWrapper
char* dest = getScratchSpace(maps_, __kernel_sigreturn, 11, extraSpace,
extraLength);
- memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE9", 7);
- *reinterpret_cast<char *>(dest + 7) =
- reinterpret_cast<char *>(&syscallWrapper) -
- reinterpret_cast<char *>(dest + 11);
+ memcpy(dest, "\x58\xB8\x77\x00\x00\x00\xE8", 7);
+ *reinterpret_cast<long *>(dest + 7) =
+ reinterpret_cast<char *>(&syscallWrapper) - dest - 11;;
*__kernel_sigreturn = '\xE9';
- *reinterpret_cast<char *>(__kernel_sigreturn + 1) =
- dest - reinterpret_cast<char *>(__kernel_sigreturn + 5);
+ *reinterpret_cast<long *>(__kernel_sigreturn + 1) =
+ dest - reinterpret_cast<char *>(__kernel_sigreturn) - 5;
}
if (__kernel_rt_sigreturn) {
// Replace the rt_sigreturn() system call with a jump to code that does:
//
// B8 AD 00 00 00 MOV $0xAD, %eax
- // E9 .. .. .. .. JMP syscallWrapper
+ // E8 .. .. .. .. CALL syscallWrapper
char* dest = getScratchSpace(maps_, __kernel_rt_sigreturn, 10, extraSpace,
extraLength);
- memcpy(dest, "\xB8\xAD\x00\x00\x00\xE9", 6);
- *reinterpret_cast<char *>(dest + 6) =
- reinterpret_cast<char *>(&syscallWrapper) -
- reinterpret_cast<char *>(dest + 10);
+ memcpy(dest, "\xB8\xAD\x00\x00\x00\xE8", 6);
+ *reinterpret_cast<long *>(dest + 6) =
+ reinterpret_cast<char *>(&syscallWrapper) - dest - 10;
*__kernel_rt_sigreturn = '\xE9';
- *reinterpret_cast<char *>(__kernel_rt_sigreturn + 1) =
- dest - reinterpret_cast<char *>(__kernel_rt_sigreturn + 5);
+ *reinterpret_cast<long *>(__kernel_rt_sigreturn + 1) =
+ dest - reinterpret_cast<char *>(__kernel_rt_sigreturn) - 5;
}
#endif
}
diff --git a/sandbox/linux/seccomp/sandbox.cc b/sandbox/linux/seccomp/sandbox.cc
index b7a249e..93ce12e 100644
--- a/sandbox/linux/seccomp/sandbox.cc
+++ b/sandbox/linux/seccomp/sandbox.cc
@@ -121,7 +121,8 @@ void Sandbox::setupSignalHandlers() {
// Set up SEGV handler for dealing with RDTSC instructions, system calls
// that have been rewritten to use INT0, and for sigpending() emulation.
- sa.sa_handler_ = segv();
+ sa.sa_sigaction_ = segv();
+ sa.sa_flags = SA_SIGINFO;
sys.sigaction(SIGSEGV, &sa, NULL);
// Unblock SIGSEGV and SIGCHLD
@@ -131,8 +132,8 @@ void Sandbox::setupSignalHandlers() {
sys.sigprocmask(SIG_UNBLOCK, &mask, 0);
}
-void (*Sandbox::segv())(int signo) {
- void (*fnc)(int signo);
+void (*Sandbox::segv())(int signo, SysCalls::siginfo *context, void *unused) {
+ void (*fnc)(int signo, SysCalls::siginfo *context, void *unused);
asm volatile(
"call 999f\n"
#if defined(__x86_64__)
@@ -198,7 +199,7 @@ void (*Sandbox::segv())(int signo) {
// rewrite the system call instruction. Retrieve the CPU register
// at the time of the segmentation fault and invoke syscallWrapper().
"8:cmpw $0x00CD, (%%r15)\n" // INT $0x0
- "jnz 14f\n"
+ "jnz 16f\n"
#ifndef NDEBUG
"lea 200f(%%rip), %%rdi\n"
"call playground$debugMessage\n"
@@ -239,10 +240,18 @@ void (*Sandbox::segv())(int signo) {
"mov %%r10, 0(%%rdx)\n" // old_set
"jmp 7b\n"
+ // Handle rt_sigreturn()
+ "12:cmp $15, %%rax\n" // NR_rt_sigreturn
+ "jnz 14f\n"
+ "mov 0xA8(%%rsp), %%rsp\n" // %rsp at time of segmentation fault
+ "13:syscall\n" // rt_sigreturn() is unrestricted
+ "mov $66, %%edi\n" // rt_sigreturn() should never return
+ "mov $231, %%eax\n" // NR_exit_group
+ "jmp 13b\n"
// Copy signal frame onto new stack. See clone.cc for details
- "12:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000
- "jnz 13f\n"
+ "14:cmp $56+0xF000, %%rax\n" // NR_clone + 0xF000
+ "jnz 15f\n"
"mov 0xA8(%%rsp), %%rcx\n" // %rsp at time of segmentation fault
"sub %%rsp, %%rcx\n" // %rcx = size of stack frame
"sub $8, %%rcx\n" // skip return address
@@ -256,7 +265,7 @@ void (*Sandbox::segv())(int signo) {
"jmp 7b\n"
// Forward system call to syscallWrapper()
- "13:lea 7b(%%rip), %%rcx\n"
+ "15:lea 7b(%%rip), %%rcx\n"
"push %%rcx\n"
"push 0xB8(%%rsp)\n" // %rip at time of segmentation fault
"lea playground$syscallWrapper(%%rip), %%rcx\n"
@@ -265,7 +274,7 @@ void (*Sandbox::segv())(int signo) {
// This was a genuine segmentation fault. Trigger the kernel's default
// signal disposition. The only way we can do this from seccomp mode
// is by blocking the signal and retriggering it.
- "14:mov $2, %%edi\n" // stderr
+ "16:mov $2, %%edi\n" // stderr
"lea 300f(%%rip), %%rsi\n" // "Segmentation fault\n"
"mov $301f-300f, %%edx\n"
"mov $1, %%eax\n" // NR_write
@@ -277,13 +286,13 @@ void (*Sandbox::segv())(int signo) {
// happened. If it is RDTSC, forward the request to the trusted
// thread.
"mov $-3, %%ebx\n" // request for RDTSC
- "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault
+ "mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault
"cmpw $0x310F, (%%ebp)\n" // RDTSC
"jz 0f\n"
- "cmpw $0x010F, (%%ebp)\n"
- "jnz 8f\n"
+ "cmpw $0x010F, (%%ebp)\n" // RDTSCP
+ "jnz 9f\n"
"cmpb $0xF9, 2(%%ebp)\n"
- "jnz 8f\n"
+ "jnz 9f\n"
"mov $-4, %%ebx\n" // request for RDTSCP
"0:"
#ifndef NDEBUG
@@ -292,7 +301,7 @@ void (*Sandbox::segv())(int signo) {
"call playground$debugMessage\n"
"sub $4, %%esp\n"
#else
- "sub $8, %%esp\n"
+ "sub $8, %%esp\n" // allocate buffer for receiving timestamp
#endif
"push %%ebx\n"
"mov %%fs:16, %%ebx\n" // fd = threadFdPub
@@ -301,126 +310,178 @@ void (*Sandbox::segv())(int signo) {
"1:mov %%edx, %%eax\n" // NR_write
"int $0x80\n"
"cmp %%eax, %%edx\n"
- "jz 5f\n"
+ "jz 7f\n"
"cmp $-4, %%eax\n" // EINTR
"jz 1b\n"
- "2:add $12, %%esp\n"
- "movl $0, 0x34(%%esp)\n" // %eax at time of segmentation fault
- "movl $0, 0x2C(%%esp)\n" // %edx at time of segmentation fault
+ "2:add $12, %%esp\n" // remove temporary buffer from stack
+ "xor %%eax, %%eax\n"
+ "movl $0, 0xC8(%%esp)\n" // %edx at time of segmentation fault
"cmpw $0x310F, (%%ebp)\n" // RDTSC
"jz 3f\n"
- "movl $0, 0x30(%%esp)\n" // %ecx at time of segmentation fault
- "3:addl $2, 0x40(%%esp)\n" // %eip at time of segmentation fault
- "mov 0x40(%%esp), %%ebp\n" // %eip at time of segmentation fault
- "cmpw $0x010F, (%%ebp)\n" // RDTSC
- "jnz 4f\n"
- "addl $1, 0x40(%%esp)\n" // %eip at time of segmentation fault
- "4:ret\n"
- "5:mov $12, %%edx\n" // len = 3*sizeof(int)
- "6:mov $3, %%eax\n" // NR_read
+ "movl $0, 0xCC(%%esp)\n" // %ecx at time of segmentation fault
+ "3:mov %%eax, 0xD0(%%esp)\n" // %eax at time of segmentation fault
+ "4:mov 0xDC(%%esp), %%ebp\n" // %eip at time of segmentation fault
+ "addl $2, 0xDC(%%esp)\n" // %eip at time of segmentation fault
+ "cmpw $0x010F, (%%ebp)\n" // RDTSCP
+ "jnz 5f\n"
+ "addl $1, 0xDC(%%esp)\n" // %eip at time of segmentation fault
+ "5:sub $0x1C8, %%esp\n" // a legacy signal stack is much larger
+ "mov 0x1CC(%%esp), %%eax\n" // push signal number
+ "push %%eax\n"
+ "lea 0x270(%%esp), %%esi\n" // copy siginfo register values
+ "lea 0x4(%%esp), %%edi\n" // into new location
+ "mov $22, %%ecx\n"
+ "cld\n"
+ "rep movsl\n"
+ "mov 0x2C8(%%esp), %%ebx\n" // copy first half of signal mask
+ "mov %%ebx, 0x54(%%esp)\n"
+ "lea 6f, %%esi\n" // copy "magic" restorer function
+ "push %%esi\n" // push restorer function
+ "lea 0x2D4(%%esp), %%edi\n" // patch up retcode magic numbers
+ "movb $2, %%cl\n"
+ "rep movsl\n"
+ "ret\n" // return to restorer function
+
+ // The restorer function is sometimes used by gdb as a magic marker to
+ // recognize signal stack frames. Don't change any of the next three
+ // instructions.
+ "6:pop %%eax\n" // remove dummy argument (signo)
+ "mov $119, %%eax\n" // NR_sigreturn
+ "int $0x80\n"
+ "7:mov $12, %%edx\n" // len = 3*sizeof(int)
+ "8:mov $3, %%eax\n" // NR_read
"int $0x80\n"
"cmp $-4, %%eax\n" // EINTR
- "jz 6b\n"
+ "jz 8b\n"
"cmp %%eax, %%edx\n"
"jnz 2b\n"
"pop %%eax\n"
"pop %%edx\n"
"pop %%ecx\n"
- "mov %%edx, 0x2C(%%esp)\n" // %edx at time of segmentation fault
+ "mov %%edx, 0xC8(%%esp)\n" // %edx at time of segmentation fault
"cmpw $0x310F, (%%ebp)\n" // RDTSC
- "jz 7f\n"
- "mov %%ecx, 0x30(%%esp)\n" // %ecx at time of segmentation fault
- "7:mov %%eax, 0x34(%%esp)\n" // %eax at time of segmentation fault
+ "jz 3b\n"
+ "mov %%ecx, 0xCC(%%esp)\n" // %ecx at time of segmentation fault
"jmp 3b\n"
// If the instruction is INT 0, then this was probably the result
// of playground::Library being unable to find a way to safely
// rewrite the system call instruction. Retrieve the CPU register
// at the time of the segmentation fault and invoke syscallWrapper().
- "8:cmpw $0x00CD, (%%ebp)\n" // INT $0x0
- "jnz 16f\n"
+ "9:cmpw $0x00CD, (%%ebp)\n" // INT $0x0
+ "jnz 20f\n"
#ifndef NDEBUG
"lea 200f, %%eax\n"
"push %%eax\n"
"call playground$debugMessage\n"
"add $0x4, %%esp\n"
#endif
- "mov 0x34(%%esp), %%eax\n" // %eax at time of segmentation fault
- "mov 0x28(%%esp), %%ebx\n" // %ebx at time of segmentation fault
- "mov 0x30(%%esp), %%ecx\n" // %ecx at time of segmentation fault
- "mov 0x2C(%%esp), %%edx\n" // %edx at time of segmentation fault
- "mov 0x1C(%%esp), %%esi\n" // %esi at time of segmentation fault
- "mov 0x18(%%esp), %%edi\n" // %edi at time of segmentation fault
- "mov 0x20(%%esp), %%ebp\n" // %ebp at time of segmentation fault
+ "mov 0xD0(%%esp), %%eax\n" // %eax at time of segmentation fault
+ "mov 0xC4(%%esp), %%ebx\n" // %ebx at time of segmentation fault
+ "mov 0xCC(%%esp), %%ecx\n" // %ecx at time of segmentation fault
+ "mov 0xC8(%%esp), %%edx\n" // %edx at time of segmentation fault
+ "mov 0xB8(%%esp), %%esi\n" // %esi at time of segmentation fault
+ "mov 0xB4(%%esp), %%edi\n" // %edi at time of segmentation fault
+ "mov 0xB2(%%esp), %%ebp\n" // %ebp at time of segmentation fault
// Handle sigprocmask() and rt_sigprocmask()
"cmp $175, %%eax\n" // NR_rt_sigprocmask
- "jnz 9f\n"
+ "jnz 10f\n"
"mov $-22, %%eax\n" // -EINVAL
"cmp $8, %%esi\n" // %esi = sigsetsize (8 bytes = 64 signals)
- "jl 7b\n"
- "jmp 10f\n"
- "9:cmp $126, %%eax\n" // NR_sigprocmask
- "jnz 14f\n"
+ "jl 3b\n"
+ "jmp 11f\n"
+ "10:cmp $126, %%eax\n" // NR_sigprocmask
+ "jnz 15f\n"
"mov $-22, %%eax\n"
- "10:mov 0x58(%%esp), %%edi\n" // signal mask at time of segmentation fault
- "mov 0x5C(%%esp), %%ebp\n"
+ "11:mov 0xFC(%%esp), %%edi\n" // signal mask at time of segmentation fault
+ "mov 0x100(%%esp), %%ebp\n"
"test %%ecx, %%ecx\n" // only set mask, if set is non-NULL
- "jz 13f\n"
+ "jz 14f\n"
"mov 0(%%ecx), %%esi\n"
"mov 4(%%ecx), %%ecx\n"
"cmp $0, %%ebx\n" // %ebx = how (SIG_BLOCK)
- "jnz 11f\n"
- "or %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault
- "or %%ecx, 0x5C(%%esp)\n"
- "jmp 13f\n"
- "11:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK)
"jnz 12f\n"
+ "or %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault
+ "or %%ecx, 0x100(%%esp)\n"
+ "jmp 14f\n"
+ "12:cmp $1, %%ebx\n" // %ebx = how (SIG_UNBLOCK)
+ "jnz 13f\n"
"xor $-1, %%esi\n"
"xor $-1, %%ecx\n"
- "and %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault
- "and %%ecx, 0x5C(%%esp)\n"
- "jmp 13f\n"
- "12:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK)
- "jnz 7b\n"
- "mov %%esi, 0x58(%%esp)\n" // signal mask at time of segmentation fault
- "mov %%ecx, 0x5C(%%esp)\n"
- "13:xor %%eax, %%eax\n"
+ "and %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault
+ "and %%ecx, 0x100(%%esp)\n"
+ "jmp 14f\n"
+ "13:cmp $2, %%ebx\n" // %ebx = how (SIG_SETMASK)
+ "jnz 3b\n"
+ "mov %%esi, 0xFC(%%esp)\n" // signal mask at time of segmentation fault
+ "mov %%ecx, 0x100(%%esp)\n"
+ "14:xor %%eax, %%eax\n"
"test %%edx, %%edx\n" // only return old mask, if set is non-NULL
- "jz 7b\n"
+ "jz 3b\n"
"mov %%edi, 0(%%edx)\n" // old_set
"mov %%ebp, 4(%%edx)\n"
- "jmp 7b\n"
+ "jmp 3b\n"
- // Copy signal frame onto new stack. See clone.cc for details
- "14:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000
- "jnz 15f\n"
- "mov 0x24(%%esp), %%ecx\n" // %esp at time of segmentation fault
- "sub %%esp, %%ecx\n" // %ecx = size of stack frame
- "sub $8, %%ecx\n" // skip return address and dummy
- "mov %%ecx, %%eax\n" // return size of signal stack frame
+ // Handle sigreturn() and rt_sigreturn()
+ // See syscall.cc for a discussion on how we can emulate rt_sigreturn()
+ // by calling sigreturn() with a suitably adjusted stack.
+ "15:cmp $119, %%eax\n" // NR_sigreturn
+ "jnz 17f\n"
+ "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault
+ "16:int $0x80\n" // sigreturn() is unrestricted
+ "17:cmp $173, %%eax\n" // NR_rt_sigreturn
+ "jnz 18f\n"
+ "mov 0xC0(%%esp), %%esp\n" // %esp at time of segmentation fault
+ "sub $4, %%esp\n" // add fake return address
+ "jmp 4b\n"
+
+ // Copy signal frame onto new stack. In the process, we have to convert
+ // it from an RT signal frame to a legacy signal frame.
+ // See clone.cc for details
+ "18:cmp $120+0xF000, %%eax\n" // NR_clone + 0xF000
+ "jnz 19f\n"
+ "mov 0xC0(%%esp), %%ecx\n" // %esp at time of segmentation fault
+ "sub %%esp, %%ecx\n" // %ecx = size of RT stack frame
+ "mov %%ecx, %%eax\n"
+ "add $0x1C8, %%eax\n" // adjust for size of legacy stack frame
+ "sub $0x100, %%ecx\n"
"mov 0(%%edx), %%edi\n" // stack for newly clone()'d thread
"sub %%ecx, %%edi\n" // copy onto new stack
- "mov %%edi, 0(%%edx)\n" // allocate space on new stack
- "lea 8(%%esp), %%esi\n" // copy from current stack
+ "lea 0x100(%%esp), %%esi\n"
"cld\n"
- "rep movsb\n"
- "jmp 7b\n"
+ "rep movsb\n" // copy parts of RT stack(sigmask, FP state)
+ "mov 0xF0(%%esp), %%ebx\n" // adjust pointer to fpstate
+ "sub %%esi, %%ebx\n"
+ "add %%edi, %%ebx\n"
+ "sub %%eax, %%edi\n"
+ "mov %%edi, 0(%%edx)\n" // allocate space on new stack
+ "lea 0xA4(%%esp), %%esi\n" // copy sigcontext from current stack
+ "mov $0x16, %%ecx\n"
+ "rep movsl\n"
+ "mov %%ebx, -0xC(%%edi)\n" // set pointer to fpstate
+ "mov 0xFC(%%esp), %%ebx\n" // copy first half of signal mask
+ "mov %%ebx, -0x8(%%edi)\n"
+ "mov %%eax, -0x2C(%%edi)\n" // return size of stack frame in %%eax
+ "addl $2, -0x20(%%edi)\n" // adjust %eip
+ "mov 0(%%edx), %%esp\n"
+ "mov $119, %%eax\n" // NR_sigreturn
+ "int $0x80\n"
// Forward system call to syscallWrapper()
- "15:call playground$syscallWrapper\n"
- "jmp 7b\n"
+ "19:call playground$syscallWrapper\n"
+ "jmp 3b\n"
// This was a genuine segmentation fault. Trigger the kernel's default
// signal disposition. The only way we can do this from seccomp mode
// is by blocking the signal and retriggering it.
- "16:mov $2, %%ebx\n" // stderr
+ "20:mov $2, %%ebx\n" // stderr
"lea 300f, %%ecx\n" // "Segmentation fault\n"
"mov $301f-300f, %%edx\n"
"mov $4, %%eax\n" // NR_write
"int $0x80\n"
- "orb $4, 0x59(%%esp)\n" // signal mask at time of segmentation fault
- "ret\n"
+ "orb $4, 0xFD(%%esp)\n" // signal mask at time of segmentation fault
+ "jmp 4b\n"
#else
#error Unsupported target platform
#endif
diff --git a/sandbox/linux/seccomp/sandbox_impl.h b/sandbox/linux/seccomp/sandbox_impl.h
index 38a1803..9c49ffc 100644
--- a/sandbox/linux/seccomp/sandbox_impl.h
+++ b/sandbox/linux/seccomp/sandbox_impl.h
@@ -598,7 +598,7 @@ class Sandbox {
// The SEGV handler knows how to handle RDTSC instructions
static void setupSignalHandlers();
- static void (*segv())(int signo);
+ static void (*segv())(int signo, SysCalls::siginfo *context, void *unused);
// If no specific handler has been registered for a system call, call this
// function which asks the trusted thread to perform the call. This is used
diff --git a/sandbox/linux/seccomp/syscall.cc b/sandbox/linux/seccomp/syscall.cc
index 76e96e4..681fec9 100644
--- a/sandbox/linux/seccomp/syscall.cc
+++ b/sandbox/linux/seccomp/syscall.cc
@@ -46,8 +46,17 @@ asm(
".globl playground$syscallWrapper\n"
".type playground$syscallWrapper, @function\n"
#if defined(__x86_64__)
+ // Check for rt_sigreturn(). It needs to be handled specially.
+ "cmp $15, %rax\n" // NR_rt_sigreturn
+ "jnz 1f\n"
+ "add $0x90, %rsp\n" // pop return addresses and red zone
+ "0:syscall\n" // rt_sigreturn() is unrestricted
+ "mov $66, %edi\n" // rt_sigreturn() should never return
+ "mov $231, %eax\n" // NR_exit_group
+ "jmp 0b\n"
+
// Save all registers
- "push %rbp\n"
+ "1:push %rbp\n"
"mov %rsp, %rbp\n"
"push %rbx\n"
"push %rcx\n"
@@ -70,7 +79,7 @@ asm(
// Check range of system call
"cmp playground$maxSyscall(%rip), %eax\n"
- "ja 1f\n"
+ "ja 3f\n"
// Retrieve function call from system call table (c.f. syscall_table.c).
// We have three different types of entries; zero for denied system calls,
@@ -86,9 +95,9 @@ asm(
// Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
// jump to fallback handler.
"cmp $1, %r10\n"
- "jbe 1f\n"
+ "jbe 3f\n"
"call *%r10\n"
- "0:"
+ "2:"
// Restore CPU registers, except for %rax which was set by the system call.
"pop %r15\n"
@@ -113,7 +122,7 @@ asm(
// Return to caller
"ret\n"
- "1:"
+ "3:"
// If we end up calling a specific handler, we don't need to know the
// system call number. However, in the generic case, we do. Shift
// registers so that the system call number becomes visible as the
@@ -129,10 +138,55 @@ asm(
// Call default handler.
"call playground$defaultSystemCallHandler\n"
"pop %r9\n"
- "jmp 0b\n"
+ "jmp 2b\n"
#elif defined(__i386__)
+ "cmp $119, %eax\n" // NR_sigreturn
+ "jnz 1f\n"
+ "add $0x4, %esp\n" // pop return address
+ "0:int $0x80\n" // sigreturn() is unrestricted
+ "mov $66, %ebx\n" // sigreturn() should never return
+ "mov %ebx, %eax\n" // NR_exit
+ "jmp 0b\n"
+ "1:cmp $173, %eax\n" // NR_rt_sigreturn
+ "jnz 3f\n"
+
+ // Convert rt_sigframe into sigframe, allowing us to call sigreturn().
+ // This is possible since the first part of signal stack frames have
+ // stayed very stable since the earliest kernel versions. While never
+ // officially documented, lots of user space applications rely on this
+ // part of the ABI, and kernel developers have been careful to maintain
+ // backwards compatibility.
+ // In general, the rt_sigframe includes a lot of extra information that
+ // the signal handler can look at. Most notably, this means a complete
+ // siginfo record.
+ // Fortunately though, the kernel doesn't look at any of this extra data
+ // when returning from a signal handler. So, we can safely convert an
+ // rt_sigframe to a legacy sigframe, discarding the extra data in the
+ // process. Interestingly, the legacy signal frame is actually larger than
+ // the rt signal frame, as it includes a lot more padding.
+ "sub $0x1C8, %esp\n" // a legacy signal stack is much larger
+ "mov 0x1CC(%esp), %eax\n" // push signal number
+ "push %eax\n"
+ "lea 0x270(%esp), %esi\n" // copy siginfo register values
+ "lea 0x4(%esp), %edi\n" // into new location
+ "mov $0x16, %ecx\n"
+ "cld\n"
+ "rep movsl\n"
+ "mov 0x2C8(%esp), %ebx\n" // copy first half of signal mask
+ "mov %ebx, 0x54(%esp)\n"
+ "lea 2f, %esi\n"
+ "push %esi\n" // push restorer function
+ "lea 0x2D4(%esp), %edi\n" // patch up retcode magic numbers
+ "movb $2, %cl\n"
+ "rep movsl\n"
+ "ret\n" // return to restorer function
+ "2:pop %eax\n" // remove dummy argument (signo)
+ "mov $119, %eax\n" // NR_sigaction
+ "int $0x80\n"
+
+
// Preserve all registers
- "push %ebx\n"
+ "3:push %ebx\n"
"push %ecx\n"
"push %edx\n"
"push %esi\n"
@@ -150,7 +204,7 @@ asm(
// Check range of system call
"cmp playground$maxSyscall, %eax\n"
- "ja 5f\n"
+ "ja 9f\n"
// We often have long sequences of calls to gettimeofday(). This is
// needlessly expensive. Coalesce them into a single call.
@@ -164,9 +218,9 @@ asm(
// or maybe, if we have recently seen requests to compute
// the time. There might be a repeated pattern of those.
"cmp $78, %eax\n" // __NR_gettimeofday
- "jnz 2f\n"
+ "jnz 6f\n"
"cmp %eax, %fs:0x102C-0x58\n" // last system call
- "jnz 0f\n"
+ "jnz 4f\n"
// This system call and the last system call prior to this one both are
// calls to gettimeofday(). Try to avoid making the new call and just
@@ -174,7 +228,7 @@ asm(
// Just in case the caller is spinning on the result from gettimeofday(),
// every so often, call the actual system call.
"decl %fs:0x1030-0x58\n" // countdown calls to gettimofday()
- "jz 0f\n"
+ "jz 4f\n"
// Atomically read the 64bit word representing last-known timestamp and
// return it to the caller. On x86-32 this is a little more complicated and
@@ -186,11 +240,11 @@ asm(
"mov %edx, 4(%ebx)\n"
"xor %eax, %eax\n"
"add $28, %esp\n"
- "jmp 4f\n"
+ "jmp 8f\n"
// This is a call to gettimeofday(), but we don't have a valid cached
// result, yet.
- "0:mov %eax, %fs:0x102C-0x58\n" // remember syscall number
+ "4:mov %eax, %fs:0x102C-0x58\n" // remember syscall number
"movl $500, %fs:0x1030-0x58\n" // make system call, each 500 invocations
"call playground$defaultSystemCallHandler\n"
@@ -201,17 +255,17 @@ asm(
"mov 0(%ebx), %ebx\n"
"mov 100f, %eax\n"
"mov 101f, %edx\n"
- "1:lock; cmpxchg8b 100f\n"
- "jnz 1b\n"
+ "5:lock; cmpxchg8b 100f\n"
+ "jnz 5b\n"
"xor %eax, %eax\n"
- "jmp 6f\n"
+ "jmp 10f\n"
// Remember the number of the last system call made. We deliberately do
// not remember calls to gettid(), as we have often seen long sequences
// of calls to just gettimeofday() and gettid(). In that situation, we
// would still like to coalesce the gettimeofday() calls.
- "2:cmp $224, %eax\n" // __NR_gettid
- "jz 3f\n"
+ "6:cmp $224, %eax\n" // __NR_gettid
+ "jz 7f\n"
"mov %eax, %fs:0x102C-0x58\n" // remember syscall number
// Retrieve function call from system call table (c.f. syscall_table.c).
@@ -219,7 +273,7 @@ asm(
// that should be handled by the defaultSystemCallHandler(); minus one
// for unrestricted system calls that need to be forwarded to the trusted
// thread; and function pointers to specific handler functions.
- "3:shl $3, %eax\n"
+ "7:shl $3, %eax\n"
"lea playground$syscallTable, %ebx\n"
"add %ebx, %eax\n"
"mov 0(%eax), %eax\n"
@@ -227,13 +281,13 @@ asm(
// Jump to function if non-null and not UNRESTRICTED_SYSCALL, otherwise
// jump to fallback handler.
"cmp $1, %eax\n"
- "jbe 5f\n"
+ "jbe 9f\n"
"add $4, %esp\n"
"call *%eax\n"
"add $24, %esp\n"
// Restore CPU registers, except for %eax which was set by the system call.
- "4:pop %ebp\n"
+ "8:pop %ebp\n"
"pop %edi\n"
"pop %esi\n"
"pop %edx\n"
@@ -244,9 +298,9 @@ asm(
"ret\n"
// Call default handler.
- "5:call playground$defaultSystemCallHandler\n"
- "6:add $28, %esp\n"
- "jmp 4b\n"
+ "9:call playground$defaultSystemCallHandler\n"
+ "10:add $28, %esp\n"
+ "jmp 8b\n"
".pushsection \".bss\"\n"
".balign 8\n"
@@ -267,9 +321,9 @@ void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
void* arg5) {
// TODO(markus): The following comment is currently not true, we do intercept these system calls. Try to fix that.
- // We try to avoid intercepting read(), write(), and sigreturn(), as
- // these system calls are not restricted in Seccomp mode. But depending on
- // the exact instruction sequence in libc, we might not be able to reliably
+ // We try to avoid intercepting read(), and write(), as these system calls
+ // are not restricted in Seccomp mode. But depending on the exact
+ // instruction sequence in libc, we might not be able to reliably
// filter out these system calls at the time when we instrument the code.
SysCalls sys;
long rc;
@@ -283,10 +337,6 @@ void* Sandbox::defaultSystemCallHandler(int syscallNum, void* arg0, void* arg1,
Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
rc = sys.write((long)arg0, arg1, (size_t)arg2);
break;
- case __NR_rt_sigreturn:
- Debug::syscall(&tm, syscallNum, "Allowing unrestricted system call");
- rc = sys.rt_sigreturn((unsigned long)arg0);
- break;
default:
if (Debug::isEnabled()) {
// In debug mode, prevent stderr from being closed