base: wait for children to terminate.

I screwed up and assumed that you can't race the kernel. But it turns out that it's possible to obverse a child in the process of dying. The kernel will close the child's file descriptors before waitpid knows that the child is dead. Rather than rewrite everything (in an area of the code which has had lots of issues in the past), I'm opting to do a blocking wait for the child to die. The code higher up already copes with waitpid() failing, but we'll miss some crashes if it does. With this patch, we'll wait up to 250ms for the termination information on a child. We are, no doubt, blocking a latency sensitive thread with this. However, the race window in the kernel is small, so doesn't really block for > a millisecond or so. We do have to consider the case where a buggy child closes its file descriptor without dying. In that case, we'll block for the full 250ms but that indicates a serious bug in the child. BUG=63531 TEST=nagivating to about:gpucrash on linux and checking the we know that the child crashed. http://codereview.chromium.org/5377001/ git-svn-id: svn://svn.chromium.org/chrome/trunk/src@67572 0039d316-1c4b-4281-b951-d872f2087c98
author: agl@chromium.org <agl@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-11-29 18:56:19 +0000
committer: agl@chromium.org <agl@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-11-29 18:56:19 +0000
commit: 774f220e3b8714626b2ae0c75df1fb3035707eb1 (patch)
tree: e482bdfb90b6ae75d7a502f0196f0206b22897c9 /base/process_util_posix.cc
parent: ae7aaede33c296418f0b41aa8b287e6f3082dc4f (diff)
download: chromium_src-774f220e3b8714626b2ae0c75df1fb3035707eb1.zip
chromium_src-774f220e3b8714626b2ae0c75df1fb3035707eb1.tar.gz
chromium_src-774f220e3b8714626b2ae0c75df1fb3035707eb1.tar.bz2
1 files changed, 55 insertions, 43 deletions
diff --git a/base/process_util_posix.cc b/base/process_util_posix.cc
index 8a97dcd..d53ab99 100644
--- a/base/process_util_posix.cc
+++ b/base/process_util_posix.cc
@@ -40,8 +40,10 @@ namespace base {
 
 namespace {
 
-int WaitpidWithTimeout(ProcessHandle handle, int64 wait_milliseconds,
-                       bool* success) {
+// WaitpidWithTimeout performs a timed waitpid() call. The arguments and return
+// value match waitpid(2).
+pid_t WaitpidWithTimeout(ProcessHandle handle, int* status,
+                         int64 wait_milliseconds) {
   // This POSIX version of this function only guarantees that we wait no less
   // than |wait_milliseconds| for the process to exit.  The child process may
   // exit sometime before the timeout has ended but we may still block for up
@@ -63,13 +65,15 @@ int WaitpidWithTimeout(ProcessHandle handle, int64 wait_milliseconds,
   // has been installed.  This means that when a SIGCHLD is sent, it will exit
   // depending on behavior external to this function.
   //
-  // This function is used primarily for unit tests, if we want to use it in
-  // the application itself it would probably be best to examine other routes.
-  int status = -1;
-  pid_t ret_pid = HANDLE_EINTR(waitpid(handle, &status, WNOHANG));
+  // This function is used in unit tests and by DidProcessCrash. See the
+  // comments in DidProcessCrash about its use in Chrome itself.
   static const int64 kMaxSleepInMicroseconds = 1 << 18;  // ~256 milliseconds.
   int64 max_sleep_time_usecs = 1 << 10;  // ~1 milliseconds.
-  int64 double_sleep_time = 0;
+  int double_sleep_time = 0;
+
+  pid_t ret_pid = HANDLE_EINTR(waitpid(handle, status, WNOHANG));
+  if (ret_pid != 0)
+    return ret_pid;
 
   // If the process hasn't exited yet, then sleep and try again.
   Time wakeup_time = Time::Now() +
@@ -87,7 +91,7 @@ int WaitpidWithTimeout(ProcessHandle handle, int64 wait_milliseconds,
     // usleep() will return 0 and set errno to EINTR on receipt of a signal
     // such as SIGCHLD.
     usleep(sleep_time_usecs);
-    ret_pid = HANDLE_EINTR(waitpid(handle, &status, WNOHANG));
+    ret_pid = HANDLE_EINTR(waitpid(handle, status, WNOHANG));
 
     if ((max_sleep_time_usecs < kMaxSleepInMicroseconds) &&
         (double_sleep_time++ % 4 == 0)) {
@@ -95,10 +99,7 @@ int WaitpidWithTimeout(ProcessHandle handle, int64 wait_milliseconds,
     }
   }
 
-  if (success)
-    *success = (ret_pid != -1);
-
-  return status;
+  return ret_pid;
 }
 
 void StackDumpSignalHandler(int signal, siginfo_t* info, ucontext_t* context) {
@@ -631,15 +632,27 @@ void RaiseProcessToHighPriority() {
 }
 
 bool DidProcessCrash(bool* child_exited, ProcessHandle handle) {
+  // We call DidProcessCrash when we see an EOF from a socket to that child.
+  // Sadly, the kernel is raceable: it will close a dead process's file
+  // descriptors before marking the process as dead. Therefore it's possible to
+  // see the EOF and still have a subsequent waitpid() return zero. (Had I
+  // known that from the beginning, child processing would have been designed
+  // differently.)
+  //
+  // Everything higher up copes with the case where waitpid fails, but we might
+  // lose some crash notifications. In order to minimise this we waitpid() in a
+  // loop with a timeout to try and catch the exit code. In the typical case,
+  // we should only be blocking for microseconds here.
+  //
+  // It's possible for a bad child to close its socket without exiting. Because
+  // we don't want to hang the browser in this case we have a timeout.
   int status;
-  const pid_t result = HANDLE_EINTR(waitpid(handle, &status, WNOHANG));
-  if (result == -1) {
-    PLOG(ERROR) << "waitpid(" << handle << ")";
-    if (child_exited)
-      *child_exited = false;
-    return false;
-  } else if (result == 0) {
-    // the child hasn't exited yet.
+  pid_t pid = WaitpidWithTimeout(handle, &status, 250 /* milliseconds */);
+
+  if (pid == 0) {
+    // We waited for the full timeout and the child still isn't dead.
+    LOG(ERROR) << "We janked because we expected child "
+               << handle << " to have exited.";
     if (child_exited)
       *child_exited = false;
     return false;
@@ -685,12 +698,10 @@ bool WaitForExitCode(ProcessHandle handle, int* exit_code) {
 
 bool WaitForExitCodeWithTimeout(ProcessHandle handle, int* exit_code,
                                 int64 timeout_milliseconds) {
-  bool waitpid_success = false;
-  int status = WaitpidWithTimeout(handle, timeout_milliseconds,
-                                  &waitpid_success);
-  if (status == -1)
-    return false;
-  if (!waitpid_success)
+  int status;
+  pid_t pid = WaitpidWithTimeout(handle, &status, timeout_milliseconds);
+
+  if (pid <= 0)
     return false;
   if (!WIFEXITED(status))
     return false;
@@ -703,30 +714,31 @@ bool WaitForExitCodeWithTimeout(ProcessHandle handle, int* exit_code,
 }
 
 bool WaitForSingleProcess(ProcessHandle handle, int64 wait_milliseconds) {
-  bool waitpid_success;
   int status;
-  if (wait_milliseconds == base::kNoTimeout)
-    waitpid_success = (HANDLE_EINTR(waitpid(handle, &status, 0)) != -1);
-  else
-    status = WaitpidWithTimeout(handle, wait_milliseconds, &waitpid_success);
-  if (status != -1) {
-    DCHECK(waitpid_success);
-    return WIFEXITED(status);
+  pid_t pid;
+  if (wait_milliseconds == base::kNoTimeout) {
+    pid = (HANDLE_EINTR(waitpid(handle, &status, 0)) == handle);
   } else {
-    return false;
+    pid = WaitpidWithTimeout(handle, &status, wait_milliseconds);
   }
+
+  if (pid <= 0)
+    return false;
+
+  return WIFEXITED(status);
 }
 
 bool CrashAwareSleep(ProcessHandle handle, int64 wait_milliseconds) {
-  bool waitpid_success;
-  int status = WaitpidWithTimeout(handle, wait_milliseconds, &waitpid_success);
-  if (status != -1) {
-    DCHECK(waitpid_success);
-    return !(WIFEXITED(status) || WIFSIGNALED(status));
+  int status;
+  pid_t pid = WaitpidWithTimeout(handle, &status, wait_milliseconds);
+  if (pid < 0) {
+    // If waitpid failed then the process probably didn't exist prior to the
+    // call.
+    return true;
+  } else if (pid == 0) {
+    return false;
   } else {
-    // If waitpid returned with an error, then the process doesn't exist
-    // (which most probably means it didn't exist before our call).
-    return waitpid_success;
+    return !(WIFEXITED(status) || WIFSIGNALED(status));
   }
 }
author	agl@chromium.org <agl@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-11-29 18:56:19 +0000
committer	agl@chromium.org <agl@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-11-29 18:56:19 +0000
commit	774f220e3b8714626b2ae0c75df1fb3035707eb1 (patch)
tree	e482bdfb90b6ae75d7a502f0196f0206b22897c9 /base/process_util_posix.cc
parent	ae7aaede33c296418f0b41aa8b287e6f3082dc4f (diff)
download	chromium_src-774f220e3b8714626b2ae0c75df1fb3035707eb1.zip chromium_src-774f220e3b8714626b2ae0c75df1fb3035707eb1.tar.gz chromium_src-774f220e3b8714626b2ae0c75df1fb3035707eb1.tar.bz2