Switched GPU watchdog timeout to be based on main thread's user + kernel time rather than wall clock time on Windows.

Only on Windows because it is the only platform 58396 appears to be happening on. My latest hairbrained theory is that it is timing out and aborting while Windows boxes are resuming from hypernation. In that case wall clock time might incorporate a lot of I/O time for paging in data from swapfile. This is an attempt to hide that I/O time, counting only active CPU time. It catches hangs like this: for (;;) { } and this: for (;;) { Sleep(0); } but not this: for (;;) { Sleep(1000); } because that just makes the thread largely idle. It also does not catch deadlocks. Also fixed null dereference in GPU watchdog termination code. BUG=64648, 58396 TEST=test GPU watchdog locally, try Review URL: http://codereview.chromium.org/5301007 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@67891 0039d316-1c4b-4281-b951-d872f2087c98
author: apatrick@chromium.org <apatrick@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-12-01 20:09:24 +0000
committer: apatrick@chromium.org <apatrick@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-12-01 20:09:24 +0000
commit: 981c1c5d6f46c1435bbcf466e35a8aa047a367a9 (patch)
tree: 6fa889c4075ca978cf194703c9640c0ffc8677a1 /chrome/gpu
parent: 38f57abb36fa6c91d9939a2393f9998dd40c51ba (diff)
download: chromium_src-981c1c5d6f46c1435bbcf466e35a8aa047a367a9.zip
chromium_src-981c1c5d6f46c1435bbcf466e35a8aa047a367a9.tar.gz
chromium_src-981c1c5d6f46c1435bbcf466e35a8aa047a367a9.tar.bz2
4 files changed, 109 insertions, 36 deletions
diff --git a/chrome/gpu/gpu_main.cc b/chrome/gpu/gpu_main.cc
index d125de9..29e59ec 100644
--- a/chrome/gpu/gpu_main.cc
+++ b/chrome/gpu/gpu_main.cc
@@ -130,8 +130,7 @@ int GpuMain(const MainFunctionParams& parameters) {
   // consuming has completed, otherwise the process is liable to be aborted.
   scoped_refptr<GpuWatchdogThread> watchdog_thread;
   if (enable_watchdog) {
-    watchdog_thread = new GpuWatchdogThread(MessageLoop::current(),
-                                            watchdog_timeout * 1000);
+    watchdog_thread = new GpuWatchdogThread(watchdog_timeout * 1000);
     watchdog_thread->Start();
   }
 
diff --git a/chrome/gpu/gpu_thread.cc b/chrome/gpu/gpu_thread.cc
index d81a09a..09c91f2 100644
--- a/chrome/gpu/gpu_thread.cc
+++ b/chrome/gpu/gpu_thread.cc
@@ -129,8 +129,10 @@ void GpuThread::OnCrash() {
 }
 
 void GpuThread::OnHang() {
-  for (;;)
-    PlatformThread::Sleep(1000);
+  for (;;) {
+    // Do not sleep here. The GPU watchdog timer tracks the amount of user
+    // time this thread is using and it doesn't use much while calling Sleep.
+  }
 }
 
 #if defined(OS_WIN)
diff --git a/chrome/gpu/gpu_watchdog_thread.cc b/chrome/gpu/gpu_watchdog_thread.cc
index 36f7e6b..fb90ffe 100644
--- a/chrome/gpu/gpu_watchdog_thread.cc
+++ b/chrome/gpu/gpu_watchdog_thread.cc
@@ -18,16 +18,32 @@ void DoNothing() {
 }
 }
 
-GpuWatchdogThread::GpuWatchdogThread(MessageLoop* watched_message_loop,
-                                     int timeout)
+GpuWatchdogThread::GpuWatchdogThread(int timeout)
     : base::Thread("Watchdog"),
-      watched_message_loop_(watched_message_loop),
+      watched_message_loop_(MessageLoop::current()),
       timeout_(timeout),
       armed_(false),
+#if defined(OS_WIN)
+      watched_thread_handle_(0),
+      arm_time_(0),
+#endif
       ALLOW_THIS_IN_INITIALIZER_LIST(task_observer_(this)) {
-  DCHECK(watched_message_loop);
   DCHECK(timeout >= 0);
 
+#if defined(OS_WIN)
+  // GetCurrentThread returns a pseudo-handle that cannot be used by one thread
+  // to identify another. DuplicateHandle creates a "real" handle that can be
+  // used for this purpose.
+  BOOL result = DuplicateHandle(GetCurrentProcess(),
+                                GetCurrentThread(),
+                                GetCurrentProcess(),
+                                &watched_thread_handle_,
+                                THREAD_QUERY_INFORMATION,
+                                FALSE,
+                                0);
+  DCHECK(result);
+#endif
+
   watched_message_loop_->AddTaskObserver(&task_observer_);
 }
 
@@ -36,6 +52,10 @@ GpuWatchdogThread::~GpuWatchdogThread() {
   // implicitly by the destructor, CleanUp() will not be called.
   DCHECK(!method_factory_.get());
 
+#if defined(OS_WIN)
+  CloseHandle(watched_thread_handle_);
+#endif
+
   watched_message_loop_->RemoveTaskObserver(&task_observer_);
 }
 
@@ -59,9 +79,6 @@ void GpuWatchdogThread::CleanUp() {
   // The method factory must be destroyed on the watchdog thread.
   method_factory_->RevokeAll();
   method_factory_.reset();
-
-  // Prevent any more delayed tasks from being posted.
-  watched_message_loop_ = NULL;
 }
 
 GpuWatchdogThread::GpuWatchdogTaskObserver::GpuWatchdogTaskObserver(
@@ -106,39 +123,87 @@ void GpuWatchdogThread::OnAcknowledge() {
   armed_ = false;
 
   // The monitored thread has responded. Post a task to check it again.
-  if (watched_message_loop_) {
-    message_loop()->PostDelayedTask(
-        FROM_HERE,
-        method_factory_->NewRunnableMethod(&GpuWatchdogThread::OnCheck),
-        kCheckPeriod);
-  }
+  message_loop()->PostDelayedTask(
+      FROM_HERE,
+      method_factory_->NewRunnableMethod(&GpuWatchdogThread::OnCheck),
+      kCheckPeriod);
 }
 
+#if defined(OS_WIN)
+int64 GpuWatchdogThread::GetWatchedThreadTime() {
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME user_time;
+  FILETIME kernel_time;
+  BOOL result = GetThreadTimes(watched_thread_handle_,
+                               &creation_time,
+                               &exit_time,
+                               &kernel_time,
+                               &user_time);
+  DCHECK(result);
+
+  ULARGE_INTEGER user_time64;
+  user_time64.HighPart = user_time.dwHighDateTime;
+  user_time64.LowPart = user_time.dwLowDateTime;
+
+  ULARGE_INTEGER kernel_time64;
+  kernel_time64.HighPart = kernel_time.dwHighDateTime;
+  kernel_time64.LowPart = kernel_time.dwLowDateTime;
+
+  // Time is reported in units of 100 nanoseconds. Kernel and user time are
+  // summed to deal with to kinds of hangs. One is where the GPU process is
+  // stuck in user level, never calling into the kernel and kernel time is
+  // not increasing. The other is where either the kernel hangs and never
+  // returns to user level or where user level code
+  // calls into kernel level repeatedly, giving up its quanta before it is
+  // tracked, for example a loop that repeatedly Sleeps.
+  return static_cast<int64>(
+      (user_time64.QuadPart + kernel_time64.QuadPart) / 10000);
+}
+#endif
+
 void GpuWatchdogThread::OnCheck() {
-  if (watched_message_loop_) {
-    // Must set armed before posting the task. This task might be the only task
-    // that will activate the TaskObserver on the watched thread and it must not
-    // miss the false -> true transition.
-    armed_ = true;
-
-    // Post a task to the monitored thread that does nothing but wake up the
-    // TaskObserver. Any other tasks that are pending on the watched thread will
-    // also wake up the observer. This simply ensures there is at least one.
-    watched_message_loop_->PostTask(
-        FROM_HERE,
-        NewRunnableFunction(DoNothing));
+  if (armed_)
+    return;
+
+  // Must set armed before posting the task. This task might be the only task
+  // that will activate the TaskObserver on the watched thread and it must not
+  // miss the false -> true transition.
+  armed_ = true;
+
+#if defined(OS_WIN)
+  arm_time_ = GetWatchedThreadTime();
+#endif
+
+  // Post a task to the monitored thread that does nothing but wake up the
+  // TaskObserver. Any other tasks that are pending on the watched thread will
+  // also wake up the observer. This simply ensures there is at least one.
+  watched_message_loop_->PostTask(
+      FROM_HERE,
+      NewRunnableFunction(DoNothing));
+
+  // Post a task to the watchdog thread to exit if the monitored thread does
+  // not respond in time.
+  message_loop()->PostDelayedTask(
+      FROM_HERE,
+      method_factory_->NewRunnableMethod(&GpuWatchdogThread::OnExit),
+      timeout_);
+}
 
-    // Post a task to the watchdog thread to exit if the monitored thread does
-    // not respond in time.
+// Use the --disable-gpu-watchdog command line switch to disable this.
+void GpuWatchdogThread::OnExit() {
+#if defined(OS_WIN)
+  // Defer termination until a certain amount of user time has elapsed.
+  int64 time_since_arm = GetWatchedThreadTime() - arm_time_;
+  if (time_since_arm < timeout_) {
     message_loop()->PostDelayedTask(
         FROM_HERE,
         method_factory_->NewRunnableMethod(&GpuWatchdogThread::OnExit),
-        timeout_);
+        timeout_ - time_since_arm);
+    return;
   }
-}
+#endif
 
-// Use the --disable-gpu-watchdog command line switch to disable this.
-void GpuWatchdogThread::OnExit() {
   // Make sure the timeout period is on the stack before crashing.
   volatile int timeout = timeout_;
 
diff --git a/chrome/gpu/gpu_watchdog_thread.h b/chrome/gpu/gpu_watchdog_thread.h
index 19cc41f..343e8d1 100644
--- a/chrome/gpu/gpu_watchdog_thread.h
+++ b/chrome/gpu/gpu_watchdog_thread.h
@@ -16,7 +16,7 @@
 class GpuWatchdogThread : public base::Thread,
                           public base::RefCountedThreadSafe<GpuWatchdogThread> {
  public:
-  GpuWatchdogThread(MessageLoop* watched_message_loop, int timeout);
+  explicit GpuWatchdogThread(int timeout);
   virtual ~GpuWatchdogThread();
 
   // Accessible on watched thread but only modified by watchdog thread.
@@ -50,11 +50,18 @@ class GpuWatchdogThread : public base::Thread,
   void OnExit();
   void Disable();
 
+  int64 GetWatchedThreadTime();
+
   MessageLoop* watched_message_loop_;
   int timeout_;
   volatile bool armed_;
   GpuWatchdogTaskObserver task_observer_;
 
+#if defined(OS_WIN)
+  void* watched_thread_handle_;
+  int64 arm_time_;
+#endif
+
   typedef ScopedRunnableMethodFactory<GpuWatchdogThread> MethodFactory;
   scoped_ptr<MethodFactory> method_factory_;
author	apatrick@chromium.org <apatrick@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-12-01 20:09:24 +0000
committer	apatrick@chromium.org <apatrick@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-12-01 20:09:24 +0000
commit	981c1c5d6f46c1435bbcf466e35a8aa047a367a9 (patch)
tree	6fa889c4075ca978cf194703c9640c0ffc8677a1 /chrome/gpu
parent	38f57abb36fa6c91d9939a2393f9998dd40c51ba (diff)
download	chromium_src-981c1c5d6f46c1435bbcf466e35a8aa047a367a9.zip chromium_src-981c1c5d6f46c1435bbcf466e35a8aa047a367a9.tar.gz chromium_src-981c1c5d6f46c1435bbcf466e35a8aa047a367a9.tar.bz2