Fix races in thread list Unregister.

First race: We were releasing the thin_lock_id in Unregister before the thread was not suspended. This could cause problems in SuspendThreadByThreadId since there was a small window of time where two threads could share the same thread id. This race caused an occasional check failure in SuspendThreadByThreadId. Second race: We were setting the thin_lock_thread_id_ to 0 in Unregister before waiting to not be suspended. This caused another race in SuspendThreadByThreadId where we modified the thread suspend count, busy waited, but didn't find the thread the next iteration. This meant that we were returning null even though we had modified the suspend count. This caused the suspend count to not get decremented since the caller didn't know that the suspend count had been increased. Removing the self->thin_lock_thread_id_ = 0 in ThreadList::UnRegister fixes this race. Added a bit of additional checks and logging to prevent these issues from resurfacing, other misc cleanup. Added thread names to threads in ThreadStress. Bug: 11319866 Change-Id: I48e3a0700193b72079e450be1e924a2f88cf52e2
author: Mathieu Chartier <mathieuc@google.com> 2013-12-03 14:24:05 -0800
committer: Mathieu Chartier <mathieuc@google.com> 2013-12-05 12:57:43 -0800
commit: 5f51d4b80058236759fea1d932470a57f348c199 (patch)
tree: 096ce11ba8dc096fd27d2fbed84459080c7d3b1e
parent: d83d4c86c89357a74e94963994ad0c42ea7299c3 (diff)
download: art-5f51d4b80058236759fea1d932470a57f348c199.zip
art-5f51d4b80058236759fea1d932470a57f348c199.tar.gz
art-5f51d4b80058236759fea1d932470a57f348c199.tar.bz2
4 files changed, 64 insertions, 62 deletions
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index af93a56..ef9a9ce 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -633,8 +633,7 @@ void Monitor::InflateThinLocked(Thread* self, SirtRef<mirror::Object>& obj, Lock
     ScopedThreadStateChange tsc(self, kBlocked);
     if (lock_word == obj->GetLockWord()) {  // If lock word hasn't changed.
       bool timed_out;
-      Thread* owner = thread_list->SuspendThreadByThreadId(lock_word.ThinLockOwner(), false,
-                                                           &timed_out);
+      Thread* owner = thread_list->SuspendThreadByThreadId(owner_thread_id, false, &timed_out);
       if (owner != nullptr) {
         // We succeeded in suspending the thread, check the lock's status didn't change.
         lock_word = obj->GetLockWord();
diff --git a/runtime/thread.cc b/runtime/thread.cc
index d816ca1..297fa45 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -586,11 +586,11 @@ bool Thread::RequestCheckpoint(Closure* function) {
   if ((old_state_and_flags.as_struct.flags & kCheckpointRequest) != 0) {
     return false;  // Fail, already a checkpoint pending.
   }
-  CHECK(checkpoint_function_ == NULL);
+  CHECK(checkpoint_function_ == nullptr);
   checkpoint_function_ = function;
   // Checkpoint function installed now install flag bit.
   // We must be runnable to request a checkpoint.
-  old_state_and_flags.as_struct.state = kRunnable;
+  DCHECK_EQ(old_state_and_flags.as_struct.state, kRunnable);
   union StateAndFlags new_state_and_flags = old_state_and_flags;
   new_state_and_flags.as_struct.flags |= kCheckpointRequest;
   int succeeded = android_atomic_cmpxchg(old_state_and_flags.as_int, new_state_and_flags.as_int,
@@ -2120,12 +2120,11 @@ void Thread::VisitRoots(RootVisitor* visitor, void* arg) {
     opeer_ = visitor(opeer_, arg);
   }
   if (exception_ != nullptr) {
-    exception_ = reinterpret_cast<mirror::Throwable*>(visitor(exception_, arg));
+    exception_ = down_cast<mirror::Throwable*>(visitor(exception_, arg));
   }
   throw_location_.VisitRoots(visitor, arg);
   if (class_loader_override_ != nullptr) {
-    class_loader_override_ = reinterpret_cast<mirror::ClassLoader*>(
-        visitor(class_loader_override_, arg));
+    class_loader_override_ = down_cast<mirror::ClassLoader*>(visitor(class_loader_override_, arg));
   }
   jni_env_->locals.VisitRoots(visitor, arg);
   jni_env_->monitors.VisitRoots(visitor, arg);
@@ -2144,7 +2143,7 @@ void Thread::VisitRoots(RootVisitor* visitor, void* arg) {
       frame.this_object_ = visitor(frame.this_object_, arg);
     }
     DCHECK(frame.method_ != nullptr);
-    frame.method_ = reinterpret_cast<mirror::ArtMethod*>(visitor(frame.method_, arg));
+    frame.method_ = down_cast<mirror::ArtMethod*>(visitor(frame.method_, arg));
   }
 }
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index dd3f11c..aed8c77 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -162,6 +162,35 @@ static void UnsafeLogFatalForThreadSuspendAllTimeout(Thread* self) NO_THREAD_SAF
 }
 #endif
 
+// Unlike suspending all threads where we can wait to acquire the mutator_lock_, suspending an
+// individual thread requires polling. delay_us is the requested sleep and total_delay_us
+// accumulates the total time spent sleeping for timeouts. The first sleep is just a yield,
+// subsequently sleeps increase delay_us from 1ms to 500ms by doubling.
+static void ThreadSuspendSleep(Thread* self, useconds_t* delay_us, useconds_t* total_delay_us,
+                               bool holding_locks) {
+  if (!holding_locks) {
+    for (int i = kLockLevelCount - 1; i >= 0; --i) {
+      BaseMutex* held_mutex = self->GetHeldMutex(static_cast<LockLevel>(i));
+      if (held_mutex != NULL) {
+        LOG(FATAL) << "Holding " << held_mutex->GetName() << " while sleeping for thread suspension";
+      }
+    }
+  }
+  useconds_t new_delay_us = (*delay_us) * 2;
+  CHECK_GE(new_delay_us, *delay_us);
+  if (new_delay_us < 500000) {  // Don't allow sleeping to be more than 0.5s.
+    *delay_us = new_delay_us;
+  }
+  if (*delay_us == 0) {
+    sched_yield();
+    // Default to 1 milliseconds (note that this gets multiplied by 2 before the first sleep).
+    *delay_us = 500;
+  } else {
+    usleep(*delay_us);
+    *total_delay_us += *delay_us;
+  }
+}
+
 size_t ThreadList::RunCheckpoint(Closure* checkpoint_function) {
   Thread* self = Thread::Current();
   if (kIsDebugBuild) {
@@ -208,17 +237,15 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function) {
   for (const auto& thread : suspended_count_modified_threads) {
     if (!thread->IsSuspended()) {
       // Wait until the thread is suspended.
-      uint64_t start = NanoTime();
+      useconds_t total_delay_us = 0;
       do {
-        // Sleep for 100us.
-        usleep(100);
+        useconds_t delay_us = 100;
+        ThreadSuspendSleep(self, &delay_us, &total_delay_us, true);
       } while (!thread->IsSuspended());
-      uint64_t end = NanoTime();
-      // Shouldn't need to wait for longer than 1 millisecond.
-      const uint64_t threshold = 1;
-      if (NsToMs(end - start) > threshold) {
-        LOG(INFO) << "Warning: waited longer than " << threshold
-                  << " ms for thread suspend\n";
+      // Shouldn't need to wait for longer than 1000 microseconds.
+      constexpr useconds_t kLongWaitThresholdUS = 1000;
+      if (UNLIKELY(total_delay_us > kLongWaitThresholdUS)) {
+        LOG(WARNING) << "Waited " << total_delay_us << " us for thread suspend!";
       }
     }
     // We know for sure that the thread is suspended at this point.
@@ -354,34 +381,6 @@ static void ThreadSuspendByPeerWarning(Thread* self, int level, const char* mess
   }
 }
 
-// Unlike suspending all threads where we can wait to acquire the mutator_lock_, suspending an
-// individual thread requires polling. delay_us is the requested sleep and total_delay_us
-// accumulates the total time spent sleeping for timeouts. The first sleep is just a yield,
-// subsequently sleeps increase delay_us from 1ms to 500ms by doubling.
-static void ThreadSuspendSleep(Thread* self, useconds_t* delay_us, useconds_t* total_delay_us) {
-  for (int i = kLockLevelCount - 1; i >= 0; --i) {
-    BaseMutex* held_mutex = self->GetHeldMutex(static_cast<LockLevel>(i));
-    if (held_mutex != NULL) {
-      LOG(FATAL) << "Holding " << held_mutex->GetName() << " while sleeping for thread suspension";
-    }
-  }
-  {
-    useconds_t new_delay_us = (*delay_us) * 2;
-    CHECK_GE(new_delay_us, *delay_us);
-    if (new_delay_us < 500000) {  // Don't allow sleeping to be more than 0.5s.
-      *delay_us = new_delay_us;
-    }
-  }
-  if ((*delay_us) == 0) {
-    sched_yield();
-    // Default to 1 milliseconds (note that this gets multiplied by 2 before the first sleep).
-    (*delay_us) = 500;
-  } else {
-    usleep(*delay_us);
-    (*total_delay_us) += (*delay_us);
-  }
-}
-
 Thread* ThreadList::SuspendThreadByPeer(jobject peer, bool request_suspension,
                                         bool debug_suspension, bool* timed_out) {
   static const useconds_t kTimeoutUs = 30 * 1000000;  // 30s.
@@ -432,7 +431,7 @@ Thread* ThreadList::SuspendThreadByPeer(jobject peer, bool request_suspension,
       }
       // Release locks and come out of runnable state.
     }
-    ThreadSuspendSleep(self, &delay_us, &total_delay_us);
+    ThreadSuspendSleep(self, &delay_us, &total_delay_us, false);
   }
 }
 
@@ -445,13 +444,13 @@ Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, bool debug_suspe
   static const useconds_t kTimeoutUs = 30 * 1000000;  // 30s.
   useconds_t total_delay_us = 0;
   useconds_t delay_us = 0;
-  bool did_suspend_request = false;
   *timed_out = false;
+  Thread* suspended_thread = nullptr;
   Thread* self = Thread::Current();
   CHECK_NE(thread_id, kInvalidThreadId);
   while (true) {
-    Thread* thread = NULL;
     {
+      Thread* thread = NULL;
       ScopedObjectAccess soa(self);
       MutexLock mu(self, *Locks::thread_list_lock_);
       for (const auto& it : list_) {
@@ -460,17 +459,20 @@ Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, bool debug_suspe
           break;
         }
       }
-      if (thread == NULL) {
+      if (thread == nullptr) {
+        CHECK(suspended_thread == nullptr) << "Suspended thread " << suspended_thread
+            << " no longer in thread list";
         // There's a race in inflating a lock and the owner giving up ownership and then dying.
         ThreadSuspendByThreadIdWarning(WARNING, "No such thread id for suspend", thread_id);
         return NULL;
       }
       {
         MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-        if (!did_suspend_request) {
+        if (suspended_thread == nullptr) {
           thread->ModifySuspendCount(self, +1, debug_suspension);
-          did_suspend_request = true;
+          suspended_thread = thread;
         } else {
+          CHECK_EQ(suspended_thread, thread);
           // If the caller isn't requesting suspension, a suspension should have already occurred.
           CHECK_GT(thread->GetSuspendCount(), 0);
         }
@@ -487,7 +489,7 @@ Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, bool debug_suspe
         }
         if (total_delay_us >= kTimeoutUs) {
           ThreadSuspendByThreadIdWarning(WARNING, "Thread suspension timed out", thread_id);
-          if (did_suspend_request) {
+          if (suspended_thread != nullptr) {
             thread->ModifySuspendCount(soa.Self(), -1, debug_suspension);
           }
           *timed_out = true;
@@ -496,7 +498,7 @@ Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, bool debug_suspe
       }
       // Release locks and come out of runnable state.
     }
-    ThreadSuspendSleep(self, &delay_us, &total_delay_us);
+    ThreadSuspendSleep(self, &delay_us, &total_delay_us, false);
   }
 }
 
@@ -719,9 +721,7 @@ void ThreadList::Unregister(Thread* self) {
   self->Destroy();
 
   uint32_t thin_lock_id = self->thin_lock_thread_id_;
-  self->thin_lock_thread_id_ = 0;
-  ReleaseThreadId(self, thin_lock_id);
-  while (self != NULL) {
+  while (self != nullptr) {
     // Remove and delete the Thread* while holding the thread_list_lock_ and
     // thread_suspend_count_lock_ so that the unregistering thread cannot be suspended.
     // Note: deliberately not using MutexLock that could hold a stale self pointer.
@@ -732,10 +732,14 @@ void ThreadList::Unregister(Thread* self) {
     if (!self->IsSuspended()) {
       list_.remove(self);
       delete self;
-      self = NULL;
+      self = nullptr;
     }
     Locks::thread_list_lock_->ExclusiveUnlock(self);
   }
+  // Release the thread ID after the thread is finished and deleted to avoid cases where we can
+  // temporarily have multiple threads with the same thread id. When this occurs, it causes
+  // problems in FindThreadByThreadId / SuspendThreadByThreadId.
+  ReleaseThreadId(nullptr, thin_lock_id);
 
   // Clear the TLS data, so that the underlying native thread is recognizably detached.
   // (It may wish to reattach later.)
diff --git a/test/ThreadStress/ThreadStress.java b/test/ThreadStress/ThreadStress.java
index 8d8135d..795c790 100644
--- a/test/ThreadStress/ThreadStress.java
+++ b/test/ThreadStress/ThreadStress.java
@@ -128,13 +128,13 @@ class ThreadStress implements Runnable {
         Thread[] runners = new Thread[numberOfThreads];
         for (int r = 0; r < runners.length; r++) {
             final ThreadStress ts = threadStresses[r];
-            runners[r] = new Thread() {
+            runners[r] = new Thread("Runner thread " + r) {
                 final ThreadStress threadStress = ts;
                 public void run() {
                     int id = threadStress.id;
-                    System.out.println("Starting runner for " + id);
+                    System.out.println("Starting worker for " + id);
                     while (threadStress.nextOperation < operationsPerThread) {
-                        Thread thread = new Thread(ts);
+                        Thread thread = new Thread(ts, "Worker thread " + id);
                         thread.start();
                         try {
                             thread.join();
@@ -144,14 +144,14 @@ class ThreadStress implements Runnable {
                                            + (operationsPerThread - threadStress.nextOperation)
                                            + " operations remaining.");
                     }
-                    System.out.println("Finishing runner for " + id);
+                    System.out.println("Finishing worker for " + id);
                 }
             };
         }
 
         // The notifier thread is a daemon just loops forever to wake
         // up threads in Operation.WAIT
-        Thread notifier = new Thread() {
+        Thread notifier = new Thread("Notifier") {
             public void run() {
                 while (true) {
                     synchronized (lock) {
author	Mathieu Chartier <mathieuc@google.com>	2013-12-03 14:24:05 -0800
committer	Mathieu Chartier <mathieuc@google.com>	2013-12-05 12:57:43 -0800
commit	5f51d4b80058236759fea1d932470a57f348c199 (patch)
tree	096ce11ba8dc096fd27d2fbed84459080c7d3b1e
parent	d83d4c86c89357a74e94963994ad0c42ea7299c3 (diff)
download	art-5f51d4b80058236759fea1d932470a57f348c199.zip art-5f51d4b80058236759fea1d932470a57f348c199.tar.gz art-5f51d4b80058236759fea1d932470a57f348c199.tar.bz2