From 8e84bfcc1c33fc0b63392d48ee8288d73f4a7675 Mon Sep 17 00:00:00 2001 From: stanisc Date: Mon, 20 Oct 2014 15:11:35 -0700 Subject: Sync: Avoid deadlock in SyncBackendRegistrar / ModelSafeWorker on sync backend shutdown. This deadlock involves SyncBackendRegistrar running on the sync thread and at least two workers - one of them (worker A) running on UI thread and another (worker B) running on some other background thread. In this particular case it was a password worker running on Password model thread. The deadlock occurs when the SyncBackendRegistrar shutdown code running in the sync thread blocks on a waitable event set by worker B, at the same time the worker B's thread is blocked for whatever reason waiting to make a synchronous call into the UI thread, and at the same time the worker A is blocked on a lock owned by the sync thread while it attempts to unregister itself with SyncBackendRegistrar. I think the main issue here is that the sync thread might block on a worker. This is possible in only one situation - when that worker hasn't have a chance to run any tasks because its own thread was blocked doing something else. The fix removes this dependency. The only reason this blocking occurs is because ModelSafeWorker has a mechanism for subscribing / unsubscribing to the MessageLoop descruction which has to be done in the right thread context. In order to unsubscribe the worker needs to remember the message loop it runs on which is done in SetWorkingLoopToCurrent callback, which is called from a task posted on the worker's thread. There is also a waitable event which is signalled at that time. The even is used to block ModelSafeWorker::UnregisterForLoopDestruction from delegating the call on an uninitialized message loop which is possible only when SetWorkingLoopToCurrent has never been called. There is no need to block in UnregisterForLoopDestruction. If the registration for even loop destruction hasn't been been done yet (because the thread is blocked) then there really no reason to wait for that and then immediately unsubscribe. We can skip both subscription to the even loop notification altogether and let the registrar know that it is OK to remove the worker. That's essentially what the fix does. I added a test that verifies that SyncBackendRegistrar::Shutdown doesn't block on a blocked model type thread anymore. BUG=423078 Review URL: https://codereview.chromium.org/637413003 Cr-Commit-Position: refs/heads/master@{#300350} --- .../sync/glue/browser_thread_model_worker.cc | 1 - chrome/browser/sync/glue/history_model_worker.cc | 1 - chrome/browser/sync/glue/password_model_worker.cc | 1 - .../sync/glue/sync_backend_registrar_unittest.cc | 85 ++++++++++++++++++++++ chrome/browser/sync/glue/ui_model_worker.cc | 1 - .../public/engine/model_safe_worker.cc | 52 ++++++++----- .../internal_api/public/engine/model_safe_worker.h | 8 +- .../public/engine/passive_model_worker.cc | 1 - 8 files changed, 126 insertions(+), 24 deletions(-) diff --git a/chrome/browser/sync/glue/browser_thread_model_worker.cc b/chrome/browser/sync/glue/browser_thread_model_worker.cc index 7f4024a..093d282 100644 --- a/chrome/browser/sync/glue/browser_thread_model_worker.cc +++ b/chrome/browser/sync/glue/browser_thread_model_worker.cc @@ -50,7 +50,6 @@ BrowserThreadModelWorker::~BrowserThreadModelWorker() {} void BrowserThreadModelWorker::RegisterForLoopDestruction() { if (BrowserThread::CurrentlyOn(thread_)) { - base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } else { BrowserThread::PostTask( diff --git a/chrome/browser/sync/glue/history_model_worker.cc b/chrome/browser/sync/glue/history_model_worker.cc index 8241bdf..87b5470 100644 --- a/chrome/browser/sync/glue/history_model_worker.cc +++ b/chrome/browser/sync/glue/history_model_worker.cc @@ -100,7 +100,6 @@ void HistoryModelWorker::RegisterForLoopDestruction() { } void HistoryModelWorker::RegisterOnDBThread() { - base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } diff --git a/chrome/browser/sync/glue/password_model_worker.cc b/chrome/browser/sync/glue/password_model_worker.cc index 6ad5cfd..17ba62e 100644 --- a/chrome/browser/sync/glue/password_model_worker.cc +++ b/chrome/browser/sync/glue/password_model_worker.cc @@ -64,7 +64,6 @@ void PasswordModelWorker::CallDoWorkAndSignalTask( } void PasswordModelWorker::RegisterForPasswordLoopDestruction() { - base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } diff --git a/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc b/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc index cd2d29d..0bb25fa 100644 --- a/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc +++ b/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc @@ -256,6 +256,91 @@ TEST_F(SyncBackendRegistrarTest, ActivateDeactivateNonUIDataType) { TriggerChanges(registrar_.get(), AUTOFILL); } +class SyncBackendRegistrarShutdownTest : public testing::Test { + public: + void BlockDBThread() { + EXPECT_FALSE(db_thread_lock_.Try()); + + db_thread_blocked_.Signal(); + base::AutoLock l(db_thread_lock_); + } + + protected: + friend class TestRegistrar; + + SyncBackendRegistrarShutdownTest() + : thread_bundle_(content::TestBrowserThreadBundle::REAL_DB_THREAD | + content::TestBrowserThreadBundle::REAL_FILE_THREAD | + content::TestBrowserThreadBundle::REAL_IO_THREAD), + db_thread_blocked_(false, false), + registrar_destroyed_(false, false) {} + + virtual ~SyncBackendRegistrarShutdownTest() {} + + content::TestBrowserThreadBundle thread_bundle_; + base::WaitableEvent db_thread_blocked_; + base::Lock db_thread_lock_; + base::WaitableEvent registrar_destroyed_; +}; + +// Wrap SyncBackendRegistrar so that we can monitor its lifetime. +class TestRegistrar : public SyncBackendRegistrar { + public: + explicit TestRegistrar(Profile* profile, + SyncBackendRegistrarShutdownTest* test) + : SyncBackendRegistrar("test", profile, scoped_ptr()), + test_(test) {} + + virtual ~TestRegistrar() { test_->registrar_destroyed_.Signal(); } + + private: + SyncBackendRegistrarShutdownTest* test_; +}; + +TEST_F(SyncBackendRegistrarShutdownTest, BlockingShutdown) { + // Take ownership of |db_thread_lock_| so that the DB thread can't acquire it. + db_thread_lock_.Acquire(); + + // This will block the DB thread by waiting on |db_thread_lock_|. + BrowserThread::PostTask( + BrowserThread::DB, + FROM_HERE, + base::Bind(&SyncBackendRegistrarShutdownTest::BlockDBThread, + base::Unretained(this))); + + TestingProfile profile; + scoped_ptr registrar(new TestRegistrar(&profile, this)); + base::Thread* sync_thread = registrar->sync_thread(); + + // Stop here until the DB thread gets a chance to run and block on the lock. + // Please note that since the task above didn't finish, the task to + // initialize the worker on the DB thread hasn't had a chance to run yet too. + // Which means ModelSafeWorker::SetWorkingLoopToCurrent hasn't been called + // for the DB worker. + db_thread_blocked_.Wait(); + + registrar->SetInitialTypes(ModelTypeSet()); + + // Start the shutdown. + registrar->RequestWorkerStopOnUIThread(); + sync_thread->message_loop()->PostTask( + FROM_HERE, + base::Bind(&SyncBackendRegistrar::Shutdown, + base::Unretained(registrar.release()))); + + // The test verifies that the sync thread doesn't block because + // of the blocked DB thread and can finish the shutdown. + sync_thread->message_loop()->RunUntilIdle(); + + db_thread_lock_.Release(); + + base::MessageLoop::current()->RunUntilIdle(); + + // This verifies that all workers have been removed and the registrar + // destroyed. + registrar_destroyed_.Wait(); +} + } // namespace } // namespace browser_sync diff --git a/chrome/browser/sync/glue/ui_model_worker.cc b/chrome/browser/sync/glue/ui_model_worker.cc index 2d8bc7d..726deb6 100644 --- a/chrome/browser/sync/glue/ui_model_worker.cc +++ b/chrome/browser/sync/glue/ui_model_worker.cc @@ -47,7 +47,6 @@ UIModelWorker::UIModelWorker(syncer::WorkerLoopDestructionObserver* observer) void UIModelWorker::RegisterForLoopDestruction() { CHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); - base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } diff --git a/sync/internal_api/public/engine/model_safe_worker.cc b/sync/internal_api/public/engine/model_safe_worker.cc index eb3fc1f..2aea5cf 100644 --- a/sync/internal_api/public/engine/model_safe_worker.cc +++ b/sync/internal_api/public/engine/model_safe_worker.cc @@ -75,8 +75,8 @@ ModelSafeWorker::ModelSafeWorker(WorkerLoopDestructionObserver* observer) : stopped_(false), work_done_or_stopped_(false, false), observer_(observer), - working_loop_(NULL), - working_loop_set_wait_(true, false) {} + working_loop_(NULL) { +} ModelSafeWorker::~ModelSafeWorker() {} @@ -135,26 +135,42 @@ void ModelSafeWorker::WillDestroyCurrentMessageLoop() { void ModelSafeWorker::SetWorkingLoopToCurrent() { base::AutoLock l(working_loop_lock_); DCHECK(!working_loop_); - working_loop_ = base::MessageLoop::current(); - working_loop_set_wait_.Signal(); + + if (unregister_done_callback_.is_null()) { + // Expected case - UnregisterForLoopDestruction hasn't been called yet. + base::MessageLoop::current()->AddDestructionObserver(this); + working_loop_ = base::MessageLoop::current(); + } else { + // Rare case which is possible when the model type thread remains + // blocked for the entire session and UnregisterForLoopDestruction ends + // up being called before this method. This method is posted unlike + // UnregisterForLoopDestruction - that's why they can end up being called + // out of order. + // In this case we skip the destruction observer registration + // and just invoke the callback stored at UnregisterForLoopDestruction. + DCHECK(stopped_); + unregister_done_callback_.Run(GetModelSafeGroup()); + } } void ModelSafeWorker::UnregisterForLoopDestruction( base::Callback unregister_done_callback) { - // Ok to wait until |working_loop_| is set because this is called on sync - // loop. - working_loop_set_wait_.Wait(); - - { - base::AutoLock l(working_loop_lock_); - if (working_loop_ != NULL) { - // Should be called on sync loop. - DCHECK_NE(base::MessageLoop::current(), working_loop_); - working_loop_->PostTask( - FROM_HERE, - base::Bind(&ModelSafeWorker::UnregisterForLoopDestructionAsync, - this, unregister_done_callback)); - } + base::AutoLock l(working_loop_lock_); + if (working_loop_ != NULL) { + // Normal case - observer registration has been already done. + // Delegate to the sync thread to do the actual unregistration in + // UnregisterForLoopDestructionAsync. + DCHECK_NE(base::MessageLoop::current(), working_loop_); + working_loop_->PostTask( + FROM_HERE, + base::Bind(&ModelSafeWorker::UnregisterForLoopDestructionAsync, + this, + unregister_done_callback)); + } else { + // The working loop is still unknown, probably because the model type + // thread is blocked. Store the callback to be called from + // SetWorkingLoopToCurrent. + unregister_done_callback_ = unregister_done_callback; } } diff --git a/sync/internal_api/public/engine/model_safe_worker.h b/sync/internal_api/public/engine/model_safe_worker.h index 02f3ca1..df5bba9 100644 --- a/sync/internal_api/public/engine/model_safe_worker.h +++ b/sync/internal_api/public/engine/model_safe_worker.h @@ -133,7 +133,13 @@ class SYNC_EXPORT ModelSafeWorker // observation from sync thread when shutting down sync. base::Lock working_loop_lock_; base::MessageLoop* working_loop_; - base::WaitableEvent working_loop_set_wait_; + + // Callback passed with UnregisterForLoopDestruction. Normally this + // remains unset/unused and is stored only if |working_loop_| isn't + // initialized by the time UnregisterForLoopDestruction is called. + // It is safe to copy and thread safe. + // See comments in model_safe_worker.cc for more details. + base::Callback unregister_done_callback_; }; // A map that details which ModelSafeGroup each ModelType diff --git a/sync/internal_api/public/engine/passive_model_worker.cc b/sync/internal_api/public/engine/passive_model_worker.cc index 02036d5..3117194 100644 --- a/sync/internal_api/public/engine/passive_model_worker.cc +++ b/sync/internal_api/public/engine/passive_model_worker.cc @@ -18,7 +18,6 @@ PassiveModelWorker::~PassiveModelWorker() { } void PassiveModelWorker::RegisterForLoopDestruction() { - base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } -- cgit v1.1