From 141df6d109aad65adba7a58f6522af415a586b7c Mon Sep 17 00:00:00 2001 From: "rdevlin.cronin" Date: Tue, 21 Oct 2014 09:34:54 -0700 Subject: Revert "Sync: Avoid deadlock in SyncBackendRegistrar / ModelSafeWorker on sync backend shutdown." This reverts commit 8e84bfcc1c33fc0b63392d48ee8288d73f4a7675. Original CL: https://codereview.chromium.org/637413003 Reason for Revert: New test SyncBackendRegistrarShutdownTest.BlockingShutdown fails repeatedly on android and causes other tests to hang. Also fails on other bots. http://chromium-build-logs.appspot.com/gtest_query?gtest_query=SyncBackendRegistrarShutdownTest.BlockingShutdown Failures are timeouts, and don't have any useful logs or stack traces (sorry). Reason for hand-revert: Conflict with crrev.com/0cd9ff92b2dbc83fe8c3664b48041c7ced683c9c NOTRY=true TBR=stanisc@chromium.org TBR=zea@chromium.org -- Original CL Description -- Sync: Avoid deadlock in SyncBackendRegistrar / ModelSafeWorker on sync backend shutdown. This deadlock involves SyncBackendRegistrar running on the sync thread and at least two workers - one of them (worker A) running on UI thread and another (worker B) running on some other background thread. In this particular case it was a password worker running on Password model thread. The deadlock occurs when the SyncBackendRegistrar shutdown code running in the sync thread blocks on a waitable event set by worker B, at the same time the worker B's thread is blocked for whatever reason waiting to make a synchronous call into the UI thread, and at the same time the worker A is blocked on a lock owned by the sync thread while it attempts to unregister itself with SyncBackendRegistrar. I think the main issue here is that the sync thread might block on a worker. This is possible in only one situation - when that worker hasn't have a chance to run any tasks because its own thread was blocked doing something else. The fix removes this dependency. The only reason this blocking occurs is because ModelSafeWorker has a mechanism for subscribing / unsubscribing to the MessageLoop descruction which has to be done in the right thread context. In order to unsubscribe the worker needs to remember the message loop it runs on which is done in SetWorkingLoopToCurrent callback, which is called from a task posted on the worker's thread. There is also a waitable event which is signalled at that time. The even is used to block ModelSafeWorker::UnregisterForLoopDestruction from delegating the call on an uninitialized message loop which is possible only when SetWorkingLoopToCurrent has never been called. There is no need to block in UnregisterForLoopDestruction. If the registration for even loop destruction hasn't been been done yet (because the thread is blocked) then there really no reason to wait for that and then immediately unsubscribe. We can skip both subscription to the even loop notification altogether and let the registrar know that it is OK to remove the worker. That's essentially what the fix does. I added a test that verifies that SyncBackendRegistrar::Shutdown doesn't block on a blocked model type thread anymore. BUG=423078 Committed: https://crrev.com/8e84bfcc1c33fc0b63392d48ee8288d73f4a7675 Cr-Commit-Position: refs/heads/master@{#300350} Review URL: https://codereview.chromium.org/667573007 Cr-Commit-Position: refs/heads/master@{#300498} --- .../sync/glue/browser_thread_model_worker.cc | 1 + chrome/browser/sync/glue/history_model_worker.cc | 1 + chrome/browser/sync/glue/password_model_worker.cc | 1 + .../sync/glue/sync_backend_registrar_unittest.cc | 85 ---------------------- chrome/browser/sync/glue/ui_model_worker.cc | 1 + .../public/engine/model_safe_worker.cc | 52 +++++-------- .../internal_api/public/engine/model_safe_worker.h | 8 +- .../public/engine/passive_model_worker.cc | 1 + 8 files changed, 24 insertions(+), 126 deletions(-) diff --git a/chrome/browser/sync/glue/browser_thread_model_worker.cc b/chrome/browser/sync/glue/browser_thread_model_worker.cc index 093d282..7f4024a 100644 --- a/chrome/browser/sync/glue/browser_thread_model_worker.cc +++ b/chrome/browser/sync/glue/browser_thread_model_worker.cc @@ -50,6 +50,7 @@ BrowserThreadModelWorker::~BrowserThreadModelWorker() {} void BrowserThreadModelWorker::RegisterForLoopDestruction() { if (BrowserThread::CurrentlyOn(thread_)) { + base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } else { BrowserThread::PostTask( diff --git a/chrome/browser/sync/glue/history_model_worker.cc b/chrome/browser/sync/glue/history_model_worker.cc index d9f8754..eb00c92 100644 --- a/chrome/browser/sync/glue/history_model_worker.cc +++ b/chrome/browser/sync/glue/history_model_worker.cc @@ -100,6 +100,7 @@ void HistoryModelWorker::RegisterForLoopDestruction() { } void HistoryModelWorker::RegisterOnDBThread() { + base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } diff --git a/chrome/browser/sync/glue/password_model_worker.cc b/chrome/browser/sync/glue/password_model_worker.cc index 17ba62e..6ad5cfd 100644 --- a/chrome/browser/sync/glue/password_model_worker.cc +++ b/chrome/browser/sync/glue/password_model_worker.cc @@ -64,6 +64,7 @@ void PasswordModelWorker::CallDoWorkAndSignalTask( } void PasswordModelWorker::RegisterForPasswordLoopDestruction() { + base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } diff --git a/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc b/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc index f1a62ee0..cd2d29d 100644 --- a/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc +++ b/chrome/browser/sync/glue/sync_backend_registrar_unittest.cc @@ -256,91 +256,6 @@ TEST_F(SyncBackendRegistrarTest, ActivateDeactivateNonUIDataType) { TriggerChanges(registrar_.get(), AUTOFILL); } -class SyncBackendRegistrarShutdownTest : public testing::Test { - public: - void BlockDBThread() { - EXPECT_FALSE(db_thread_lock_.Try()); - - db_thread_blocked_.Signal(); - base::AutoLock l(db_thread_lock_); - } - - protected: - friend class TestRegistrar; - - SyncBackendRegistrarShutdownTest() - : thread_bundle_(content::TestBrowserThreadBundle::REAL_DB_THREAD | - content::TestBrowserThreadBundle::REAL_FILE_THREAD | - content::TestBrowserThreadBundle::REAL_IO_THREAD), - db_thread_blocked_(false, false), - registrar_destroyed_(false, false) {} - - virtual ~SyncBackendRegistrarShutdownTest() {} - - content::TestBrowserThreadBundle thread_bundle_; - base::WaitableEvent db_thread_blocked_; - base::Lock db_thread_lock_; - base::WaitableEvent registrar_destroyed_; -}; - -// Wrap SyncBackendRegistrar so that we can monitor its lifetime. -class TestRegistrar : public SyncBackendRegistrar { - public: - explicit TestRegistrar(Profile* profile, - SyncBackendRegistrarShutdownTest* test) - : SyncBackendRegistrar("test", profile, scoped_ptr()), - test_(test) {} - - ~TestRegistrar() override { test_->registrar_destroyed_.Signal(); } - - private: - SyncBackendRegistrarShutdownTest* test_; -}; - -TEST_F(SyncBackendRegistrarShutdownTest, BlockingShutdown) { - // Take ownership of |db_thread_lock_| so that the DB thread can't acquire it. - db_thread_lock_.Acquire(); - - // This will block the DB thread by waiting on |db_thread_lock_|. - BrowserThread::PostTask( - BrowserThread::DB, - FROM_HERE, - base::Bind(&SyncBackendRegistrarShutdownTest::BlockDBThread, - base::Unretained(this))); - - TestingProfile profile; - scoped_ptr registrar(new TestRegistrar(&profile, this)); - base::Thread* sync_thread = registrar->sync_thread(); - - // Stop here until the DB thread gets a chance to run and block on the lock. - // Please note that since the task above didn't finish, the task to - // initialize the worker on the DB thread hasn't had a chance to run yet too. - // Which means ModelSafeWorker::SetWorkingLoopToCurrent hasn't been called - // for the DB worker. - db_thread_blocked_.Wait(); - - registrar->SetInitialTypes(ModelTypeSet()); - - // Start the shutdown. - registrar->RequestWorkerStopOnUIThread(); - sync_thread->message_loop()->PostTask( - FROM_HERE, - base::Bind(&SyncBackendRegistrar::Shutdown, - base::Unretained(registrar.release()))); - - // The test verifies that the sync thread doesn't block because - // of the blocked DB thread and can finish the shutdown. - sync_thread->message_loop()->RunUntilIdle(); - - db_thread_lock_.Release(); - - base::MessageLoop::current()->RunUntilIdle(); - - // This verifies that all workers have been removed and the registrar - // destroyed. - registrar_destroyed_.Wait(); -} - } // namespace } // namespace browser_sync diff --git a/chrome/browser/sync/glue/ui_model_worker.cc b/chrome/browser/sync/glue/ui_model_worker.cc index 726deb6..2d8bc7d 100644 --- a/chrome/browser/sync/glue/ui_model_worker.cc +++ b/chrome/browser/sync/glue/ui_model_worker.cc @@ -47,6 +47,7 @@ UIModelWorker::UIModelWorker(syncer::WorkerLoopDestructionObserver* observer) void UIModelWorker::RegisterForLoopDestruction() { CHECK(BrowserThread::CurrentlyOn(BrowserThread::UI)); + base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } diff --git a/sync/internal_api/public/engine/model_safe_worker.cc b/sync/internal_api/public/engine/model_safe_worker.cc index 2aea5cf..eb3fc1f 100644 --- a/sync/internal_api/public/engine/model_safe_worker.cc +++ b/sync/internal_api/public/engine/model_safe_worker.cc @@ -75,8 +75,8 @@ ModelSafeWorker::ModelSafeWorker(WorkerLoopDestructionObserver* observer) : stopped_(false), work_done_or_stopped_(false, false), observer_(observer), - working_loop_(NULL) { -} + working_loop_(NULL), + working_loop_set_wait_(true, false) {} ModelSafeWorker::~ModelSafeWorker() {} @@ -135,42 +135,26 @@ void ModelSafeWorker::WillDestroyCurrentMessageLoop() { void ModelSafeWorker::SetWorkingLoopToCurrent() { base::AutoLock l(working_loop_lock_); DCHECK(!working_loop_); - - if (unregister_done_callback_.is_null()) { - // Expected case - UnregisterForLoopDestruction hasn't been called yet. - base::MessageLoop::current()->AddDestructionObserver(this); - working_loop_ = base::MessageLoop::current(); - } else { - // Rare case which is possible when the model type thread remains - // blocked for the entire session and UnregisterForLoopDestruction ends - // up being called before this method. This method is posted unlike - // UnregisterForLoopDestruction - that's why they can end up being called - // out of order. - // In this case we skip the destruction observer registration - // and just invoke the callback stored at UnregisterForLoopDestruction. - DCHECK(stopped_); - unregister_done_callback_.Run(GetModelSafeGroup()); - } + working_loop_ = base::MessageLoop::current(); + working_loop_set_wait_.Signal(); } void ModelSafeWorker::UnregisterForLoopDestruction( base::Callback unregister_done_callback) { - base::AutoLock l(working_loop_lock_); - if (working_loop_ != NULL) { - // Normal case - observer registration has been already done. - // Delegate to the sync thread to do the actual unregistration in - // UnregisterForLoopDestructionAsync. - DCHECK_NE(base::MessageLoop::current(), working_loop_); - working_loop_->PostTask( - FROM_HERE, - base::Bind(&ModelSafeWorker::UnregisterForLoopDestructionAsync, - this, - unregister_done_callback)); - } else { - // The working loop is still unknown, probably because the model type - // thread is blocked. Store the callback to be called from - // SetWorkingLoopToCurrent. - unregister_done_callback_ = unregister_done_callback; + // Ok to wait until |working_loop_| is set because this is called on sync + // loop. + working_loop_set_wait_.Wait(); + + { + base::AutoLock l(working_loop_lock_); + if (working_loop_ != NULL) { + // Should be called on sync loop. + DCHECK_NE(base::MessageLoop::current(), working_loop_); + working_loop_->PostTask( + FROM_HERE, + base::Bind(&ModelSafeWorker::UnregisterForLoopDestructionAsync, + this, unregister_done_callback)); + } } } diff --git a/sync/internal_api/public/engine/model_safe_worker.h b/sync/internal_api/public/engine/model_safe_worker.h index df5bba9..02f3ca1 100644 --- a/sync/internal_api/public/engine/model_safe_worker.h +++ b/sync/internal_api/public/engine/model_safe_worker.h @@ -133,13 +133,7 @@ class SYNC_EXPORT ModelSafeWorker // observation from sync thread when shutting down sync. base::Lock working_loop_lock_; base::MessageLoop* working_loop_; - - // Callback passed with UnregisterForLoopDestruction. Normally this - // remains unset/unused and is stored only if |working_loop_| isn't - // initialized by the time UnregisterForLoopDestruction is called. - // It is safe to copy and thread safe. - // See comments in model_safe_worker.cc for more details. - base::Callback unregister_done_callback_; + base::WaitableEvent working_loop_set_wait_; }; // A map that details which ModelSafeGroup each ModelType diff --git a/sync/internal_api/public/engine/passive_model_worker.cc b/sync/internal_api/public/engine/passive_model_worker.cc index 3117194..02036d5 100644 --- a/sync/internal_api/public/engine/passive_model_worker.cc +++ b/sync/internal_api/public/engine/passive_model_worker.cc @@ -18,6 +18,7 @@ PassiveModelWorker::~PassiveModelWorker() { } void PassiveModelWorker::RegisterForLoopDestruction() { + base::MessageLoop::current()->AddDestructionObserver(this); SetWorkingLoopToCurrent(); } -- cgit v1.1