Don't retry failed attempts to poll sync server

This commit doesn't change much of our intended behaviour, though it does affect the details of how we achieve it. A failed poll used to cause us to schedule a retry job. However, because this job was considered to be a POLL job, and we don't run POLL jobs when we're in backoff, it would get rejected by the scheduler. The backoff timer never got reset after that, so this would have happened only once per poll attempt. This change makes it so we never backoff in response to a failed poll. This might have an impact in some really strange edge cases (ie. a poll recently failed then we receive an IP address change notification) but those changes should be an improvement and most common use cases will be unaffected. Users will likely never notice the effects of this change. Our tests, however, do care about these sorts of details. This commit fixes some existing tests and adds one new one. BUG=112954 TEST=SyncSchedulerTest in sync_unit_test Review URL: http://codereview.chromium.org/9372051 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@124034 0039d316-1c4b-4281-b951-d872f2087c98
author: rlarocque@chromium.org <rlarocque@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-02-28 21:50:32 +0000
committer: rlarocque@chromium.org <rlarocque@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2012-02-28 21:50:32 +0000
commit: 7e9305c9e30cebfae259702a5c60bc45afb0bb97 (patch)
tree: bc3dfbe1a2021529a60745db3006b949fa5a9639
parent: 32d582db027b7388280fe78f8e4c9181c2175022 (diff)
download: chromium_src-7e9305c9e30cebfae259702a5c60bc45afb0bb97.zip
chromium_src-7e9305c9e30cebfae259702a5c60bc45afb0bb97.tar.gz
chromium_src-7e9305c9e30cebfae259702a5c60bc45afb0bb97.tar.bz2
3 files changed, 78 insertions, 38 deletions
diff --git a/chrome/browser/sync/engine/sync_scheduler.cc b/chrome/browser/sync/engine/sync_scheduler.cc
index f98341c..eaee0d4 100644
--- a/chrome/browser/sync/engine/sync_scheduler.cc
+++ b/chrome/browser/sync/engine/sync_scheduler.cc
@@ -899,33 +899,36 @@ void SyncScheduler::ScheduleNextSync(const SyncSessionJob& old_job) {
     return;
   }
 
+  if (old_job.purpose == SyncSessionJob::POLL) {
+    return; // We don't retry POLL jobs.
+  }
+
   // TODO(rlarocque): There's no reason why we should blindly backoff and retry
   // if we don't succeed.  Some types of errors are not likely to disappear on
   // their own.  With the return values now available in the old_job.session, we
   // should be able to detect such errors and only retry when we detect
   // transient errors.
 
-  // We are in backoff mode and our time did not run out. That means we had
-  // a local change, notification from server or a network connection change
-  // notification. In any case set had_nudge = true so we dont retry next
-  // nudge. Note: we will keep retrying network connection changes though as
-  // they are treated as canary jobs. Also we check the mode here because
-  // we want to do this only in normal mode. For config mode jobs we dont
-  // have anything similar to had_nudge.
   if (IsBackingOff() && wait_interval_->timer.IsRunning() &&
       mode_ == NORMAL_MODE) {
+    // When in normal mode, we allow up to one nudge per backoff interval.  It
+    // appears that this was our nudge for this interval, and it failed.
+    //
+    // Note: This does not prevent us from running canary jobs.  For example, an
+    // IP address change might still result in another nudge being executed
+    // during this backoff interval.
     SDVLOG(2) << "A nudge during backoff failed";
-    // We weren't continuing but we're in backoff; must have been a nudge.
+
     DCHECK_EQ(SyncSessionJob::NUDGE, old_job.purpose);
     DCHECK(!wait_interval_->had_nudge);
+
     wait_interval_->had_nudge = true;
-    // Old job did not finish. So make it the pending job.
     InitOrCoalescePendingJob(old_job);
-    // Resume waiting.
     RestartWaiting();
   } else {
+    // Either this is the first failure or a consecutive failure after our
+    // backoff timer expired.  We handle it the same way in either case.
     SDVLOG(2) << "Non-'backoff nudge' SyncShare job failed";
-    // We don't seem to have made forward progress. Start or extend backoff.
     HandleContinuationError(old_job);
   }
 }
diff --git a/chrome/browser/sync/engine/sync_scheduler.h b/chrome/browser/sync/engine/sync_scheduler.h
index ef86364..ecf3002 100644
--- a/chrome/browser/sync/engine/sync_scheduler.h
+++ b/chrome/browser/sync/engine/sync_scheduler.h
@@ -197,6 +197,7 @@ class SyncScheduler : public sessions::SyncSession::Delegate {
                            ContinueCanaryJobConfig);
   FRIEND_TEST_ALL_PREFIXES(SyncSchedulerWhiteboxTest,
       ContinueNudgeWhileExponentialBackOff);
+  FRIEND_TEST_ALL_PREFIXES(SyncSchedulerTest, TransientPollFailure);
 
   // A component used to get time delays associated with exponential backoff.
   // Encapsulated into a class to facilitate testing.
diff --git a/chrome/browser/sync/engine/sync_scheduler_unittest.cc b/chrome/browser/sync/engine/sync_scheduler_unittest.cc
index 99799fb..a6c48d6 100644
--- a/chrome/browser/sync/engine/sync_scheduler_unittest.cc
+++ b/chrome/browser/sync/engine/sync_scheduler_unittest.cc
@@ -859,16 +859,14 @@ TEST_F(SyncSchedulerTest, BackoffDropsJobs) {
   StartSyncScheduler(SyncScheduler::NORMAL_MODE);
   RunLoop();
 
-  // Run again to wait for polling.
+  // This nudge should fail and put us into backoff.  Thanks to our mock
+  // GetDelay() setup above, this will be a long backoff.
+  scheduler()->ScheduleNudge(zero(), NUDGE_SOURCE_LOCAL, types, FROM_HERE);
   RunLoop();
 
-  // Pump loop to get rid of nudge.
-  PumpLoop();
-
   Mock::VerifyAndClearExpectations(syncer());
   ASSERT_EQ(1U, r.snapshots.size());
-  EXPECT_EQ(GetUpdatesCallerInfo::PERIODIC,
-            r.snapshots[0]->source.updates_source);
+  EXPECT_EQ(GetUpdatesCallerInfo::LOCAL, r.snapshots[0]->source.updates_source);
 
   EXPECT_CALL(*syncer(), SyncShare(_,_,_)).Times(1)
       .WillOnce(DoAll(Invoke(sessions::test_util::SimulateCommitFailed),
@@ -876,15 +874,13 @@ TEST_F(SyncSchedulerTest, BackoffDropsJobs) {
 
   // We schedule a nudge with enough delay (10X poll interval) that at least
   // one or two polls would have taken place.  The nudge should succeed.
-  scheduler()->ScheduleNudge(
-      poll * 10, NUDGE_SOURCE_LOCAL, types, FROM_HERE);
+  scheduler()->ScheduleNudge(poll * 10, NUDGE_SOURCE_LOCAL, types, FROM_HERE);
   RunLoop();
 
   Mock::VerifyAndClearExpectations(syncer());
   Mock::VerifyAndClearExpectations(delay());
   ASSERT_EQ(2U, r.snapshots.size());
-  EXPECT_EQ(GetUpdatesCallerInfo::LOCAL,
-            r.snapshots[1]->source.updates_source);
+  EXPECT_EQ(GetUpdatesCallerInfo::LOCAL, r.snapshots[1]->source.updates_source);
 
   EXPECT_CALL(*syncer(), SyncShare(_,_,_)).Times(0);
   EXPECT_CALL(*delay(), GetDelay(_)).Times(0);
@@ -947,47 +943,87 @@ TEST_F(SyncSchedulerTest, BackoffElevation) {
   EXPECT_GE(r.times[4] - r.times[3], fifth);
 }
 
-// Test that things go back to normal once a canary task makes forward progress
-// following a succession of failures.
-//
-// TODO(rlarocque, 112954): The code this was intended to test is broken.
-TEST_F(SyncSchedulerTest, DISABLED_BackoffRelief) {
+// Test that things go back to normal once a retry makes forward progress.
+TEST_F(SyncSchedulerTest, BackoffRelief) {
   SyncShareRecords r;
   const TimeDelta poll(TimeDelta::FromMilliseconds(10));
   scheduler()->OnReceivedLongPollIntervalUpdate(poll);
   UseMockDelayProvider();
 
-  const TimeDelta backoff = TimeDelta::FromMilliseconds(100);
+  const TimeDelta backoff = TimeDelta::FromMilliseconds(5);
 
   EXPECT_CALL(*syncer(), SyncShare(_,_,_))
-      .WillOnce(Invoke(sessions::test_util::SimulateCommitFailed))
-      .WillOnce(Invoke(sessions::test_util::SimulateCommitFailed))
+      .WillOnce(DoAll(Invoke(sessions::test_util::SimulateCommitFailed),
+                      RecordSyncShareMultiple(&r, kMinNumSamples)))
       .WillRepeatedly(DoAll(Invoke(sessions::test_util::SimulateSuccess),
-          RecordSyncShareMultiple(&r, kMinNumSamples)));
+                            RecordSyncShareMultiple(&r, kMinNumSamples)));
   EXPECT_CALL(*delay(), GetDelay(_)).WillOnce(Return(backoff));
 
   // Optimal start for the post-backoff poll party.
-  TimeTicks optimal_start = TimeTicks::Now() + poll + backoff;
+  TimeTicks optimal_start = TimeTicks::Now();
   StartSyncScheduler(SyncScheduler::NORMAL_MODE);
   RunLoop();
 
   // Run again to wait for polling.
+  scheduler()->ScheduleNudge(zero(), NUDGE_SOURCE_LOCAL,
+                             ModelTypeSet(), FROM_HERE);
   RunLoop();
 
   StopSyncScheduler();
 
-  // Check for healthy polling after backoff is relieved.
-  // Can't use AnalyzePollRun because first sync is a continuation. Bleh.
-  for (size_t i = 0; i < r.times.size(); i++) {
+  EXPECT_EQ(kMinNumSamples, r.times.size());
+
+  // The first nudge ran as soon as possible.  It failed.
+  TimeTicks optimal_job_time = optimal_start;
+  EXPECT_GE(r.times[0], optimal_job_time);
+  EXPECT_EQ(GetUpdatesCallerInfo::LOCAL,
+            r.snapshots[0]->source.updates_source);
+
+  // It was followed by a successful retry nudge shortly afterward.
+  optimal_job_time = optimal_job_time + backoff;
+  EXPECT_GE(r.times[1], optimal_job_time);
+  EXPECT_EQ(GetUpdatesCallerInfo::LOCAL,
+            r.snapshots[1]->source.updates_source);
+  // After that, we went back to polling.
+  for (size_t i = 2; i < r.snapshots.size(); i++) {
+    optimal_job_time = optimal_job_time + poll;
     SCOPED_TRACE(testing::Message() << "SyncShare # (" << i << ")");
-    TimeTicks optimal_next_sync = optimal_start + poll * i;
-    EXPECT_GE(r.times[i], optimal_next_sync);
-    EXPECT_EQ(i == 0 ? GetUpdatesCallerInfo::SYNC_CYCLE_CONTINUATION
-                     : GetUpdatesCallerInfo::PERIODIC,
+    EXPECT_GE(r.times[i], optimal_job_time);
+    EXPECT_EQ(GetUpdatesCallerInfo::PERIODIC,
               r.snapshots[i]->source.updates_source);
   }
 }
 
+// Test that poll failures are ignored.  They should have no effect on
+// subsequent poll attempts, nor should they trigger a backoff/retry.
+TEST_F(SyncSchedulerTest, TransientPollFailure) {
+  SyncShareRecords r;
+  const TimeDelta poll_interval(TimeDelta::FromMilliseconds(10));
+  scheduler()->OnReceivedLongPollIntervalUpdate(poll_interval);
+  UseMockDelayProvider(); // Will cause test failure if backoff is initiated.
+
+  EXPECT_CALL(*syncer(), SyncShare(_,_,_))
+      .WillOnce(DoAll(Invoke(sessions::test_util::SimulateCommitFailed),
+                      RecordSyncShare(&r)))
+      .WillOnce(DoAll(Invoke(sessions::test_util::SimulateSuccess),
+                      RecordSyncShare(&r)));
+
+  StartSyncScheduler(SyncScheduler::NORMAL_MODE);
+  RunLoop();
+
+  // Run the unsucessful poll.  The failed poll should not trigger backoff.
+  RunLoop();
+  EXPECT_FALSE(scheduler()->IsBackingOff());
+
+  // Run the successful poll.
+  RunLoop();
+  EXPECT_FALSE(scheduler()->IsBackingOff());
+
+  // Verify the the two SyncShare() calls were made one poll interval apart.
+  ASSERT_EQ(2U, r.snapshots.size());
+  EXPECT_GE(r.times[1] - r.times[0], poll_interval);
+}
+
 TEST_F(SyncSchedulerTest, GetRecommendedDelay) {
   EXPECT_LE(TimeDelta::FromSeconds(0),
             SyncScheduler::GetRecommendedDelay(TimeDelta::FromSeconds(0)));
author	rlarocque@chromium.org <rlarocque@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-02-28 21:50:32 +0000
committer	rlarocque@chromium.org <rlarocque@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2012-02-28 21:50:32 +0000
commit	7e9305c9e30cebfae259702a5c60bc45afb0bb97 (patch)
tree	bc3dfbe1a2021529a60745db3006b949fa5a9639
parent	32d582db027b7388280fe78f8e4c9181c2175022 (diff)
download	chromium_src-7e9305c9e30cebfae259702a5c60bc45afb0bb97.zip chromium_src-7e9305c9e30cebfae259702a5c60bc45afb0bb97.tar.gz chromium_src-7e9305c9e30cebfae259702a5c60bc45afb0bb97.tar.bz2