Re-applying reverted changes plus fix. ConfidenceScore now takes flat

lists and the previously offending code now checks whether the lists need to be flattened before being passed to the function. BUG=422727 Review URL: https://codereview.chromium.org/644323002 Cr-Commit-Position: refs/heads/master@{#300196} Review URL: https://codereview.chromium.org/665893003 Cr-Commit-Position: refs/heads/master@{#300953}
author: robertocn <robertocn@chromium.org> 2014-10-23 14:33:26 -0700
committer: Commit bot <commit-bot@chromium.org> 2014-10-23 21:33:40 +0000
commit: d0efba359b185fe140635b6269442a81e49e4c69 (patch)
tree: 94a44adf51aad9ae621068f25694ca7909ae81fb /tools
parent: 63b390d81aa08c4c490522cc7aefbd282fb33bc7 (diff)
download: chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.zip
chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.tar.gz
chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.tar.bz2
4 files changed, 178 insertions, 27 deletions
diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py
index f90074f..0fd8534 100755
--- a/tools/auto_bisect/bisect_perf_regression.py
+++ b/tools/auto_bisect/bisect_perf_regression.py
@@ -75,6 +75,10 @@ MAX_MAC_BUILD_TIME = 14400
 MAX_WIN_BUILD_TIME = 14400
 MAX_LINUX_BUILD_TIME = 14400
 
+# The confidence percentage we require to consider the initial range a
+# regression based on the test results of the inital good and bad revisions.
+REGRESSION_CONFIDENCE = 95
+
 # Patch template to add a new file, DEPS.sha under src folder.
 # This file contains SHA1 value of the DEPS changes made while bisecting
 # dependency repositories. This patch send along with DEPS patch to try server.
@@ -89,6 +93,23 @@ new file mode 100644
 +%(deps_sha)s
 """
 
+REGRESSION_CONFIDENCE_ERROR_TEMPLATE = """
+We could not reproduce the regression with this test/metric/platform combination
+with enough confidence.
+
+Here are the results for the initial revision range:
+'Good' revision: {good_rev}
+\tmean: {good_mean}
+\tstd.err.:{good_std_err}
+\tsample size:{good_sample_size}
+'Bad' revision: {bad_rev}
+\tmean: {bad_mean}
+\tstd.err.:{bad_std_err}
+\tsample size:{bad_sample_size}
+
+NOTE: There's still a chance that this is actually a regression, but you may
+      need to bisect a different platform."""
+
 # Git branch name used to run bisect try jobs.
 BISECT_TRYJOB_BRANCH = 'bisect-tryjob'
 # Git master branch name.
@@ -589,6 +610,46 @@ def _GenerateProfileIfNecessary(command_args):
   return True
 
 
+def _CheckRegressionConfidenceError(
+    good_revision,
+    bad_revision,
+    known_good_value,
+    known_bad_value):
+  """Checks whether we can be confident beyond a certain degree that the given
+  metrics represent a regression.
+
+  Args:
+    good_revision: string representing the commit considered 'good'
+    bad_revision: Same as above for 'bad'.
+    known_good_value: A dict with at least: 'values', 'mean' and 'std_err'
+    known_bad_value: Same as above.
+
+  Returns:
+    False if there is no error (i.e. we can be confident there's a regressioni),
+    a string containing the details of the lack of confidence otherwise.
+  """
+  error = False
+  # Adding good and bad values to a parameter list.
+  confidenceParams = []
+  for l in [known_bad_value['values'], known_good_value['values']]:
+    # Flatten if needed
+    if isinstance(l, list) and all([isinstance(x, list) for x in l]):
+      confidenceParams.append(sum(l, []))
+    else:
+      confidenceParams.append(l)
+  regression_confidence = BisectResults.ConfidenceScore(*confidenceParams)
+  if regression_confidence < REGRESSION_CONFIDENCE:
+    error = REGRESSION_CONFIDENCE_ERROR_TEMPLATE.format(
+        good_rev=good_revision,
+        good_mean=known_good_value['mean'],
+        good_std_err=known_good_value['std_err'],
+        good_sample_size=len(known_good_value['values']),
+        bad_rev=bad_revision,
+        bad_mean=known_bad_value['mean'],
+        bad_std_err=known_bad_value['std_err'],
+        bad_sample_size=len(known_bad_value['values']))
+  return error
+
 class DepotDirectoryRegistry(object):
 
   def __init__(self, src_cwd):
@@ -2217,6 +2278,15 @@ class BisectPerformanceMetrics(object):
 
       min_revision = 0
       max_revision = len(revision_states) - 1
+      # Check how likely it is that the good and bad results are different
+      # beyond chance-induced variation.
+      if not self.opts.debug_ignore_regression_confidence:
+        error = _CheckRegressionConfidenceError(good_revision,
+                                                bad_revision,
+                                                known_good_value,
+                                                known_bad_value)
+        if error:
+          return BisectResults(error=error)
 
       # Can just mark the good and bad revisions explicitly here since we
       # already know the results.
@@ -2425,6 +2495,7 @@ class BisectOptions(object):
     self.debug_ignore_build = None
     self.debug_ignore_sync = None
     self.debug_ignore_perf_test = None
+    self.debug_ignore_regression_confidence = None
     self.debug_fake_first_test_mean = 0
     self.gs_bucket = None
     self.target_arch = 'ia32'
@@ -2593,6 +2664,10 @@ class BisectOptions(object):
     group.add_option('--debug_ignore_perf_test',
                      action='store_true',
                      help='DEBUG: Don\'t perform performance tests.')
+    group.add_option('--debug_ignore_regression_confidence',
+                     action='store_true',
+                     help='DEBUG: Don\'t score the confidence of the initial '
+                          'good and bad revisions\' test results.')
     group.add_option('--debug_fake_first_test_mean',
                      type='int',
                      default='0',
diff --git a/tools/auto_bisect/bisect_perf_regression_test.py b/tools/auto_bisect/bisect_perf_regression_test.py
index 8b741cd..8df897b 100644
--- a/tools/auto_bisect/bisect_perf_regression_test.py
+++ b/tools/auto_bisect/bisect_perf_regression_test.py
@@ -18,17 +18,75 @@ import mock
 import source_control
 
 
+# Regression confidence: 0%
+CLEAR_NON_REGRESSION = [
+    # Mean: 30.223 Std. Dev.: 11.383
+    [[16.886], [16.909], [16.99], [17.723], [17.952], [18.118], [19.028],
+     [19.552], [21.954], [38.573], [38.839], [38.965], [40.007], [40.572],
+     [41.491], [42.002], [42.33], [43.109], [43.238]],
+    # Mean: 34.76 Std. Dev.: 11.516
+    [[16.426], [17.347], [20.593], [21.177], [22.791], [27.843], [28.383],
+     [28.46], [29.143], [40.058], [40.303], [40.558], [41.918], [42.44],
+     [45.223], [46.494], [50.002], [50.625], [50.839]]
+]
+# Regression confidence: ~ 90%
+ALMOST_REGRESSION = [
+    # Mean: 30.042 Std. Dev.: 2.002
+    [[26.146], [28.04], [28.053], [28.074], [28.168], [28.209], [28.471],
+     [28.652], [28.664], [30.862], [30.973], [31.002], [31.897], [31.929],
+     [31.99], [32.214], [32.323], [32.452], [32.696]],
+    # Mean: 33.008 Std. Dev.: 4.265
+    [[34.963], [30.741], [39.677], [39.512], [34.314], [31.39], [34.361],
+     [25.2], [30.489], [29.434]]
+]
+# Regression confidence: ~ 98%
+BARELY_REGRESSION = [
+    # Mean: 28.828 Std. Dev.: 1.993
+    [[26.96], [27.605], [27.768], [27.829], [28.006], [28.206], [28.393],
+     [28.911], [28.933], [30.38], [30.462], [30.808], [31.74], [31.805],
+     [31.899], [32.077], [32.454], [32.597], [33.155]],
+    # Mean: 31.156 Std. Dev.: 1.980
+    [[28.729], [29.112], [29.258], [29.454], [29.789], [30.036], [30.098],
+     [30.174], [30.534], [32.285], [32.295], [32.552], [32.572], [32.967],
+     [33.165], [33.403], [33.588], [33.744], [34.147], [35.84]]
+]
+# Regression confidence: 99.5%
+CLEAR_REGRESSION = [
+    # Mean: 30.254 Std. Dev.: 2.987
+    [[26.494], [26.621], [26.701], [26.997], [26.997], [27.05], [27.37],
+     [27.488], [27.556], [31.846], [32.192], [32.21], [32.586], [32.596],
+     [32.618], [32.95], [32.979], [33.421], [33.457], [34.97]],
+    # Mean: 33.190 Std. Dev.: 2.972
+    [[29.547], [29.713], [29.835], [30.132], [30.132], [30.33], [30.406],
+     [30.592], [30.72], [34.486], [35.247], [35.253], [35.335], [35.378],
+     [35.934], [36.233], [36.41], [36.947], [37.982]]
+]
 # Default options for the dry run
 DEFAULT_OPTIONS = {
     'debug_ignore_build': True,
     'debug_ignore_sync': True,
     'debug_ignore_perf_test': True,
+    'debug_ignore_regression_confidence': True,
     'command': 'fake_command',
     'metric': 'fake/metric',
     'good_revision': 280000,
     'bad_revision': 280005,
 }
 
+# This global is a placeholder for a generator to be defined by the testcases
+# that use _MockRunTest
+_MockResultsGenerator = (x for x in [])
+
+def _FakeTestResult(values):
+  result_dict = {'mean': 0.0, 'std_err': 0.0, 'std_dev': 0.0, 'values': values}
+  success_code = 0
+  return (result_dict, success_code)
+
+
+def _MockRunTests(*args, **kwargs):
+  _, _ = args, kwargs
+  return _FakeTestResult(_MockResultsGenerator.next())
+
 
 def _GetBisectPerformanceMetricsInstance(options_dict):
   """Returns an instance of the BisectPerformanceMetrics class."""
@@ -36,12 +94,13 @@ def _GetBisectPerformanceMetricsInstance(options_dict):
   return bisect_perf_regression.BisectPerformanceMetrics(opts)
 
 
-def _GetExtendedOptions(d, f):
+def _GetExtendedOptions(improvement_dir, fake_first, ignore_confidence=True):
   """Returns the a copy of the default options dict plus some options."""
   result = dict(DEFAULT_OPTIONS)
   result.update({
-      'improvement_direction': d,
-      'debug_fake_first_test_mean': f})
+      'improvement_direction': improvement_dir,
+      'debug_fake_first_test_mean': fake_first,
+      'debug_ignore_regression_confidence': ignore_confidence})
   return result
 
 
@@ -233,11 +292,11 @@ class BisectPerfRegressionTest(unittest.TestCase):
 
   def testBisectImprovementDirectionFails(self):
     """Dry run of a bisect with an improvement instead of regression."""
-
     # Test result goes from 0 to 100 where higher is better
     results = _GenericDryRun(_GetExtendedOptions(1, 100))
     self.assertIsNotNone(results.error)
     self.assertIn('not a regression', results.error)
+
     # Test result goes from 0 to -100 where lower is better
     results = _GenericDryRun(_GetExtendedOptions(-1, -100))
     self.assertIsNotNone(results.error)
@@ -252,6 +311,32 @@ class BisectPerfRegressionTest(unittest.TestCase):
     results = _GenericDryRun(_GetExtendedOptions(1, -100))
     self.assertIsNone(results.error)
 
+  @mock.patch('bisect_perf_regression.BisectPerformanceMetrics.'
+              'RunPerformanceTestAndParseResults', _MockRunTests)
+  def testBisectStopsOnDoubtfulRegression(self):
+    global _MockResultsGenerator
+    _MockResultsGenerator = (rs for rs in CLEAR_NON_REGRESSION)
+    results = _GenericDryRun(_GetExtendedOptions(0, 0, False))
+    self.assertIsNotNone(results.error)
+    self.assertIn('could not reproduce the regression', results.error)
+
+    _MockResultsGenerator = (rs for rs in ALMOST_REGRESSION)
+    results = _GenericDryRun(_GetExtendedOptions(0, 0, False))
+    self.assertIsNotNone(results.error)
+    self.assertIn('could not reproduce the regression', results.error)
+
+  @mock.patch('bisect_perf_regression.BisectPerformanceMetrics.'
+              'RunPerformanceTestAndParseResults', _MockRunTests)
+  def testBisectContinuesOnClearRegression(self):
+    global _MockResultsGenerator
+    _MockResultsGenerator = (rs for rs in CLEAR_REGRESSION)
+    with self.assertRaises(StopIteration):
+      _GenericDryRun(_GetExtendedOptions(0, 0, False))
+
+    _MockResultsGenerator = (rs for rs in BARELY_REGRESSION)
+    with self.assertRaises(StopIteration):
+      _GenericDryRun(_GetExtendedOptions(0, 0, False))
+
   def testGetCommitPosition(self):
     cp_git_rev = '7017a81991de983e12ab50dfc071c70e06979531'
     self.assertEqual(291765, source_control.GetCommitPosition(cp_git_rev))
diff --git a/tools/auto_bisect/bisect_results.py b/tools/auto_bisect/bisect_results.py
index 9bcaeb6..281afe6 100644
--- a/tools/auto_bisect/bisect_results.py
+++ b/tools/auto_bisect/bisect_results.py
@@ -109,7 +109,7 @@ class BisectResults(object):
     return warnings
 
   @staticmethod
-  def ConfidenceScore(good_results_lists, bad_results_lists,
+  def ConfidenceScore(sample1, sample2,
                       accept_single_bad_or_good=False):
     """Calculates a confidence score.
 
@@ -119,8 +119,8 @@ class BisectResults(object):
 
 
     Args:
-      good_results_lists: A list of lists of "good" result numbers.
-      bad_results_lists: A list of lists of "bad" result numbers.
+      sample1: A flat list of "good" result numbers.
+      sample2: A flat list of "bad" result numbers.
       accept_single_bad_or_good: If True, computes confidence even if there is
           just one bad or good revision, otherwise single good or bad revision
           always returns 0.0 confidence. This flag will probably get away when
@@ -134,13 +134,9 @@ class BisectResults(object):
     # classified good or bad; this isn't good enough evidence to make a
     # decision. If an empty list was passed, that also implies zero confidence.
     if not accept_single_bad_or_good:
-      if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
+      if len(sample1) <= 1 or len(sample2) <= 1:
         return 0.0
 
-    # Flatten the lists of results lists.
-    sample1 = sum(good_results_lists, [])
-    sample2 = sum(bad_results_lists, [])
-
     # If there were only empty lists in either of the lists (this is unexpected
     # and normally shouldn't happen), then we also want to return 0.
     if not sample1 or not sample2:
@@ -171,7 +167,9 @@ class BisectResults(object):
       if revision_state.value:
         current_values = revision_state.value['values']
         if previous_values:
-          confidence = cls.ConfidenceScore(previous_values, [current_values],
+          confidence_params = (sum(previous_values, []),
+                               sum([current_values], []))
+          confidence = cls.ConfidenceScore(*confidence_params,
                                            accept_single_bad_or_good=True)
           mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
           mean_of_current_runs = math_utils.Mean(current_values)
@@ -253,7 +251,8 @@ class BisectResults(object):
     # Give a "confidence" in the bisect. At the moment we use how distinct the
     # values are before and after the last broken revision, and how noisy the
     # overall graph is.
-    confidence = cls.ConfidenceScore(working_means, broken_means)
+    confidence_params = (sum(working_means, []), sum(broken_means, []))
+    confidence = cls.ConfidenceScore(*confidence_params)
 
     bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
 
diff --git a/tools/auto_bisect/bisect_results_test.py b/tools/auto_bisect/bisect_results_test.py
index 4b77806..25359c5 100644
--- a/tools/auto_bisect/bisect_results_test.py
+++ b/tools/auto_bisect/bisect_results_test.py
@@ -87,22 +87,14 @@ class BisectResultsTest(unittest.TestCase):
       bad_values: First list of numbers.
       good_values: Second list of numbers.
     """
-    # ConfidenceScore takes a list of lists but these lists are flattened
-    # inside the function.
-    confidence = BisectResults.ConfidenceScore(
-        [[v] for v in bad_values],
-        [[v] for v in good_values])
+    confidence = BisectResults.ConfidenceScore(bad_values, good_values)
     self.assertEqual(score, confidence)
 
   def testConfidenceScoreIsZeroOnTooFewLists(self):
-    self._AssertConfidence(0.0, [], [[1], [2]])
-    self._AssertConfidence(0.0, [[1], [2]], [])
-    self._AssertConfidence(0.0, [[1]], [[1], [2]])
-    self._AssertConfidence(0.0, [[1], [2]], [[1]])
-
-  def testConfidenceScoreIsZeroOnEmptyLists(self):
-    self.assertEqual(BisectResults.ConfidenceScore([[], []], [[1], [2]]), 0.0)
-    self.assertEqual(BisectResults.ConfidenceScore([[1], [2]], [[], []]), 0.0)
+    self._AssertConfidence(0.0, [], [1, 2])
+    self._AssertConfidence(0.0, [1, 2], [])
+    self._AssertConfidence(0.0, [1], [1, 2])
+    self._AssertConfidence(0.0, [1, 2], [1])
 
   def testConfidenceScore_ZeroConfidence(self):
     # The good and bad sets contain the same values, so the confidence that
author	robertocn <robertocn@chromium.org>	2014-10-23 14:33:26 -0700
committer	Commit bot <commit-bot@chromium.org>	2014-10-23 21:33:40 +0000
commit	d0efba359b185fe140635b6269442a81e49e4c69 (patch)
tree	94a44adf51aad9ae621068f25694ca7909ae81fb /tools
parent	63b390d81aa08c4c490522cc7aefbd282fb33bc7 (diff)
download	chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.zip chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.tar.gz chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.tar.bz2