summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorrobertocn <robertocn@chromium.org>2014-10-23 14:33:26 -0700
committerCommit bot <commit-bot@chromium.org>2014-10-23 21:33:40 +0000
commitd0efba359b185fe140635b6269442a81e49e4c69 (patch)
tree94a44adf51aad9ae621068f25694ca7909ae81fb /tools
parent63b390d81aa08c4c490522cc7aefbd282fb33bc7 (diff)
downloadchromium_src-d0efba359b185fe140635b6269442a81e49e4c69.zip
chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.tar.gz
chromium_src-d0efba359b185fe140635b6269442a81e49e4c69.tar.bz2
Re-applying reverted changes plus fix. ConfidenceScore now takes flat
lists and the previously offending code now checks whether the lists need to be flattened before being passed to the function. BUG=422727 Review URL: https://codereview.chromium.org/644323002 Cr-Commit-Position: refs/heads/master@{#300196} Review URL: https://codereview.chromium.org/665893003 Cr-Commit-Position: refs/heads/master@{#300953}
Diffstat (limited to 'tools')
-rwxr-xr-xtools/auto_bisect/bisect_perf_regression.py75
-rw-r--r--tools/auto_bisect/bisect_perf_regression_test.py93
-rw-r--r--tools/auto_bisect/bisect_results.py19
-rw-r--r--tools/auto_bisect/bisect_results_test.py18
4 files changed, 178 insertions, 27 deletions
diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py
index f90074f..0fd8534 100755
--- a/tools/auto_bisect/bisect_perf_regression.py
+++ b/tools/auto_bisect/bisect_perf_regression.py
@@ -75,6 +75,10 @@ MAX_MAC_BUILD_TIME = 14400
MAX_WIN_BUILD_TIME = 14400
MAX_LINUX_BUILD_TIME = 14400
+# The confidence percentage we require to consider the initial range a
+# regression based on the test results of the inital good and bad revisions.
+REGRESSION_CONFIDENCE = 95
+
# Patch template to add a new file, DEPS.sha under src folder.
# This file contains SHA1 value of the DEPS changes made while bisecting
# dependency repositories. This patch send along with DEPS patch to try server.
@@ -89,6 +93,23 @@ new file mode 100644
+%(deps_sha)s
"""
+REGRESSION_CONFIDENCE_ERROR_TEMPLATE = """
+We could not reproduce the regression with this test/metric/platform combination
+with enough confidence.
+
+Here are the results for the initial revision range:
+'Good' revision: {good_rev}
+\tmean: {good_mean}
+\tstd.err.:{good_std_err}
+\tsample size:{good_sample_size}
+'Bad' revision: {bad_rev}
+\tmean: {bad_mean}
+\tstd.err.:{bad_std_err}
+\tsample size:{bad_sample_size}
+
+NOTE: There's still a chance that this is actually a regression, but you may
+ need to bisect a different platform."""
+
# Git branch name used to run bisect try jobs.
BISECT_TRYJOB_BRANCH = 'bisect-tryjob'
# Git master branch name.
@@ -589,6 +610,46 @@ def _GenerateProfileIfNecessary(command_args):
return True
+def _CheckRegressionConfidenceError(
+ good_revision,
+ bad_revision,
+ known_good_value,
+ known_bad_value):
+ """Checks whether we can be confident beyond a certain degree that the given
+ metrics represent a regression.
+
+ Args:
+ good_revision: string representing the commit considered 'good'
+ bad_revision: Same as above for 'bad'.
+ known_good_value: A dict with at least: 'values', 'mean' and 'std_err'
+ known_bad_value: Same as above.
+
+ Returns:
+ False if there is no error (i.e. we can be confident there's a regressioni),
+ a string containing the details of the lack of confidence otherwise.
+ """
+ error = False
+ # Adding good and bad values to a parameter list.
+ confidenceParams = []
+ for l in [known_bad_value['values'], known_good_value['values']]:
+ # Flatten if needed
+ if isinstance(l, list) and all([isinstance(x, list) for x in l]):
+ confidenceParams.append(sum(l, []))
+ else:
+ confidenceParams.append(l)
+ regression_confidence = BisectResults.ConfidenceScore(*confidenceParams)
+ if regression_confidence < REGRESSION_CONFIDENCE:
+ error = REGRESSION_CONFIDENCE_ERROR_TEMPLATE.format(
+ good_rev=good_revision,
+ good_mean=known_good_value['mean'],
+ good_std_err=known_good_value['std_err'],
+ good_sample_size=len(known_good_value['values']),
+ bad_rev=bad_revision,
+ bad_mean=known_bad_value['mean'],
+ bad_std_err=known_bad_value['std_err'],
+ bad_sample_size=len(known_bad_value['values']))
+ return error
+
class DepotDirectoryRegistry(object):
def __init__(self, src_cwd):
@@ -2217,6 +2278,15 @@ class BisectPerformanceMetrics(object):
min_revision = 0
max_revision = len(revision_states) - 1
+ # Check how likely it is that the good and bad results are different
+ # beyond chance-induced variation.
+ if not self.opts.debug_ignore_regression_confidence:
+ error = _CheckRegressionConfidenceError(good_revision,
+ bad_revision,
+ known_good_value,
+ known_bad_value)
+ if error:
+ return BisectResults(error=error)
# Can just mark the good and bad revisions explicitly here since we
# already know the results.
@@ -2425,6 +2495,7 @@ class BisectOptions(object):
self.debug_ignore_build = None
self.debug_ignore_sync = None
self.debug_ignore_perf_test = None
+ self.debug_ignore_regression_confidence = None
self.debug_fake_first_test_mean = 0
self.gs_bucket = None
self.target_arch = 'ia32'
@@ -2593,6 +2664,10 @@ class BisectOptions(object):
group.add_option('--debug_ignore_perf_test',
action='store_true',
help='DEBUG: Don\'t perform performance tests.')
+ group.add_option('--debug_ignore_regression_confidence',
+ action='store_true',
+ help='DEBUG: Don\'t score the confidence of the initial '
+ 'good and bad revisions\' test results.')
group.add_option('--debug_fake_first_test_mean',
type='int',
default='0',
diff --git a/tools/auto_bisect/bisect_perf_regression_test.py b/tools/auto_bisect/bisect_perf_regression_test.py
index 8b741cd..8df897b 100644
--- a/tools/auto_bisect/bisect_perf_regression_test.py
+++ b/tools/auto_bisect/bisect_perf_regression_test.py
@@ -18,17 +18,75 @@ import mock
import source_control
+# Regression confidence: 0%
+CLEAR_NON_REGRESSION = [
+ # Mean: 30.223 Std. Dev.: 11.383
+ [[16.886], [16.909], [16.99], [17.723], [17.952], [18.118], [19.028],
+ [19.552], [21.954], [38.573], [38.839], [38.965], [40.007], [40.572],
+ [41.491], [42.002], [42.33], [43.109], [43.238]],
+ # Mean: 34.76 Std. Dev.: 11.516
+ [[16.426], [17.347], [20.593], [21.177], [22.791], [27.843], [28.383],
+ [28.46], [29.143], [40.058], [40.303], [40.558], [41.918], [42.44],
+ [45.223], [46.494], [50.002], [50.625], [50.839]]
+]
+# Regression confidence: ~ 90%
+ALMOST_REGRESSION = [
+ # Mean: 30.042 Std. Dev.: 2.002
+ [[26.146], [28.04], [28.053], [28.074], [28.168], [28.209], [28.471],
+ [28.652], [28.664], [30.862], [30.973], [31.002], [31.897], [31.929],
+ [31.99], [32.214], [32.323], [32.452], [32.696]],
+ # Mean: 33.008 Std. Dev.: 4.265
+ [[34.963], [30.741], [39.677], [39.512], [34.314], [31.39], [34.361],
+ [25.2], [30.489], [29.434]]
+]
+# Regression confidence: ~ 98%
+BARELY_REGRESSION = [
+ # Mean: 28.828 Std. Dev.: 1.993
+ [[26.96], [27.605], [27.768], [27.829], [28.006], [28.206], [28.393],
+ [28.911], [28.933], [30.38], [30.462], [30.808], [31.74], [31.805],
+ [31.899], [32.077], [32.454], [32.597], [33.155]],
+ # Mean: 31.156 Std. Dev.: 1.980
+ [[28.729], [29.112], [29.258], [29.454], [29.789], [30.036], [30.098],
+ [30.174], [30.534], [32.285], [32.295], [32.552], [32.572], [32.967],
+ [33.165], [33.403], [33.588], [33.744], [34.147], [35.84]]
+]
+# Regression confidence: 99.5%
+CLEAR_REGRESSION = [
+ # Mean: 30.254 Std. Dev.: 2.987
+ [[26.494], [26.621], [26.701], [26.997], [26.997], [27.05], [27.37],
+ [27.488], [27.556], [31.846], [32.192], [32.21], [32.586], [32.596],
+ [32.618], [32.95], [32.979], [33.421], [33.457], [34.97]],
+ # Mean: 33.190 Std. Dev.: 2.972
+ [[29.547], [29.713], [29.835], [30.132], [30.132], [30.33], [30.406],
+ [30.592], [30.72], [34.486], [35.247], [35.253], [35.335], [35.378],
+ [35.934], [36.233], [36.41], [36.947], [37.982]]
+]
# Default options for the dry run
DEFAULT_OPTIONS = {
'debug_ignore_build': True,
'debug_ignore_sync': True,
'debug_ignore_perf_test': True,
+ 'debug_ignore_regression_confidence': True,
'command': 'fake_command',
'metric': 'fake/metric',
'good_revision': 280000,
'bad_revision': 280005,
}
+# This global is a placeholder for a generator to be defined by the testcases
+# that use _MockRunTest
+_MockResultsGenerator = (x for x in [])
+
+def _FakeTestResult(values):
+ result_dict = {'mean': 0.0, 'std_err': 0.0, 'std_dev': 0.0, 'values': values}
+ success_code = 0
+ return (result_dict, success_code)
+
+
+def _MockRunTests(*args, **kwargs):
+ _, _ = args, kwargs
+ return _FakeTestResult(_MockResultsGenerator.next())
+
def _GetBisectPerformanceMetricsInstance(options_dict):
"""Returns an instance of the BisectPerformanceMetrics class."""
@@ -36,12 +94,13 @@ def _GetBisectPerformanceMetricsInstance(options_dict):
return bisect_perf_regression.BisectPerformanceMetrics(opts)
-def _GetExtendedOptions(d, f):
+def _GetExtendedOptions(improvement_dir, fake_first, ignore_confidence=True):
"""Returns the a copy of the default options dict plus some options."""
result = dict(DEFAULT_OPTIONS)
result.update({
- 'improvement_direction': d,
- 'debug_fake_first_test_mean': f})
+ 'improvement_direction': improvement_dir,
+ 'debug_fake_first_test_mean': fake_first,
+ 'debug_ignore_regression_confidence': ignore_confidence})
return result
@@ -233,11 +292,11 @@ class BisectPerfRegressionTest(unittest.TestCase):
def testBisectImprovementDirectionFails(self):
"""Dry run of a bisect with an improvement instead of regression."""
-
# Test result goes from 0 to 100 where higher is better
results = _GenericDryRun(_GetExtendedOptions(1, 100))
self.assertIsNotNone(results.error)
self.assertIn('not a regression', results.error)
+
# Test result goes from 0 to -100 where lower is better
results = _GenericDryRun(_GetExtendedOptions(-1, -100))
self.assertIsNotNone(results.error)
@@ -252,6 +311,32 @@ class BisectPerfRegressionTest(unittest.TestCase):
results = _GenericDryRun(_GetExtendedOptions(1, -100))
self.assertIsNone(results.error)
+ @mock.patch('bisect_perf_regression.BisectPerformanceMetrics.'
+ 'RunPerformanceTestAndParseResults', _MockRunTests)
+ def testBisectStopsOnDoubtfulRegression(self):
+ global _MockResultsGenerator
+ _MockResultsGenerator = (rs for rs in CLEAR_NON_REGRESSION)
+ results = _GenericDryRun(_GetExtendedOptions(0, 0, False))
+ self.assertIsNotNone(results.error)
+ self.assertIn('could not reproduce the regression', results.error)
+
+ _MockResultsGenerator = (rs for rs in ALMOST_REGRESSION)
+ results = _GenericDryRun(_GetExtendedOptions(0, 0, False))
+ self.assertIsNotNone(results.error)
+ self.assertIn('could not reproduce the regression', results.error)
+
+ @mock.patch('bisect_perf_regression.BisectPerformanceMetrics.'
+ 'RunPerformanceTestAndParseResults', _MockRunTests)
+ def testBisectContinuesOnClearRegression(self):
+ global _MockResultsGenerator
+ _MockResultsGenerator = (rs for rs in CLEAR_REGRESSION)
+ with self.assertRaises(StopIteration):
+ _GenericDryRun(_GetExtendedOptions(0, 0, False))
+
+ _MockResultsGenerator = (rs for rs in BARELY_REGRESSION)
+ with self.assertRaises(StopIteration):
+ _GenericDryRun(_GetExtendedOptions(0, 0, False))
+
def testGetCommitPosition(self):
cp_git_rev = '7017a81991de983e12ab50dfc071c70e06979531'
self.assertEqual(291765, source_control.GetCommitPosition(cp_git_rev))
diff --git a/tools/auto_bisect/bisect_results.py b/tools/auto_bisect/bisect_results.py
index 9bcaeb6..281afe6 100644
--- a/tools/auto_bisect/bisect_results.py
+++ b/tools/auto_bisect/bisect_results.py
@@ -109,7 +109,7 @@ class BisectResults(object):
return warnings
@staticmethod
- def ConfidenceScore(good_results_lists, bad_results_lists,
+ def ConfidenceScore(sample1, sample2,
accept_single_bad_or_good=False):
"""Calculates a confidence score.
@@ -119,8 +119,8 @@ class BisectResults(object):
Args:
- good_results_lists: A list of lists of "good" result numbers.
- bad_results_lists: A list of lists of "bad" result numbers.
+ sample1: A flat list of "good" result numbers.
+ sample2: A flat list of "bad" result numbers.
accept_single_bad_or_good: If True, computes confidence even if there is
just one bad or good revision, otherwise single good or bad revision
always returns 0.0 confidence. This flag will probably get away when
@@ -134,13 +134,9 @@ class BisectResults(object):
# classified good or bad; this isn't good enough evidence to make a
# decision. If an empty list was passed, that also implies zero confidence.
if not accept_single_bad_or_good:
- if len(good_results_lists) <= 1 or len(bad_results_lists) <= 1:
+ if len(sample1) <= 1 or len(sample2) <= 1:
return 0.0
- # Flatten the lists of results lists.
- sample1 = sum(good_results_lists, [])
- sample2 = sum(bad_results_lists, [])
-
# If there were only empty lists in either of the lists (this is unexpected
# and normally shouldn't happen), then we also want to return 0.
if not sample1 or not sample2:
@@ -171,7 +167,9 @@ class BisectResults(object):
if revision_state.value:
current_values = revision_state.value['values']
if previous_values:
- confidence = cls.ConfidenceScore(previous_values, [current_values],
+ confidence_params = (sum(previous_values, []),
+ sum([current_values], []))
+ confidence = cls.ConfidenceScore(*confidence_params,
accept_single_bad_or_good=True)
mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
mean_of_current_runs = math_utils.Mean(current_values)
@@ -253,7 +251,8 @@ class BisectResults(object):
# Give a "confidence" in the bisect. At the moment we use how distinct the
# values are before and after the last broken revision, and how noisy the
# overall graph is.
- confidence = cls.ConfidenceScore(working_means, broken_means)
+ confidence_params = (sum(working_means, []), sum(broken_means, []))
+ confidence = cls.ConfidenceScore(*confidence_params)
bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
diff --git a/tools/auto_bisect/bisect_results_test.py b/tools/auto_bisect/bisect_results_test.py
index 4b77806..25359c5 100644
--- a/tools/auto_bisect/bisect_results_test.py
+++ b/tools/auto_bisect/bisect_results_test.py
@@ -87,22 +87,14 @@ class BisectResultsTest(unittest.TestCase):
bad_values: First list of numbers.
good_values: Second list of numbers.
"""
- # ConfidenceScore takes a list of lists but these lists are flattened
- # inside the function.
- confidence = BisectResults.ConfidenceScore(
- [[v] for v in bad_values],
- [[v] for v in good_values])
+ confidence = BisectResults.ConfidenceScore(bad_values, good_values)
self.assertEqual(score, confidence)
def testConfidenceScoreIsZeroOnTooFewLists(self):
- self._AssertConfidence(0.0, [], [[1], [2]])
- self._AssertConfidence(0.0, [[1], [2]], [])
- self._AssertConfidence(0.0, [[1]], [[1], [2]])
- self._AssertConfidence(0.0, [[1], [2]], [[1]])
-
- def testConfidenceScoreIsZeroOnEmptyLists(self):
- self.assertEqual(BisectResults.ConfidenceScore([[], []], [[1], [2]]), 0.0)
- self.assertEqual(BisectResults.ConfidenceScore([[1], [2]], [[], []]), 0.0)
+ self._AssertConfidence(0.0, [], [1, 2])
+ self._AssertConfidence(0.0, [1, 2], [])
+ self._AssertConfidence(0.0, [1], [1, 2])
+ self._AssertConfidence(0.0, [1, 2], [1])
def testConfidenceScore_ZeroConfidence(self):
# The good and bad sets contain the same values, so the confidence that