diff options
author | qyearsley <qyearsley@chromium.org> | 2015-06-23 16:46:06 -0700 |
---|---|---|
committer | Commit bot <commit-bot@chromium.org> | 2015-06-23 23:46:42 +0000 |
commit | 21269c2020e156dbc190b7c09ead0d459131bab2 (patch) | |
tree | 00646a6bc4aa66c0b8397bb2dea0e1bfca4a4386 | |
parent | fd9b02e0d80adcd45fbbd891a221e20eff5d7bd6 (diff) | |
download | chromium_src-21269c2020e156dbc190b7c09ead0d459131bab2.zip chromium_src-21269c2020e156dbc190b7c09ead0d459131bab2.tar.gz chromium_src-21269c2020e156dbc190b7c09ead0d459131bab2.tar.bz2 |
Refactor _CheckRegressionConfidenceError in bisect_perf_regression.py.
Context: I was starting to change all reference to "confidence" to refer instead to p-values (http://crbug.com/489351), and wanted to refactor this method first to make it easier.
Review URL: https://codereview.chromium.org/1190863004
Cr-Commit-Position: refs/heads/master@{#335809}
-rwxr-xr-x | tools/auto_bisect/bisect_perf_regression.py | 91 | ||||
-rw-r--r-- | tools/auto_bisect/bisect_perf_regression_test.py | 5 |
2 files changed, 44 insertions, 52 deletions
diff --git a/tools/auto_bisect/bisect_perf_regression.py b/tools/auto_bisect/bisect_perf_regression.py index 41db6ae..9539b68 100755 --- a/tools/auto_bisect/bisect_perf_regression.py +++ b/tools/auto_bisect/bisect_perf_regression.py @@ -98,11 +98,11 @@ new file mode 100644 +%(deps_sha)s """ -REGRESSION_CONFIDENCE_ERROR_TEMPLATE = """ -We could not reproduce the regression with this test/metric/platform combination -with enough confidence. +REGRESSION_NOT_REPRODUCED_MESSAGE_TEMPLATE = """ +Bisect did not clearly reproduce a regression between the given "good" +and "bad" revisions. -Here are the results for the given "good" and "bad" revisions: +Results: "Good" revision: {good_rev} \tMean: {good_mean} \tStandard error: {good_std_err} @@ -113,8 +113,8 @@ Here are the results for the given "good" and "bad" revisions: \tStandard error: {bad_std_err} \tSample size: {bad_sample_size} -NOTE: There's still a chance that this is actually a regression, but you may - need to bisect a different platform.""" +You may want to try bisecting on a different platform or metric. +""" # Git branch name used to run bisect try jobs. BISECT_TRYJOB_BRANCH = 'bisect-tryjob' @@ -475,47 +475,42 @@ def _GenerateProfileIfNecessary(command_args): return True -def _CheckRegressionConfidenceError( - good_revision, - bad_revision, - known_good_value, - known_bad_value): - """Checks whether we can be confident beyond a certain degree that the given - metrics represent a regression. +def _IsRegressionReproduced(known_good_result, known_bad_result): + """Checks whether the regression was reproduced based on the initial values. Args: - good_revision: string representing the commit considered 'good' - bad_revision: Same as above for 'bad'. - known_good_value: A dict with at least: 'values', 'mean' and 'std_err' - known_bad_value: Same as above. + known_good_result: A dict with the keys "values", "mean" and "std_err". + known_bad_result: Same as above. Returns: - False if there is no error (i.e. we can be confident there's a regression), - a string containing the details of the lack of confidence otherwise. + True if there is a clear change between the result values for the given + good and bad revisions, False otherwise. """ - error = False - # Adding good and bad values to a parameter list. - confidence_params = [] - for l in [known_bad_value['values'], known_good_value['values']]: - # Flatten if needed, by averaging the values in each nested list - if isinstance(l, list) and all([isinstance(x, list) for x in l]): - averages = map(math_utils.Mean, l) - confidence_params.append(averages) - else: - confidence_params.append(l) - regression_confidence = BisectResults.ConfidenceScore( - *confidence_params, accept_single_bad_or_good=True) - if regression_confidence < REGRESSION_CONFIDENCE: - error = REGRESSION_CONFIDENCE_ERROR_TEMPLATE.format( - good_rev=good_revision, - good_mean=known_good_value['mean'], - good_std_err=known_good_value['std_err'], - good_sample_size=len(known_good_value['values']), - bad_rev=bad_revision, - bad_mean=known_bad_value['mean'], - bad_std_err=known_bad_value['std_err'], - bad_sample_size=len(known_bad_value['values'])) - return error + def PossiblyFlatten(values): + """Flattens if needed, by averaging the values in each nested list.""" + if isinstance(values, list) and all(isinstance(x, list) for x in values): + return map(math_utils.Mean, values) + return values + + p_value = BisectResults.ConfidenceScore( + PossiblyFlatten(known_bad_result['values']), + PossiblyFlatten(known_good_result['values']), + accept_single_bad_or_good=True) + + return p_value > REGRESSION_CONFIDENCE + + +def _RegressionNotReproducedWarningMessage( + good_revision, bad_revision, known_good_value, known_bad_value): + return REGRESSION_NOT_REPRODUCED_MESSAGE_TEMPLATE.format( + good_rev=good_revision, + good_mean=known_good_value['mean'], + good_std_err=known_good_value['std_err'], + good_sample_size=len(known_good_value['values']), + bad_rev=bad_revision, + bad_mean=known_bad_value['mean'], + bad_std_err=known_bad_value['std_err'], + bad_sample_size=len(known_bad_value['values'])) class DepotDirectoryRegistry(object): @@ -2366,19 +2361,15 @@ class BisectPerformanceMetrics(object): # Check how likely it is that the good and bad results are different # beyond chance-induced variation. - confidence_error = False if not (self.opts.debug_ignore_regression_confidence or - self._IsBisectModeReturnCode()): - confidence_error = _CheckRegressionConfidenceError(good_revision, - bad_revision, - known_good_value, - known_bad_value) - if confidence_error: + self._IsBisectModeReturnCode()): + if not _IsRegressionReproduced(known_good_value, known_bad_value): # If there is no significant difference between "good" and "bad" # revision results, then the "bad revision" is considered "good". # TODO(qyearsley): Remove this if it is not necessary. bad_revision_state.passed = True - self.warnings.append(confidence_error) + self.warnings.append(_RegressionNotReproducedWarningMessage( + good_revision, bad_revision, known_good_value, known_bad_value)) return BisectResults(bisect_state, self.depot_registry, self.opts, self.warnings) diff --git a/tools/auto_bisect/bisect_perf_regression_test.py b/tools/auto_bisect/bisect_perf_regression_test.py index 4a7b37f..6d96d81 100644 --- a/tools/auto_bisect/bisect_perf_regression_test.py +++ b/tools/auto_bisect/bisect_perf_regression_test.py @@ -393,8 +393,9 @@ class BisectPerfRegressionTest(unittest.TestCase): bisect_class.RunPerformanceTestAndParseResults = original_run_tests # If the job was aborted, there should be a warning about it. - assert [w for w in dry_run_results.warnings - if 'could not reproduce the regression' in w] + self.assertTrue( + any('did not clearly reproduce a regression' in w + for w in dry_run_results.warnings)) return True def testBisectAbortedOnClearNonRegression(self): |