diff options
author | qyearsley@chromium.org <qyearsley@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-07-30 08:00:42 +0000 |
---|---|---|
committer | qyearsley@chromium.org <qyearsley@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2014-07-30 08:00:42 +0000 |
commit | 5dcad27e9ad91ad1c76f6d8afe066f13d77fbcb6 (patch) | |
tree | e8b2ee28ba0c7afaf76a83c582948280bdbfc1a4 /tools | |
parent | f1d228e18ed98c9eab8d1e10a99c4f266819cd78 (diff) | |
download | chromium_src-5dcad27e9ad91ad1c76f6d8afe066f13d77fbcb6.zip chromium_src-5dcad27e9ad91ad1c76f6d8afe066f13d77fbcb6.tar.gz chromium_src-5dcad27e9ad91ad1c76f6d8afe066f13d77fbcb6.tar.bz2 |
Use Welch's t-test to calculate confidence scores in the bisect script.
BUG=383864
Review URL: https://codereview.chromium.org/413393002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@286438 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'tools')
-rw-r--r-- | tools/auto_bisect/math_utils.py | 12 | ||||
-rw-r--r-- | tools/auto_bisect/ttest.py | 209 | ||||
-rw-r--r-- | tools/auto_bisect/ttest_test.py | 122 | ||||
-rwxr-xr-x | tools/bisect-perf-regression.py | 46 | ||||
-rw-r--r-- | tools/bisect-perf-regression_test.py | 77 | ||||
-rwxr-xr-x | tools/run-bisect-perf-regression.py | 7 |
6 files changed, 411 insertions, 62 deletions
diff --git a/tools/auto_bisect/math_utils.py b/tools/auto_bisect/math_utils.py index fe94f53..c225bdd 100644 --- a/tools/auto_bisect/math_utils.py +++ b/tools/auto_bisect/math_utils.py @@ -57,18 +57,20 @@ def Mean(values): return TruncatedMean(values, 0.0) -def StandardDeviation(values): - """Calculates the sample standard deviation of the given list of values.""" +def Variance(values): + """Calculates the sample variance.""" if len(values) == 1: return 0.0 - mean = Mean(values) differences_from_mean = [float(x) - mean for x in values] squared_differences = [float(x * x) for x in differences_from_mean] variance = sum(squared_differences) / (len(values) - 1) - std_dev = math.sqrt(variance) + return variance - return std_dev + +def StandardDeviation(values): + """Calculates the sample standard deviation of the given list of values.""" + return math.sqrt(Variance(values)) def RelativeChange(before, after): diff --git a/tools/auto_bisect/ttest.py b/tools/auto_bisect/ttest.py new file mode 100644 index 0000000..fcb3a97 --- /dev/null +++ b/tools/auto_bisect/ttest.py @@ -0,0 +1,209 @@ +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Functions for doing independent two-sample t-tests and looking up p-values. + +Note: This module was copied from the Performance Dashboard code, and changed +to use definitions of mean and variance from math_utils instead of numpy. + +> A t-test is any statistical hypothesis test in which the test statistic +> follows a Student's t distribution if the null hypothesis is supported. +> It can be used to determine if two sets of data are significantly different +> from each other. + +There are several conditions that the data under test should meet in order +for a t-test to be completely applicable: + - The data should be roughly normal in distribution. + - The two samples that are compared should be roughly similar in size. + +References: + http://en.wikipedia.org/wiki/Student%27s_t-test + http://en.wikipedia.org/wiki/Welch%27s_t-test + https://github.com/scipy/scipy/blob/master/scipy/stats/stats.py#L3244 +""" + +import math + +import math_utils + + +def WelchsTTest(sample1, sample2): + """Performs Welch's t-test on the two samples. + + Welch's t-test is an adaptation of Student's t-test which is used when the + two samples may have unequal variances. It is also an independent two-sample + t-test. + + Args: + sample1: A collection of numbers. + sample2: Another collection of numbers. + + Returns: + A 3-tuple (t-statistic, degrees of freedom, p-value). + """ + mean1 = math_utils.Mean(sample1) + mean2 = math_utils.Mean(sample2) + v1 = math_utils.Variance(sample1) + v2 = math_utils.Variance(sample2) + n1 = len(sample1) + n2 = len(sample2) + t = _TValue(mean1, mean2, v1, v2, n1, n2) + df = _DegreesOfFreedom(v1, v2, n1, n2) + p = _LookupPValue(t, df) + return t, df, p + + +def _TValue(mean1, mean2, v1, v2, n1, n2): + """Calculates a t-statistic value using the formula for Welch's t-test. + + The t value can be thought of as a signal-to-noise ratio; a higher t-value + tells you that the groups are more different. + + Args: + mean1: Mean of sample 1. + mean2: Mean of sample 2. + v1: Variance of sample 1. + v2: Variance of sample 2. + n1: Sample size of sample 1. + n2: Sample size of sample 2. + + Returns: + A t value, which may be negative or positive. + """ + # If variance of both segments is zero, return some large t-value. + if v1 == 0 and v2 == 0: + return 1000.0 + return (mean1 - mean2) / (math.sqrt(v1 / n1 + v2 / n2)) + + +def _DegreesOfFreedom(v1, v2, n1, n2): + """Calculates degrees of freedom using the Welch-Satterthwaite formula. + + Degrees of freedom is a measure of sample size. For other types of tests, + degrees of freedom is sometimes N - 1, where N is the sample size. However, + + Args: + v1: Variance of sample 1. + v2: Variance of sample 2. + n1: Size of sample 2. + n2: Size of sample 2. + + Returns: + An estimate of degrees of freedom. Must be at least 1.0. + """ + # When there's no variance in either sample, return 1. + if v1 == 0 and v2 == 0: + return 1 + # If the sample size is too small, also return the minimum (1). + if n1 <= 1 or n2 <= 2: + return 1 + df = (((v1 / n1 + v2 / n2) ** 2) / + ((v1 ** 2) / ((n1 ** 2) * (n1 - 1)) + + (v2 ** 2) / ((n2 ** 2) * (n2 - 1)))) + return max(1, df) + + +# Below is a hard-coded table for looking up p-values. +# +# Normally, p-values are calculated based on the t-distribution formula. +# Looking up pre-calculated values is a less accurate but less complicated +# alternative. +# +# Reference: http://www.sjsu.edu/faculty/gerstman/StatPrimer/t-table.pdf + +# A list of p-values for a two-tailed test. The entries correspond to to +# entries in the rows of the table below. +TWO_TAIL = [1, 0.20, 0.10, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001] + +# A map of degrees of freedom to lists of t-values. The index of the t-value +# can be used to look up the corresponding p-value. +TABLE = { + 1: [0, 3.078, 6.314, 12.706, 31.820, 63.657, 127.321, 318.309, 636.619], + 2: [0, 1.886, 2.920, 4.303, 6.965, 9.925, 14.089, 22.327, 31.599], + 3: [0, 1.638, 2.353, 3.182, 4.541, 5.841, 7.453, 10.215, 12.924], + 4: [0, 1.533, 2.132, 2.776, 3.747, 4.604, 5.598, 7.173, 8.610], + 5: [0, 1.476, 2.015, 2.571, 3.365, 4.032, 4.773, 5.893, 6.869], + 6: [0, 1.440, 1.943, 2.447, 3.143, 3.707, 4.317, 5.208, 5.959], + 7: [0, 1.415, 1.895, 2.365, 2.998, 3.499, 4.029, 4.785, 5.408], + 8: [0, 1.397, 1.860, 2.306, 2.897, 3.355, 3.833, 4.501, 5.041], + 9: [0, 1.383, 1.833, 2.262, 2.821, 3.250, 3.690, 4.297, 4.781], + 10: [0, 1.372, 1.812, 2.228, 2.764, 3.169, 3.581, 4.144, 4.587], + 11: [0, 1.363, 1.796, 2.201, 2.718, 3.106, 3.497, 4.025, 4.437], + 12: [0, 1.356, 1.782, 2.179, 2.681, 3.055, 3.428, 3.930, 4.318], + 13: [0, 1.350, 1.771, 2.160, 2.650, 3.012, 3.372, 3.852, 4.221], + 14: [0, 1.345, 1.761, 2.145, 2.625, 2.977, 3.326, 3.787, 4.140], + 15: [0, 1.341, 1.753, 2.131, 2.602, 2.947, 3.286, 3.733, 4.073], + 16: [0, 1.337, 1.746, 2.120, 2.584, 2.921, 3.252, 3.686, 4.015], + 17: [0, 1.333, 1.740, 2.110, 2.567, 2.898, 3.222, 3.646, 3.965], + 18: [0, 1.330, 1.734, 2.101, 2.552, 2.878, 3.197, 3.610, 3.922], + 19: [0, 1.328, 1.729, 2.093, 2.539, 2.861, 3.174, 3.579, 3.883], + 20: [0, 1.325, 1.725, 2.086, 2.528, 2.845, 3.153, 3.552, 3.850], + 21: [0, 1.323, 1.721, 2.080, 2.518, 2.831, 3.135, 3.527, 3.819], + 22: [0, 1.321, 1.717, 2.074, 2.508, 2.819, 3.119, 3.505, 3.792], + 23: [0, 1.319, 1.714, 2.069, 2.500, 2.807, 3.104, 3.485, 3.768], + 24: [0, 1.318, 1.711, 2.064, 2.492, 2.797, 3.090, 3.467, 3.745], + 25: [0, 1.316, 1.708, 2.060, 2.485, 2.787, 3.078, 3.450, 3.725], + 26: [0, 1.315, 1.706, 2.056, 2.479, 2.779, 3.067, 3.435, 3.707], + 27: [0, 1.314, 1.703, 2.052, 2.473, 2.771, 3.057, 3.421, 3.690], + 28: [0, 1.313, 1.701, 2.048, 2.467, 2.763, 3.047, 3.408, 3.674], + 29: [0, 1.311, 1.699, 2.045, 2.462, 2.756, 3.038, 3.396, 3.659], + 30: [0, 1.310, 1.697, 2.042, 2.457, 2.750, 3.030, 3.385, 3.646], + 31: [0, 1.309, 1.695, 2.040, 2.453, 2.744, 3.022, 3.375, 3.633], + 32: [0, 1.309, 1.694, 2.037, 2.449, 2.738, 3.015, 3.365, 3.622], + 33: [0, 1.308, 1.692, 2.035, 2.445, 2.733, 3.008, 3.356, 3.611], + 34: [0, 1.307, 1.691, 2.032, 2.441, 2.728, 3.002, 3.348, 3.601], + 35: [0, 1.306, 1.690, 2.030, 2.438, 2.724, 2.996, 3.340, 3.591], + 36: [0, 1.306, 1.688, 2.028, 2.434, 2.719, 2.991, 3.333, 3.582], + 37: [0, 1.305, 1.687, 2.026, 2.431, 2.715, 2.985, 3.326, 3.574], + 38: [0, 1.304, 1.686, 2.024, 2.429, 2.712, 2.980, 3.319, 3.566], + 39: [0, 1.304, 1.685, 2.023, 2.426, 2.708, 2.976, 3.313, 3.558], + 40: [0, 1.303, 1.684, 2.021, 2.423, 2.704, 2.971, 3.307, 3.551], + 42: [0, 1.302, 1.682, 2.018, 2.418, 2.698, 2.963, 3.296, 3.538], + 44: [0, 1.301, 1.680, 2.015, 2.414, 2.692, 2.956, 3.286, 3.526], + 46: [0, 1.300, 1.679, 2.013, 2.410, 2.687, 2.949, 3.277, 3.515], + 48: [0, 1.299, 1.677, 2.011, 2.407, 2.682, 2.943, 3.269, 3.505], + 50: [0, 1.299, 1.676, 2.009, 2.403, 2.678, 2.937, 3.261, 3.496], + 60: [0, 1.296, 1.671, 2.000, 2.390, 2.660, 2.915, 3.232, 3.460], + 70: [0, 1.294, 1.667, 1.994, 2.381, 2.648, 2.899, 3.211, 3.435], + 80: [0, 1.292, 1.664, 1.990, 2.374, 2.639, 2.887, 3.195, 3.416], + 90: [0, 1.291, 1.662, 1.987, 2.369, 2.632, 2.878, 3.183, 3.402], + 100: [0, 1.290, 1.660, 1.984, 2.364, 2.626, 2.871, 3.174, 3.391], + 120: [0, 1.289, 1.658, 1.980, 2.358, 2.617, 2.860, 3.160, 3.373], + 150: [0, 1.287, 1.655, 1.976, 2.351, 2.609, 2.849, 3.145, 3.357], + 200: [0, 1.286, 1.652, 1.972, 2.345, 2.601, 2.839, 3.131, 3.340], + 300: [0, 1.284, 1.650, 1.968, 2.339, 2.592, 2.828, 3.118, 3.323], + 500: [0, 1.283, 1.648, 1.965, 2.334, 2.586, 2.820, 3.107, 3.310], +} + + +def _LookupPValue(t, df): + """Looks up a p-value in a t-distribution table. + + Args: + t: A t statistic value; the result of a t-test. + df: Number of degrees of freedom. + + Returns: + A p-value, which represents the likelihood of obtaining a result at least + as extreme as the one observed just by chance (the null hypothesis). + """ + assert df >= 1, 'Degrees of freedom must be positive' + + # We ignore the negative sign on the t-value because our null hypothesis + # is that the two samples are the same; our alternative hypothesis is that + # the second sample is lesser OR greater than the first. + t = abs(t) + + def GreatestSmaller(nums, target): + """Returns the largest number that is <= the target number.""" + lesser_equal = [n for n in nums if n <= target] + assert lesser_equal, 'No number in number list <= target.' + return max(lesser_equal) + + df_key = GreatestSmaller(TABLE.keys(), df) + t_table_row = TABLE[df_key] + approximate_t_value = GreatestSmaller(t_table_row, t) + t_value_index = t_table_row.index(approximate_t_value) + + return TWO_TAIL[t_value_index] diff --git a/tools/auto_bisect/ttest_test.py b/tools/auto_bisect/ttest_test.py new file mode 100644 index 0000000..744a383 --- /dev/null +++ b/tools/auto_bisect/ttest_test.py @@ -0,0 +1,122 @@ +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Unit tests for ttest module.""" + +import unittest + +import ttest + + +class TTestTest(unittest.TestCase): + """Tests for the t-test functions.""" + + def testWelchsFormula(self): + """Tests calculation of the t value.""" + # Results can be verified by directly plugging variables into Welch's + # equation (e.g. using a calculator or the Python interpreter). + self.assertEqual( + -0.2796823595120407, + ttest._TValue(0.299, 0.307, 0.05, 0.08, 150, 165)) + + # Note that a negative t value is obtained when the first sample has a + # smaller mean than the second, otherwise a positive value is returned. + self.assertEqual( + 0.2796823595120407, + ttest._TValue(0.307, 0.299, 0.08, 0.05, 165, 150)) + + def testWelchSatterthwaiteFormula(self): + """Tests calculation of estimated degrees of freedom.""" + # Note that since the Welch-Satterthwaite equation gives an estimate of + # degrees of freedom, the result may not be an integer. + self.assertEqual( + 307.1987997516727, + ttest._DegreesOfFreedom(0.05, 0.08, 150, 165)) + + def testWelchsTTest(self): + """Tests the t value and degrees of freedom output of Welch's t-test.""" + # The t-value can be checked with scipy.stats.ttest_ind(equal_var=False). + t, df, _ = ttest.WelchsTTest([2, 3, 2, 3, 2, 3], [4, 5, 4, 5, 4, 5]) + self.assertAlmostEqual(10.0, df) + + # The t-value produced by scipy.stats.ttest_ind is -6.32455532034. + # Our function produces slightly different results. + # Possibly due to differences in rounding error? + self.assertAlmostEqual(-6.325, t, delta=1.0) + + def testTTestEqualSamples(self): + """Checks that t = 0 and p = 1 when the samples are the same.""" + t, _, p = ttest.WelchsTTest([1, 2, 3], [1, 2, 3]) + self.assertEqual(0, t) + self.assertEqual(1, p) + + t, _, p = ttest.WelchsTTest([1, 2], [1, 2]) + self.assertEqual(0, t) + self.assertEqual(1, p) + + def testTTestVeryDifferentSamples(self): + """Checks that p is very low when the samples are clearly different.""" + t, _, p = ttest.WelchsTTest( + [100, 101, 100, 101, 100], [1, 2, 1, 2, 1, 2, 1, 2]) + self.assertGreaterEqual(t, 250) + self.assertLessEqual(0.01, p) + + def testTTestVariance(self): + """Verifies that higher variance -> higher p value.""" + _, _, p_low_var = ttest.WelchsTTest([2, 3, 2, 3], [4, 5, 4, 5]) + _, _, p_high_var = ttest.WelchsTTest([1, 4, 1, 4], [3, 6, 3, 6]) + self.assertLess(p_low_var, p_high_var) + + def testTTestSampleSize(self): + """Verifies that smaller sample size -> higher p value.""" + _, _, p_larger_sample = ttest.WelchsTTest([2, 3, 2, 3], [4, 5, 4, 5]) + _, _, p_smaller_sample = ttest.WelchsTTest([2, 3, 2, 3], [4, 5]) + self.assertLess(p_larger_sample, p_smaller_sample) + + def testTTestMeanDifference(self): + """Verifies that smaller difference between means -> higher p value.""" + _, _, p_far_means = ttest.WelchsTTest([2, 3, 2, 3], [5, 6, 5, 6]) + _, _, p_near_means = ttest.WelchsTTest([2, 3, 2, 3], [3, 4, 3, 4]) + self.assertLess(p_far_means, p_near_means) + + +class LookupTableTest(unittest.TestCase): + """Tests for functionality related to lookup of p-values in a table.""" + + def setUp(self): + ttest.TWO_TAIL = [1, 0.2, 0.1, 0.05, 0.02, 0.01] + ttest.TABLE = { + 1: [0, 6.314, 12.71, 31.82, 63.66, 318.31], + 2: [0, 2.920, 4.303, 6.965, 9.925, 22.327], + 3: [0, 2.353, 3.182, 4.541, 5.841, 10.215], + 4: [0, 2.132, 2.776, 3.747, 4.604, 7.173], + } + + def testLookupExactMatch(self): + """Tests a lookup when there is an exact match.""" + self.assertEqual(0.1, ttest._LookupPValue(3.182, 3)) + self.assertEqual(0.1, ttest._LookupPValue(-3.182, 3)) + + def testLookupAbove(self): + """Tests a lookup when the given value is above an entry in the table.""" + self.assertEqual(0.2, ttest._LookupPValue(3.1, 2)) + self.assertEqual(0.2, ttest._LookupPValue(-3.1, 2)) + + def testLookupLargeTValue(self): + """Tests a lookup when the given t-value is very large.""" + self.assertEqual(0.01, ttest._LookupPValue(500.0, 1)) + self.assertEqual(0.01, ttest._LookupPValue(-500.0, 1)) + + def testLookupZeroTValue(self): + """Tests a lookup when the given t-value is zero.""" + self.assertEqual(1, ttest._LookupPValue(0.0, 1)) + self.assertEqual(1, ttest._LookupPValue(0.0, 2)) + + def testLookupLargeDF(self): + """Tests a lookup when the given degrees of freedom is large.""" + self.assertEqual(0.02, ttest._LookupPValue(5.0, 50)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/bisect-perf-regression.py b/tools/bisect-perf-regression.py index 8368e63..6fd0289 100755 --- a/tools/bisect-perf-regression.py +++ b/tools/bisect-perf-regression.py @@ -55,6 +55,7 @@ from auto_bisect import bisect_utils from auto_bisect import math_utils from auto_bisect import post_perf_builder_job as bisect_builder from auto_bisect import source_control as source_control_module +from auto_bisect import ttest from telemetry.util import cloud_storage # The additional repositories that might need to be bisected. @@ -260,44 +261,31 @@ def _AddAdditionalDepotInfo(depot_info): def ConfidenceScore(good_results_lists, bad_results_lists): - """Calculates a confidence percentage. + """Calculates a confidence score. - This is calculated based on how distinct the "good" and "bad" values are, - and how noisy the results are. More precisely, the confidence is the quotient - of the difference between the closest values across the good and bad groups - and the sum of the standard deviations of the good and bad groups. + This score is a percentage which represents our degree of confidence in the + proposition that the good results and bad results are distinct groups, and + their differences aren't due to chance alone. - TODO(qyearsley): Replace this confidence function with a function that - uses a Student's t-test. The confidence would be (1 - p-value), where - p-value is the probability of obtaining the given a set of good and bad - values just by chance. Args: good_results_lists: A list of lists of "good" result numbers. bad_results_lists: A list of lists of "bad" result numbers. Returns: - A number between in the range [0, 100]. + A number in the range [0, 100]. """ - # Get the distance between the two groups. - means_good = map(math_utils.Mean, good_results_lists) - means_bad = map(math_utils.Mean, bad_results_lists) - bounds_good = (min(means_good), max(means_good)) - bounds_bad = (min(means_bad), max(means_bad)) - dist_between_groups = min( - math.fabs(bounds_bad[1] - bounds_good[0]), - math.fabs(bounds_bad[0] - bounds_good[1])) - - # Get the sum of the standard deviations of the two groups. - good_results_flattened = sum(good_results_lists, []) - bad_results_flattened = sum(bad_results_lists, []) - stddev_good = math_utils.StandardDeviation(good_results_flattened) - stddev_bad = math_utils.StandardDeviation(bad_results_flattened) - stddev_sum = stddev_good + stddev_bad - - confidence = dist_between_groups / (max(0.0001, stddev_sum)) - confidence = int(min(1.0, max(confidence, 0.0)) * 100.0) - return confidence + if not good_results_lists or not bad_results_lists: + return 0.0 + + # Flatten the lists of results lists. + sample1 = sum(good_results_lists, []) + sample2 = sum(bad_results_lists, []) + + # The p-value is approximately the probability of obtaining the given set + # of good and bad values just by chance. + _, _, p_value = ttest.WelchsTTest(sample1, sample2) + return 100.0 * (1.0 - p_value) def GetSHA1HexDigest(contents): diff --git a/tools/bisect-perf-regression_test.py b/tools/bisect-perf-regression_test.py index 913a851..d4e88d2 100644 --- a/tools/bisect-perf-regression_test.py +++ b/tools/bisect-perf-regression_test.py @@ -31,36 +31,65 @@ class BisectPerfRegressionTest(unittest.TestCase): """Cleans up the test environment after each test method.""" pass - def testConfidenceScore(self): + def testConfidenceScoreHigh(self): """Tests the confidence calculation.""" - bad_values = [[0, 1], [1, 2]] - good_values = [[6, 7], [7, 8]] - # Closest means are mean(1, 2) and mean(6, 7). - distance = 6.5 - 1.5 - # Standard deviation of [n-1, n, n, n+1] is 0.8165. - stddev_sum = 0.8165 + 0.8165 - # Expected confidence is an int in the range [0, 100]. - expected_confidence = min(100, int(100 * distance / float(stddev_sum))) - self.assertEqual( - expected_confidence, - bisect_perf_module.ConfidenceScore(bad_values, good_values)) + bad_values = [[0, 1, 1], [1, 2, 2]] + good_values = [[1, 2, 2], [3, 3, 4]] + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(95.0, confidence) - def testConfidenceScoreZeroConfidence(self): + def testConfidenceScoreNotSoHigh(self): + """Tests the confidence calculation.""" + bad_values = [[0, 1, 1], [1, 2, 2]] + good_values = [[1, 1, 1], [3, 3, 4]] + # The good and bad groups are closer together than in the above test, + # so the confidence that they're different is a little lower. + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(80.0, confidence) + + def testConfidenceScoreZero(self): """Tests the confidence calculation when it's expected to be 0.""" - bad_values = [[0, 1], [1, 2], [4, 5], [0, 2]] - good_values = [[4, 5], [6, 7], [7, 8]] - # Both groups have value lists with means of 4.5, which means distance - # between groups is zero, and thus confidence is zero. - self.assertEqual( - 0, bisect_perf_module.ConfidenceScore(bad_values, good_values)) + bad_values = [[4, 5], [7, 6], [8, 7]] + good_values = [[8, 7], [6, 7], [5, 4]] + # The good and bad sets contain the same values, so the confidence that + # they're different should be zero. + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(0.0, confidence) - def testConfidenceScoreMaxConfidence(self): - """Tests the confidence calculation when it's expected to be 100.""" + def testConfidenceScoreVeryHigh(self): + """Tests the confidence calculation when it's expected to be high.""" bad_values = [[1, 1], [1, 1]] good_values = [[1.2, 1.2], [1.2, 1.2]] - # Standard deviation in both groups is zero, so confidence is 100. - self.assertEqual( - 100, bisect_perf_module.ConfidenceScore(bad_values, good_values)) + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(99.9, confidence) + + def testConfidenceScoreImbalance(self): + """Tests the confidence calculation one set of numbers is small.""" + bad_values = [[1.1, 1.2], [1.1, 1.2], [1.0, 1.3], [1.2, 1.3]] + good_values = [[1.4]] + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(80.0, confidence) + + def testConfidenceScoreImbalance(self): + """Tests the confidence calculation one set of numbers is empty.""" + bad_values = [[1.1, 1.2], [1.1, 1.2], [1.0, 1.3], [1.2, 1.3]] + good_values = [] + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(0.0, confidence) + + def testConfidenceScoreFunctionalTestResultsInconsistent(self): + """Tests the confidence calculation when the numbers are just 0 and 1.""" + bad_values = [[1], [1], [0], [1], [1], [1], [0], [1]] + good_values = [[0], [0], [1], [0], [1], [0]] + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(80.0, confidence) + + def testConfidenceScoreFunctionalTestResultsConsistent(self): + """Tests the confidence calculation when the numbers are 0 and 1.""" + bad_values = [[1], [1], [1], [1], [1], [1], [1], [1]] + good_values = [[0], [0], [0], [0], [0], [0]] + confidence = bisect_perf_module.ConfidenceScore(bad_values, good_values) + self.assertEqual(99.9, confidence) def testParseDEPSStringManually(self): """Tests DEPS parsing.""" diff --git a/tools/run-bisect-perf-regression.py b/tools/run-bisect-perf-regression.py index b5076bb..e5621cb 100755 --- a/tools/run-bisect-perf-regression.py +++ b/tools/run-bisect-perf-regression.py @@ -91,10 +91,9 @@ class Goma(object): subprocess.call([self._abs_path_to_goma_file, 'stop']) -def _LoadConfigFile(config_file_path): - """Loads the given file as a python module and returns the config dictionary. - - The config file is loaded as a Python module. +def _LoadConfigFile(path_to_file): + """Attempts to load the specified config file as a module + and grab the global config dict. Args: config_file_path: Path to the config file. |