tools/auto_bisect/bisect_results.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262

# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import math
import os

import bisect_utils
import math_utils
import source_control
import ttest

from bisect_state import RevisionState


class BisectResults(object):
  """Contains results of the completed bisect.

  Properties:
    error: Error message if the bisect failed.

  If the error is None, the following properties are present:
    warnings: List of warnings from the bisect run.
    state: BisectState object from which these results were generated.
    first_working_revision: First good revision.
    last_broken_revision: Last bad revision.

  If both of above revisions are not None, the follow properties are present:
    culprit_revisions: A list of revisions, which contain the bad change
        introducing the failure.
    regression_size: For performance bisects, this is a relative change of
        the mean metric value. For other bisects this field always contains
        'zero-to-nonzero'.
    regression_std_err: For performance bisects, it is a pooled standard error
        for groups of good and bad runs. Not used for other bisects.
    confidence: For performance bisects, it is a confidence that the good and
        bad runs are distinct groups. Not used for non-performance bisects.
  """

  def __init__(self, bisect_state=None, depot_registry=None, opts=None,
               runtime_warnings=None, error=None, abort_reason=None):
    """Computes final bisect results after a bisect run is complete.

    This constructor should be called in one of the following ways:
      BisectResults(state, depot_registry, opts, runtime_warnings)
      BisectResults(error=error)

    First option creates an object representing successful bisect results, while
    second option creates an error result.

    Args:
      bisect_state: BisectState object representing latest bisect state.
      depot_registry: DepotDirectoryRegistry object with information on each
          repository in the bisect_state.
      opts: Options passed to the bisect run.
      runtime_warnings: A list of warnings from the bisect run.
      error: Error message. When error is not None, other arguments are ignored.
    """
    self.error = error
    self.abort_reason = abort_reason
    if error is not None or abort_reason is not None:
      return

    assert (bisect_state is not None and depot_registry is not None and
            opts is not None and runtime_warnings is not None), (
                'Incorrect use of the BisectResults constructor. '
                'When error is None, all other arguments are required.')

    self.state = bisect_state

    rev_states = bisect_state.GetRevisionStates()
    first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
    self.first_working_revision = first_working_rev
    self.last_broken_revision = last_broken_rev

    self.warnings = runtime_warnings

    self.retest_results_tot = None
    self.retest_results_reverted = None

    if first_working_rev is not None and last_broken_rev is not None:
      statistics = self._ComputeRegressionStatistics(
          rev_states, first_working_rev, last_broken_rev)

      self.regression_size = statistics['regression_size']
      self.regression_std_err = statistics['regression_std_err']
      self.confidence = statistics['confidence']

      self.culprit_revisions = self._FindCulpritRevisions(
          rev_states, depot_registry, first_working_rev, last_broken_rev)

      self.warnings += self._GetResultBasedWarnings(
          self.culprit_revisions, opts, self.confidence)
    elif first_working_rev is not None:
      # Setting these attributes so that bisect printer does not break when the
      # regression cannot be reproduced (no broken revision was found)
      self.regression_size = 0
      self.regression_std_err = 0
      self.confidence = 0
      self.culprit_revisions = []

  def AddRetestResults(self, results_tot, results_reverted):
    if not results_tot or not results_reverted:
      self.warnings.append(
          'Failed to re-test reverted culprit CL against ToT.')
      return

    confidence = BisectResults.ConfidenceScore(
        results_reverted[0]['values'],
        results_tot[0]['values'])

    self.retest_results_tot = RevisionState('ToT', 'n/a', 0)
    self.retest_results_tot.value = results_tot[0]

    self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0)
    self.retest_results_reverted.value = results_reverted[0]

    if confidence <= bisect_utils.HIGH_CONFIDENCE:
      self.warnings.append(
          'Confidence of re-test with reverted CL is not high.'
          ' Check that the regression hasn\'t already recovered. '
          ' There\'s still a chance this is a regression, as performance of'
          ' local builds may not match official builds.')

  @staticmethod
  def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
    warnings = []
    if len(culprit_revisions) > 1:
      warnings.append('Due to build errors, regression range could '
                      'not be narrowed down to a single commit.')
    if opts.repeat_test_count == 1:
      warnings.append('Tests were only set to run once. This may '
                      'be insufficient to get meaningful results.')
    if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
      warnings.append('Confidence is not high. Try bisecting again '
                      'with increased repeat_count, larger range, or '
                      'on another metric.')
    if not confidence:
      warnings.append('Confidence score is 0%. Try bisecting again on '
                      'another platform or another metric.')
    return warnings

  @staticmethod
  def ConfidenceScore(sample1, sample2, accept_single_bad_or_good=False):
    """Calculates a confidence score.

    This score is based on a statistical hypothesis test. The null
    hypothesis is that the two groups of results have no difference,
    i.e. there is no performance regression. The alternative hypothesis
    is that there is some difference between the groups that's unlikely
    to occur by chance.

    The score returned by this function represents our confidence in the
    alternative hypothesis.

    Note that if there's only one item in either sample, this means only
    one revision was classified good or bad, so there's not much evidence
    to make a decision.

    Args:
      sample1: A flat list of "good" result numbers.
      sample2: A flat list of "bad" result numbers.
      accept_single_bad_or_good: If True, compute a value even if
          there is only one bad or good revision.

    Returns:
      A float between 0 and 100; 0 if the samples aren't large enough.
    """
    if ((len(sample1) <= 1 or len(sample2) <= 1) and
        not accept_single_bad_or_good):
      return 0.0
    if not sample1 or not sample2:
      return 0.0
    _, _, p_value = ttest.WelchsTTest(sample1, sample2)
    return 100.0 * (1.0 - p_value)

  @staticmethod
  def FindBreakingRevRange(revision_states):
    """Finds the last known good and first known bad revisions.

    Note that since revision_states is expected to be in reverse chronological
    order, the last known good revision is the first revision in the list that
    has the passed property set to 1, therefore the name
    `first_working_revision`. The inverse applies to `last_broken_revision`.

    Args:
      revision_states: A list of RevisionState instances.

    Returns:
      A tuple containing the two revision states at the border. (Last
      known good and first known bad.)
    """
    first_working_revision = None
    last_broken_revision = None

    for revision_state in revision_states:
      if revision_state.passed == 1 and not first_working_revision:
        first_working_revision = revision_state

      if not revision_state.passed:
        last_broken_revision = revision_state

    return first_working_revision, last_broken_revision

  @staticmethod
  def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
                            last_broken_rev):
    cwd = os.getcwd()

    culprit_revisions = []
    for i in xrange(last_broken_rev.index, first_working_rev.index):
      depot_registry.ChangeToDepotDir(revision_states[i].depot)
      info = source_control.QueryRevisionInfo(revision_states[i].revision)
      culprit_revisions.append((revision_states[i].revision, info,
                                revision_states[i].depot))

    os.chdir(cwd)
    return culprit_revisions

  @classmethod
  def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
                                   last_broken_rev):
    # TODO(sergiyb): We assume that value has "values" key, which may not be
    # the case for failure-bisects, where there is a single value only.
    broken_means = [state.value['values']
                    for state in rev_states[:last_broken_rev.index+1]
                    if state.value]

    working_means = [state.value['values']
                     for state in rev_states[first_working_rev.index:]
                     if state.value]

    # Flatten the lists to calculate mean of all values.
    working_mean = sum(working_means, [])
    broken_mean = sum(broken_means, [])

    # Calculate the approximate size of the regression
    mean_of_bad_runs = math_utils.Mean(broken_mean)
    mean_of_good_runs = math_utils.Mean(working_mean)

    regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
                                                      mean_of_bad_runs)
    if math.isnan(regression_size):
      regression_size = 'zero-to-nonzero'

    regression_std_err = math.fabs(math_utils.PooledStandardError(
        [working_mean, broken_mean]) /
        max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

    # Give a "confidence" in the bisect culprit by seeing whether the results
    # of the culprit revision and the revision before that appear to be
    # statistically significantly different.
    confidence = cls.ConfidenceScore(
        sum([first_working_rev.value['values']], []),
        sum([last_broken_rev.value['values']], []))

    bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs

    return {'regression_size': regression_size,
            'regression_std_err': regression_std_err,
            'confidence': confidence,
            'bad_greater_than_good': bad_greater_than_good}