tools/findit/findit_for_crash.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664

# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import os
from threading import Lock

import blame
from common import utils
import component_dictionary
import crash_utils
import git_repository_parser
import match_set
import svn_repository_parser


LINE_CHANGE_PRIORITY = 1
FILE_CHANGE_PRIORITY = 2
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
CONFIG = crash_utils.ParseURLsFromConfig(os.path.join(_THIS_DIR,
                                                      'config.ini'))


def GenerateMatchEntry(
    matches, revision_info, revision_number, file_path, function,
    component_path, component_name, crashed_line_numbers, stack_frame_indices,
    file_change_type, repository_parser):
  """Generates a match object and adds it to the match set.

  Args:
    matches: A matchset object, a map from CL to a match object.
    revision_info: The revision information, a map from fields (message,
                   changed files, etc) to its values.
    revision_number: The SVN revision number or git hash.
    file_path: The path of the file.
    function: The function that caused an crash.
    component_path: The path of the component this file is from.
    component_name: The name of the component the file is from.
    crashed_line_numbers: The list of the lines in the file that caused
                          the crash.
    stack_frame_indices: The list of positions of this file within a stack.
    file_change_type: Whether file is modified, added or deleted.
    repository_parser: The parser object to parse line diff.
  """
  # Check if this CL should be ignored.
  with matches.matches_lock:
    if revision_number in matches.cls_to_ignore:
      return

    # If this CL is not already identified as suspected, create a new entry.
    if revision_number not in matches.matches:
      match = match_set.Match(revision_info, component_name)
      message = revision_info['message']
      # TODO(jeun): Don't hold lock while issuing http request.
      match.ParseMessage(message, matches.codereview_api_url)

      # If this match is a revert, add to the set of CLs to be ignored.
      if match.is_revert:
        matches.cls_to_ignore.add(revision_number)

        # If this match has info on what CL it is reverted from, add that CL.
        if match.revert_of:
          matches.cls_to_ignore.add(match.revert_of)

        return

      matches.matches[revision_number] = match

    else:
      match = matches.matches[revision_number]

  (diff_url, changed_line_numbers, changed_line_contents) = (
      repository_parser.ParseLineDiff(
          file_path, component_path, file_change_type, revision_number))

  # Ignore this match if the component is not supported for svn.
  if not diff_url:
    return

  # Find the intersection between the lines that this file crashed on and
  # the changed lines.
  (line_number_intersection, stack_frame_index_intersection, functions) = (
      crash_utils.Intersection(
          crashed_line_numbers, stack_frame_indices, changed_line_numbers,
          function))

  # Find the minimum distance between the changed lines and crashed lines.
  (min_distance, min_crashed_line, min_changed_line) = \
      crash_utils.FindMinLineDistance(crashed_line_numbers,
                                      changed_line_numbers)

  # Check whether this CL changes the crashed lines or not.
  if line_number_intersection:
    priority = LINE_CHANGE_PRIORITY
  else:
    priority = FILE_CHANGE_PRIORITY

  # Add the parsed information to the object.
  with matches.matches_lock:
    match.crashed_line_numbers.append(line_number_intersection)

    file_name = file_path.split('/')[-1]
    match.changed_files.append(file_name)

    # Update the min distance only if it is less than the current one.
    if min_distance < match.min_distance:
      match.min_distance = min_distance
      match.min_distance_info = (file_name, min_crashed_line, min_changed_line)

    # If this CL does not change the crashed line, all occurrence of this
    # file in the stack has the same priority.
    if not stack_frame_index_intersection:
      stack_frame_index_intersection = stack_frame_indices
      functions = function
    match.stack_frame_indices.append(stack_frame_index_intersection)
    match.changed_file_urls.append(diff_url)
    match.priorities.append(priority)
    match.function_list.append(functions)


def FindMatch(revisions_info_map, file_to_revision_info, file_to_crash_info,
              component_path, component_name, repository_parser,
              codereview_api_url):
  """Finds a CL that modifies file in the stacktrace.

  Args:
    revisions_info_map: A dictionary mapping revision number to the CL
                        information.
    file_to_revision_info: A dictionary mapping file to the revision that
                           modifies it.
    file_to_crash_info: A dictionary mapping file to its occurrence in
                       stacktrace.
    component_path: The path of the component to search for.
    component_name: The name of the component to search for.
    repository_parser: The parser object to parse the line diff.
    codereview_api_url: A code review url to retrieve data from.

  Returns:
    Matches, a set of match objects.
  """
  matches = match_set.MatchSet(codereview_api_url)

  tasks = []
  # Iterate through the crashed files in the stacktrace.
  for crashed_file_path in file_to_crash_info:
    # Ignore header file.
    if crashed_file_path.endswith('.h'):
      continue

    # If the file in the stacktrace is not changed in any commits, continue.
    for changed_file_path in file_to_revision_info:
      changed_file_name = changed_file_path.split('/')[-1].lower()
      crashed_file_name = crashed_file_path.split('/')[-1].lower()
      if changed_file_name != crashed_file_name:
        continue

      if not crash_utils.GuessIfSameSubPath(
          changed_file_path.lower(), crashed_file_path.lower()):
        continue

      crashed_line_numbers = file_to_crash_info.GetCrashedLineNumbers(
          crashed_file_path)
      stack_frame_nums = file_to_crash_info.GetCrashStackFrameIndices(
          crashed_file_path)
      functions = file_to_crash_info.GetCrashFunctions(crashed_file_path)

      # Iterate through the CLs that this file path is changed.
      for (cl, file_change_type) in file_to_revision_info[changed_file_path]:
        # If the file change is delete, ignore this CL.
        if file_change_type == 'D':
          continue

        revision = revisions_info_map[cl]

        tasks.append({
            'function': GenerateMatchEntry,
            'args':[matches, revision, cl, changed_file_path, functions,
                    component_path, component_name, crashed_line_numbers,
                    stack_frame_nums, file_change_type,
                   repository_parser]})

  # Run all the tasks.
  crash_utils.RunTasks(tasks)

  matches.RemoveRevertedCLs()

  return matches


def FindMatchForComponent(component_path, file_to_crash_info, changelog,
                          callstack_priority, results, results_lock):
  """Parses changelog and finds suspected CLs for a given component.

  Args:
    component_path: The path of component to look for the culprit CL.
    file_to_crash_info: A dictionary mapping file to its occurrence in
                        stackframe.
    changelog: The parsed changelog for this component.
    callstack_priority: The priority of this call stack, 0 if from crash stack,
                        1 if from freed, 2 if from previously allocated.
    results: A dictionary to store the result.
    results_lock: A lock that guards results.
  """
  (repository_parser, component_name, revisions, file_to_revision_map) = \
      changelog

  # Find match for this component.
  codereview_api_url = CONFIG['codereview']['review_url']
  component_result = FindMatch(
      revisions, file_to_revision_map, file_to_crash_info, component_path,
      component_name, repository_parser, codereview_api_url)
  matches = component_result.matches

  # For all the match results in a dictionary, add to the list so that it
  # can be sorted.
  with results_lock:
    for cl in matches:
      match = matches[cl]
      results.append((callstack_priority, cl, match))


def FindMatchForCallstack(
    callstack, components, component_to_changelog_map, results,
    results_lock):
  """Finds culprit cl for a stack within a stacktrace.

  For each components to look for, create new thread that computes the matches
  and join the results at the end.

  Args:
    callstack: A callstack in a stacktrace to find the result for.
    components: A set of components to look for.
    component_to_changelog_map: A map from component to its parsed changelog.
    results: A list to aggregrate results from all stacktraces.
    results_lock: A lock that guards results.
  """
  # Create component dictionary from the component and call stack.
  component_dict = component_dictionary.ComponentDictionary(callstack,
                                                            components)
  callstack_priority = callstack.priority

  # Iterate through all components.
  for component_path in component_dict:
    # If the component to consider in this callstack is not in the parsed list
    # of components, ignore this one.
    if component_path not in component_to_changelog_map:
      continue

    changelog = component_to_changelog_map[component_path]
    file_to_crash_info = component_dict.GetFileDict(component_path)
    FindMatchForComponent(component_path, file_to_crash_info, changelog,
                          callstack_priority, results, results_lock)


def FindMatchForStacktrace(stacktrace, components,
                           component_to_regression_dict):
  """Finds the culprit CL for stacktrace.

  The passed stacktrace is either from release build stacktrace
  or debug build stacktrace.

  Args:
    stacktrace: A list of parsed stacks within a stacktrace.
    components: A set of components to look for.
    component_to_regression_dict: A dictionary mapping component path to
                                  its regression.

  Returns:
    A list of match results from all stacks.
  """
  # A list to aggregate results from all the callstacks in the stacktrace.
  results = []
  results_lock = Lock()

  # Setup parsers.
  svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
  git_parser = git_repository_parser.GitParser(component_to_regression_dict,
                                               CONFIG['git'])

  # Create a cache of parsed revisions.
  component_to_changelog_map = {}
  for component_path in components:
    component_object = component_to_regression_dict[component_path]
    range_start = component_object['old_revision']
    range_end = component_object['new_revision']

    # If range start is 0, the range is too large and the crash has been
    # introduced the archived build, so ignore this case.
    if range_start == '0':
      continue

    component_name = component_to_regression_dict[component_path]['name']

    is_git = utils.IsGitHash(range_start)
    if is_git:
      repository_parser = git_parser
    else:
      repository_parser = svn_parser

    (revisions, file_to_revision_map) = repository_parser.ParseChangelog(
        component_path, range_start, range_end)

    # If the returned map from ParseChangeLog is empty, we don't need to look
    # further because either the parsing failed or the changelog is empty.
    if not (revisions and file_to_revision_map):
      continue

    component_to_changelog_map[component_path] = (repository_parser,
                                                  component_name,
                                                  revisions,
                                                  file_to_revision_map)

  # Analyze each of the call stacks in the stacktrace.
  for callstack in stacktrace.stack_list:
    FindMatchForCallstack(callstack, components, component_to_changelog_map,
                          results, results_lock)

  return results


def SortMatchesFunction(match_with_stack_priority):
  """A function to sort the match triple.

  Currently, it sorts the list by:
  1) The highest priority file change in the CL (changing crashed line is
  higher priority than just changing the file).
  2) The callstack this match is computed (crash stack, freed, allocation).
  3) The minimum stack frame number of the changed file in the match.
  4) The number of files this CL changes (higher the better).
  5) The minimum distance between the lines that the CL changes and crashed
  lines.

  Args:
    match_with_stack_priority: A match object, with the CL it is from and what
                               callstack it is from.

  Returns:
    A sort key.
  """
  (stack_priority, _, match) = match_with_stack_priority

  return (min(match.priorities),
          stack_priority,
          match.min_distance,
          crash_utils.FindMinStackFrameNumber(match.stack_frame_indices,
                                              match.priorities),
          -len(match.changed_files))


def SortAndFilterMatches(matches, num_important_frames=5):
  """Filters the list of potential culprit CLs to remove noise.

  Args:
    matches: A list containing match results.
    num_important_frames: A number of frames on the top of the frame to Check
                          for when filtering the results. A match with a file
                          that is in top num_important_frames of the stacktrace
                          is regarded more probable then others.

  Returns:
    Filtered match results.
  """
  new_matches = []
  line_changed = False
  is_important_frame = False
  highest_priority_stack = crash_utils.INFINITY
  matches.sort(key=SortMatchesFunction)
  # Iterate through the matches to find out what results are significant.
  for stack_priority, cl, match in matches:
    # Check if the current match changes crashed line.
    is_line_change = (min(match.priorities) == LINE_CHANGE_PRIORITY)

    # Check which stack this match is from, and finds the highest priority
    # callstack up to this point.
    current_stack = stack_priority
    if current_stack < highest_priority_stack:
      highest_priority_stack = current_stack

    # Check if current match changes a file that occurs in crash state.
    flattened_stack_frame_indices = [frame for frame_indices in
                                     match.stack_frame_indices
                                     for frame in frame_indices]
    current_is_important = (
        min(flattened_stack_frame_indices) < num_important_frames)

    # This match and anything lower than this should be ignored if:
    # - Current match does not change crashed lines but there are matches
    # that do so.
    # - Current match is not in crash state but there are matches in it.
    # - There are other matches that came from higher priority stack.
    if (line_changed and not is_line_change) or (
        is_important_frame and not current_is_important) or (
            current_stack > highest_priority_stack):
      break

    # Update the variables.
    if is_line_change:
      line_changed = True
    if current_is_important:
      is_important_frame = True

    # Add current match to the filtered result.
    new_matches.append((stack_priority, cl, match))

  return new_matches


def GenerateReasonForMatches(matches):
  """Generates a reason that a match (CL) is a culprit cl.

  Args:
    matches: A list of match objects.
  """
  # Iterate through the matches in the list.
  for i, _, match in matches:
    reason = []

    # Zip the files in the match by the reason they are suspected
    # (how the file is modified).
    match_by_priority = zip(
        match.priorities, match.crashed_line_numbers, match.changed_files,
        match.stack_frame_indices, match.function_list)

    # Sort the zipped changed files in the match by their priority so that the
    # changed lines comes first in the reason.
    match_by_priority.sort(
        key=lambda (priority, crashed_line_numbers, file_name,
                    stack_frame_indices, function_list): priority)

    # Iterate through the sorted match.
    for i in range(len(match_by_priority)):
      (priority, crashed_line_numbers, file_name, stack_frame_indices,
       function_list) = match_by_priority[i]

      # If the file in the match is a line change, append a explanation.
      if priority == LINE_CHANGE_PRIORITY:
        crashed_line_numbers = [crashed_line_number
                                for lines in crashed_line_numbers
                                for crashed_line_number in lines]
        reason.append(
            'Lines %s of file %s which potentially caused crash '
            'are changed in this cl (%s).\n' %
            (utils.JoinLineNumbers(crashed_line_numbers, accepted_gap=4),
             file_name,
             crash_utils.PrettifyFrameInfo(stack_frame_indices, function_list)))

      else:
        # Get all the files that are not line change.
        rest_of_the_files = match_by_priority[i:]

        if len(rest_of_the_files) == 1:
          file_string = 'File %s is changed in this cl '
        else:
          file_string = 'Files %s are changed in this cl '

        # Create a list of file names, and prettify the list.
        file_names = [
            file_name for (_, _, file_name, _, _) in rest_of_the_files]
        pretty_file_names = crash_utils.PrettifyList(file_names)

        # Add the reason, break because we took care of the rest of the files.
        file_string += ('(and is part of stack %s)' %
            crash_utils.PrettifyFrameInfo(stack_frame_indices, function_list))
        reason.append(file_string % pretty_file_names)
        break

    # Set the reason as string.
    match.reason = '\n'.join(reason)


def CombineMatches(matches):
  """Combine possible duplicates in matches.

  Args:
    matches: A list of matches object, along with its callstack priority and
             CL it is from.
  Returns:
    A combined list of matches.
  """
  combined_matches = []

  for stack_index, cl, match in matches:
    found_match = None

    # Iterate through the list of combined matches.
    for _, cl_combined, match_combined in combined_matches:
      # Check for if current CL is already higher up in the result.
      if cl == cl_combined:
        found_match = match_combined
        break

    # If current match is not already in, add it to the list of matches.
    if not found_match:
      combined_matches.append((stack_index, cl, match))
      continue

    # Combine the reason if the current match is already in there.
    found_match.reason += '\n' + match.reason
    if match.min_distance < found_match.min_distance:
      found_match.min_distance = match.min_distance
      found_match.min_distance_info = match.min_distance_info

  for stack_index, cl, match in combined_matches:
    if match.min_distance_info:
      file_name, min_crashed_line, min_changed_line = match.min_distance_info
      match.reason = match.reason.strip()
      match.reason += (
          '\nMinimum distance from crash line to modified line: %d. '
          '(file: %s, crashed on: %d, modified: %d).' %
          (match.min_distance, file_name, min_crashed_line, min_changed_line))

  return combined_matches


def FilterAndGenerateReasonForMatches(result):
  """A wrapper function.

  It generates reasons for the matches and returns string representation
  of filtered results.

  Args:
    result: A list of match objects.

  Returns:
    A string representation of filtered results.
  """
  new_result = SortAndFilterMatches(result)
  GenerateReasonForMatches(new_result)
  combined_matches = CombineMatches(new_result)
  return crash_utils.MatchListToResultList(combined_matches)


def ParseCrashComponents(main_stack):
  """Parses the crashing component.

  Crashing components is a component that top_n_frames of the stacktrace is
  from.

  Args:
    main_stack: Main stack from the stacktrace.

  Returns:
    A set of components.
  """
  components = set()

  for frame in main_stack.frame_list:
    components.add(frame.component_path)

  return components


def GenerateAndFilterBlameList(callstack, component_to_crash_revision_dict,
                               component_to_regression_dict):
  """A wrapper function.

  Finds blame information for stack and returns string representation.

  Args:
    callstack: A callstack to find the blame information.
    component_to_crash_revision_dict: A dictionary mapping component to its
                                      crash revision.
    component_to_regression_dict: A dictionary mapping component to its
                                  regression.

  Returns:
    A list of blame results.
  """
  if component_to_regression_dict:
    parsed_deps = component_to_regression_dict
  else:
    parsed_deps = component_to_crash_revision_dict

  # Setup parser objects to use for parsing blame information.
  svn_parser = svn_repository_parser.SVNParser(CONFIG['svn'])
  git_parser = git_repository_parser.GitParser(parsed_deps, CONFIG['git'])
  parsers = {}
  parsers['svn'] = svn_parser
  parsers['git'] = git_parser

  # Create and generate the blame objects from the callstack.
  blame_list = blame.BlameList()
  blame_list.FindBlame(callstack, component_to_crash_revision_dict,
                       component_to_regression_dict,
                       parsers)

  blame_list.FilterAndSortBlameList()
  return crash_utils.BlameListToResultList(blame_list)


def FindItForCrash(stacktrace_list,
                   callstack,
                   component_to_regression_dict,
                   component_to_crash_revision_dict):
  """Finds the culprit CL from the list of stacktrace.

  Args:
    stacktrace_list: A list of stacktraces to look for, in the order of
                     decreasing significance.
    callstack: A callstack object to show blame information for, if there are
               no results for all stacktraces in the stacktrace_list.
    component_to_regression_dict: A parsed regression information as a
                                  result of parsing DEPS file.
    component_to_crash_revision_dict: A parsed crash revision information.

  Returns:
    A list of result objects, with the message how the result is created.
  """
  # If regression information is not available, return blame information.
  if not component_to_regression_dict:
    result = GenerateAndFilterBlameList(callstack,
                                        component_to_crash_revision_dict,
                                        component_to_regression_dict)
    if result:
      return_message = (
          'Regression information is not available. The result is '
          'the blame information.')
    else:
      return_message = ('Findit could not find any suspected CLs.')

    return (return_message, result)

  for stacktrace in stacktrace_list:
    # Check the next stacktrace if current one is empty.
    if not stacktrace.stack_list:
      continue

    # Get the crash stack for this stacktrace, and extract crashing components
    # from it.
    main_stack = stacktrace.GetCrashStack()
    components = ParseCrashComponents(main_stack)

    result_for_stacktrace = FindMatchForStacktrace(
        stacktrace, components, component_to_regression_dict)
    filtered_result = FilterAndGenerateReasonForMatches(result_for_stacktrace)

    # If the result is empty, check the next stacktrace. Else, return the
    # filtered result.
    if not filtered_result:
      continue

    return_message = (
        'The result is a list of CLs that change the crashed files.')
    return (return_message, filtered_result)

  # If no match is found, return the blame information for the input
  # callstack.
  result = GenerateAndFilterBlameList(
      callstack, component_to_crash_revision_dict,
      component_to_regression_dict)

  if result:
    return_message = (
        'No CL in the regression range changes the crashed files. '
        'The result is the blame information.')

  # When findit could not find any CL that changes file in stacktrace or if
  # if cannot get any blame information, return a message saying that no
  # results are available.
  else:
    return_message = ('Findit could not find any suspected CLs.')

  return (return_message, result)