tools/mac/symbolicate_crash.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504

#!/usr/bin/env python
# Copyright (c) 2011 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""
This script can take an Apple-style CrashReporter log and symbolicate it. This
is useful for when a user's reports aren't being uploaded, for example.

Only versions 6, 7, 8, and 9 reports are supported. For more information on the
file format, reference this document:
  TN2123 <http://developer.apple.com/library/mac/#technotes/tn2004/tn2123.html>

Information on symbolication was gleaned from:
  <http://developer.apple.com/tools/xcode/symbolizingcrashdumps.html>
"""

import optparse
import os.path
import re
import subprocess
import sys

# Maps binary image identifiers to binary names (minus the .dSYM portion) found
# in the archive. These are the only objects that will be looked up.
SYMBOL_IMAGE_MAP = {
  'com.google.Chrome': 'Google Chrome.app',
  'com.google.Chrome.framework': 'Google Chrome Framework.framework',
  'com.google.Chrome.helper': 'Google Chrome Helper.app'
}

class CrashReport(object):
  """A parsed representation of an Apple CrashReport text file."""
  def __init__(self, file_name):
    super(CrashReport, self).__init__()
    self.report_info = {}
    self.threads = []
    self._binary_images = {}

    fd = open(file_name, 'r')
    self._ParseHeader(fd)

    # Try and get the report version. If it's not a version we handle, abort.
    self.report_version = int(self.report_info['Report Version'])
    # Version 6: 10.5 and 10.6 crash report
    # Version 7: 10.6 spindump report
    # Version 8: 10.7 spindump report
    # Version 9: 10.7 crash report
    valid_versions = (6, 7, 8, 9)
    if self.report_version not in valid_versions:
      raise Exception("Only crash reports of versions %s are accepted." %
          str(valid_versions))

    # If this is a spindump (version 7 or 8 report), use a special parser. The
    # format is undocumented, but is similar to version 6. However, the spindump
    # report contains user and kernel stacks for every process on the system.
    if self.report_version == 7 or self.report_version == 8:
      self._ParseSpindumpStack(fd)
    else:
      self._ParseStack(fd)

    self._ParseBinaryImages(fd)
    fd.close()

  def Symbolicate(self, symbol_path):
    """Symbolicates a crash report stack trace."""
    # In order to be efficient, collect all the offsets that will be passed to
    # atos by the image name.
    offsets_by_image = self._CollectAddressesForImages(SYMBOL_IMAGE_MAP.keys())

    # For each image, run atos with the list of addresses.
    for image_name, addresses in offsets_by_image.items():
      # If this image was not loaded or is in no stacks, skip.
      if image_name not in self._binary_images or not len(addresses):
        continue

      # Combine the |image_name| and |symbol_path| into the path of the dSYM.
      dsym_file = self._GetDSymPath(symbol_path, image_name)

      # From the list of 2-Tuples of (frame, address), create a list of just
      # addresses.
      address_list = map(lambda x: x[1], addresses)

      # Look up the load address of the image.
      binary_base = self._binary_images[image_name][0]

      # This returns a list of just symbols. The indices will match up with the
      # list of |addresses|.
      symbol_names = self._RunAtos(binary_base, dsym_file, address_list)
      if not symbol_names:
        print 'Error loading symbols for ' + image_name
        continue

      # Attaches a list of symbol names to stack frames. This assumes that the
      # order of |addresses| has stayed the same as |symbol_names|.
      self._AddSymbolsToFrames(symbol_names, addresses)

  def _ParseHeader(self, fd):
    """Parses the header section of a crash report, which contains the OS and
    application version information."""
    # The header is made up of different sections, depending on the type of
    # report and the report version. Almost all have a format of a key and
    # value separated by a colon. Accumulate all of these artifacts into a
    # dictionary until the first thread stack is reached.
    thread_re = re.compile('^[ \t]*Thread ([a-f0-9]+)')
    line = ''
    while not thread_re.match(line):
      # Skip blank lines. There are typically three or four sections separated
      # by newlines in the header.
      line = line.strip()
      if line:
        parts = line.split(':', 1)
        # Certain lines in different report versions don't follow the key-value
        # format, so skip them.
        if len(parts) == 2:
          # There's a varying amount of space padding after the ':' to align all
          # the values; strip that.
          self.report_info[parts[0]] = parts[1].lstrip()
      line = fd.readline()

    # When this loop exits, the header has been read in full. However, the first
    # thread stack heading has been read past. Seek backwards from the current
    # position by the length of the line so that it is re-read when
    # _ParseStack() is entered.
    fd.seek(-len(line), os.SEEK_CUR)

  def _ParseStack(self, fd):
    """Parses the stack dump of a crash report and creates a list of threads
    and their stack traces."""
    # Compile a regex that matches the start of a thread stack. Note that this
    # must be specific to not include the thread state section, which comes
    # right after all the stack traces.
    line_re = re.compile('^Thread ([0-9]+)( Crashed)?:(.*)')

    # On entry into this function, the fd has been walked up to the "Thread 0"
    # line.
    line = fd.readline().rstrip()
    in_stack = False
    thread = None
    while line_re.match(line) or in_stack:
      # Check for start of the thread stack.
      matches = line_re.match(line)

      if not line.strip():
        # A blank line indicates a break in the thread stack.
        in_stack = False
      elif matches:
        # If this is the start of a thread stack, create the CrashThread.
        in_stack = True
        thread = CrashThread(matches.group(1))
        thread.name = matches.group(3)
        thread.did_crash = matches.group(2) != None
        self.threads.append(thread)
      else:
        # All other lines are stack frames.
        thread.stack.append(self._ParseStackFrame(line))
      # Read the next line.
      line = fd.readline()

  def _ParseStackFrame(self, line):
    """Takes in a single line of text and transforms it into a StackFrame."""
    frame = StackFrame(line)

    # A stack frame is in the format of:
    # |<frame-number> <binary-image> 0x<address> <symbol> <offset>|.
    regex = '^([0-9]+) +(.+)[ \t]+(0x[0-9a-f]+) (.*) \+ ([0-9]+)$'
    matches = re.match(regex, line)
    if matches is None:
      return frame

    # Create a stack frame with the information extracted from the regex.
    frame.frame_id = matches.group(1)
    frame.image = matches.group(2)
    frame.address = int(matches.group(3), 0)  # Convert HEX to an int.
    frame.original_symbol = matches.group(4)
    frame.offset = matches.group(5)
    frame.line = None
    return frame

  def _ParseSpindumpStack(self, fd):
    """Parses a spindump stack report. In this format, each thread stack has
    both a user and kernel trace. Only the user traces are symbolicated."""

    # The stack trace begins with the thread header, which is identified by a
    # HEX number. The thread names appear to be incorrect in spindumps.
    user_thread_re = re.compile('^  Thread ([0-9a-fx]{4})')

    # When this method is called, the fd has been walked right up to the first
    # line.
    line = fd.readline()
    in_user_stack = False
    in_kernel_stack = False
    thread = None
    frame_id = 0
    while user_thread_re.match(line) or in_user_stack or in_kernel_stack:
      # Check for the start of a thread.
      matches = user_thread_re.match(line)

      if not line.strip():
        # A blank line indicates the start of a new thread. The blank line comes
        # after the kernel stack before a new thread header.
        in_kernel_stack = False
      elif matches:
        # This is the start of a thread header. The next line is the heading for
        # the user stack, followed by the actual trace.
        thread = CrashThread(matches.group(1))
        frame_id = 0
        self.threads.append(thread)
        in_user_stack = True
        line = fd.readline()  # Read past the 'User stack:' header.
      elif line.startswith('  Kernel stack:'):
        # The kernel stack header comes immediately after the last frame (really
        # the top frame) in the user stack, without a blank line.
        in_user_stack = False
        in_kernel_stack = True
      elif in_user_stack:
        # If this is a line while in the user stack, parse it as a stack frame.
        thread.stack.append(self._ParseSpindumpStackFrame(line))
      # Loop with the next line.
      line = fd.readline()

    # When the loop exits, the file has been read through the 'Binary images:'
    # header. Seek backwards so that _ParseBinaryImages() does the right thing.
    fd.seek(-len(line), os.SEEK_CUR)

  def _ParseSpindumpStackFrame(self, line):
    """Parses a spindump-style stackframe."""
    frame = StackFrame(line)

    # The format of the frame is either:
    # A: |<space><steps> <symbol> + <offset> (in <image-name>) [<address>]|
    # B: |<space><steps> ??? (in <image-name> + <offset>) [<address>]|
    regex_a = '^([ ]+[0-9]+) (.*) \+ ([0-9]+) \(in (.*)\) \[(0x[0-9a-f]+)\]'
    regex_b = '^([ ]+[0-9]+) \?\?\?( \(in (.*) \+ ([0-9]+)\))? \[(0x[0-9a-f]+)\]'

    # Create the stack frame with the information extracted from the regex.
    matches = re.match(regex_a, line)
    if matches:
      frame.frame_id = matches.group(1)[4:]  # Remove some leading spaces.
      frame.original_symbol = matches.group(2)
      frame.offset = matches.group(3)
      frame.image = matches.group(4)
      frame.address = int(matches.group(5), 0)
      frame.line = None
      return frame

    # If pattern A didn't match (which it will most of the time), try B.
    matches = re.match(regex_b, line)
    if matches:
      frame.frame_id = matches.group(1)[4:]  # Remove some leading spaces.
      frame.image = matches.group(3)
      frame.offset = matches.group(4)
      frame.address = int(matches.group(5), 0)
      frame.line = None
      return frame

    # Otherwise, this frame could not be matched and just use the raw input.
    frame.line = frame.line.strip()
    return frame

  def _ParseBinaryImages(self, fd):
    """Parses out the binary images section in order to get the load offset."""
    # The parser skips some sections, so advance until the "Binary Images"
    # header is reached.
    while not fd.readline().lstrip().startswith("Binary Images:"): pass

    # Create a regex to match the lines of format:
    # |0x<start> - 0x<end> <binary-image> <version> (<version>) <<UUID>> <path>|
    image_re = re.compile(
        '[ ]*(0x[0-9a-f]+) -[ \t]+(0x[0-9a-f]+) [+ ]([a-zA-Z0-9._\-]+)')

    # This section is in this format:
    # |<start address> - <end address> <image name>|.
    while True:
      line = fd.readline()
      if not line.strip():
        # End when a blank line is hit.
        return
      # Match the line to the regex.
      match = image_re.match(line)
      if match:
        # Store the offsets by image name so it can be referenced during
        # symbolication. These are hex numbers with leading '0x', so int() can
        # convert them to decimal if base=0.
        address_range = (int(match.group(1), 0), int(match.group(2), 0))
        self._binary_images[match.group(3)] = address_range

  def _CollectAddressesForImages(self, images):
    """Iterates all the threads and stack frames and all the stack frames that
    are in a list of binary |images|. The result is a dictionary, keyed by the
    image name that maps to a list of tuples. Each is a 2-Tuple of
    (stack_frame, address)"""
    # Create the collection and initialize it with empty lists for each image.
    collection = {}
    for image in images:
      collection[image] = []

    # Perform the iteration.
    for thread in self.threads:
      for frame in thread.stack:
        image_name = self._ImageForAddress(frame.address)
        if image_name in images:
          # Replace the image name in the frame in case it was elided.
          frame.image = image_name
          collection[frame.image].append((frame, frame.address))

    # Return the result.
    return collection

  def _ImageForAddress(self, address):
    """Given a PC address, returns the bundle identifier of the image in which
    the address resides."""
    for image_name, address_range in self._binary_images.items():
      if address >= address_range[0] and address <= address_range[1]:
        return image_name
    return None

  def _GetDSymPath(self, base_path, image_name):
    """Takes a base path for the symbols and an image name. It looks the name up
    in SYMBOL_IMAGE_MAP and creates a full path to the dSYM in the bundle."""
    image_file = SYMBOL_IMAGE_MAP[image_name]
    return os.path.join(base_path, image_file + '.dSYM', 'Contents',
        'Resources', 'DWARF',
        os.path.splitext(image_file)[0])  # Chop off the extension.

  def _RunAtos(self, load_address, dsym_file, addresses):
    """Runs the atos with the provided arguments. |addresses| is used as stdin.
    Returns a list of symbol information in the same order as |addresses|."""
    args = ['atos', '-l', str(load_address), '-o', dsym_file]

    # Get the arch type. This is of the format |X86 (Native)|.
    if 'Code Type' in self.report_info:
      arch = self.report_info['Code Type'].lower().split(' ')
      if len(arch) == 2:
        arch = arch[0]
        if arch == 'x86':
          # The crash report refers to i386 as x86, but atos doesn't know what
          # that is.
          arch = 'i386'
        args.extend(['-arch', arch])

    proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    addresses = map(hex, addresses)
    (stdout, stderr) = proc.communicate(' '.join(addresses))
    if proc.returncode:
      return None
    return stdout.rstrip().split('\n')

  def _AddSymbolsToFrames(self, symbols, address_tuples):
    """Takes a single value (the list) from _CollectAddressesForImages and does
    a smart-zip with the data returned by atos in |symbols|. Note that the
    indices must match for this to succeed."""
    if len(symbols) != len(address_tuples):
      print 'symbols do not match'

    # Each line of output from atos is in this format:
    # |<symbol> (in <image>) (<file>:<line>)|.
    line_regex = re.compile('(.+) \(in (.+)\) (\((.+):([0-9]+)\))?')

    # Zip the two data sets together.
    for i in range(len(symbols)):
      symbol_parts = line_regex.match(symbols[i])
      if not symbol_parts:
        continue  # Error.
      frame = address_tuples[i][0]
      frame.symbol = symbol_parts.group(1)
      frame.image = symbol_parts.group(2)
      frame.file_name = symbol_parts.group(4)
      frame.line_number = symbol_parts.group(5)


class CrashThread(object):
  """A CrashThread represents a stacktrace of a single thread """
  def __init__(self, thread_id):
    super(CrashThread, self).__init__()
    self.thread_id = thread_id
    self.name = None
    self.did_crash = False
    self.stack = []

  def __repr__(self):
    name = ''
    if self.name:
      name = ': ' + self.name
    return 'Thread ' + self.thread_id + name + '\n' + \
        '\n'.join(map(str, self.stack))


class StackFrame(object):
  """A StackFrame is owned by a CrashThread."""
  def __init__(self, line):
    super(StackFrame, self).__init__()
    # The original line. This will be set to None if symbolication was
    # successfuly.
    self.line = line

    self.frame_id = 0
    self.image = None
    self.address = 0x0
    self.original_symbol = None
    self.offset = 0x0
    # The following members are set after symbolication.
    self.symbol = None
    self.file_name = None
    self.line_number = 0

  def __repr__(self):
    # If symbolication failed, just use the original line.
    if self.line:
      return '  %s' % self.line

    # Use different location information depending on symbolicated data.
    location = None
    if self.file_name:
      location = ' - %s:%s' % (self.file_name, self.line_number)
    else:
      location = ' + %s' % self.offset

    # Same with the symbol information.
    symbol = self.original_symbol
    if self.symbol:
      symbol = self.symbol

    return '  %s\t0x%x\t[%s\t%s]\t%s' % (self.frame_id, self.address,
        self.image, location, symbol)


def PrettyPrintReport(report):
  """Takes a crash report and prints it like the crash server would."""
  print 'Process    : ' + report.report_info['Process']
  print 'Version    : ' + report.report_info['Version']
  print 'Date       : ' + report.report_info['Date/Time']
  print 'OS Version : ' + report.report_info['OS Version']
  print
  if 'Crashed Thread' in report.report_info:
    print 'Crashed Thread : ' + report.report_info['Crashed Thread']
    print
  if 'Event' in report.report_info:
    print 'Event      : ' + report.report_info['Event']
    print

  for thread in report.threads:
    print
    if thread.did_crash:
      exc_type = report.report_info['Exception Type'].split(' ')[0]
      exc_code = report.report_info['Exception Codes'].replace('at', '@')
      print '*CRASHED* ( ' + exc_type + ' / ' + exc_code + ' )'
    # Version 7 reports have spindump-style output (with a stepped stack trace),
    # so remove the first tab to get better alignment.
    if report.report_version == 7:
      for line in repr(thread).split('\n'):
        print line.replace('\t', '  ', 1)
    else:
      print thread


def Main(args):
  """Program main."""
  parser = optparse.OptionParser(
      usage='%prog [options] symbol_path crash_report',
      description='This will parse and symbolicate an Apple CrashReporter v6-9 '
          'file.')
  parser.add_option('-s', '--std-path', action='store_true', dest='std_path',
                    help='With this flag, the symbol_path is a containing '
                    'directory, in which a dSYM files are stored in a '
                    'directory named by the version. Example: '
                    '[symbolicate_crash.py -s ./symbols/ report.crash] will '
                    'look for dSYMs in ./symbols/15.0.666.0/ if the report is '
                    'from that verison.')
  (options, args) = parser.parse_args(args[1:])

  # Check that we have something to symbolicate.
  if len(args) != 2:
    parser.print_usage()
    return 1

  report = CrashReport(args[1])
  symbol_path = None

  # If not using the standard layout, this is a full path to the symbols.
  if not options.std_path:
    symbol_path = args[0]
  # Otherwise, use the report version to locate symbols in a directory.
  else:
    # This is in the format of |M.N.B.P (B.P)|. Get just the part before the
    # space.
    chrome_version = report.report_info['Version'].split(' ')[0]
    symbol_path = os.path.join(args[0], chrome_version)

  # Check that the symbols exist.
  if not os.path.isdir(symbol_path):
    print >>sys.stderr, 'Symbol path %s is not a directory' % symbol_path
    return 2

  print >>sys.stderr, 'Using symbols from ' + symbol_path
  print >>sys.stderr, '=' * 80

  report.Symbolicate(symbol_path)
  PrettyPrintReport(report)
  return 0


if __name__ == '__main__':
  sys.exit(Main(sys.argv))