Move tools/mac/symbolicate_crash.py from src-internal to the public repo.

BUG=none TEST=none Review URL: http://codereview.chromium.org/7825019 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@99356 0039d316-1c4b-4281-b951-d872f2087c98
author: rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-02 13:24:30 +0000
committer: rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-02 13:24:30 +0000
commit: 4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091 (patch)
tree: 01fde9d12587b3e34ccb2c2666c177991ef4edb6 /tools/mac
parent: dfba8766fbe2b0d0ac7e538f18aaf6b92aa68c1e (diff)
download: chromium_src-4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091.zip
chromium_src-4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091.tar.gz
chromium_src-4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091.tar.bz2
1 files changed, 504 insertions, 0 deletions
diff --git a/tools/mac/symbolicate_crash.py b/tools/mac/symbolicate_crash.py
new file mode 100755
index 0000000..4e84125
--- /dev/null
+++ b/tools/mac/symbolicate_crash.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python2.6
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+This script can take an Apple-style CrashReporter log and symbolicate it. This
+is useful for when a user's reports aren't being uploaded, for example.
+
+Only versions 6, 7, 8, and 9 reports are supported. For more information on the
+file format, reference this document:
+  TN2123 <http://developer.apple.com/library/mac/#technotes/tn2004/tn2123.html>
+
+Information on symbolication was gleaned from:
+  <http://developer.apple.com/tools/xcode/symbolizingcrashdumps.html>
+"""
+
+import optparse
+import os.path
+import re
+import subprocess
+import sys
+
+# Maps binary image identifiers to binary names (minus the .dSYM portion) found
+# in the archive. These are the only objects that will be looked up.
+SYMBOL_IMAGE_MAP = {
+  'com.google.Chrome': 'Google Chrome.app',
+  'com.google.Chrome.framework': 'Google Chrome Framework.framework',
+  'com.google.Chrome.helper': 'Google Chrome Helper.app'
+}
+
+class CrashReport(object):
+  """A parsed representation of an Apple CrashReport text file."""
+  def __init__(self, file_name):
+    super(CrashReport, self).__init__()
+    self.report_info = {}
+    self.threads = []
+    self._binary_images = {}
+
+    fd = open(file_name, 'r')
+    self._ParseHeader(fd)
+
+    # Try and get the report version. If it's not a version we handle, abort.
+    self.report_version = int(self.report_info['Report Version'])
+    # Version 6: 10.5 and 10.6 crash report
+    # Version 7: 10.6 spindump report
+    # Version 8: 10.7 spindump report
+    # Version 9: 10.7 crash report
+    valid_versions = (6, 7, 8, 9)
+    if self.report_version not in valid_versions:
+      raise Exception("Only crash reports of versions %s are accepted." %
+          str(valid_versions))
+
+    # If this is a spindump (version 7 or 8 report), use a special parser. The
+    # format is undocumented, but is similar to version 6. However, the spindump
+    # report contains user and kernel stacks for every process on the system.
+    if self.report_version == 7 or self.report_version == 8:
+      self._ParseSpindumpStack(fd)
+    else:
+      self._ParseStack(fd)
+
+    self._ParseBinaryImages(fd)
+    fd.close()
+
+  def Symbolicate(self, symbol_path):
+    """Symbolicates a crash report stack trace."""
+    # In order to be efficient, collect all the offsets that will be passed to
+    # atos by the image name.
+    offsets_by_image = self._CollectAddressesForImages(SYMBOL_IMAGE_MAP.keys())
+
+    # For each image, run atos with the list of addresses.
+    for image_name, addresses in offsets_by_image.items():
+      # If this image was not loaded or is in no stacks, skip.
+      if image_name not in self._binary_images or not len(addresses):
+        continue
+
+      # Combine the |image_name| and |symbol_path| into the path of the dSYM.
+      dsym_file = self._GetDSymPath(symbol_path, image_name)
+
+      # From the list of 2-Tuples of (frame, address), create a list of just
+      # addresses.
+      address_list = map(lambda x: x[1], addresses)
+
+      # Look up the load address of the image.
+      binary_base = self._binary_images[image_name][0]
+
+      # This returns a list of just symbols. The indices will match up with the
+      # list of |addresses|.
+      symbol_names = self._RunAtos(binary_base, dsym_file, address_list)
+      if not symbol_names:
+        print 'Error loading symbols for ' + image_name
+        continue
+
+      # Attaches a list of symbol names to stack frames. This assumes that the
+      # order of |addresses| has stayed the same as |symbol_names|.
+      self._AddSymbolsToFrames(symbol_names, addresses)
+
+  def _ParseHeader(self, fd):
+    """Parses the header section of a crash report, which contains the OS and
+    application version information."""
+    # The header is made up of different sections, depending on the type of
+    # report and the report version. Almost all have a format of a key and
+    # value separated by a colon. Accumulate all of these artifacts into a
+    # dictionary until the first thread stack is reached.
+    thread_re = re.compile('^[ \t]*Thread ([a-f0-9]+)')
+    line = ''
+    while not thread_re.match(line):
+      # Skip blank lines. There are typically three or four sections separated
+      # by newlines in the header.
+      line = line.strip()
+      if line:
+        parts = line.split(':', 1)
+        # Certain lines in different report versions don't follow the key-value
+        # format, so skip them.
+        if len(parts) == 2:
+          # There's a varying amount of space padding after the ':' to align all
+          # the values; strip that.
+          self.report_info[parts[0]] = parts[1].lstrip()
+      line = fd.readline()
+
+    # When this loop exits, the header has been read in full. However, the first
+    # thread stack heading has been read past. Seek backwards from the current
+    # position by the length of the line so that it is re-read when
+    # _ParseStack() is entered.
+    fd.seek(-len(line), os.SEEK_CUR)
+
+  def _ParseStack(self, fd):
+    """Parses the stack dump of a crash report and creates a list of threads
+    and their stack traces."""
+    # Compile a regex that matches the start of a thread stack. Note that this
+    # must be specific to not include the thread state section, which comes
+    # right after all the stack traces.
+    line_re = re.compile('^Thread ([0-9]+)( Crashed)?:(.*)')
+
+    # On entry into this function, the fd has been walked up to the "Thread 0"
+    # line.
+    line = fd.readline().rstrip()
+    in_stack = False
+    thread = None
+    while line_re.match(line) or in_stack:
+      # Check for start of the thread stack.
+      matches = line_re.match(line)
+
+      if not line.strip():
+        # A blank line indicates a break in the thread stack.
+        in_stack = False
+      elif matches:
+        # If this is the start of a thread stack, create the CrashThread.
+        in_stack = True
+        thread = CrashThread(matches.group(1))
+        thread.name = matches.group(3)
+        thread.did_crash = matches.group(2) != None
+        self.threads.append(thread)
+      else:
+        # All other lines are stack frames.
+        thread.stack.append(self._ParseStackFrame(line))
+      # Read the next line.
+      line = fd.readline()
+
+  def _ParseStackFrame(self, line):
+    """Takes in a single line of text and transforms it into a StackFrame."""
+    frame = StackFrame(line)
+
+    # A stack frame is in the format of:
+    # |<frame-number> <binary-image> 0x<address> <symbol> <offset>|.
+    regex = '^([0-9]+) +(.+)[ \t]+(0x[0-9a-f]+) (.*) \+ ([0-9]+)$'
+    matches = re.match(regex, line)
+    if matches is None:
+      return frame
+
+    # Create a stack frame with the information extracted from the regex.
+    frame.frame_id = matches.group(1)
+    frame.image = matches.group(2)
+    frame.address = int(matches.group(3), 0)  # Convert HEX to an int.
+    frame.original_symbol = matches.group(4)
+    frame.offset = matches.group(5)
+    frame.line = None
+    return frame
+
+  def _ParseSpindumpStack(self, fd):
+    """Parses a spindump stack report. In this format, each thread stack has
+    both a user and kernel trace. Only the user traces are symbolicated."""
+
+    # The stack trace begins with the thread header, which is identified by a
+    # HEX number. The thread names appear to be incorrect in spindumps.
+    user_thread_re = re.compile('^  Thread ([0-9a-fx]{4})')
+
+    # When this method is called, the fd has been walked right up to the first
+    # line.
+    line = fd.readline()
+    in_user_stack = False
+    in_kernel_stack = False
+    thread = None
+    frame_id = 0
+    while user_thread_re.match(line) or in_user_stack or in_kernel_stack:
+      # Check for the start of a thread.
+      matches = user_thread_re.match(line)
+
+      if not line.strip():
+        # A blank line indicates the start of a new thread. The blank line comes
+        # after the kernel stack before a new thread header.
+        in_kernel_stack = False
+      elif matches:
+        # This is the start of a thread header. The next line is the heading for
+        # the user stack, followed by the actual trace.
+        thread = CrashThread(matches.group(1))
+        frame_id = 0
+        self.threads.append(thread)
+        in_user_stack = True
+        line = fd.readline()  # Read past the 'User stack:' header.
+      elif line.startswith('  Kernel stack:'):
+        # The kernel stack header comes immediately after the last frame (really
+        # the top frame) in the user stack, without a blank line.
+        in_user_stack = False
+        in_kernel_stack = True
+      elif in_user_stack:
+        # If this is a line while in the user stack, parse it as a stack frame.
+        thread.stack.append(self._ParseSpindumpStackFrame(line))
+      # Loop with the next line.
+      line = fd.readline()
+
+    # When the loop exits, the file has been read through the 'Binary images:'
+    # header. Seek backwards so that _ParseBinaryImages() does the right thing.
+    fd.seek(-len(line), os.SEEK_CUR)
+
+  def _ParseSpindumpStackFrame(self, line):
+    """Parses a spindump-style stackframe."""
+    frame = StackFrame(line)
+
+    # The format of the frame is either:
+    # A: |<space><steps> <symbol> + <offset> (in <image-name>) [<address>]|
+    # B: |<space><steps> ??? (in <image-name> + <offset>) [<address>]|
+    regex_a = '^([ ]+[0-9]+) (.*) \+ ([0-9]+) \(in (.*)\) \[(0x[0-9a-f]+)\]'
+    regex_b = '^([ ]+[0-9]+) \?\?\?( \(in (.*) \+ ([0-9]+)\))? \[(0x[0-9a-f]+)\]'
+
+    # Create the stack frame with the information extracted from the regex.
+    matches = re.match(regex_a, line)
+    if matches:
+      frame.frame_id = matches.group(1)[4:]  # Remove some leading spaces.
+      frame.original_symbol = matches.group(2)
+      frame.offset = matches.group(3)
+      frame.image = matches.group(4)
+      frame.address = int(matches.group(5), 0)
+      frame.line = None
+      return frame
+
+    # If pattern A didn't match (which it will most of the time), try B.
+    matches = re.match(regex_b, line)
+    if matches:
+      frame.frame_id = matches.group(1)[4:]  # Remove some leading spaces.
+      frame.image = matches.group(3)
+      frame.offset = matches.group(4)
+      frame.address = int(matches.group(5), 0)
+      frame.line = None
+      return frame
+
+    # Otherwise, this frame could not be matched and just use the raw input.
+    frame.line = frame.line.strip()
+    return frame
+
+  def _ParseBinaryImages(self, fd):
+    """Parses out the binary images section in order to get the load offset."""
+    # The parser skips some sections, so advance until the "Binary Images"
+    # header is reached.
+    while not fd.readline().lstrip().startswith("Binary Images:"): pass
+
+    # Create a regex to match the lines of format:
+    # |0x<start> - 0x<end> <binary-image> <version> (<version>) <<UUID>> <path>|
+    image_re = re.compile(
+        '[ ]*(0x[0-9a-f]+) -[ \t]+(0x[0-9a-f]+) [+ ]([a-zA-Z0-9._\-]+)')
+
+    # This section is in this format:
+    # |<start address> - <end address> <image name>|.
+    while True:
+      line = fd.readline()
+      if not line.strip():
+        # End when a blank line is hit.
+        return
+      # Match the line to the regex.
+      match = image_re.match(line)
+      if match:
+        # Store the offsets by image name so it can be referenced during
+        # symbolication. These are hex numbers with leading '0x', so int() can
+        # convert them to decimal if base=0.
+        address_range = (int(match.group(1), 0), int(match.group(2), 0))
+        self._binary_images[match.group(3)] = address_range
+
+  def _CollectAddressesForImages(self, images):
+    """Iterates all the threads and stack frames and all the stack frames that
+    are in a list of binary |images|. The result is a dictionary, keyed by the
+    image name that maps to a list of tuples. Each is a 2-Tuple of
+    (stack_frame, address)"""
+    # Create the collection and initialize it with empty lists for each image.
+    collection = {}
+    for image in images:
+      collection[image] = []
+
+    # Perform the iteration.
+    for thread in self.threads:
+      for frame in thread.stack:
+        image_name = self._ImageForAddress(frame.address)
+        if image_name in images:
+          # Replace the image name in the frame in case it was elided.
+          frame.image = image_name
+          collection[frame.image].append((frame, frame.address))
+
+    # Return the result.
+    return collection
+
+  def _ImageForAddress(self, address):
+    """Given a PC address, returns the bundle identifier of the image in which
+    the address resides."""
+    for image_name, address_range in self._binary_images.items():
+      if address >= address_range[0] and address <= address_range[1]:
+        return image_name
+    return None
+
+  def _GetDSymPath(self, base_path, image_name):
+    """Takes a base path for the symbols and an image name. It looks the name up
+    in SYMBOL_IMAGE_MAP and creates a full path to the dSYM in the bundle."""
+    image_file = SYMBOL_IMAGE_MAP[image_name]
+    return os.path.join(base_path, image_file + '.dSYM', 'Contents',
+        'Resources', 'DWARF',
+        os.path.splitext(image_file)[0])  # Chop off the extension.
+
+  def _RunAtos(self, load_address, dsym_file, addresses):
+    """Runs the atos with the provided arguments. |addresses| is used as stdin.
+    Returns a list of symbol information in the same order as |addresses|."""
+    args = ['atos', '-l', str(load_address), '-o', dsym_file]
+
+    # Get the arch type. This is of the format |X86 (Native)|.
+    if 'Code Type' in self.report_info:
+      arch = self.report_info['Code Type'].lower().split(' ')
+      if len(arch) == 2:
+        arch = arch[0]
+        if arch == 'x86':
+          # The crash report refers to i386 as x86, but atos doesn't know what
+          # that is.
+          arch = 'i386'
+        args.extend(['-arch', arch])
+
+    proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    addresses = map(hex, addresses)
+    (stdout, stderr) = proc.communicate(' '.join(addresses))
+    if proc.returncode:
+      return None
+    return stdout.rstrip().split('\n')
+
+  def _AddSymbolsToFrames(self, symbols, address_tuples):
+    """Takes a single value (the list) from _CollectAddressesForImages and does
+    a smart-zip with the data returned by atos in |symbols|. Note that the
+    indices must match for this to succeed."""
+    if len(symbols) != len(address_tuples):
+      print 'symbols do not match'
+
+    # Each line of output from atos is in this format:
+    # |<symbol> (in <image>) (<file>:<line>)|.
+    line_regex = re.compile('(.+) \(in (.+)\) (\((.+):([0-9]+)\))?')
+
+    # Zip the two data sets together.
+    for i in range(len(symbols)):
+      symbol_parts = line_regex.match(symbols[i])
+      if not symbol_parts:
+        continue  # Error.
+      frame = address_tuples[i][0]
+      frame.symbol = symbol_parts.group(1)
+      frame.image = symbol_parts.group(2)
+      frame.file_name = symbol_parts.group(4)
+      frame.line_number = symbol_parts.group(5)
+
+
+class CrashThread(object):
+  """A CrashThread represents a stacktrace of a single thread """
+  def __init__(self, thread_id):
+    super(CrashThread, self).__init__()
+    self.thread_id = thread_id
+    self.name = None
+    self.did_crash = False
+    self.stack = []
+
+  def __repr__(self):
+    name = ''
+    if self.name:
+      name = ': ' + self.name
+    return 'Thread ' + self.thread_id + name + '\n' + \
+        '\n'.join(map(str, self.stack))
+
+
+class StackFrame(object):
+  """A StackFrame is owned by a CrashThread."""
+  def __init__(self, line):
+    super(StackFrame, self).__init__()
+    # The original line. This will be set to None if symbolication was
+    # successfuly.
+    self.line = line
+
+    self.frame_id = 0
+    self.image = None
+    self.address = 0x0
+    self.original_symbol = None
+    self.offset = 0x0
+    # The following members are set after symbolication.
+    self.symbol = None
+    self.file_name = None
+    self.line_number = 0
+
+  def __repr__(self):
+    # If symbolication failed, just use the original line.
+    if self.line:
+      return '  %s' % self.line
+
+    # Use different location information depending on symbolicated data.
+    location = None
+    if self.file_name:
+      location = ' - %s:%s' % (self.file_name, self.line_number)
+    else:
+      location = ' + %s' % self.offset
+
+    # Same with the symbol information.
+    symbol = self.original_symbol
+    if self.symbol:
+      symbol = self.symbol
+
+    return '  %s\t0x%x\t[%s\t%s]\t%s' % (self.frame_id, self.address,
+        self.image, location, symbol)
+
+
+def PrettyPrintReport(report):
+  """Takes a crash report and prints it like the crash server would."""
+  print 'Process    : ' + report.report_info['Process']
+  print 'Version    : ' + report.report_info['Version']
+  print 'Date       : ' + report.report_info['Date/Time']
+  print 'OS Version : ' + report.report_info['OS Version']
+  print
+  if 'Crashed Thread' in report.report_info:
+    print 'Crashed Thread : ' + report.report_info['Crashed Thread']
+    print
+  if 'Event' in report.report_info:
+    print 'Event      : ' + report.report_info['Event']
+    print
+
+  for thread in report.threads:
+    print
+    if thread.did_crash:
+      exc_type = report.report_info['Exception Type'].split(' ')[0]
+      exc_code = report.report_info['Exception Codes'].replace('at', '@')
+      print '*CRASHED* ( ' + exc_type + ' / ' + exc_code + ' )'
+    # Version 7 reports have spindump-style output (with a stepped stack trace),
+    # so remove the first tab to get better alignment.
+    if report.report_version == 7:
+      for line in repr(thread).split('\n'):
+        print line.replace('\t', '  ', 1)
+    else:
+      print thread
+
+
+def Main(args):
+  """Program main."""
+  parser = optparse.OptionParser(
+      usage='%prog [options] symbol_path crash_report',
+      description='This will parse and symbolicate an Apple CrashReporter v6-9 '
+          'file.')
+  parser.add_option('-s', '--std-path', action='store_true', dest='std_path',
+                    help='With this flag, the symbol_path is a containing '
+                    'directory, in which a dSYM files are stored in a '
+                    'directory named by the version. Example: '
+                    '[symbolicate_crash.py -s ./symbols/ report.crash] will '
+                    'look for dSYMs in ./symbols/15.0.666.0/ if the report is '
+                    'from that verison.')
+  (options, args) = parser.parse_args(args[1:])
+
+  # Check that we have something to symbolicate.
+  if len(args) != 2:
+    parser.print_usage()
+    return 1
+
+  report = CrashReport(args[1])
+  symbol_path = None
+
+  # If not using the standard layout, this is a full path to the symbols.
+  if not options.std_path:
+    symbol_path = args[0]
+  # Otherwise, use the report version to locate symbols in a directory.
+  else:
+    # This is in the format of |M.N.B.P (B.P)|. Get just the part before the
+    # space.
+    chrome_version = report.report_info['Version'].split(' ')[0]
+    symbol_path = os.path.join(args[0], chrome_version)
+
+  # Check that the symbols exist.
+  if not os.path.isdir(symbol_path):
+    print >>sys.stderr, 'Symbol path %s is not a directory' % symbol_path
+    return 2
+
+  print >>sys.stderr, 'Using symbols from ' + symbol_path
+  print >>sys.stderr, '=' * 80
+
+  report.Symbolicate(symbol_path)
+  PrettyPrintReport(report)
+  return 0
+
+
+if __name__ == '__main__':
+  sys.exit(Main(sys.argv))
author	rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-02 13:24:30 +0000
committer	rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-02 13:24:30 +0000
commit	4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091 (patch)
tree	01fde9d12587b3e34ccb2c2666c177991ef4edb6 /tools/mac
parent	dfba8766fbe2b0d0ac7e538f18aaf6b92aa68c1e (diff)
download	chromium_src-4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091.zip chromium_src-4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091.tar.gz chromium_src-4e5d2bbf9770f19ecdbb3ac0f669f83849cf5091.tar.bz2