#!/usr/bin/env python # Copyright (c) 2012 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """ This script can take an Apple-style CrashReporter log and symbolicate it. This is useful for when a user's reports aren't being uploaded, for example. Only versions 6, 7, 8, and 9 reports are supported. For more information on the file format, reference this document: TN2123 Information on symbolication was gleaned from: """ import optparse import os.path import re import subprocess import sys # Maps binary image identifiers to binary names (minus the .dSYM portion) found # in the archive. These are the only objects that will be looked up. SYMBOL_IMAGE_MAP = { 'com.google.Chrome': 'Google Chrome.app', 'com.google.Chrome.framework': 'Google Chrome Framework.framework', 'com.google.Chrome.helper': 'Google Chrome Helper.app' } class CrashReport(object): """A parsed representation of an Apple CrashReport text file.""" def __init__(self, file_name): super(CrashReport, self).__init__() self.report_info = {} self.threads = [] self._binary_images = {} fd = open(file_name, 'r') self._ParseHeader(fd) # Try and get the report version. If it's not a version we handle, abort. self.report_version = int(self.report_info['Report Version']) # Version 6: 10.5 and 10.6 crash report # Version 7: 10.6 spindump report # Version 8: 10.7 spindump report # Version 9: 10.7 crash report valid_versions = (6, 7, 8, 9) if self.report_version not in valid_versions: raise Exception("Only crash reports of versions %s are accepted." % str(valid_versions)) # If this is a spindump (version 7 or 8 report), use a special parser. The # format is undocumented, but is similar to version 6. However, the spindump # report contains user and kernel stacks for every process on the system. if self.report_version == 7 or self.report_version == 8: self._ParseSpindumpStack(fd) else: self._ParseStack(fd) self._ParseBinaryImages(fd) fd.close() def Symbolicate(self, symbol_path): """Symbolicates a crash report stack trace.""" # In order to be efficient, collect all the offsets that will be passed to # atos by the image name. offsets_by_image = self._CollectAddressesForImages(SYMBOL_IMAGE_MAP.keys()) # For each image, run atos with the list of addresses. for image_name, addresses in offsets_by_image.items(): # If this image was not loaded or is in no stacks, skip. if image_name not in self._binary_images or not len(addresses): continue # Combine the |image_name| and |symbol_path| into the path of the dSYM. dsym_file = self._GetDSymPath(symbol_path, image_name) # From the list of 2-Tuples of (frame, address), create a list of just # addresses. address_list = map(lambda x: x[1], addresses) # Look up the load address of the image. binary_base = self._binary_images[image_name][0] # This returns a list of just symbols. The indices will match up with the # list of |addresses|. symbol_names = self._RunAtos(binary_base, dsym_file, address_list) if not symbol_names: print 'Error loading symbols for ' + image_name continue # Attaches a list of symbol names to stack frames. This assumes that the # order of |addresses| has stayed the same as |symbol_names|. self._AddSymbolsToFrames(symbol_names, addresses) def _ParseHeader(self, fd): """Parses the header section of a crash report, which contains the OS and application version information.""" # The header is made up of different sections, depending on the type of # report and the report version. Almost all have a format of a key and # value separated by a colon. Accumulate all of these artifacts into a # dictionary until the first thread stack is reached. thread_re = re.compile('^[ \t]*Thread ([a-f0-9]+)') line = '' while not thread_re.match(line): # Skip blank lines. There are typically three or four sections separated # by newlines in the header. line = line.strip() if line: parts = line.split(':', 1) # Certain lines in different report versions don't follow the key-value # format, so skip them. if len(parts) == 2: # There's a varying amount of space padding after the ':' to align all # the values; strip that. self.report_info[parts[0]] = parts[1].lstrip() line = fd.readline() # When this loop exits, the header has been read in full. However, the first # thread stack heading has been read past. Seek backwards from the current # position by the length of the line so that it is re-read when # _ParseStack() is entered. fd.seek(-len(line), os.SEEK_CUR) def _ParseStack(self, fd): """Parses the stack dump of a crash report and creates a list of threads and their stack traces.""" # Compile a regex that matches the start of a thread stack. Note that this # must be specific to not include the thread state section, which comes # right after all the stack traces. line_re = re.compile('^Thread ([0-9]+)( Crashed)?:(.*)') # On entry into this function, the fd has been walked up to the "Thread 0" # line. line = fd.readline().rstrip() in_stack = False thread = None while line_re.match(line) or in_stack: # Check for start of the thread stack. matches = line_re.match(line) if not line.strip(): # A blank line indicates a break in the thread stack. in_stack = False elif matches: # If this is the start of a thread stack, create the CrashThread. in_stack = True thread = CrashThread(matches.group(1)) thread.name = matches.group(3) thread.did_crash = matches.group(2) != None self.threads.append(thread) else: # All other lines are stack frames. thread.stack.append(self._ParseStackFrame(line)) # Read the next line. line = fd.readline() def _ParseStackFrame(self, line): """Takes in a single line of text and transforms it into a StackFrame.""" frame = StackFrame(line) # A stack frame is in the format of: # | 0x
|. regex = '^([0-9]+) +(.+)[ \t]+(0x[0-9a-f]+) (.*) \+ ([0-9]+)$' matches = re.match(regex, line) if matches is None: return frame # Create a stack frame with the information extracted from the regex. frame.frame_id = matches.group(1) frame.image = matches.group(2) frame.address = int(matches.group(3), 0) # Convert HEX to an int. frame.original_symbol = matches.group(4) frame.offset = matches.group(5) frame.line = None return frame def _ParseSpindumpStack(self, fd): """Parses a spindump stack report. In this format, each thread stack has both a user and kernel trace. Only the user traces are symbolicated.""" # The stack trace begins with the thread header, which is identified by a # HEX number. The thread names appear to be incorrect in spindumps. user_thread_re = re.compile('^ Thread ([0-9a-fx]+)') # When this method is called, the fd has been walked right up to the first # line. line = fd.readline() in_user_stack = False in_kernel_stack = False thread = None frame_id = 0 while user_thread_re.match(line) or in_user_stack or in_kernel_stack: # Check for the start of a thread. matches = user_thread_re.match(line) if not line.strip(): # A blank line indicates the start of a new thread. The blank line comes # after the kernel stack before a new thread header. in_kernel_stack = False elif matches: # This is the start of a thread header. The next line is the heading for # the user stack, followed by the actual trace. thread = CrashThread(matches.group(1)) frame_id = 0 self.threads.append(thread) in_user_stack = True line = fd.readline() # Read past the 'User stack:' header. elif line.startswith(' Kernel stack:'): # The kernel stack header comes immediately after the last frame (really # the top frame) in the user stack, without a blank line. in_user_stack = False in_kernel_stack = True elif in_user_stack: # If this is a line while in the user stack, parse it as a stack frame. thread.stack.append(self._ParseSpindumpStackFrame(line)) # Loop with the next line. line = fd.readline() # When the loop exits, the file has been read through the 'Binary images:' # header. Seek backwards so that _ParseBinaryImages() does the right thing. fd.seek(-len(line), os.SEEK_CUR) def _ParseSpindumpStackFrame(self, line): """Parses a spindump-style stackframe.""" frame = StackFrame(line) # The format of the frame is either: # A: | + (in ) [
]| # B: | ??? (in + ) [
]| regex_a = '^([ ]+[0-9]+) (.*) \+ ([0-9]+) \(in (.*)\) \[(0x[0-9a-f]+)\]' regex_b = '^([ ]+[0-9]+) \?\?\?( \(in (.*) \+ ([0-9]+)\))? \[(0x[0-9a-f]+)\]' # Create the stack frame with the information extracted from the regex. matches = re.match(regex_a, line) if matches: frame.frame_id = matches.group(1)[4:] # Remove some leading spaces. frame.original_symbol = matches.group(2) frame.offset = matches.group(3) frame.image = matches.group(4) frame.address = int(matches.group(5), 0) frame.line = None return frame # If pattern A didn't match (which it will most of the time), try B. matches = re.match(regex_b, line) if matches: frame.frame_id = matches.group(1)[4:] # Remove some leading spaces. frame.image = matches.group(3) frame.offset = matches.group(4) frame.address = int(matches.group(5), 0) frame.line = None return frame # Otherwise, this frame could not be matched and just use the raw input. frame.line = frame.line.strip() return frame def _ParseBinaryImages(self, fd): """Parses out the binary images section in order to get the load offset.""" # The parser skips some sections, so advance until the "Binary Images" # header is reached. while not fd.readline().lstrip().startswith("Binary Images:"): pass # Create a regex to match the lines of format: # |0x - 0x () <> | image_re = re.compile( '[ ]*(0x[0-9a-f]+) -[ \t]+(0x[0-9a-f]+) [+ ]([a-zA-Z0-9._\-]+)') # This section is in this format: # | - |. while True: line = fd.readline() if not line.strip(): # End when a blank line is hit. return # Match the line to the regex. match = image_re.match(line) if match: # Store the offsets by image name so it can be referenced during # symbolication. These are hex numbers with leading '0x', so int() can # convert them to decimal if base=0. address_range = (int(match.group(1), 0), int(match.group(2), 0)) self._binary_images[match.group(3)] = address_range def _CollectAddressesForImages(self, images): """Iterates all the threads and stack frames and all the stack frames that are in a list of binary |images|. The result is a dictionary, keyed by the image name that maps to a list of tuples. Each is a 2-Tuple of (stack_frame, address)""" # Create the collection and initialize it with empty lists for each image. collection = {} for image in images: collection[image] = [] # Perform the iteration. for thread in self.threads: for frame in thread.stack: image_name = self._ImageForAddress(frame.address) if image_name in images: # Replace the image name in the frame in case it was elided. frame.image = image_name collection[frame.image].append((frame, frame.address)) # Return the result. return collection def _ImageForAddress(self, address): """Given a PC address, returns the bundle identifier of the image in which the address resides.""" for image_name, address_range in self._binary_images.items(): if address >= address_range[0] and address <= address_range[1]: return image_name return None def _GetDSymPath(self, base_path, image_name): """Takes a base path for the symbols and an image name. It looks the name up in SYMBOL_IMAGE_MAP and creates a full path to the dSYM in the bundle.""" image_file = SYMBOL_IMAGE_MAP[image_name] return os.path.join(base_path, image_file + '.dSYM', 'Contents', 'Resources', 'DWARF', os.path.splitext(image_file)[0]) # Chop off the extension. def _RunAtos(self, load_address, dsym_file, addresses): """Runs the atos with the provided arguments. |addresses| is used as stdin. Returns a list of symbol information in the same order as |addresses|.""" args = ['atos', '-l', str(load_address), '-o', dsym_file] # Get the arch type. This is of the format |X86 (Native)|. if 'Code Type' in self.report_info: arch = self.report_info['Code Type'].lower().split(' ') if len(arch) == 2: arch = arch[0] if arch == 'x86': # The crash report refers to i386 as x86, but atos doesn't know what # that is. arch = 'i386' args.extend(['-arch', arch]) proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) addresses = map(hex, addresses) (stdout, stderr) = proc.communicate(' '.join(addresses)) if proc.returncode: return None return stdout.rstrip().split('\n') def _AddSymbolsToFrames(self, symbols, address_tuples): """Takes a single value (the list) from _CollectAddressesForImages and does a smart-zip with the data returned by atos in |symbols|. Note that the indices must match for this to succeed.""" if len(symbols) != len(address_tuples): print 'symbols do not match' # Each line of output from atos is in this format: # | (in ) (:)|. line_regex = re.compile('(.+) \(in (.+)\) (\((.+):([0-9]+)\))?') # Zip the two data sets together. for i in range(len(symbols)): symbol_parts = line_regex.match(symbols[i]) if not symbol_parts: continue # Error. frame = address_tuples[i][0] frame.symbol = symbol_parts.group(1) frame.image = symbol_parts.group(2) frame.file_name = symbol_parts.group(4) frame.line_number = symbol_parts.group(5) class CrashThread(object): """A CrashThread represents a stacktrace of a single thread """ def __init__(self, thread_id): super(CrashThread, self).__init__() self.thread_id = thread_id self.name = None self.did_crash = False self.stack = [] def __repr__(self): name = '' if self.name: name = ': ' + self.name return 'Thread ' + self.thread_id + name + '\n' + \ '\n'.join(map(str, self.stack)) class StackFrame(object): """A StackFrame is owned by a CrashThread.""" def __init__(self, line): super(StackFrame, self).__init__() # The original line. This will be set to None if symbolication was # successfuly. self.line = line self.frame_id = 0 self.image = None self.address = 0x0 self.original_symbol = None self.offset = 0x0 # The following members are set after symbolication. self.symbol = None self.file_name = None self.line_number = 0 def __repr__(self): # If symbolication failed, just use the original line. if self.line: return ' %s' % self.line # Use different location information depending on symbolicated data. location = None if self.file_name: location = ' - %s:%s' % (self.file_name, self.line_number) else: location = ' + %s' % self.offset # Same with the symbol information. symbol = self.original_symbol if self.symbol: symbol = self.symbol return ' %s\t0x%x\t[%s\t%s]\t%s' % (self.frame_id, self.address, self.image, location, symbol) def PrettyPrintReport(report): """Takes a crash report and prints it like the crash server would.""" print 'Process : ' + report.report_info['Process'] print 'Version : ' + report.report_info['Version'] print 'Date : ' + report.report_info['Date/Time'] print 'OS Version : ' + report.report_info['OS Version'] print if 'Crashed Thread' in report.report_info: print 'Crashed Thread : ' + report.report_info['Crashed Thread'] print if 'Event' in report.report_info: print 'Event : ' + report.report_info['Event'] print for thread in report.threads: print if thread.did_crash: exc_type = report.report_info['Exception Type'].split(' ')[0] exc_code = report.report_info['Exception Codes'].replace('at', '@') print '*CRASHED* ( ' + exc_type + ' / ' + exc_code + ' )' # Version 7 reports have spindump-style output (with a stepped stack trace), # so remove the first tab to get better alignment. if report.report_version == 7: for line in repr(thread).split('\n'): print line.replace('\t', ' ', 1) else: print thread def Main(args): """Program main.""" parser = optparse.OptionParser( usage='%prog [options] symbol_path crash_report', description='This will parse and symbolicate an Apple CrashReporter v6-9 ' 'file.') parser.add_option('-s', '--std-path', action='store_true', dest='std_path', help='With this flag, the symbol_path is a containing ' 'directory, in which a dSYM files are stored in a ' 'directory named by the version. Example: ' '[symbolicate_crash.py -s ./symbols/ report.crash] will ' 'look for dSYMs in ./symbols/15.0.666.0/ if the report is ' 'from that verison.') (options, args) = parser.parse_args(args[1:]) # Check that we have something to symbolicate. if len(args) != 2: parser.print_usage() return 1 report = CrashReport(args[1]) symbol_path = None # If not using the standard layout, this is a full path to the symbols. if not options.std_path: symbol_path = args[0] # Otherwise, use the report version to locate symbols in a directory. else: # This is in the format of |M.N.B.P (B.P)|. Get just the part before the # space. chrome_version = report.report_info['Version'].split(' ')[0] symbol_path = os.path.join(args[0], chrome_version) # Check that the symbols exist. if not os.path.isdir(symbol_path): print >>sys.stderr, 'Symbol path %s is not a directory' % symbol_path return 2 print >>sys.stderr, 'Using symbols from ' + symbol_path print >>sys.stderr, '=' * 80 report.Symbolicate(symbol_path) PrettyPrintReport(report) return 0 if __name__ == '__main__': sys.exit(Main(sys.argv))