#!/usr/bin/env python # Copyright (c) 2012 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """The deep heap profiler script for Chrome.""" from datetime import datetime import json import os import re import subprocess import sys import tempfile BUCKET_ID = 5 VIRTUAL = 0 COMMITTED = 1 ALLOC_COUNT = 2 FREE_COUNT = 3 NULL_REGEX = re.compile('') # If an executable pprof script is in the directory of deep_memory_profiler, # use it. Otherwise, use tcmalloc's pprof. # # A pprof script in deep_memory_profiler/ is prioritized to allow packaging # deep_memory_profiler files with a pprof script. The packaged # deep_memory_profiler is downloaded with pprof by using download.sh. PPROF_PATH = os.path.join(os.path.dirname(__file__), 'pprof') if not (os.path.isfile(PPROF_PATH) and os.access(PPROF_PATH, os.X_OK)): PPROF_PATH = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, 'third_party', 'tcmalloc', 'chromium', 'src', 'pprof') # Heap Profile Dump versions # DUMP_DEEP_1 DOES NOT distinct mmap regions and malloc chunks. # Their stacktraces DO contain mmap* or tc-* at their tops. # They should be processed by POLICY_DEEP_1. DUMP_DEEP_1 = 'DUMP_DEEP_1' # DUMP_DEEP_2 DOES distinct mmap regions and malloc chunks. # Their stacktraces still DO contain mmap* or tc-*. # They should be processed by POLICY_DEEP_1. DUMP_DEEP_2 = 'DUMP_DEEP_2' # DUMP_DEEP_3 DOES distinct mmap regions and malloc chunks. # Their stacktraces DO NOT contain mmap* or tc-*. # They should be processed by POLICY_DEEP_2. DUMP_DEEP_3 = 'DUMP_DEEP_3' # DUMP_DEEP_4 adds some features to DUMP_DEEP_3: # 1. Support comments starting with '#' # 2. Support additional global stats: e.g. nonprofiled-*. DUMP_DEEP_4 = 'DUMP_DEEP_4' # Heap Profile Policy versions # POLICY_DEEP_1 DOES NOT include allocation_type columns. # mmap regions are distincted w/ mmap frames in the pattern column. POLICY_DEEP_1 = 'POLICY_DEEP_1' # POLICY_DEEP_2 DOES include allocation_type columns. # mmap regions are distincted w/ the allocation_type column. POLICY_DEEP_2 = 'POLICY_DEEP_2' # TODO(dmikurube): Avoid global variables. address_symbol_dict = {} appeared_addresses = set() components = [] class ParsingException(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) class Policy(object): def __init__(self, name, mmap, pattern): self.name = name self.mmap = mmap self.condition = re.compile(pattern + r'\Z') def get_component(policy_list, bucket, mmap): """Returns a component name which a given bucket belongs to. Args: policy_list: A list containing Policy objects. (Parsed policy data by parse_policy.) bucket: A Bucket object to be searched for. mmap: True if searching for a mmap region. Returns: A string representing a component name. """ if not bucket: return 'no-bucket' if bucket.component: return bucket.component stacktrace = ''.join( address_symbol_dict[a] + ' ' for a in bucket.stacktrace).strip() for policy in policy_list: if mmap == policy.mmap and policy.condition.match(stacktrace): bucket.component = policy.name return policy.name assert False class Bucket(object): def __init__(self, stacktrace): self.stacktrace = stacktrace self.component = '' class Log(object): """A class representing one dumped log data.""" def __init__(self, log_path): self.log_path = log_path self.log_lines = [ l for l in open(self.log_path, 'r') if l and not l.startswith('#')] self.log_version = '' sys.stderr.write('Loading a dump: %s\n' % log_path) self.mmap_stacktrace_lines = [] self.malloc_stacktrace_lines = [] self.counters = {} self.log_time = os.stat(self.log_path).st_mtime @staticmethod def dump_stacktrace_lines(stacktrace_lines, buckets): """Prints a given stacktrace. Args: stacktrace_lines: A list of strings which are valid as stacktraces. buckets: A dict mapping bucket ids and their corresponding Bucket objects. """ for l in stacktrace_lines: words = l.split() bucket = buckets.get(int(words[BUCKET_ID])) if not bucket: continue for i in range(0, BUCKET_ID - 1): sys.stdout.write(words[i] + ' ') for address in bucket.stacktrace: sys.stdout.write((address_symbol_dict.get(address) or address) + ' ') sys.stdout.write('\n') def dump_stacktrace(self, buckets): """Prints stacktraces contained in the log. Args: buckets: A dict mapping bucket ids and their corresponding Bucket objects. """ self.dump_stacktrace_lines(self.mmap_stacktrace_lines, buckets) self.dump_stacktrace_lines(self.malloc_stacktrace_lines, buckets) @staticmethod def accumulate_size_for_pprof(stacktrace_lines, policy_list, buckets, component_name, mmap): """Accumulates size of committed chunks and the number of allocated chunks. Args: stacktrace_lines: A list of strings which are valid as stacktraces. policy_list: A list containing Policy objects. (Parsed policy data by parse_policy.) buckets: A dict mapping bucket ids and their corresponding Bucket objects. component_name: A name of component for filtering. mmap: True if searching for a mmap region. Returns: Two integers which are the accumulated size of committed regions and the number of allocated chunks, respectively. """ com_committed = 0 com_allocs = 0 for l in stacktrace_lines: words = l.split() bucket = buckets.get(int(words[BUCKET_ID])) if (not bucket or (component_name and component_name != get_component(policy_list, bucket, mmap))): continue com_committed += int(words[COMMITTED]) com_allocs += int(words[ALLOC_COUNT]) - int(words[FREE_COUNT]) return com_committed, com_allocs @staticmethod def dump_stacktrace_lines_for_pprof(stacktrace_lines, policy_list, buckets, component_name, mmap): """Prints information of stacktrace lines for pprof. Args: stacktrace_lines: A list of strings which are valid as stacktraces. policy_list: A list containing Policy objects. (Parsed policy data by parse_policy.) buckets: A dict mapping bucket ids and their corresponding Bucket objects. component_name: A name of component for filtering. mmap: True if searching for a mmap region. """ for l in stacktrace_lines: words = l.split() bucket = buckets.get(int(words[BUCKET_ID])) if (not bucket or (component_name and component_name != get_component(policy_list, bucket, mmap))): continue sys.stdout.write('%6d: %8s [%6d: %8s] @' % ( int(words[ALLOC_COUNT]) - int(words[FREE_COUNT]), words[COMMITTED], int(words[ALLOC_COUNT]) - int(words[FREE_COUNT]), words[COMMITTED])) for address in bucket.stacktrace: sys.stdout.write(' ' + address) sys.stdout.write('\n') def dump_for_pprof(self, policy_list, buckets, mapping_lines, component_name): """Converts the log file so it can be processed by pprof. Args: policy_list: A list containing Policy objects. (Parsed policy data by parse_policy.) buckets: A dict mapping bucket ids and their corresponding Bucket objects. mapping_lines: A list of strings containing /proc/.../maps. component_name: A name of component for filtering. """ sys.stdout.write('heap profile: ') com_committed, com_allocs = self.accumulate_size_for_pprof( self.mmap_stacktrace_lines, policy_list, buckets, component_name, True) add_committed, add_allocs = self.accumulate_size_for_pprof( self.malloc_stacktrace_lines, policy_list, buckets, component_name, False) com_committed += add_committed com_allocs += add_allocs sys.stdout.write('%6d: %8s [%6d: %8s] @ heapprofile\n' % ( com_allocs, com_committed, com_allocs, com_committed)) self.dump_stacktrace_lines_for_pprof( self.mmap_stacktrace_lines, policy_list, buckets, component_name, True) self.dump_stacktrace_lines_for_pprof( self.malloc_stacktrace_lines, policy_list, buckets, component_name, False) sys.stdout.write('MAPPED_LIBRARIES:\n') for l in mapping_lines: sys.stdout.write(l) @staticmethod def check_stacktrace_line(stacktrace_line, buckets): """Checks if a given stacktrace_line is valid as stacktrace. Args: stacktrace_line: A string to be checked. buckets: A dict mapping bucket ids and their corresponding Bucket objects. Returns: True if the given stacktrace_line is valid. """ words = stacktrace_line.split() if len(words) < BUCKET_ID + 1: return False if words[BUCKET_ID - 1] != '@': return False bucket = buckets.get(int(words[BUCKET_ID])) if bucket: for address in bucket.stacktrace: appeared_addresses.add(address) return True @staticmethod def skip_lines_while(line_number, max_line_number, skipping_condition): """Increments line_number until skipping_condition(line_number) is false. Returns: A pair of an integer indicating a line number after skipped, and a boolean value which is True if found a line which skipping_condition is False for. """ while skipping_condition(line_number): line_number += 1 if line_number >= max_line_number: return line_number, False return line_number, True def parse_stacktraces_while_valid(self, buckets, log_lines, line_number): """Parses stacktrace lines while the lines are valid. Args: buckets: A dict mapping bucket ids and their corresponding Bucket objects. log_lines: A list of lines to be parsed. line_number: An integer representing the starting line number in log_lines. Returns: A pair of a list of valid lines and an integer representing the last line number in log_lines. """ (line_number, _) = self.skip_lines_while( line_number, len(log_lines), lambda n: not log_lines[n].split()[0].isdigit()) stacktrace_lines_start = line_number (line_number, _) = self.skip_lines_while( line_number, len(log_lines), lambda n: self.check_stacktrace_line(log_lines[n], buckets)) return (log_lines[stacktrace_lines_start:line_number], line_number) def parse_stacktraces(self, buckets, line_number): """Parses lines in self.log_lines as stacktrace. Valid stacktrace lines are stored into self.mmap_stacktrace_lines and self.malloc_stacktrace_lines. Args: buckets: A dict mapping bucket ids and their corresponding Bucket objects. line_number: An integer representing the starting line number in log_lines. Raises: ParsingException for invalid dump versions. """ sys.stderr.write(' Version: %s\n' % self.log_version) if self.log_version in (DUMP_DEEP_3, DUMP_DEEP_4): (self.mmap_stacktrace_lines, line_number) = ( self.parse_stacktraces_while_valid( buckets, self.log_lines, line_number)) (line_number, _) = self.skip_lines_while( line_number, len(self.log_lines), lambda n: self.log_lines[n] != 'MALLOC_STACKTRACES:\n') (self.malloc_stacktrace_lines, line_number) = ( self.parse_stacktraces_while_valid( buckets, self.log_lines, line_number)) elif self.log_version == DUMP_DEEP_2: (self.mmap_stacktrace_lines, line_number) = ( self.parse_stacktraces_while_valid( buckets, self.log_lines, line_number)) (line_number, _) = self.skip_lines_while( line_number, len(self.log_lines), lambda n: self.log_lines[n] != 'MALLOC_STACKTRACES:\n') (self.malloc_stacktrace_lines, line_number) = ( self.parse_stacktraces_while_valid( buckets, self.log_lines, line_number)) self.malloc_stacktrace_lines.extend(self.mmap_stacktrace_lines) self.mmap_stacktrace_lines = [] elif self.log_version == DUMP_DEEP_1: (self.malloc_stacktrace_lines, line_number) = ( self.parse_stacktraces_while_valid( buckets, self.log_lines, line_number)) else: raise ParsingException('invalid heap profile dump version: %s' % ( self.log_version)) def parse_global_stats(self): """Parses lines in self.log_lines as global stats.""" (ln, _) = self.skip_lines_while( 0, len(self.log_lines), lambda n: self.log_lines[n] != 'GLOBAL_STATS:\n') if self.log_version == DUMP_DEEP_4: global_stat_names = [ 'total', 'file-exec', 'file-nonexec', 'anonymous', 'stack', 'other', 'nonprofiled-absent', 'nonprofiled-anonymous', 'nonprofiled-file-exec', 'nonprofiled-file-nonexec', 'nonprofiled-stack', 'nonprofiled-other', 'profiled-mmap', 'profiled-malloc'] else: global_stat_names = [ 'total', 'file', 'anonymous', 'other', 'mmap', 'tcmalloc'] for prefix in global_stat_names: (ln, _) = self.skip_lines_while( ln, len(self.log_lines), lambda n: self.log_lines[n].split()[0] != prefix) words = self.log_lines[ln].split() self.counters[prefix + '_virtual'] = int(words[-2]) self.counters[prefix + '_committed'] = int(words[-1]) def parse_version(self): """Parses a version string in self.log_lines. Returns: A pair of (a string representing a version of the stacktrace dump, and an integer indicating a line number next to the version string). Raises: ParsingException for invalid dump versions. """ version = '' # Skip until an identifiable line. headers = ('STACKTRACES:\n', 'MMAP_STACKTRACES:\n', 'heap profile: ') if not self.log_lines: raise ParsingException('Empty heap dump file.') (ln, found) = self.skip_lines_while( 0, len(self.log_lines), lambda n: not self.log_lines[n].startswith(headers)) if not found: raise ParsingException('Invalid heap dump file (no version header).') # Identify a version. if self.log_lines[ln].startswith('heap profile: '): version = self.log_lines[ln][13:].strip() if (version == DUMP_DEEP_2 or version == DUMP_DEEP_3 or version == DUMP_DEEP_4): (ln, _) = self.skip_lines_while( ln, len(self.log_lines), lambda n: self.log_lines[n] != 'MMAP_STACKTRACES:\n') else: raise ParsingException('invalid heap profile dump version: %s' % version) elif self.log_lines[ln] == 'STACKTRACES:\n': version = DUMP_DEEP_1 elif self.log_lines[ln] == 'MMAP_STACKTRACES:\n': version = DUMP_DEEP_2 return (version, ln) def parse_log(self, buckets): self.log_version, ln = self.parse_version() self.parse_global_stats() self.parse_stacktraces(buckets, ln) @staticmethod def accumulate_size_for_policy(stacktrace_lines, policy_list, buckets, sizes, mmap): for l in stacktrace_lines: words = l.split() bucket = buckets.get(int(words[BUCKET_ID])) component_match = get_component(policy_list, bucket, mmap) sizes[component_match] += int(words[COMMITTED]) if component_match.startswith('tc-'): sizes['tc-total-log'] += int(words[COMMITTED]) elif component_match.startswith('mmap-'): sizes['mmap-total-log'] += int(words[COMMITTED]) else: sizes['other-total-log'] += int(words[COMMITTED]) def apply_policy(self, policy_list, buckets, first_log_time): """Aggregates the total memory size of each component. Iterate through all stacktraces and attribute them to one of the components based on the policy. It is important to apply policy in right order. Args: policy_list: A list containing Policy objects. (Parsed policy data by parse_policy.) buckets: A dict mapping bucket ids and their corresponding Bucket objects. first_log_time: An integer representing time when the first log is dumped. Returns: A dict mapping components and their corresponding sizes. """ sys.stderr.write('apply policy:%s\n' % (self.log_path)) sizes = dict((c, 0) for c in components) self.accumulate_size_for_policy(self.mmap_stacktrace_lines, policy_list, buckets, sizes, True) self.accumulate_size_for_policy(self.malloc_stacktrace_lines, policy_list, buckets, sizes, False) if self.log_version == DUMP_DEEP_4: mmap_prefix = 'profiled-mmap' malloc_prefix = 'profiled-malloc' else: mmap_prefix = 'mmap' malloc_prefix = 'tcmalloc' sizes['mmap-no-log'] = ( self.counters['%s_committed' % mmap_prefix] - sizes['mmap-total-log']) sizes['mmap-total-record'] = self.counters['%s_committed' % mmap_prefix] sizes['mmap-total-record-vm'] = self.counters['%s_virtual' % mmap_prefix] sizes['tc-no-log'] = ( self.counters['%s_committed' % malloc_prefix] - sizes['tc-total-log']) sizes['tc-total-record'] = self.counters['%s_committed' % malloc_prefix] sizes['tc-unused'] = ( sizes['mmap-tcmalloc'] - self.counters['%s_committed' % malloc_prefix]) sizes['tc-total'] = sizes['mmap-tcmalloc'] for key, value in { 'total': 'total_committed', 'filemapped': 'file_committed', 'file-exec': 'file-exec_committed', 'file-nonexec': 'file-nonexec_committed', 'anonymous': 'anonymous_committed', 'stack': 'stack_committed', 'other': 'other_committed', 'nonprofiled-absent': 'nonprofiled-absent_committed', 'nonprofiled-anonymous': 'nonprofiled-anonymous_committed', 'nonprofiled-file-exec': 'nonprofiled-file-exec_committed', 'nonprofiled-file-nonexec': 'nonprofiled-file-nonexec_committed', 'nonprofiled-stack': 'nonprofiled-stack_committed', 'nonprofiled-other': 'nonprofiled-other_committed', 'total-vm': 'total_virtual', 'filemapped-vm': 'file_virtual', 'anonymous-vm': 'anonymous_virtual', 'other-vm': 'other_virtual' }.iteritems(): if key in sizes: sizes[key] = self.counters[value] if 'mustbezero' in sizes: removed = ( '%s_committed' % mmap_prefix, 'nonprofiled-absent_committed', 'nonprofiled-anonymous_committed', 'nonprofiled-file-exec_committed', 'nonprofiled-file-nonexec_committed', 'nonprofiled-stack_committed', 'nonprofiled-other_committed') sizes['mustbezero'] = ( self.counters['total_committed'] - sum(self.counters[i] for i in removed)) if 'total-exclude-profiler' in sizes: sizes['total-exclude-profiler'] = ( self.counters['total_committed'] - sizes['mmap-profiler']) if 'hour' in sizes: sizes['hour'] = (self.log_time - first_log_time) / 60.0 / 60.0 if 'minute' in sizes: sizes['minute'] = (self.log_time - first_log_time) / 60.0 if 'second' in sizes: sizes['second'] = self.log_time - first_log_time return sizes @staticmethod def accumulate_size_for_expand(stacktrace_lines, policy_list, buckets, component_name, depth, sizes, mmap): for line in stacktrace_lines: words = line.split() bucket = buckets.get(int(words[BUCKET_ID])) component_match = get_component(policy_list, bucket, mmap) if component_match == component_name: stacktrace_sequence = '' for address in bucket.stacktrace[0 : min(len(bucket.stacktrace), 1 + depth)]: stacktrace_sequence += address_symbol_dict[address] + ' ' if not stacktrace_sequence in sizes: sizes[stacktrace_sequence] = 0 sizes[stacktrace_sequence] += int(words[COMMITTED]) def expand(self, policy_list, buckets, component_name, depth): """Prints all stacktraces in a given component of given depth. Args: policy_list: A list containing Policy objects. (Parsed policy data by parse_policy.) buckets: A dict mapping bucket ids and their corresponding Bucket objects. component_name: A name of component for filtering. depth: An integer representing depth to be printed. """ sizes = {} self.accumulate_size_for_expand( self.mmap_stacktrace_lines, policy_list, buckets, component_name, depth, sizes, True) self.accumulate_size_for_expand( self.malloc_stacktrace_lines, policy_list, buckets, component_name, depth, sizes, False) sorted_sizes_list = sorted( sizes.iteritems(), key=(lambda x: x[1]), reverse=True) total = 0 for size_pair in sorted_sizes_list: sys.stdout.write('%10d %s\n' % (size_pair[1], size_pair[0])) total += size_pair[1] sys.stderr.write('total: %d\n' % (total)) def update_symbols(symbol_path, mapping_lines, chrome_path): """Updates address/symbol mapping on memory and in a .symbol cache file. It reads cached address/symbol mapping from a .symbol file if it exists. Then, it resolves unresolved addresses from a Chrome binary with pprof. Both mappings on memory and in a .symbol cache file are updated. Symbol files are formatted as follows: