#!/usr/bin/env python
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""The deep heap profiler script for Chrome."""

from datetime import datetime
import json
import optparse
import os
import re
import shutil
import subprocess
import sys
import tempfile

FIND_RUNTIME_SYMBOLS_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    os.pardir,
    'find_runtime_symbols')
sys.path.append(FIND_RUNTIME_SYMBOLS_PATH)

from find_runtime_symbols import find_runtime_symbols_list
from find_runtime_symbols import find_runtime_typeinfo_symbols_list
from find_runtime_symbols import RuntimeSymbolsInProcess
from prepare_symbol_info import prepare_symbol_info

BUCKET_ID = 5
VIRTUAL = 0
COMMITTED = 1
ALLOC_COUNT = 2
FREE_COUNT = 3
NULL_REGEX = re.compile('')

POLICIES_JSON_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    'policies.json')

# Heap Profile Dump versions

# DUMP_DEEP_1 is OBSOLETE.
# DUMP_DEEP_1 DOES NOT distinct mmap regions and malloc chunks.
# Their stacktraces DO contain mmap* or tc-* at their tops.
# They should be processed by POLICY_DEEP_1.
DUMP_DEEP_1 = 'DUMP_DEEP_1'

# DUMP_DEEP_2 is OBSOLETE.
# DUMP_DEEP_2 DOES distinct mmap regions and malloc chunks.
# Their stacktraces still DO contain mmap* or tc-*.
# They should be processed by POLICY_DEEP_1.
DUMP_DEEP_2 = 'DUMP_DEEP_2'

# DUMP_DEEP_3 is OBSOLETE.
# DUMP_DEEP_3 DOES distinct mmap regions and malloc chunks.
# Their stacktraces DO NOT contain mmap* or tc-*.
# They should be processed by POLICY_DEEP_2.
DUMP_DEEP_3 = 'DUMP_DEEP_3'

# DUMP_DEEP_4 is OBSOLETE.
# DUMP_DEEP_4 adds some features to DUMP_DEEP_3:
# 1. Support comments starting with '#'
# 2. Support additional global stats: e.g. nonprofiled-*.
DUMP_DEEP_4 = 'DUMP_DEEP_4'

# DUMP_DEEP_5 doesn't separate sections for malloc and mmap.
# malloc and mmap are identified in bucket files.
DUMP_DEEP_5 = 'DUMP_DEEP_5'

DUMP_DEEP_OBSOLETE = (DUMP_DEEP_1, DUMP_DEEP_2, DUMP_DEEP_3, DUMP_DEEP_4)

# Heap Profile Policy versions

# POLICY_DEEP_1 DOES NOT include allocation_type columns.
# mmap regions are distincted w/ mmap frames in the pattern column.
POLICY_DEEP_1 = 'POLICY_DEEP_1'

# POLICY_DEEP_2 DOES include allocation_type columns.
# mmap regions are distincted w/ the allocation_type column.
POLICY_DEEP_2 = 'POLICY_DEEP_2'

# POLICY_DEEP_3 is in JSON format.
POLICY_DEEP_3 = 'POLICY_DEEP_3'

# POLICY_DEEP_3 contains typeinfo.
POLICY_DEEP_4 = 'POLICY_DEEP_4'


class EmptyDumpException(Exception):
  def __init__(self, value):
    self.value = value
  def __str__(self):
    return repr(self.value)


class ParsingException(Exception):
  def __init__(self, value):
    self.value = value
  def __str__(self):
    return repr(self.value)


class InvalidDumpException(ParsingException):
  def __init__(self, value):
    self.value = value
  def __str__(self):
    return "invalid heap profile dump: %s" % repr(self.value)


class ObsoleteDumpVersionException(ParsingException):
  def __init__(self, value):
    self.value = value
  def __str__(self):
    return "obsolete heap profile dump version: %s" % repr(self.value)


class DelayedStaticSymbols(object):
  """Represents static symbol information loaded lazily."""

  def __init__(self, prefix, keep=False):
    self.maps_path = prefix + '.maps'
    self.keep = keep
    if keep:
      self.prepared_data_dir = prefix + '.pre'
    self.loaded_static_symbols = None
    self.loaded_symbols_in_process = None

  def get(self):
    if not self.loaded_symbols_in_process:
      if not self.keep:
        self.prepared_data_dir = tempfile.mkdtemp()
      try:
        prepare_symbol_info(self.maps_path, self.prepared_data_dir)
        self.loaded_symbols_in_process = RuntimeSymbolsInProcess.load(
            self.prepared_data_dir)
      finally:
        if not self.keep:
          shutil.rmtree(self.prepared_data_dir)
    return self.loaded_symbols_in_process


class Rule(object):
  """Represents one matching rule in a policy file."""

  def __init__(self, name, mmap, stacktrace_pattern, typeinfo_pattern=None):
    self.name = name
    self.mmap = mmap
    self.stacktrace_pattern = re.compile(stacktrace_pattern + r'\Z')
    if typeinfo_pattern:
      self.typeinfo_pattern = re.compile(typeinfo_pattern + r'\Z')
    else:
      self.typeinfo_pattern = None


class Policy(object):
  """Represents a policy, a content of a policy file."""

  def __init__(self, rules, version, components):
    self.rules = rules
    self.version = version
    self.components = components

  def append_rule(self, rule):
    self.rules.append(rule)


def get_component(rule_list, bucket, symbols):
  """Returns a component name which a given bucket belongs to.

  Args:
      rule_list: A list of Rule objects.
      bucket: A Bucket object to be searched for.
      symbols: A dict mapping runtime addresses to symbol names.

  Returns:
      A string representing a component name.
  """
  if not bucket:
    return 'no-bucket'
  if bucket.component_cache:
    return bucket.component_cache

  stacktrace = ''.join(symbols[a] + ' ' for a in bucket.stacktrace).strip()
  typeinfo = bucket.typeinfo_symbol
  if typeinfo.startswith('0x'):
    typeinfo = bucket.typename

  for rule in rule_list:
    if (bucket.mmap == rule.mmap and
        rule.stacktrace_pattern.match(stacktrace) and
        (not rule.typeinfo_pattern or rule.typeinfo_pattern.match(typeinfo))):
      bucket.component_cache = rule.name
      return rule.name

  assert False


class Bucket(object):
  """Represents a bucket, which is a unit of memory classification."""

  def __init__(self, stacktrace, mmap, typeinfo, typename):
    self.stacktrace = stacktrace
    self.mmap = mmap
    self.typeinfo = typeinfo
    self.typeinfo_symbol = typename
    self.typename = typename
    self.component_cache = ''

  def clear_component_cache(self):
    self.component_cache = ''


class Dump(object):
  """Represents one heap profile dump."""

  def __init__(self, dump_path):
    self.dump_path = dump_path
    self.dump_lines = [
        l for l in open(self.dump_path, 'r') if l and not l.startswith('#')]
    self.dump_version = ''
    self.stacktrace_lines = []
    self.counters = {}
    self.dump_time = os.stat(self.dump_path).st_mtime

  def print_stacktrace(self, buckets, symbols):
    """Prints a given stacktrace.

    Args:
        buckets: A dict mapping bucket ids to Bucket objects.
        symbols: A dict mapping runtime addresses to symbol names.
    """
    for line in self.stacktrace_lines:
      words = line.split()
      bucket = buckets.get(int(words[BUCKET_ID]))
      if not bucket:
        continue
      for i in range(0, BUCKET_ID - 1):
        sys.stdout.write(words[i] + ' ')
      for address in bucket.stacktrace:
        sys.stdout.write((symbols.get(address) or ('0x%016x' % address)) + ' ')
      sys.stdout.write('\n')

  @staticmethod
  def accumulate_size_for_pprof(stacktrace_lines, rule_list, buckets,
                                component_name, symbols):
    """Accumulates size of committed chunks and the number of allocated chunks.

    Args:
        stacktrace_lines: A list of strings which are valid as stacktraces.
        rule_list: A list of Rule objects.
        buckets: A dict mapping bucket ids to Bucket objects.
        component_name: A name of component for filtering.
        symbols: A dict mapping runtime addresses to symbol names.

    Returns:
        Two integers which are the accumulated size of committed regions and the
        number of allocated chunks, respectively.
    """
    com_committed = 0
    com_allocs = 0
    for line in stacktrace_lines:
      words = line.split()
      bucket = buckets.get(int(words[BUCKET_ID]))
      if (not bucket or
          (component_name and
           component_name != get_component(rule_list, bucket, symbols))):
        continue

      com_committed += int(words[COMMITTED])
      com_allocs += int(words[ALLOC_COUNT]) - int(words[FREE_COUNT])

    return com_committed, com_allocs

  @staticmethod
  def print_stacktrace_lines_for_pprof(stacktrace_lines, rule_list,
                                      buckets, component_name, symbols):
    """Prints information of stacktrace lines for pprof.

    Args:
        stacktrace_lines: A list of strings which are valid as stacktraces.
        rule_list: A list of Rule objects.
        buckets: A dict mapping bucket ids to Bucket objects.
        component_name: A name of component for filtering.
        symbols: A dict mapping runtime addresses to symbol names.
    """
    for line in stacktrace_lines:
      words = line.split()
      bucket = buckets.get(int(words[BUCKET_ID]))
      if (not bucket or
          (component_name and
           component_name != get_component(rule_list, bucket, symbols))):
        continue

      sys.stdout.write('%6d: %8s [%6d: %8s] @' % (
          int(words[ALLOC_COUNT]) - int(words[FREE_COUNT]),
          words[COMMITTED],
          int(words[ALLOC_COUNT]) - int(words[FREE_COUNT]),
          words[COMMITTED]))
      for address in bucket.stacktrace:
        sys.stdout.write(' 0x%016x' % address)
      sys.stdout.write('\n')

  def print_for_pprof(
      self, rule_list, buckets, maps_lines, component_name, symbols):
    """Converts the heap profile dump so it can be processed by pprof.

    Args:
        rule_list: A list of Rule objects.
        buckets: A dict mapping bucket ids to Bucket objects.
        maps_lines: A list of strings containing /proc/.../maps.
        component_name: A name of component for filtering.
        symbols: A dict mapping runtime addresses to symbol names.
    """
    sys.stdout.write('heap profile: ')
    com_committed, com_allocs = self.accumulate_size_for_pprof(
        self.stacktrace_lines, rule_list, buckets, component_name, symbols)

    sys.stdout.write('%6d: %8s [%6d: %8s] @ heapprofile\n' % (
        com_allocs, com_committed, com_allocs, com_committed))

    self.print_stacktrace_lines_for_pprof(
        self.stacktrace_lines, rule_list, buckets, component_name, symbols)

    sys.stdout.write('MAPPED_LIBRARIES:\n')
    for line in maps_lines:
      sys.stdout.write(line)

  @staticmethod
  def check_stacktrace_line(stacktrace_line, buckets, appeared_addresses):
    """Checks if a given stacktrace_line is valid as stacktrace.

    Args:
        stacktrace_line: A string to be checked.
        buckets: A dict mapping bucket ids to Bucket objects.
        appeared_addresses: A list where appeared addresses will be stored.

    Returns:
        True if the given stacktrace_line is valid.
    """
    words = stacktrace_line.split()
    if len(words) < BUCKET_ID + 1:
      return False
    if words[BUCKET_ID - 1] != '@':
      return False
    bucket = buckets.get(int(words[BUCKET_ID]))
    if bucket:
      for address in bucket.stacktrace:
        appeared_addresses.add(address)
    return True

  @staticmethod
  def skip_lines_while(line_number, max_line_number, skipping_condition):
    """Increments line_number until skipping_condition(line_number) is false.

    Returns:
        A pair of an integer indicating a line number after skipped, and a
        boolean value which is True if found a line which skipping_condition
        is False for.
    """
    while skipping_condition(line_number):
      line_number += 1
      if line_number >= max_line_number:
        return line_number, False
    return line_number, True

  def parse_stacktraces_while_valid(
      self, buckets, dump_lines, line_number, appeared_addresses):
    """Parses stacktrace lines while the lines are valid.

    Args:
        buckets: A dict mapping bucket ids to Bucket objects.
        dump_lines: A list of lines to be parsed.
        line_number: A line number to start parsing in dump_lines.
        appeared_addresses: A list where appeared addresses will be stored.

    Returns:
        A pair of a list of valid lines and an integer representing the last
        line number in dump_lines.
    """
    (line_number, _) = self.skip_lines_while(
        line_number, len(dump_lines),
        lambda n: not dump_lines[n].split()[0].isdigit())
    stacktrace_lines_start = line_number
    (line_number, _) = self.skip_lines_while(
        line_number, len(dump_lines),
        lambda n: self.check_stacktrace_line(
            dump_lines[n], buckets, appeared_addresses))
    return (dump_lines[stacktrace_lines_start:line_number], line_number)

  def parse_stacktraces(self, buckets, line_number, appeared_addresses):
    """Parses lines in self.dump_lines as stacktrace.

    Valid stacktrace lines are stored into self.stacktrace_lines.

    Args:
        buckets: A dict mapping bucket ids to Bucket objects.
        line_number: A line number to start parsing in dump_lines.
        appeared_addresses: A list where appeared addresses will be stored.

    Raises:
        ParsingException for invalid dump versions.
    """
    if self.dump_version == DUMP_DEEP_5:
      (self.stacktrace_lines, line_number) = (
          self.parse_stacktraces_while_valid(
              buckets, self.dump_lines, line_number, appeared_addresses))

    elif self.dump_version in DUMP_DEEP_OBSOLETE:
      raise ObsoleteDumpVersionException(self.dump_version)

    else:
      raise InvalidDumpException('Invalid version: %s' % self.dump_version)

  def parse_global_stats(self):
    """Parses lines in self.dump_lines as global stats."""
    (ln, _) = self.skip_lines_while(
        0, len(self.dump_lines),
        lambda n: self.dump_lines[n] != 'GLOBAL_STATS:\n')

    global_stat_names = [
        'total', 'file-exec', 'file-nonexec', 'anonymous', 'stack', 'other',
        'nonprofiled-absent', 'nonprofiled-anonymous',
        'nonprofiled-file-exec', 'nonprofiled-file-nonexec',
        'nonprofiled-stack', 'nonprofiled-other',
        'profiled-mmap', 'profiled-malloc']

    for prefix in global_stat_names:
      (ln, _) = self.skip_lines_while(
          ln, len(self.dump_lines),
          lambda n: self.dump_lines[n].split()[0] != prefix)
      words = self.dump_lines[ln].split()
      self.counters[prefix + '_virtual'] = int(words[-2])
      self.counters[prefix + '_committed'] = int(words[-1])

  def parse_version(self):
    """Parses a version string in self.dump_lines.

    Returns:
        A pair of (a string representing a version of the stacktrace dump,
        and an integer indicating a line number next to the version string).

    Raises:
        ParsingException for invalid dump versions.
    """
    version = ''

    # Skip until an identifiable line.
    headers = ('STACKTRACES:\n', 'MMAP_STACKTRACES:\n', 'heap profile: ')
    if not self.dump_lines:
      raise EmptyDumpException('Empty heap dump file.')
    (ln, found) = self.skip_lines_while(
        0, len(self.dump_lines),
        lambda n: not self.dump_lines[n].startswith(headers))
    if not found:
      raise InvalidDumpException('No version header.')

    # Identify a version.
    if self.dump_lines[ln].startswith('heap profile: '):
      version = self.dump_lines[ln][13:].strip()
      if version == DUMP_DEEP_5:
        (ln, _) = self.skip_lines_while(
            ln, len(self.dump_lines),
            lambda n: self.dump_lines[n] != 'STACKTRACES:\n')
      elif version in DUMP_DEEP_OBSOLETE:
        raise ObsoleteDumpVersionException(version)
      else:
        raise InvalidDumpException('Invalid version: %s' % version)
    elif self.dump_lines[ln] == 'STACKTRACES:\n':
      raise ObsoleteDumpVersionException(DUMP_DEEP_1)
    elif self.dump_lines[ln] == 'MMAP_STACKTRACES:\n':
      raise ObsoleteDumpVersionException(DUMP_DEEP_2)

    return (version, ln)

  def parse_dump(self, buckets, appeared_addresses):
    self.dump_version, ln = self.parse_version()
    self.parse_global_stats()
    self.parse_stacktraces(buckets, ln, appeared_addresses)

  @staticmethod
  def accumulate_size_for_policy(stacktrace_lines,
                                 rule_list, buckets, sizes, symbols):
    for line in stacktrace_lines:
      words = line.split()
      bucket = buckets.get(int(words[BUCKET_ID]))
      component_match = get_component(rule_list, bucket, symbols)
      sizes[component_match] += int(words[COMMITTED])

      if component_match.startswith('tc-'):
        sizes['tc-total-log'] += int(words[COMMITTED])
      elif component_match.startswith('mmap-'):
        sizes['mmap-total-log'] += int(words[COMMITTED])
      else:
        sizes['other-total-log'] += int(words[COMMITTED])

  def apply_policy(
      self, rule_list, buckets, first_dump_time, components, symbols):
    """Aggregates the total memory size of each component.

    Iterate through all stacktraces and attribute them to one of the components
    based on the policy.  It is important to apply policy in right order.

    Args:
        rule_list: A list of Rule objects.
        buckets: A dict mapping bucket ids to Bucket objects.
        first_dump_time: An integer representing time when the first dump is
            dumped.
        components: A list of strings of component names.
        symbols: A dict mapping runtime addresses to symbol names.

    Returns:
        A dict mapping components and their corresponding sizes.
    """

    sys.stderr.write('Applying policy: "%s".\n' % self.dump_path)
    sizes = dict((c, 0) for c in components)

    self.accumulate_size_for_policy(self.stacktrace_lines,
                                    rule_list, buckets, sizes, symbols)

    mmap_prefix = 'profiled-mmap'
    malloc_prefix = 'profiled-malloc'

    sizes['mmap-no-log'] = (
        self.counters['%s_committed' % mmap_prefix] - sizes['mmap-total-log'])
    sizes['mmap-total-record'] = self.counters['%s_committed' % mmap_prefix]
    sizes['mmap-total-record-vm'] = self.counters['%s_virtual' % mmap_prefix]

    sizes['tc-no-log'] = (
        self.counters['%s_committed' % malloc_prefix] - sizes['tc-total-log'])
    sizes['tc-total-record'] = self.counters['%s_committed' % malloc_prefix]
    sizes['tc-unused'] = (
        sizes['mmap-tcmalloc'] - self.counters['%s_committed' % malloc_prefix])
    sizes['tc-total'] = sizes['mmap-tcmalloc']

    for key, value in {
        'total': 'total_committed',
        'filemapped': 'file_committed',
        'file-exec': 'file-exec_committed',
        'file-nonexec': 'file-nonexec_committed',
        'anonymous': 'anonymous_committed',
        'stack': 'stack_committed',
        'other': 'other_committed',
        'nonprofiled-absent': 'nonprofiled-absent_committed',
        'nonprofiled-anonymous': 'nonprofiled-anonymous_committed',
        'nonprofiled-file-exec': 'nonprofiled-file-exec_committed',
        'nonprofiled-file-nonexec': 'nonprofiled-file-nonexec_committed',
        'nonprofiled-stack': 'nonprofiled-stack_committed',
        'nonprofiled-other': 'nonprofiled-other_committed',
        'total-vm': 'total_virtual',
        'filemapped-vm': 'file_virtual',
        'anonymous-vm': 'anonymous_virtual',
        'other-vm': 'other_virtual' }.iteritems():
      if key in sizes:
        sizes[key] = self.counters[value]

    if 'mustbezero' in sizes:
      removed = (
          '%s_committed' % mmap_prefix,
          'nonprofiled-absent_committed',
          'nonprofiled-anonymous_committed',
          'nonprofiled-file-exec_committed',
          'nonprofiled-file-nonexec_committed',
          'nonprofiled-stack_committed',
          'nonprofiled-other_committed')
      sizes['mustbezero'] = (
          self.counters['total_committed'] -
          sum(self.counters[i] for i in removed))
    if 'total-exclude-profiler' in sizes:
      sizes['total-exclude-profiler'] = (
          self.counters['total_committed'] -
          (sizes['mmap-profiler'] + sizes['mmap-type-profiler']))
    if 'hour' in sizes:
      sizes['hour'] = (self.dump_time - first_dump_time) / 60.0 / 60.0
    if 'minute' in sizes:
      sizes['minute'] = (self.dump_time - first_dump_time) / 60.0
    if 'second' in sizes:
      sizes['second'] = self.dump_time - first_dump_time

    return sizes

  @staticmethod
  def accumulate_size_for_expand(stacktrace_lines, rule_list, buckets,
                                 component_name, depth, sizes, symbols,
                                 typeinfo_symbols):
    for line in stacktrace_lines:
      words = line.split()
      bucket = buckets.get(int(words[BUCKET_ID]))
      component_match = get_component(rule_list, bucket, symbols)
      if component_match == component_name:
        stacktrace_sequence = ''
        if bucket.typeinfo:
          stacktrace_sequence += '(type=%s)' % typeinfo_symbols[bucket.typeinfo]
          stacktrace_sequence += ' (type.name=%s) ' % bucket.typename
        for address in bucket.stacktrace[0 : min(len(bucket.stacktrace),
                                                 1 + depth)]:
          stacktrace_sequence += symbols[address] + ' '
        if not stacktrace_sequence in sizes:
          sizes[stacktrace_sequence] = 0
        sizes[stacktrace_sequence] += int(words[COMMITTED])

  def expand(
      self, rule_list, buckets, component_name, depth, symbols,
      typeinfo_symbols):
    """Prints all stacktraces in a given component of given depth.

    Args:
        rule_list: A list of Rule objects.
        buckets: A dict mapping bucket ids to Bucket objects.
        component_name: A name of component for filtering.
        depth: An integer representing depth to be printed.
        symbols: A dict mapping runtime addresses to symbol names.
    """
    sizes = {}

    self.accumulate_size_for_expand(
        self.stacktrace_lines, rule_list, buckets, component_name,
        depth, sizes, symbols, typeinfo_symbols)

    sorted_sizes_list = sorted(
        sizes.iteritems(), key=(lambda x: x[1]), reverse=True)
    total = 0
    for size_pair in sorted_sizes_list:
      sys.stdout.write('%10d %s\n' % (size_pair[1], size_pair[0]))
      total += size_pair[1]
    sys.stderr.write('total: %d\n' % (total))


def update_symbols(
    symbol_path, delayed_static_symbols, appeared_addresses,
    parameter_find_runtime_symbols_list, symbols):
  """Updates address/symbol mapping on memory and in a .symbol cache file.

  It reads cached address/symbol mapping from a .symbol file if it exists.
  Then, it resolves unresolved addresses from a Chrome binary with pprof.
  Both mappings on memory and in a .symbol cache file are updated.

  Symbol files are formatted as follows:
    <Address> <Symbol>
    <Address> <Symbol>
    <Address> <Symbol>
    ...

  Args:
      symbol_path: A string representing a path for a .symbol file.
      delayed_static_symbols: A DelayedStaticSymbols object.
      appeared_addresses: A list of known addresses.
      parameter_find_runtime_symbols_list: A function to find symbols.
      symbols: A dict mapping runtime addresses to symbol names.
  """
  with open(symbol_path, mode='a+') as symbol_f:
    symbol_lines = symbol_f.readlines()
    if symbol_lines:
      for line in symbol_lines:
        items = line.split(None, 1)
        if len(items) == 1:
          items.append('??')
        symbols[int(items[0], 16)] = items[1].rstrip()
    if symbols:
      sys.stderr.write('  Found %d symbols in cache.\n' % len(symbols))
    else:
      sys.stderr.write('  No symbols found in cache.\n')

    unresolved_addresses = sorted(
        a for a in appeared_addresses if a not in symbols)

    if not unresolved_addresses:
      sys.stderr.write('  No need to resolve any more addresses.\n')
    else:
      sys.stderr.write('  %d addresses unresolved.\n' %
                       len(unresolved_addresses))

      sys.stderr.write('  Loading symbols\n')
      symbols_in_process = delayed_static_symbols.get()
      symbol_list = parameter_find_runtime_symbols_list(
          symbols_in_process, unresolved_addresses)
      sys.stderr.write('  Loaded\n')

      for address, symbol in zip(unresolved_addresses, symbol_list):
        if not symbol:
          symbol = '??'
        stripped_symbol = symbol.strip()
        symbols[address] = stripped_symbol
        symbol_f.write('%x %s\n' % (address, stripped_symbol))

    sys.stderr.write('  All symbols resolved.\n')


def parse_policy_text(policy_path):
  """Parses policy file in text format.

  A policy file contains component's names and their
  stacktrace pattern written in regular expression.
  Those patterns are matched against each symbols of
  each stacktraces in the order written in the policy file

  TODO(dmikurube): Deprecate this function after a while.

  Args:
       policy_path: A path for a policy file.
  Returns:
       A loaded policy object.
  """
  with open(policy_path, mode='r') as policy_f:
    policy_lines = policy_f.readlines()

  policy_version = POLICY_DEEP_1
  if policy_lines[0].startswith('heap profile policy: '):
    policy_version = policy_lines[0][21:].strip()
    policy_lines.pop(0)
  rule_list = []
  components = []

  if policy_version == POLICY_DEEP_2 or policy_version == POLICY_DEEP_1:
    for line in policy_lines:
      if line[0] == '#':
        continue

      if policy_version == POLICY_DEEP_2:
        (name, allocation_type, pattern) = line.strip().split(None, 2)
        mmap = False
        if allocation_type == 'mmap':
          mmap = True
      elif policy_version == POLICY_DEEP_1:
        name = line.split()[0]
        pattern = line[len(name) : len(line)].strip()
        mmap = False

      if pattern != 'default':
        rule_list.append(Rule(name, mmap, pattern))
      if components.count(name) == 0:
        components.append(name)

  else:
    sys.stderr.write('  invalid heap profile policy version: %s\n' % (
        policy_version))

  return Policy(rule_list, policy_version, components)


def parse_policy_json(policy_path):
  """Parses policy file in json format.

  A policy file contains component's names and their
  stacktrace pattern written in regular expression.
  Those patterns are matched against each symbols of
  each stacktraces in the order written in the policy file

  Args:
       policy_path: A path for a policy file.
  Returns:
       A loaded policy object.
  """
  with open(policy_path, mode='r') as f:
    policy = json.load(f)

  rules = []
  for rule in policy['rules']:
    if 'typeinfo' in rule:
      rules.append(Rule(
          rule['name'], rule['allocator'] == 'mmap', rule['stacktrace'],
          rule['typeinfo']))
    else:
      rules.append(Rule(
          rule['name'], rule['allocator'] == 'mmap', rule['stacktrace']))
  return Policy(rules, policy['version'], policy['components'])


def find_prefix(path):
  return re.sub('\.[0-9][0-9][0-9][0-9]\.heap', '', path)


def load_buckets(prefix):
  # Reading buckets
  sys.stderr.write('Loading bucket files.\n')
  appeared_typeinfo_addresses = set()
  buckets = {}
  bucket_count = 0
  n = 0
  while True:
    buckets_path = '%s.%04d.buckets' % (prefix, n)
    if not os.path.exists(buckets_path):
      if n > 10:
        break
      n += 1
      continue
    sys.stderr.write('  %s\n' % buckets_path)
    with open(buckets_path, 'r') as buckets_f:
      for line in buckets_f:
        words = line.split()
        typeinfo = None
        typename = ''
        stacktrace_begin = 2
        for index, word in enumerate(words):
          if index < 2:
            continue
          if word[0] == 't':
            typeinfo = int(word[1:], 16)
            appeared_typeinfo_addresses.add(typeinfo)
          elif word[0] == 'n':
            typename = word[1:]
          else:
            stacktrace_begin = index
            break
        stacktrace = [int(address, 16) for address in words[stacktrace_begin:]]
        buckets[int(words[0])] = Bucket(
            stacktrace, words[1] == 'mmap', typeinfo, typename)
    n += 1

  return buckets, appeared_typeinfo_addresses


def determine_dump_path_list(dump_path, prefix):
  dump_path_list = [dump_path]

  # search for the sequence of files
  n = int(dump_path[len(dump_path) - 9 : len(dump_path) - 5])
  n += 1  # skip current file
  while True:
    p = '%s.%04d.heap' % (prefix, n)
    if os.path.exists(p):
      dump_path_list.append(p)
    else:
      break
    n += 1

  return dump_path_list


def load_single_dump(dump_path, buckets, appeared_addresses):
  new_dump = Dump(dump_path)
  try:
    new_dump.parse_dump(buckets, appeared_addresses)
  except EmptyDumpException:
    sys.stderr.write('... ignored an empty dump')
  except ParsingException, e:
    sys.stderr.write('... error in parsing: %s' % e)
    sys.exit(1)
  else:
    sys.stderr.write(' (version: %s)' % new_dump.dump_version)

  return new_dump


def load_dump(dump_path, buckets):
  sys.stderr.write('Loading a heap dump file: "%s"' % dump_path)
  appeared_addresses = set()
  dump = load_single_dump(dump_path, buckets, appeared_addresses)
  sys.stderr.write('.\n')
  return dump, appeared_addresses


def load_dumps(dump_path_list, buckets):
  sys.stderr.write('Loading heap dump files.\n')
  appeared_addresses = set()
  dumps = []
  for path in dump_path_list:
    sys.stderr.write('  %s' % path)
    dumps.append(load_single_dump(path, buckets, appeared_addresses))
    sys.stderr.write('\n')
  return dumps, appeared_addresses


def load_and_update_symbol_cache(
    prefix, appeared_addresses, appeared_typeinfo_addresses,
    delayed_static_symbols):
  symbol_path = prefix + '.symbols'
  sys.stderr.write('Loading and updating symbol cache: "%s".\n' % symbol_path)
  symbols = {}
  update_symbols(
      symbol_path, delayed_static_symbols, appeared_addresses,
      find_runtime_symbols_list, symbols)

  typeinfo_symbol_path = prefix + '.tsymbols'
  sys.stderr.write('Loading and updating typeinfo symbol cache: "%s".\n' %
                   typeinfo_symbol_path)
  typeinfo_symbols = {}
  update_symbols(
      typeinfo_symbol_path, delayed_static_symbols, appeared_typeinfo_addresses,
      find_runtime_typeinfo_symbols_list, typeinfo_symbols)

  return symbols, typeinfo_symbols


def load_default_policies():
  with open(POLICIES_JSON_PATH, mode='r') as policies_f:
    default_policies = json.load(policies_f)
  return default_policies


def load_policy(policies_dict, policy_label):
  policy_file = policies_dict[policy_label]['file']
  policy_format = policies_dict[policy_label]['format']
  policy_path = os.path.join(os.path.dirname(__file__), policy_file)
  policy = None
  if policy_format == 'json':
    policy = parse_policy_json(policy_path)
  elif policy_format == 'text':
    policy = parse_policy_text(policy_path)
  else:
    return None
  sys.stderr.write('  %s: %s (version: %s)\n' %
                   (policy_label, policy_path, policy.version))
  return policy


def load_policies_dict(policies_dict):
  sys.stderr.write('Loading policy files.\n')
  policies = {}
  for policy_label in policies_dict:
    loaded_policy = load_policy(policies_dict, policy_label)
    if loaded_policy:
      policies[policy_label] = loaded_policy
  return policies


def load_policies(options_policy):
  default_policies = load_default_policies()
  if options_policy:
    policy_labels = options_policy.split(',')
    specified_policies = {}
    for specified_policy in policy_labels:
      if specified_policy in default_policies:
        specified_policies[specified_policy] = (
            default_policies[specified_policy])
    policies = load_policies_dict(specified_policies)
  else:
    policies = load_policies_dict(default_policies)
  return policies


def load_basic_files_with_multiple_dumps(dump_path, keep):
  prefix = find_prefix(dump_path)
  buckets, appeared_typeinfo_addresses = load_buckets(prefix)
  dumps, appeared_addresses = load_dumps(
      determine_dump_path_list(dump_path, prefix), buckets)
  delayed_static_symbols = DelayedStaticSymbols(prefix, keep)
  symbols, typeinfo_symbols = load_and_update_symbol_cache(
      prefix, appeared_addresses, appeared_typeinfo_addresses,
      delayed_static_symbols)
  for bucket in buckets:
    if buckets[bucket].typeinfo != None:
      buckets[bucket].typeinfo_symbol = typeinfo_symbols[
          buckets[bucket].typeinfo]

  return (buckets, dumps, appeared_addresses, appeared_typeinfo_addresses,
          delayed_static_symbols, symbols, typeinfo_symbols)


def load_basic_files_with_single_dump(dump_path, keep):
  prefix = find_prefix(dump_path)
  buckets, appeared_typeinfo_addresses = load_buckets(prefix)
  dump, appeared_addresses = load_dump(dump_path, buckets)
  delayed_static_symbols = DelayedStaticSymbols(prefix, keep)
  symbols, typeinfo_symbols = load_and_update_symbol_cache(
      prefix, appeared_addresses, appeared_typeinfo_addresses,
      delayed_static_symbols)
  for bucket in buckets:
    if buckets[bucket].typeinfo != None:
      buckets[bucket].typeinfo_symbol = typeinfo_symbols[
          buckets[bucket].typeinfo]

  return (buckets, dump, appeared_addresses, appeared_typeinfo_addresses,
          delayed_static_symbols, symbols, typeinfo_symbols)


def do_stacktrace(sys_argv):
  parser = optparse.OptionParser(
      'Usage: %prog stacktrace [--keep] <dump>')
  parser.add_option('--keep', dest='keep', action='store_true')
  options, args = parser.parse_args(sys_argv)

  if len(args) != 2:
    parser.error('needs 1 argument.')
    return 1

  dump_path = args[1]

  (buckets, dump, appeared_addresses, appeared_typeinfo_addresses,
   delayed_static_symbols, symbols, typeinfo_symbols) = (
       load_basic_files_with_single_dump(dump_path, options.keep))

  dump.print_stacktrace(buckets, symbols)

  return 0


def do_csv(sys_argv):
  parser = optparse.OptionParser(
      'Usage: %prog csv [-p POLICY] [--keep] <first-dump>')
  parser.add_option('-p', '--policy', type='string', dest='policy',
                    help='profile with POLICY', metavar='POLICY')
  parser.add_option('--keep', dest='keep', action='store_true')
  options, args = parser.parse_args(sys_argv)

  if len(args) != 2:
    parser.error('needs 1 argument.')
    return 1

  dump_path = args[1]

  (buckets, dumps, appeared_addresses, appeared_typeinfo_addresses,
   delayed_static_symbols, symbols, typeinfo_symbols) = (
       load_basic_files_with_multiple_dumps(dump_path, options.keep))

  policies = load_policies(options.policy)

  max_components = 0
  for policy in policies:
    max_components = max(max_components, len(policies[policy].components))

  for policy in sorted(policies):
    rule_list = policies[policy].rules
    components = policies[policy].components

    if len(policies) > 1:
      sys.stdout.write('%s%s\n' % (policy, ',' * (max_components - 1)))
    sys.stdout.write('%s%s\n' % (
        ','.join(components), ',' * (max_components - len(components))))

    for dump in dumps:
      component_sizes = dump.apply_policy(
          rule_list, buckets, dumps[0].dump_time, components, symbols)
      s = []
      for c in components:
        if c in ('hour', 'minute', 'second'):
          s.append('%05.5f' % (component_sizes[c]))
        else:
          s.append('%05.5f' % (component_sizes[c] / 1024.0 / 1024.0))
      sys.stdout.write('%s%s\n' % (
            ','.join(s), ',' * (max_components - len(components))))

    for bucket in buckets.itervalues():
      bucket.clear_component_cache()

  return 0


def do_json(sys_argv):
  parser = optparse.OptionParser(
      'Usage: %prog json [-p POLICY] [--keep] <first-dump>')
  parser.add_option('-p', '--policy', type='string', dest='policy',
                    help='profile with POLICY', metavar='POLICY')
  parser.add_option('--keep', dest='keep', action='store_true')
  options, args = parser.parse_args(sys_argv)

  if len(args) != 2:
    parser.error('needs 1 argument.')
    return 1

  dump_path = args[1]

  (buckets, dumps, appeared_addresses, appeared_typeinfo_addresses,
   delayed_static_symbols, symbols, typeinfo_symbols) = (
       load_basic_files_with_multiple_dumps(dump_path, options.keep))
  policies = load_policies(options.policy)

  json_base = {
    'version': 'JSON_DEEP_2',
    'policies': {},
  }

  for policy in sorted(policies):
    rule_list = policies[policy].rules
    components = policies[policy].components

    json_base['policies'][policy] = {
      'legends': components,
      'snapshots': [],
    }

    for dump in dumps:
      component_sizes = dump.apply_policy(
          rule_list, buckets, dumps[0].dump_time, components, symbols)
      component_sizes['dump_path'] = dump.dump_path
      component_sizes['dump_time'] = datetime.fromtimestamp(
          dump.dump_time).strftime('%Y-%m-%d %H:%M:%S')
      json_base['policies'][policy]['snapshots'].append(component_sizes)

    for bucket in buckets.itervalues():
      bucket.clear_component_cache()

  json.dump(json_base, sys.stdout, indent=2, sort_keys=True)

  return 0


def do_list(sys_argv):
  parser = optparse.OptionParser(
      'Usage: %prog [-p POLICY] [--keep] list <first-dump>')
  parser.add_option('-p', '--policy', type='string', dest='policy',
                    help='profile with POLICY', metavar='POLICY')
  parser.add_option('--keep', dest='keep', action='store_true')
  options, args = parser.parse_args(sys_argv)

  if len(args) != 2:
    parser.error('needs 1 argument.')
    return 1

  dump_path = args[1]

  (buckets, dumps, appeared_addresses, appeared_typeinfo_addresses,
   delayed_static_symbols, symbols, typeinfo_symbols) = (
       load_basic_files_with_multiple_dumps(dump_path, options.keep))
  policies = load_policies(options.policy)

  for policy in sorted(policies):
    rule_list = policies[policy].rules
    components = policies[policy].components

    component_sizes = dumps[0].apply_policy(
        rule_list, buckets, dumps[0].dump_time, components, symbols)
    sys.stdout.write('%s:\n' % policy)
    for c in components:
      if c in ['hour', 'minute', 'second']:
        sys.stdout.write('%30s %10.3f\n' % (c, component_sizes[c]))
      else:
        sys.stdout.write('%30s %10.3f\n' % (
            c, component_sizes[c] / 1024.0 / 1024.0))

    for bucket in buckets.itervalues():
      bucket.clear_component_cache()

  return 0


def do_expand(sys_argv):
  parser = optparse.OptionParser(
      'Usage: %prog expand [--keep] <dump> <policy> <component> <depth>')
  parser.add_option('--keep', dest='keep', action='store_true')
  options, args = parser.parse_args(sys_argv)

  if len(args) != 5:
    parser.error('needs 4 arguments.')
    return 1

  dump_path = args[1]
  target_policy = args[2]
  component_name = args[3]
  depth = args[4]

  (buckets, dump, appeared_addresses, appeared_typeinfo_addresses,
   delayed_static_symbols, symbols, typeinfo_symbols) = (
       load_basic_files_with_single_dump(dump_path, options.keep))
  policies = load_policies(target_policy)

  rule_list = policies[target_policy].rules

  dump.expand(rule_list, buckets, component_name, int(depth), symbols,
              typeinfo_symbols)

  return 0


def do_pprof(sys_argv):
  parser = optparse.OptionParser(
      'Usage: %prog pprof [-c COMPONENT] [--keep] <dump> <policy>')
  parser.add_option('-c', '--component', type='string', dest='component',
                    help='restrict to COMPONENT', metavar='COMPONENT')
  parser.add_option('--keep', dest='keep', action='store_true')
  options, args = parser.parse_args(sys_argv)

  if len(args) != 3:
    parser.error('needs 2 arguments.')
    return 1

  dump_path = args[1]
  target_policy = args[2]
  component = options.component

  (buckets, dump, appeared_addresses, appeared_typeinfo_addresses,
   delayed_static_symbols, symbols, typeinfo_symbols) = (
       load_basic_files_with_single_dump(dump_path, options.keep))
  policies = load_policies(target_policy)

  rule_list = policies[target_policy].rules

  with open(find_prefix(dump_path) + '.maps', 'r') as maps_f:
    maps_lines = maps_f.readlines()
  dump.print_for_pprof(rule_list, buckets, maps_lines, component, symbols)

  return 0


def main():
  COMMANDS = {
    'csv': do_csv,
    'expand': do_expand,
    'json': do_json,
    'list': do_list,
    'pprof': do_pprof,
    'stacktrace': do_stacktrace,
  }

  # TODO(dmikurube): Remove this message after a while.
  if len(sys.argv) >= 2 and sys.argv[1].startswith('--'):
    sys.stderr.write("""
**************** NOTICE!! ****************
   The command line format has changed.
   Please look at the description below.
******************************************

""")

  if len(sys.argv) < 2 or (not sys.argv[1] in COMMANDS):
    sys.stderr.write("""Usage: %s <command> [options] [<args>]

Commands:
   csv          Classify memory usage in CSV
   expand       Show all stacktraces contained in the specified component
   json         Classify memory usage in JSON
   list         Classify memory usage in simple listing format
   pprof        Format the profile dump so that it can be processed by pprof
   stacktrace   Convert runtime addresses to symbol names

Quick Reference:
   dmprof csv [-p POLICY] [--keep] <first-dump>
   dmprof expand [--keep] <dump> <policy> <component> <depth>
   dmprof json [-p POLICY] [--keep] <first-dump>
   dmprof list [-p POLICY] [--keep] <first-dump>
   dmprof pprof [-c COMPONENT] [--keep] <dump> <policy>
   dmprof stacktrace [--keep] <dump>
""" % (sys.argv[0]))
    sys.exit(1)
  action = sys.argv.pop(1)

  return COMMANDS[action](sys.argv)


if __name__ == '__main__':
  sys.exit(main())