#!/usr/bin/python
# Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

# memcheck_analyze.py

''' Given a valgrind XML file, parses errors and uniques them.'''

import gdb_helper

import logging
import optparse
import os
import subprocess
import sys
import time
from xml.dom.minidom import parse
from xml.parsers.expat import ExpatError

# Global symbol table (yuck)
TheAddressTable = None


# These are functions (using C++ mangled names) that we look for in stack
# traces. We don't show stack frames while pretty printing when they are below
# any of the following:
_TOP_OF_STACK_POINTS = [
  # Don't show our testing framework.
  "testing::Test::Run()",
  # Also don't show the internals of libc/pthread.
  "start_thread"
]

def getTextOf(top_node, name):
  ''' Returns all text in all DOM nodes with a certain |name| that are children
  of |top_node|.
  '''

  text = ""
  for nodes_named in top_node.getElementsByTagName(name):
    text += "".join([node.data for node in nodes_named.childNodes
                     if node.nodeType == node.TEXT_NODE])
  return text

def getCDATAOf(top_node, name):
  ''' Returns all CDATA in all DOM nodes with a certain |name| that are children
  of |top_node|.
  '''

  text = ""
  for nodes_named in top_node.getElementsByTagName(name):
    text += "".join([node.data for node in nodes_named.childNodes
                     if node.nodeType == node.CDATA_SECTION_NODE])
  if (text == ""):
    return None
  return text

def removeCommonRoot(source_dir, directory):
  '''Returns a string with the string prefix |source_dir| removed from
  |directory|.'''
  if source_dir:
    # Do this for safety, just in case directory is an absolute path outside of
    # source_dir.
    prefix = os.path.commonprefix([source_dir, directory])
    return directory[len(prefix) + 1:]

  return directory

# Constants that give real names to the abbreviations in valgrind XML output.
INSTRUCTION_POINTER = "ip"
OBJECT_FILE = "obj"
FUNCTION_NAME = "fn"
SRC_FILE_DIR = "dir"
SRC_FILE_NAME = "file"
SRC_LINE = "line"

def gatherFrames(node, source_dir):
  frames = []
  for frame in node.getElementsByTagName("frame"):
    frame_dict = {
      INSTRUCTION_POINTER : getTextOf(frame, INSTRUCTION_POINTER),
      OBJECT_FILE         : getTextOf(frame, OBJECT_FILE),
      FUNCTION_NAME       : getTextOf(frame, FUNCTION_NAME),
      SRC_FILE_DIR        : removeCommonRoot(
          source_dir, getTextOf(frame, SRC_FILE_DIR)),
      SRC_FILE_NAME       : getTextOf(frame, SRC_FILE_NAME),
      SRC_LINE            : getTextOf(frame, SRC_LINE)
    }
    frames += [frame_dict]
    if frame_dict[FUNCTION_NAME] in _TOP_OF_STACK_POINTS:
      break
    global TheAddressTable
    if TheAddressTable != None and frame_dict[SRC_LINE] == "":
      # Try using gdb
      TheAddressTable.Add(frame_dict[OBJECT_FILE], frame_dict[INSTRUCTION_POINTER])
  return frames

class ValgrindError:
  ''' Takes a <DOM Element: error> node and reads all the data from it. A
  ValgrindError is immutable and is hashed on its pretty printed output.
  '''

  def __init__(self, source_dir, error_node, commandline):
    ''' Copies all the relevant information out of the DOM and into object
    properties.

    Args:
      error_node: The <error></error> DOM node we're extracting from.
      source_dir: Prefix that should be stripped from the <dir> node.
      commandline: The command that was run under valgrind
    '''

    # Valgrind errors contain one <what><stack> pair, plus an optional
    # <auxwhat><stack> pair, plus an optional <origin><what><stack></origin>,
    # plus (since 3.5.0) a <suppression></suppression> pair.
    # (Origin is nicely enclosed; too bad the other two aren't.)
    # The most common way to see all three in one report is
    # a syscall with a parameter that points to uninitialized memory, e.g.
    # Format:
    # <error>
    #   <unique>0x6d</unique>
    #   <tid>1</tid>
    #   <kind>SyscallParam</kind>
    #   <what>Syscall param write(buf) points to uninitialised byte(s)</what>
    #   <stack>
    #     <frame>
    #     ...
    #     </frame>
    #   </stack>
    #   <auxwhat>Address 0x5c9af4f is 7 bytes inside a block of ...</auxwhat>
    #   <stack>
    #     <frame>
    #     ...
    #     </frame>
    #   </stack>
    #   <origin>
    #   <what>Uninitialised value was created by a heap allocation</what>
    #   <stack>
    #     <frame>
    #     ...
    #     </frame>
    #   </stack>
    #   </origin>
    #   <suppression>
    #     <sname>insert_a_suppression_name_here</sname>
    #     <skind>Memcheck:Param</skind>
    #     <skaux>write(buf)</skaux>
    #     <sframe> <fun>__write_nocancel</fun> </sframe>
    #     ...
    #     <sframe> <fun>main</fun> </sframe>
    #     <rawtext>
    # <![CDATA[
    # {
    #    <insert_a_suppression_name_here>
    #    Memcheck:Param
    #    write(buf)
    #    fun:__write_nocancel
    #    ...
    #    fun:main
    # }
    # ]]>
    #     </rawtext>
    #   </suppression>
    # </error>
    #
    # Each frame looks like this:
    #  <frame>
    #    <ip>0x83751BC</ip>
    #    <obj>/data/dkegel/chrome-build/src/out/Release/base_unittests</obj>
    #    <fn>_ZN7testing8internal12TestInfoImpl7RunTestEPNS_8TestInfoE</fn>
    #    <dir>/data/dkegel/chrome-build/src/testing/gtest/src</dir>
    #    <file>gtest-internal-inl.h</file>
    #    <line>655</line>
    #  </frame>
    # although the dir, file, and line elements are missing if there is
    # no debug info.

    self._kind = getTextOf(error_node, "kind")
    self._backtraces = []
    self._suppression = None
    self._commandline = commandline

    # Iterate through the nodes, parsing <what|auxwhat><stack> pairs.
    description = None
    for node in error_node.childNodes:
      if node.localName == "what" or node.localName == "auxwhat":
        description = "".join([n.data for n in node.childNodes
                              if n.nodeType == n.TEXT_NODE])
      elif node.localName == "xwhat":
        description = getTextOf(node, "text")
      elif node.localName == "stack":
        self._backtraces.append([description, gatherFrames(node, source_dir)])
        description = None
      elif node.localName == "origin":
        description = getTextOf(node, "what")
        stack = node.getElementsByTagName("stack")[0]
        frames = gatherFrames(stack, source_dir)
        self._backtraces.append([description, frames])
        description = None
        stack = None
        frames = None
      elif node.localName == "suppression":
        self._suppression = getCDATAOf(node, "rawtext");

  def __str__(self):
    ''' Pretty print the type and backtrace(s) of this specific error,
        including suppression (which is just a mangled backtrace).'''
    output = self._kind + "\n"
    if (self._commandline):
      output += self._commandline + "\n"

    for backtrace in self._backtraces:
      output += backtrace[0] + "\n"
      filter = subprocess.Popen("c++filt -n", stdin=subprocess.PIPE,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                shell=True,
                                close_fds=True)
      buf = ""
      for frame in backtrace[1]:
        buf +=  (frame[FUNCTION_NAME] or frame[INSTRUCTION_POINTER]) + "\n"
      (stdoutbuf, stderrbuf) = filter.communicate(buf.encode('latin-1'))
      demangled_names = stdoutbuf.split("\n")

      i = 0
      for frame in backtrace[1]:
        output += ("  " + demangled_names[i])
        i = i + 1

        global TheAddressTable
        if TheAddressTable != None and frame[SRC_FILE_DIR] == "":
           # Try using gdb
           foo = TheAddressTable.GetFileLine(frame[OBJECT_FILE],
                                             frame[INSTRUCTION_POINTER])
           if foo[0] != None:
             output += (" (" + foo[0] + ":" + foo[1] + ")")
        elif frame[SRC_FILE_DIR] != "":
          output += (" (" + frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME] +
                     ":" + frame[SRC_LINE] + ")")
        else:
          output += " (" + frame[OBJECT_FILE] + ")"
        output += "\n"

      # TODO(dank): stop synthesizing suppressions once everyone has
      # valgrind-3.5 and we can rely on xml
      if (self._suppression == None):
        output += "Suppression:\n"
        for frame in backtrace[1]:
          output += "  fun:" + (frame[FUNCTION_NAME] or "*") + "\n"

    if (self._suppression != None):
      output += "Suppression:"
      # Widen suppression slightly to make portable between mac and linux
      supp = self._suppression;
      supp = supp.replace("fun:_Znwj", "fun:_Znw*")
      supp = supp.replace("fun:_Znwm", "fun:_Znw*")
      # Split into lines so we can enforce length limits
      supplines = supp.split("\n")

      # Truncate at line 26 (VG_MAX_SUPP_CALLERS plus 2 for name and type)
      # or at the first 'boring' caller.
      # (https://bugs.kde.org/show_bug.cgi?id=199468 proposes raising
      # VG_MAX_SUPP_CALLERS, but we're probably fine with it as is.)
      # TODO(dkegel): add more boring callers
      newlen = 26;
      try:
        newlen = min(newlen, supplines.index("   fun:_ZN11MessageLoop3RunEv"))
      except ValueError:
        pass
      if (len(supplines) > newlen):
        supplines = supplines[0:newlen]
        supplines.append("}")

      output += "\n".join(supplines) + "\n"

    return output

  def UniqueString(self):
    ''' String to use for object identity. Don't print this, use str(obj)
    instead.'''
    rep = self._kind + " "
    for backtrace in self._backtraces:
      for frame in backtrace[1]:
        rep += frame[FUNCTION_NAME]

        if frame[SRC_FILE_DIR] != "":
          rep += frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME]
        else:
          rep += frame[OBJECT_FILE]

    return rep

  def __hash__(self):
    return hash(self.UniqueString())
  def __eq__(self, rhs):
    return self.UniqueString() == rhs

def find_and_truncate(f):
  f.seek(0)
  while True:
    line = f.readline()
    if line == "":
      return False
    if '</valgrindoutput>' in line:
      # valgrind often has garbage after </valgrindoutput> upon crash
      f.truncate()
      return True

class MemcheckAnalyze:
  ''' Given a set of Valgrind XML files, parse all the errors out of them,
  unique them and output the results.'''

  SANITY_TEST_SUPPRESSION = "Memcheck sanity test"
  def __init__(self, source_dir, files, show_all_leaks=False, use_gdb=False):
    '''Reads in a set of files.

    Args:
      source_dir: Path to top of source tree for this build
      files: A list of filenames.
      show_all_leaks: whether to show even less important leaks
    '''

    # Beyond the detailed errors parsed by ValgrindError above,
    # the xml file contain records describing suppressions that were used:
    # <suppcounts>
    #  <pair>
    #    <count>28</count>
    #    <name>pango_font_leak_todo</name>
    #  </pair>
    #  <pair>
    #    <count>378</count>
    #    <name>bug_13243</name>
    #  </pair>
    # </suppcounts
    # Collect these and print them at the end.
    #
    # With our patch for https://bugs.kde.org/show_bug.cgi?id=205000 in,
    # the file also includes records of the form
    # <load_obj><obj>/usr/lib/libgcc_s.1.dylib</obj><ip>0x27000</ip></load_obj>
    # giving the filename and load address of each binary that was mapped
    # into the process.

    global TheAddressTable
    if use_gdb:
      TheAddressTable = gdb_helper.AddressTable()
    self._errors = set()
    self._suppcounts = {}
    badfiles = set()
    start = time.time()
    self._parse_failed = False
    for file in files:
      # Wait up to three minutes for valgrind to finish writing all files,
      # but after that, just skip incomplete files and warn.
      f = open(file, "r+")
      found = False
      firstrun = True
      origsize = os.path.getsize(file)
      while (not found and (firstrun or ((time.time() - start) < 180.0))):
        firstrun = False
        f.seek(0)
        found = find_and_truncate(f)
        if not found:
          time.sleep(1)
      f.close()
      if not found:
        badfiles.add(file)
      else:
        newsize = os.path.getsize(file)
        if origsize > newsize+1:
          logging.warn(str(origsize - newsize) + " bytes of junk were after </valgrindoutput> in %s!" % file)
        try:
          parsed_file = parse(file);
        except ExpatError, e:
          self._parse_failed = True
          logging.warn("could not parse %s: %s" % (file, e))
          lineno = e.lineno - 1
          context_lines = 5
          context_start = max(0, lineno - context_lines)
          context_end = lineno + context_lines + 1
          context_file = open(file, "r")
          for i in range(0, context_start):
            context_file.readline()
          for i in range(context_start, context_end):
            context_data = context_file.readline().rstrip()
            if i != lineno:
              logging.warn("  %s" % context_data)
            else:
              logging.warn("> %s" % context_data)
          context_file.close()
          continue
        if TheAddressTable != None:
          load_objs = parsed_file.getElementsByTagName("load_obj")
          for load_obj in load_objs:
            obj = getTextOf(load_obj, "obj")
            ip = getTextOf(load_obj, "ip")
            TheAddressTable.AddBinaryAt(obj, ip)

        commandline = None
        preamble = parsed_file.getElementsByTagName("preamble")[0];
        for node in preamble.getElementsByTagName("line"):
          if node.localName == "line":
            for x in node.childNodes:
              if x.nodeType == node.TEXT_NODE and "Command" in x.data:
                commandline = x.data
                break

        raw_errors = parsed_file.getElementsByTagName("error")
        for raw_error in raw_errors:
          # Ignore "possible" leaks for now by default.
          if (show_all_leaks or
              getTextOf(raw_error, "kind") != "Leak_PossiblyLost"):
            error = ValgrindError(source_dir, raw_error, commandline)
            self._errors.add(error)

        suppcountlist = parsed_file.getElementsByTagName("suppcounts")
        if len(suppcountlist) > 0:
          suppcountlist = suppcountlist[0]
          for node in suppcountlist.getElementsByTagName("pair"):
            count = getTextOf(node, "count");
            name = getTextOf(node, "name");
            if name in self._suppcounts:
              self._suppcounts[name] += int(count)
            else:
              self._suppcounts[name] = int(count)

    if len(badfiles) > 0:
      logging.warn("valgrind didn't finish writing %d files?!" % len(badfiles))
      for file in badfiles:
        logging.warn("Last 20 lines of %s :" % file)
        os.system("tail -n 20 '%s' 1>&2" % file)

  def Report(self, check_sanity=False):
    if self._parse_failed:
      logging.error("FAIL! Couldn't parse Valgrind output file")
      return -2

    is_sane = False
    print "-----------------------------------------------------"
    print "Suppressions used:"
    print "  count name"
    for item in sorted(self._suppcounts.items(), key=lambda (k,v): (v,k)):
      print "%7s %s" % (item[1], item[0])
      if item[0].startswith(MemcheckAnalyze.SANITY_TEST_SUPPRESSION):
        is_sane = True
    print "-----------------------------------------------------"
    sys.stdout.flush()

    retcode = 0
    if self._errors:
      logging.error("FAIL! There were %s errors: " % len(self._errors))

      global TheAddressTable
      if TheAddressTable != None:
        TheAddressTable.ResolveAll()

      for error in self._errors:
        logging.error(error)

      retcode = -1

    # Report tool's insanity even if there were errors.
    if check_sanity and not is_sane:
      logging.error("FAIL! Sanity check failed!")
      retcode = -3

    if retcode != 0:
      return retcode

    logging.info("PASS! No errors found!")
    return 0

def _main():
  '''For testing only. The MemcheckAnalyze class should be imported instead.'''
  retcode = 0
  parser = optparse.OptionParser("usage: %prog [options] <files to analyze>")
  parser.add_option("", "--source_dir",
                    help="path to top of source tree for this build"
                    "(used to normalize source paths in baseline)")

  (options, args) = parser.parse_args()
  if not len(args) >= 1:
    parser.error("no filename specified")
  filenames = args

  analyzer = MemcheckAnalyze(options.source_dir, filenames, use_gdb=True)
  retcode = analyzer.Report()

  sys.exit(retcode)

if __name__ == "__main__":
  _main()