1 files changed, 634 insertions, 0 deletions
diff --git a/tools/purify/purify_message.py b/tools/purify/purify_message.py
new file mode 100644
index 0000000..d093461
--- /dev/null
+++ b/tools/purify/purify_message.py
@@ -0,0 +1,634 @@
+#!/bin/env python
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# purify_message.py
+
+''' Utility objects and functions to parse and unique Purify messages '''
+
+import cStringIO
+import logging
+import re
+import sys
+
+import google.logging_utils
+
+# used to represent one or more elided frames
+ELIDE = "..."
+# used to represent stack truncation at a known entry point
+TRUNCATE = "^^^"
+# a file that's outside of our source directory
+EXTERNAL_FILE = "EXTERNAL_FILE"
+
+# mapping of purify message types to descriptions
+message_type = {
+  "ABR": "Array Bounds Read",
+  "ABW": "Array Bounds Write",
+  "ABWL": "Array Bounds Write (late detect)",
+  "BSR": "Beyond Stack Read",
+  "BSW": "Beyond Stack Write",
+  "COM": "COM API/Interface Failure",
+  "EXC": "Continued Exception",
+  "EXH": "Handled Exception",
+  "EXI": "Ignored Exception",
+  "EXU": "Unhandled Exception",
+  "FFM": "Freeing Freed Memory",
+  "FIM": "Freeing Invalid Memory",
+  "FMM": "Freeing Mismatched Memory",
+  "FMR": "Free Memory Read",
+  "FMW": "Free Memory Write",
+  "FMWL": "Free Memory Write (late detect)",
+  "HAN": "Invalid Handle",
+  "HIU": "Handle In Use",
+  "ILK": "COM Interface Leak",
+  "IPR": "Invalid Pointer Read",
+  "IPW": "Invalid Pointer Write",
+  "MAF": "Memory Allocation Failure",
+  "MIU": "Memory In Use",
+  "MLK": "Memory Leak",
+  "MPK": "Potential Memory Leak",
+  "NPR": "Null Pointer Read",
+  "NPW": "Null Pointer Write",
+  "PAR": "Bad Parameter",
+  "UMC": "Uninitialized Memory Copy",
+  "UMR": "Uninitialized Memory Read",
+}
+
+# a magic message type which is not enumerated with the normal message type dict
+FATAL = "FATAL"
+
+def GetMessageType(key):
+  if key in message_type:
+    return message_type[key]
+  elif key == FATAL:
+    return key
+  logging.warn("unknown message type %s" % key)
+  return "UNKNOWN"
+
+# currently unused, but here for documentation purposes
+message_severity = {
+  "I": "Informational",
+  "E": "Error",
+  "W": "Warning",
+  "O": "Internal Purify Error",
+}
+
+
+class Stack:
+  ''' A normalized Purify Stack.  The stack is constructed by adding one line
+  at a time from a stack in a Purify text file via AddLine.
+  Supports cmp and hash so that stacks which normalize the same can be sorted
+  and uniqued.
+  The original stack contents are preserved so that it's possible to drill
+  down into the full details if necessary. '''
+
+  # The top of the source tree.  This is stripped from the filename as part
+  # of normalization.
+  source_dir = ""
+
+  @classmethod
+  def SetSourceDir(cls, dir):
+    # normalize the dir
+    cls.source_dir = dir.replace("\\", "/").lower()
+    logging.debug("Stack.source_dir = %s" % cls.source_dir)
+
+  # a line in a stack trace
+  pat_stack_line = re.compile('(.*)\[(\w:)?([^\:\s]*)(:\d+)?(\s+.*)?]')
+
+  # Known stack entry points that allow us to truncate the rest of the stack
+  # below that point.
+  pat_known_entries = (
+     re.compile('RunnableMethod::Run\(void\)'),
+     re.compile('ChromeMain'),
+     re.compile('BrowserMain'),
+     re.compile('wWinMain'),
+     re.compile('TimerManager::ProcessPendingTimer\(void\)'),
+     re.compile('RunnableMethod::RunableMethod\(.*\)'),
+     re.compile('RenderViewHost::OnMessageReceived\(Message::IPC const&\)'),
+     re.compile('testing::Test::Run\(void\)'),
+     re.compile('testing::TestInfoImpl::Run\(void\)'),
+     re.compile('Thread::ThreadFunc\\(void \*\)'),
+     re.compile('TimerTask::Run\(void\)'),
+     re.compile('MessageLoop::RunTask\(Task \*\)'),
+     re.compile('.DispatchToMethod\@.*'),
+     )
+
+  # if functions match the following, elide them from the stack
+  pat_func_elide = (re.compile('^std::'), re.compile('^new\('))
+  # if files match the following, elide them from the stack
+  pat_file_elide = (re.compile('.*platformsdk_vista.*'), 
+                    re.compile('.*.(dll|DLL)$'),
+                    # bug 1069902
+                    re.compile('webkit/pending/wtf/fastmalloc\.h'),
+                    # When we leak sqlite stuff, we leak a lot, and the stacks
+                    # are all over the place.  For now, let's assume that
+                    # sqlite itself is leak free and focus on our calling code.
+                    re.compile('chrome/third_party/sqlite/.*'),
+                    )
+
+  pat_unit_test = re.compile('^([a-zA-Z0-9]+)_(\w+)_Test::.*')
+
+  def __init__(self, title):
+    self._title = title.lstrip()
+    self._stack = []
+    self._orig = ""
+    # are we currently in an eliding block
+    self._eliding = False
+    # have we truncated the stack?
+    self._truncated = False
+    # is the stack made up completely of external code? (i.e. elided)
+    self._all_external = True
+    # a logical group that this stack belongs to
+    self._group = None
+    # top stack line (preserved even if elided)
+    self._top_stack_line = None
+
+  def GetLines(self):
+    return self._stack
+
+  def GetTopStackLine(self):
+    return self._top_stack_line
+
+  def GetTopVisibleStackLine(self):
+    for line in self._stack:
+      if line['function']:
+        return line
+    return {}
+
+  def GetGroup(self):
+    '''A logical grouping for this stack, allowing related stacks to be grouped
+    together.  Subgroups within a group are separated by ".".
+    (e.g. group.subgroup.subsubgroup)
+    '''
+    return self._group;
+    
+  def _ComputeStackLine(self, line):
+    line = line.lstrip()
+    m = Stack.pat_stack_line.match(line)
+    if m:
+      func = m.group(1).rstrip()
+      func = self._DemangleSymbol(func)
+      func = self._DetemplatizeSymbol(func)
+      if m.group(2):
+        file = m.group(2) + m.group(3)
+      else:
+        file = m.group(3)
+      # paths are normalized to use / and be lower case
+      file = file.replace("\\", "/").lower()
+      if not file.startswith(Stack.source_dir):
+        file = EXTERNAL_FILE
+      else:
+        file = file[len(Stack.source_dir):]
+        # trim leading / if present
+        if file[0] == "/":
+          file = file[1:]
+      loc = m.group(4)
+      if loc:
+        loc = int(loc[1:])
+      else:
+        loc = 0
+      return {'function': func, 'file': file, 'line_number': loc}
+    return None
+
+  def _ShouldElide(self, stack_line):
+    func = stack_line['function']
+    file = stack_line['file']
+    # elide certain common functions from the stack such as the STL
+    for pat in Stack.pat_func_elide:
+      if pat.match(func):
+        logging.debug("eliding due to func pat match: %s" % func)
+        return True
+    if file == EXTERNAL_FILE:
+      # if it's not in our source tree, then elide
+      logging.debug("eliding due to external file: %s" % file)
+      return True
+    # elide certain common file sources from the stack, usually this
+    # involves system libraries
+    for pat in Stack.pat_file_elide:
+      if pat.match(file):
+        logging.debug("eliding due to file pat match: %s" % file)
+        return True
+
+    return False
+
+  def AddLine(self, line):
+    ''' Add one line from a stack in a Purify text file.  Lines must be
+    added in order (top down).  Lines are added to two internal structures:
+    an original string copy and an array of normalized lines, split into
+    (function, file, line number).
+    Stack normalization does several things:
+      * elides sections of the stack that are in external code
+      * truncates the stack at so called "known entry points"
+      * removes template type information from symbols
+    Returns False if the line was elided or otherwise omitted.
+    '''
+    self._orig += line + "\n"
+    stack_line = self._ComputeStackLine(line)
+    if stack_line:
+      if not self._top_stack_line:
+        self._top_stack_line = stack_line
+      # Unit test entry points are good groupings.  Even if we already have a
+      # group set, a later unit-test stack line will override.
+      # Note that we also do this even if the stack has already been truncated
+      # since this is useful information.
+      # TODO(erikkay): Maybe in this case, the truncation should be overridden?
+      test_match = Stack.pat_unit_test.match(stack_line["function"])
+      if test_match:
+        self._group = test_match.group(1) + "." + test_match.group(2)
+
+      if self._truncated:
+        return False
+
+      if self._ShouldElide(stack_line):
+        if not self._eliding:
+          self._eliding = True
+          self._stack.append({'function': "", 'file': ELIDE, 'line_number': 0})
+        return False
+      else:
+        self._stack.append(stack_line)
+        self._eliding = False
+        self._all_external = False
+        
+        # when we reach one of the known common stack entry points, truncate
+        # the stack to avoid printing overly redundant information
+        if len(self._stack) > 1:
+          for f in Stack.pat_known_entries:
+            if f.match(stack_line["function"]):
+              if not self._group:
+                # we're at the end of the stack, so use the path to the file
+                # as the group if we don't already have one
+                # This won't be incredibly reliable, but might still be useful.
+                prev = self._stack[-2]
+                if prev['file']:
+                  self._group = '.'.join(prev['file'].split('/')[:-1])
+              self._stack.append({'function': "", 'file': TRUNCATE,
+                                 'line_number': 0})
+              self._truncated = True
+              return False
+      return True
+    else:
+      # skip these lines
+      logging.debug(">>>" + line)
+      return False
+
+  def _DemangleSymbol(self, symbol):
+    # TODO(erikkay) - I'm not sure why Purify prepends an address on the
+    # front of some of these as if it were a namespace (?A<addr>::).  From an
+    # analysis standpoint, it seems meaningless and can change from machine to
+    # machine, so it's best if it's thrown away
+    if symbol.startswith("?A0x"):
+      skipto = symbol.find("::")
+      if skipto >= 0:
+        symbol = symbol[(skipto+2):]
+      else:
+        logging.warn("unable to strip address off of symbol (%s)" % symbol)
+    # TODO(erikkay) there are more symbols not being properly demangled
+    # in Purify's output.  Some of these look like template-related issues.
+    return symbol
+
+  def _DetemplatizeSymbol(self, symbol):
+    ''' remove all of the template arguments and return values from the
+    symbol, normalizing it, making it more readable, and less precise '''
+    ret = ""
+    nested = 0
+    for i in range(len(symbol)):
+      if nested > 0:
+        if symbol[i] == '>':
+          nested -= 1
+        elif symbol[i] == '<':
+          nested += 1
+      elif symbol[i] == '<':
+        nested += 1
+      else:
+        ret += symbol[i]
+    return ret
+
+  def __hash__(self):
+    return hash(self.NormalizedStr())
+
+  def __cmp__(self, other):
+    if not other:
+      return 1
+    len_self = len(self._stack)
+    len_other = len(other._stack)
+    min_len = min(len_self, len_other)
+    # sort stacks from the bottom up    
+    for i in range(-1, -(min_len + 1), -1):
+      # compare file, then func, but omit line number
+      ret = cmp((self._stack[i]['file'], self._stack[i]['function']),
+                (other._stack[i]['file'], other._stack[i]['function']))
+      if ret:
+        return ret
+    return cmp(len_self, len_other)
+
+  def NormalizedStr(self, verbose=False):
+    ''' String version of the normalized stack.  See AddLine for normalization
+    details. '''
+    # use cStringIO for more efficient string building
+    out = cStringIO.StringIO()
+    for line in self._stack:
+      out.write("   ")
+      out.write(line['file'])
+      if verbose and line['line_number'] > 0:
+        out.write(":%d" % line['line_number'])
+      out.write("  ")
+      out.write(line['function'])
+      out.write("\n")
+    ret = out.getvalue()
+    out.close()
+    return ret
+
+  def __str__(self):
+    return self._orig
+
+
+class Message:
+  '''A normalized message from a Purify text file.  Messages all have a
+  severity, most have a type, and many have an error stack and/or an
+  allocation stack.
+  Supports cmp and hash so that messages which normalize the same can be
+  sorted and uniqued.'''
+
+  pat_count = re.compile('^(.*) \{(\d+) occurrences?\}')
+  pat_leak = re.compile('(Potential )?[Mm]emory leak of (\d+) bytes? '
+                        'from (\d+) blocks? allocated in (.+)')
+  pat_miu = re.compile('Memory use of (\d+) bytes? '
+                       '(\((\d+)% initialized\) )?from (\d+) blocks? '
+                       'allocated .. (.+)')
+  # these are headings to different types of stack traces
+  pat_loc_error = re.compile('\s*(Exception|Error|Call) location')
+  pat_loc_alloc = re.compile('\s*Allocation location')
+  pat_loc_free = re.compile('\s*Free location')
+  pat_loc_free2 = re.compile('\s*Location of free attempt')
+
+  def __init__(self, severity, type, title):
+    self._severity = severity
+    self._type = type
+    self._program = None
+    self._head = ""
+    self._loc_alloc = None
+    self._loc_error = None
+    self._loc_free = None
+    self._stack = None
+    self._count = 1
+    self._bytes = 0
+    self._blocks = 0
+    m = Message.pat_count.match(title)
+    if m:
+      self._title = m.group(1)
+      self._count = int(m.group(2))
+    else:
+      m = Message.pat_leak.match(title)
+      if m:
+        self._title = m.group(4)
+        self._bytes = int(m.group(2))
+        self._blocks = int(m.group(3))
+      else:
+        m = Message.pat_miu.match(title)
+        if m:
+          self._title = m.group(5)
+          self._bytes = int(m.group(1))
+          self._blocks = int(m.group(4))
+          #print "%d/%d - %s" % (self._bytes, self._blocks, title[0:60])
+        elif type == "MIU":
+          logging.error("%s didn't match" % title)
+          sys.exit(-1)
+        else:
+          self._title = title
+
+  def GetAllocStack(self):
+    return self._loc_alloc
+
+  def GetErrorStack(self):
+    return self._loc_error
+
+  def GetGroup(self):
+    '''An attempted logical grouping for this Message computed by the contained
+    Stack objects.
+    '''
+    group = None
+    if self._loc_alloc:
+      group = self._loc_alloc.GetGroup()
+    if not group and self._loc_error:
+      group = self._loc_error.GetGroup()
+    if not group and self._loc_free:
+      group = self._loc_free.GetGroup()
+    if not group:
+      group = "UNKNOWN"
+    return group
+
+  def AddLine(self, line):
+    '''Add a line one at a time (in order from the Purify text file) to
+    build up the message and its associated stacks. '''
+
+    if Message.pat_loc_error.match(line):
+      self._stack = Stack(line)
+      self._loc_error = self._stack
+    elif Message.pat_loc_alloc.match(line):
+      self._stack = Stack(line)
+      self._loc_alloc = self._stack
+    elif Message.pat_loc_free.match(line) or Message.pat_loc_free2.match(line):
+      self._stack = Stack(line)
+      self._loc_free = self._stack
+    elif self._stack:
+      if not line.startswith("            "):
+        logging.debug("*** " + line)
+      self._stack.AddLine(line)
+    else:
+      self._head += line.lstrip()
+
+  def Type(self):
+    return self._type
+
+  def Program(self):
+    return self._program
+
+  def SetProgram(self, program):
+    self._program = program
+
+  def StacksAllExternal(self):
+    '''Returns True if the stacks it contains are made up completely of
+    external (elided) symbols'''
+    return ((not self._loc_error or self._loc_error._all_external) and
+            (not self._loc_alloc or self._loc_alloc._all_external) and
+            (not self._loc_free or self._loc_free._all_external))
+
+  def __hash__(self):
+    # NOTE: see also _MessageHashesFromFile.  If this method changes, then
+    # _MessageHashesFromFile must be updated to match.
+    s = ""
+    if self._loc_error:
+      s += "Error Location\n" + self._loc_error.NormalizedStr()
+    if self._loc_alloc:
+      s += "Alloc Location\n" + self._loc_alloc.NormalizedStr()
+    if self._loc_free:
+      s += "Free Location\n" + self._loc_free.NormalizedStr()
+    return hash(s)
+
+  def NormalizedStr(self, verbose=False):
+    '''String version of the normalized message. Only includes title
+    and normalized versions of error and allocation stacks if present.
+    Example:
+    Unitialized Memory Read in Foo::Bar()
+    Error Location
+      foo/Foo.cc  Foo::Bar(void)
+      foo/main.cc start(void)
+      foo/main.cc main(void)
+    Alloc Location
+      foo/Foo.cc  Foo::Foo(void)
+      foo/main.cc start(void)
+      foo/main.cc main(void)
+    '''
+    ret = ""
+    # some of the message types are more verbose than others and we
+    # don't need to indicate their type
+    if verbose and self._type not in ["UMR", "IPR", "IPW"]:
+      ret += GetMessageType(self._type) + ": "
+    if verbose and self._bytes > 0:
+      ret += "(%d bytes, %d blocks) " % (self._bytes, self._blocks)
+    ret += "%s\n" % self._title
+    if self._loc_error:
+      ret += "Error Location\n" + self._loc_error.NormalizedStr(verbose)
+    if self._loc_alloc:
+      ret += "Alloc Location\n" + self._loc_alloc.NormalizedStr(verbose)
+    if self._loc_free:
+      ret += "Free Location\n" + self._loc_free.NormalizedStr(verbose)
+    return ret
+
+  def __str__(self):
+    ret = self._title + "\n" + self._head
+    if self._loc_error:
+      ret += "Error Location\n" + str(self._loc_error)
+    if self._loc_alloc:
+      ret += "Alloc Location\n" + str(self._loc_alloc)
+    if self._loc_free:
+      ret += "Free Location\n" + str(self._loc_free)
+    return ret
+
+  def __cmp__(self, other):
+    if not other:
+      return 1
+    ret = 0
+    if self._loc_error:
+      ret = cmp(self._loc_error, other._loc_error)
+    if ret == 0 and self._loc_alloc:
+      ret = cmp(self._loc_alloc, other._loc_alloc)
+    if ret == 0 and self._loc_free:
+      ret = cmp(self._loc_free, other._loc_free)
+    # since title is often not very interesting, we sort against that last
+    if ret == 0:
+      ret = cmp(self._title, other._title)
+    return ret
+
+
+class MessageList:
+  '''A collection of Message objects of a given message type.'''
+  def __init__(self, type):
+    self._type = type
+    self._messages = []
+    self._unique_messages = None
+    self._sublists = None
+    self._bytes = 0
+    
+  def GetType(self):
+    return self._type
+
+  def BeginNewSublist(self):  
+    '''Some message types are logically grouped into sets of messages which
+    should not be mixed in the same list.  Specifically, Memory In Use (MIU),
+    Memory Leak (MLK) and Potential Memory Leak (MPK) are generated in a set
+    all at once, but this generation can happen at multiple distinct times,
+    either via the Purify UI or through Purify API calls.  For example, if
+    Purify is told to dump a list all memory leaks once, and then a few minutes
+    later, the two lists will certainly overlap, so they should be kept
+    in separate lists.
+    In order to accommodate this, MessageList supports the notion of sublists.
+    When the caller determines that one list of messages of a type has ended
+    and a new list has begun, it calls BeginNewSublist() which takes the current
+    set of messages, puts them into a new MessageList and puts that into the
+    sublists array.  Later, when the caller needs to get at these messages, 
+    GetSublists() should be called.
+    '''
+    if len(self._messages):
+      # if this is the first list, no need to make a new one
+      list = MessageList(self._type)
+      list._messages = self._messages
+      if not self._sublists:
+        self._sublists = [list]
+      else:
+        self._sublists.append(list)
+      self._messages = []
+      logging.info("total size: %d" % self._bytes)
+      self._bytes = 0
+
+  def GetSublists(self):
+    '''Returns the current list of sublists.  If there are currently sublists
+    and there are any messages that aren't in a sublist, BeginNewSublist() is
+    called implicitly by this method to force those ungrouped messages into
+    their own sublist.
+    '''
+    if self._sublists and len(self._sublists) and len(self._messages):
+      self.BeginNewSublist()
+    return self._sublists
+
+  def AddMessage(self, msg):
+    '''Adds a message to this MessageList.'''
+    # TODO(erikkay): assert if _unique_messages exists
+    self._messages.append(msg)
+    self._bytes += msg._bytes
+
+  def AllMessages(self):
+    '''Returns an array of all Message objects in this MessageList. '''
+    # TODO(erikkay): handle case with sublists
+    return self._messages
+
+  def UniqueMessages(self):
+    '''Returns an array of the unique normalized Message objects in this 
+    MessageList.
+    '''
+    # the list is lazily computed since we have to create a sorted list,
+    # which is only valid once all messages have been added
+    # TODO(erikkay): handle case with sublists
+    if not self._unique_messages:
+      self._unique_messages = list(set(self._messages))
+      self._unique_messages.sort()
+    return self._unique_messages
+
+  def UniqueMessageGroups(self):
+    '''Returns a dictionary mapping Message group names to arrays of uniqued
+    normalized Message objects in this MessageList.
+    '''
+    unique = self.UniqueMessages()
+    groups = {}
+    for msg in unique:
+      group = msg.GetGroup()
+      if not group in groups:
+        groups[group] = []
+      groups[group].append(msg)
+    return groups