summaryrefslogtreecommitdiffstats
path: root/tools/valgrind/asan/asan_symbolize.py
blob: 2cdae08b18ab370fd81a0c427d3a323a6b33afac (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env python

# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

from third_party import asan_symbolize

import argparse
import base64
import json
import os
import platform
import re
import subprocess
import sys

class LineBuffered(object):
  """Disable buffering on a file object."""
  def __init__(self, stream):
    self.stream = stream

  def write(self, data):
    self.stream.write(data)
    if '\n' in data:
      self.stream.flush()

  def __getattr__(self, attr):
    return getattr(self.stream, attr)


def disable_buffering():
  """Makes this process and child processes stdout unbuffered."""
  if not os.environ.get('PYTHONUNBUFFERED'):
    # Since sys.stdout is a C++ object, it's impossible to do
    # sys.stdout.write = lambda...
    sys.stdout = LineBuffered(sys.stdout)
    os.environ['PYTHONUNBUFFERED'] = 'x'


def set_symbolizer_path():
  """Set the path to the llvm-symbolize binary in the Chromium source tree."""
  if not os.environ.get('LLVM_SYMBOLIZER_PATH'):
    script_dir = os.path.dirname(os.path.abspath(__file__))
    # Assume this script resides three levels below src/ (i.e.
    # src/tools/valgrind/asan/).
    src_root = os.path.join(script_dir, "..", "..", "..")
    symbolizer_path = os.path.join(src_root, 'third_party',
        'llvm-build', 'Release+Asserts', 'bin', 'llvm-symbolizer')
    assert(os.path.isfile(symbolizer_path))
    os.environ['LLVM_SYMBOLIZER_PATH'] = os.path.abspath(symbolizer_path)


def is_hash_name(name):
  match = re.match('[0-9a-f]+$', name)
  return bool(match)


def split_path(path):
  ret = []
  while True:
    head, tail = os.path.split(path)
    if head == path:
      return [head] + ret
    ret, path = [tail] + ret, head


def chrome_product_dir_path(exe_path):
  if exe_path is None:
    return None
  path_parts = split_path(exe_path)
  # Make sure the product dir path isn't empty if |exe_path| consists of
  # a single component.
  if len(path_parts) == 1:
    path_parts = ['.'] + path_parts
  for index, part in enumerate(path_parts):
    if part.endswith('.app'):
      return os.path.join(*path_parts[:index])
  # If the executable isn't an .app bundle, it's a commandline binary that
  # resides right in the product dir.
  return os.path.join(*path_parts[:-1])


inode_path_cache = {}


def find_inode_at_path(inode, path):
  if inode in inode_path_cache:
    return inode_path_cache[inode]
  cmd = ['find', path, '-inum', str(inode)]
  find_line = subprocess.check_output(cmd).rstrip()
  lines = find_line.split('\n')
  ret = None
  if lines:
    # `find` may give us several paths (e.g. 'Chromium Framework' in the
    # product dir and 'Chromium Framework' inside 'Chromium.app',
    # chrome_dsym_hints() will produce correct .dSYM path for any of them.
    ret = lines[0]
  inode_path_cache[inode] = ret
  return ret


# Create a binary name filter that works around https://crbug.com/444835.
# When running tests on OSX swarming servers, ASan sometimes prints paths to
# files in cache (ending with SHA1 filenames) instead of paths to hardlinks to
# those files in the product dir.
# For a given |binary_path| chrome_osx_binary_name_filter() returns one of the
# hardlinks to the same inode in |product_dir_path|.
def make_chrome_osx_binary_name_filter(product_dir_path=''):
  def chrome_osx_binary_name_filter(binary_path):
    basename = os.path.basename(binary_path)
    if is_hash_name(basename) and product_dir_path:
      inode = os.stat(binary_path).st_ino
      new_binary_path = find_inode_at_path(inode, product_dir_path)
      if new_binary_path:
        return new_binary_path
    return binary_path
  return chrome_osx_binary_name_filter


# Construct a path to the .dSYM bundle for the given binary.
# There are three possible cases for binary location in Chromium:
# 1. The binary is a standalone executable or dynamic library in the product
#    dir, the debug info is in "binary.dSYM" in the product dir.
# 2. The binary is a standalone framework or .app bundle, the debug info is in
#    "Framework.framework.dSYM" or "App.app.dSYM" in the product dir.
# 3. The binary is a framework or an .app bundle within another .app bundle
#    (e.g. Outer.app/Contents/Versions/1.2.3.4/Inner.app), and the debug info
#    is in Inner.app.dSYM in the product dir.
# The first case is handled by llvm-symbolizer, so we only need to construct
# .dSYM paths for .app bundles and frameworks.
# We're assuming that there're no more than two nested bundles in the binary
# path. Only one of these bundles may be a framework and frameworks cannot
# contain other bundles.
def chrome_dsym_hints(binary):
  path_parts = split_path(binary)
  app_positions = []
  framework_positions = []
  for index, part in enumerate(path_parts):
    if part.endswith('.app'):
      app_positions.append(index)
    elif part.endswith('.framework'):
      framework_positions.append(index)
  bundle_positions = app_positions + framework_positions
  bundle_positions.sort()
  assert len(bundle_positions) <= 2, \
      "The path contains more than two nested bundles: %s" % binary
  if len(bundle_positions) == 0:
    # Case 1: this is a standalone executable or dylib.
    return []
  assert (not (len(app_positions) == 1 and
               len(framework_positions) == 1 and
               app_positions[0] > framework_positions[0])), \
      "The path contains an app bundle inside a framework: %s" % binary
  # Cases 2 and 3. The outermost bundle (which is the only bundle in the case 2)
  # is located in the product dir.
  outermost_bundle = bundle_positions[0]
  product_dir = path_parts[:outermost_bundle]
  # In case 2 this is the same as |outermost_bundle|.
  innermost_bundle = bundle_positions[-1]
  dsym_path = product_dir + [path_parts[innermost_bundle]]
  result = '%s.dSYM' % os.path.join(*dsym_path)
  return [result]


# We want our output to match base::EscapeJSONString(), which produces
# doubly-escaped strings. The first escaping pass is handled by this class. The
# second pass happens when JSON data is dumped to file.
class StringEncoder(json.JSONEncoder):
  def __init__(self):
    json.JSONEncoder.__init__(self)

  def encode(self, s):
    assert(isinstance(s, basestring))
    # Don't die on invalid utf-8 sequences.
    s = s.decode('utf-8', 'replace')
    encoded = json.JSONEncoder.encode(self, s)
    assert(len(encoded) >= 2)
    assert(encoded[0] == '"')
    assert(encoded[-1] == '"')
    encoded = encoded[1:-1]
    # Special case from base::EscapeJSONString().
    encoded = encoded.replace('<', '\u003C')
    return encoded


class JSONTestRunSymbolizer(object):
  def __init__(self, symbolization_loop):
    self.symbolization_loop = symbolization_loop

  def symbolize_snippet(self, snippet):
    symbolized_lines = []
    for line in snippet.split('\n'):
      symbolized_lines += self.symbolization_loop.process_line(line)
    return '\n'.join(symbolized_lines)

  def symbolize(self, test_run):
    original_snippet = base64.b64decode(test_run['output_snippet_base64'])
    symbolized_snippet = self.symbolize_snippet(original_snippet)
    if symbolized_snippet == original_snippet:
      # No sanitizer reports in snippet.
      return

    test_run['original_output_snippet'] = test_run['output_snippet']
    test_run['original_output_snippet_base64'] = \
        test_run['output_snippet_base64']

    escaped_snippet = StringEncoder().encode(symbolized_snippet)
    test_run['output_snippet'] = escaped_snippet
    test_run['output_snippet_base64'] = \
        base64.b64encode(symbolized_snippet)
    test_run['snippet_processed_by'] = 'asan_symbolize.py'
    # Originally, "lossless" refers to "no Unicode data lost while encoding the
    # string". However, since we're applying another kind of transformation
    # (symbolization), it doesn't seem right to consider the snippet lossless.
    test_run['losless_snippet'] = False


def symbolize_snippets_in_json(filename, symbolization_loop):
  with open(filename, 'r') as f:
    json_data = json.load(f)

  test_run_symbolizer = JSONTestRunSymbolizer(symbolization_loop)
  for iteration_data in json_data['per_iteration_data']:
    for test_name, test_runs in iteration_data.iteritems():
      for test_run in test_runs:
        test_run_symbolizer.symbolize(test_run)

  with open(filename, 'w') as f:
    json.dump(json_data, f, indent=3, sort_keys=True)


def main():
  parser = argparse.ArgumentParser(description='Symbolize sanitizer reports.')
  parser.add_argument('--test-summary-json-file',
      help='Path to a JSON file produced by the test launcher. The script will '
           'ignore stdandard input and instead symbolize the output stnippets '
           'inside the JSON file. The result will be written back to the JSON '
           'file.')
  parser.add_argument('strip_path_prefix', nargs='*',
      help='When printing source file names, the longest prefix ending in one '
           'of these substrings will be stripped. E.g.: "Release/../../".')
  parser.add_argument('--executable-path',
      help='Path to program executable. Used on OSX swarming bots to locate '
           'dSYM bundles for associated frameworks and bundles.')
  args = parser.parse_args()

  disable_buffering()
  set_symbolizer_path()
  asan_symbolize.demangle = True
  asan_symbolize.fix_filename_patterns = args.strip_path_prefix
  # Most source paths for Chromium binaries start with
  # /path/to/src/out/Release/../../
  asan_symbolize.fix_filename_patterns.append('Release/../../')
  binary_name_filter = None
  if platform.uname()[0] == 'Darwin':
    binary_name_filter = make_chrome_osx_binary_name_filter(
        chrome_product_dir_path(args.executable_path))
  loop = asan_symbolize.SymbolizationLoop(
      binary_name_filter=binary_name_filter,
      dsym_hint_producer=chrome_dsym_hints)

  if args.test_summary_json_file:
    symbolize_snippets_in_json(args.test_summary_json_file, loop)
  else:
    # Process stdin.
    asan_symbolize.logfile = sys.stdin
    loop.process_logfile()

if __name__ == '__main__':
  main()