summaryrefslogtreecommitdiffstats
path: root/tools/cygprofile/cyglog_to_orderfile.py
blob: bc382f60e660051618334423f096b5d4e21ed88c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/python
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Symbolizes a log file produced by cyprofile instrumentation.

Given a log file and the binary being profiled, creates an orderfile.
"""

import logging
import multiprocessing
import optparse
import os
import re
import string
import sys
import tempfile

import cygprofile_utils
import symbol_extractor


def _ParseLogLines(log_file_lines):
  """Parses a merged cyglog produced by mergetraces.py.

  Args:
    log_file_lines: array of lines in log file produced by profiled run

    Below is an example of a small log file:
    5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so
    secs       usecs      pid:threadid    func
    START
    1314897086 795828     3587:1074648168 0x509e105c
    1314897086 795874     3587:1074648168 0x509e0eb4
    1314897086 796326     3587:1074648168 0x509e0e3c
    1314897086 796552     3587:1074648168 0x509e07bc
    END

  Returns:
    An ordered list of callee offsets.
  """
  call_lines = []
  vm_start = 0
  line = log_file_lines[0]
  assert 'r-xp' in line
  end_index = line.find('-')
  vm_start = int(line[:end_index], 16)
  for line in log_file_lines[3:]:
    fields = line.split()
    if len(fields) == 4:
      call_lines.append(fields)
    else:
      assert fields[0] == 'END'
  # Convert strings to int in fields.
  call_info = []
  for call_line in call_lines:
    addr = int(call_line[3], 16)
    if vm_start < addr:
      addr -= vm_start
      call_info.append(addr)
  return call_info


def _GroupLibrarySymbolInfosByOffset(lib_filename):
  """Returns a dict {offset: [SymbolInfo]} from a library."""
  symbol_infos = symbol_extractor.SymbolInfosFromBinary(lib_filename)
  return symbol_extractor.GroupSymbolInfosByOffset(symbol_infos)


class SymbolNotFoundException(Exception):
  def __init__(self, value):
    super(SymbolNotFoundException, self).__init__(value)
    self.value = value

  def __str__(self):
    return repr(self.value)


def _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset):
  """Finds all SymbolInfo at a given offset.

  Args:
    offset_to_symbol_infos: {offset: [SymbolInfo]}
    offset: offset to look the symbols at

  Returns:
    The list of SymbolInfo at the given offset

  Raises:
    SymbolNotFoundException if the offset doesn't match any symbol.
  """
  if offset in offset_to_symbol_infos:
    return offset_to_symbol_infos[offset]
  elif offset % 2 and (offset - 1) in offset_to_symbol_infos:
    # On ARM, odd addresses are used to signal thumb instruction. They are
    # generated by setting the LSB to 1 (see
    # http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0471e/Babfjhia.html).
    # TODO(lizeb): Make sure this hack doesn't propagate to other archs.
    return offset_to_symbol_infos[offset - 1]
  else:
    raise SymbolNotFoundException(offset)


def _GetObjectFileNames(obj_dir):
  """Returns the list of object files in a directory."""
  obj_files = []
  for (dirpath, _, filenames) in os.walk(obj_dir):
    for file_name in filenames:
      if file_name.endswith('.o'):
        obj_files.append(os.path.join(dirpath, file_name))
  return obj_files


def _AllSymbolInfos(object_filenames):
  """Returns a list of SymbolInfo from an iterable of filenames."""
  pool = multiprocessing.Pool()
  # Hopefully the object files are in the page cache at this step, so IO should
  # not be a problem (hence no concurrency limit on the pool).
  symbol_infos_nested = pool.map(
      symbol_extractor.SymbolInfosFromBinary, object_filenames)
  result = []
  for symbol_infos in symbol_infos_nested:
    result += symbol_infos
  return result


def _SameCtorOrDtorNames(symbol1, symbol2):
  """Returns True if two symbols refer to the same constructor or destructor.

  The Itanium C++ ABI specifies dual constructor and destructor
  emmission (section 5.1.4.3):
  https://refspecs.linuxbase.org/cxxabi-1.83.html#mangling-special
  To avoid fully parsing all mangled symbols, a heuristic is used with c++filt.

  Note: some compilers may name generated copies differently.  If this becomes
  an issue this heuristic will need to be updated.
  """
  # Check if this is the understood case of constructor/destructor
  # signatures. GCC emits up to three types of constructor/destructors:
  # complete, base, and allocating.  If they're all the same they'll
  # get folded together.
  return (re.search('(C[123]|D[012])E', symbol1) and
          symbol_extractor.DemangleSymbol(symbol1) ==
          symbol_extractor.DemangleSymbol(symbol2))


def GetSymbolToSectionsMapFromObjectFiles(obj_dir):
  """Scans object files to create a {symbol: linker section(s)} map.

  Args:
    obj_dir: The root of the output object file directory, which will be
             scanned for .o files to form the mapping.

  Returns:
    A map {symbol_name: [section_name1, section_name2...]}
  """
  object_files = _GetObjectFileNames(obj_dir)
  symbol_to_sections_map = {}
  symbol_warnings = cygprofile_utils.WarningCollector(300)
  symbol_infos = _AllSymbolInfos(object_files)
  for symbol_info in symbol_infos:
    symbol = symbol_info.name
    if symbol.startswith('.LTHUNK'):
      continue
    section = symbol_info.section
    if ((symbol in symbol_to_sections_map) and
        (symbol_info.section not in symbol_to_sections_map[symbol])):
      symbol_to_sections_map[symbol].append(section)

      if not _SameCtorOrDtorNames(
          symbol, symbol_to_sections_map[symbol][0].lstrip('.text.')):
        symbol_warnings.Write('Symbol ' + symbol +
                              ' unexpectedly in more than one section: ' +
                              ', '.join(symbol_to_sections_map[symbol]))
    elif not section.startswith('.text.'):
      symbol_warnings.Write('Symbol ' + symbol +
                            ' in incorrect section ' + section)
    else:
      # In most cases we expect just one item in this list, and maybe 4 or so in
      # the worst case.
      symbol_to_sections_map[symbol] = [section]
  symbol_warnings.WriteEnd('bad sections')
  return symbol_to_sections_map


def _WarnAboutDuplicates(offsets):
  """Warns about duplicate offsets.

  Args:
    offsets: list of offsets to check for duplicates

  Returns:
    True if there are no duplicates, False otherwise.
  """
  seen_offsets = set()
  ok = True
  for offset in offsets:
    if offset not in seen_offsets:
      seen_offsets.add(offset)
    else:
      ok = False
      logging.warning('Duplicate offset: ' + hex(offset))
  return ok


def _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_sections_map,
                     output_file):
  """Outputs the orderfile to output_file.

  Args:
    offsets: Iterable of offsets to match to section names
    offset_to_symbol_infos: {offset: [SymbolInfo]}
    symbol_to_sections_map: {name: [section1, section2]}
    output_file: file-like object to write the results to

  Returns:
    True if all symbols were found in the library.
  """
  success = True
  unknown_symbol_warnings = cygprofile_utils.WarningCollector(300)
  symbol_not_found_errors = cygprofile_utils.WarningCollector(
      300, level=logging.ERROR)
  output_sections = set()
  for offset in offsets:
    try:
      symbol_infos = _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset)
      for symbol_info in symbol_infos:
        if symbol_info.name in symbol_to_sections_map:
          sections = symbol_to_sections_map[symbol_info.name]
          for section in sections:
            if not section in output_sections:
              output_file.write(section + '\n')
              output_sections.add(section)
        else:
          unknown_symbol_warnings.Write(
              'No known section for symbol ' + symbol_info.name)
    except SymbolNotFoundException:
      symbol_not_found_errors.Write(
          'Did not find function in binary. offset: ' + hex(offset))
      success = False
  unknown_symbol_warnings.WriteEnd('no known section for symbol.')
  symbol_not_found_errors.WriteEnd('symbol not found in the binary.')
  return success


def main():
  parser = optparse.OptionParser(usage=
      'usage: %prog [options] <merged_cyglog> <library> <output_filename>')
  parser.add_option('--target-arch', action='store', dest='arch',
                    choices=['arm', 'arm64', 'x86', 'x86_64', 'x64', 'mips'],
                    help='The target architecture for libchrome.so')
  options, argv = parser.parse_args(sys.argv)
  if not options.arch:
    options.arch = cygprofile_utils.DetectArchitecture()
  if len(argv) != 4:
    parser.print_help()
    return 1
  (log_filename, lib_filename, output_filename) = argv[1:]
  symbol_extractor.SetArchitecture(options.arch)

  obj_dir = cygprofile_utils.GetObjDir(lib_filename)

  log_file_lines = map(string.rstrip, open(log_filename).readlines())
  offsets = _ParseLogLines(log_file_lines)
  _WarnAboutDuplicates(offsets)

  offset_to_symbol_infos = _GroupLibrarySymbolInfosByOffset(lib_filename)
  symbol_to_sections_map = GetSymbolToSectionsMapFromObjectFiles(obj_dir)

  success = False
  temp_filename = None
  output_file = None
  try:
    (fd, temp_filename) = tempfile.mkstemp(dir=os.path.dirname(output_filename))
    output_file = os.fdopen(fd, 'w')
    ok = _OutputOrderfile(
        offsets, offset_to_symbol_infos, symbol_to_sections_map, output_file)
    output_file.close()
    os.rename(temp_filename, output_filename)
    temp_filename = None
    success = ok
  finally:
    if output_file:
      output_file.close()
    if temp_filename:
      os.remove(temp_filename)

  return 0 if success else 1


if __name__ == '__main__':
  logging.basicConfig(level=logging.INFO)
  sys.exit(main())