summaryrefslogtreecommitdiffstats
path: root/tools/cygprofile/symbolize.py
blob: 694ae54f9b0728a02b9865d61eaf157240bfaaf1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/python
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Symbolize log file produced by cypgofile instrumentation.

Given a log file and the binary being profiled (e.g. executable, shared
library), the script can produce three different outputs: 1) symbols for the
addresses, 2) function and line numbers for the addresses, or 3) an order file.
"""

import optparse
import os
import string
import subprocess
import sys


def ParseLogLines(log_file_lines):
  """Parse a log file produced by the profiled run of clank.

  Args:
    log_file_lines: array of lines in log file produced by profiled run
    lib_name: library or executable containing symbols

    Below is an example of a small log file:
    5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so
    secs       msecs      pid:threadid    func
    START
    1314897086 795828     3587:1074648168 0x509e105c
    1314897086 795874     3587:1074648168 0x509e0eb4
    1314897086 796326     3587:1074648168 0x509e0e3c
    1314897086 796552     3587:1074648168 0x509e07bc
    END

  Returns:
    call_info list with list of tuples of the format (sec, msec, call id,
    function address called)
  """
  call_lines = []
  has_started = False
  vm_start = 0
  line = log_file_lines[0]
  assert("r-xp" in line)
  end_index = line.find('-')
  vm_start = int(line[:end_index], 16)
  for line in log_file_lines[2:]:
  # print hex(vm_start)
    fields = line.split()
    if len(fields) == 4:
      call_lines.append(fields)

  # Convert strings to int in fields.
  call_info = []
  for call_line in call_lines:
    (sec_timestamp, msec_timestamp) = map(int, call_line[0:2])
    callee_id = call_line[2]
    addr = int(call_line[3], 16)
    if vm_start < addr:
      addr -= vm_start
      call_info.append((sec_timestamp, msec_timestamp, callee_id, addr))

  return call_info


def ParseLibSymbols(lib_file):
  """Get output from running nm and greping for text symbols.

  Args:
    lib_file: the library or executable that contains the profiled code

  Returns:
    list of sorted unique addresses and corresponding size of function symbols
    in lib_file and map of addresses to all symbols at a particular address
  """
  cmd = ['nm', '-S', '-n', lib_file]
  nm_p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
  output = nm_p.communicate()[0]
  nm_lines = output.split('\n')

  nm_symbols = []
  for nm_line in nm_lines:
    if any(str in nm_line for str in (' t ', ' W ', ' T ')):
      nm_symbols.append(nm_line)

  nm_index = 0
  unique_addrs = []
  address_map = {}
  while nm_index < len(nm_symbols):

    # If the length of the split line is not 4, then it does not contain all the
    # information needed to symbolize (i.e. address, size and symbol name).
    if len(nm_symbols[nm_index].split()) == 4:
      (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]]

      # Multiple symbols may be at the same address.  This is do to aliasing
      # done by the compiler.  Since there is no way to be sure which one was
      # called in profiled run, we will symbolize to include all symbol names at
      # a particular address.
      fnames = []
      while (nm_index < len(nm_symbols) and
             addr == int(nm_symbols[nm_index].split()[0], 16)):
        if len(nm_symbols[nm_index].split()) == 4:
          fnames.append(nm_symbols[nm_index].split()[3])
        nm_index += 1
      address_map[addr] = fnames
      unique_addrs.append((addr, size))
    else:
      nm_index += 1

  return (unique_addrs, address_map)

class SymbolNotFoundException(Exception):
  def __init__(self,value):
    self.value = value
  def __str__(self):
    return repr(self.value)

def BinarySearchAddresses(addr, start, end, arr):
  """Find starting address of a symbol at a particular address.

  The reason we can not directly use the address provided by the log file is
  that the log file may give an address after the start of the symbol.  The
  logged address is often one byte after the start.  By using this search
  function rather than just subtracting one from the logged address allows
  the logging instrumentation to log any address in a function.

  Args:
    addr: the address being searched for
    start: the starting index for the binary search
    end: the ending index for the binary search
    arr: the list being searched containing tuple of address and size

  Returns:
    the starting address of the symbol at address addr

  Raises:
    Exception: if address not found.  Functions expects all logged addresses
    to be found
  """
  # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end)
  if start >= end or start == end - 1:
    # arr[i] is a tuple of address and size.  Check if addr inside range
    if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]:
      return arr[start][0]
    elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]:
      return arr[end][0]
    else:
      raise SymbolNotFoundException(addr)
  else:
    halfway = (start + end) / 2
    (nm_addr, size) = arr[halfway]
    # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)
    if addr >= nm_addr and addr < nm_addr + size:
      return nm_addr
    elif addr < nm_addr:
      return BinarySearchAddresses(addr, start, halfway-1, arr)
    else:
      # Condition (addr >= nm_addr + size) must be true.
      return BinarySearchAddresses(addr, halfway+1, end, arr)


def FindFunctions(addr, unique_addrs, address_map):
  """Find function symbol names at address addr."""
  return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1,
                                           unique_addrs)]


def AddrToLine(addr, lib_file):
  """Use addr2line to determine line info of a particular address."""
  cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)]
  p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
  output = (p.communicate()[0]).split('\n')
  line = output[0]
  index = 1
  while index < len(output):
    line = line + ':' + output[index]
    index += 1
  return line


def main():
  """Write output for profiled run to standard out.

  The format of the output depends on the output type specified as the third
  command line argument.  The default output type is to symbolize the addresses
  of the functions called.
  """
  parser = optparse.OptionParser('usage: %prog [options] log_file lib_file')
  parser.add_option('-t', '--outputType', dest='output_type',
                    default='symbolize', type='string',
                    help='lineize or symbolize or orderfile')

  # Option for output type.  The log file and lib file arguments are required
  # by the script and therefore are not options.
  (options, args) = parser.parse_args()
  if len(args) != 2:
    parser.error('expected 2 args: log_file lib_file')

  (log_file, lib_file) = args
  output_type = options.output_type

  lib_name = lib_file.split('/')[-1].strip()
  log_file_lines = map(string.rstrip, open(log_file).readlines())
  call_info = ParseLogLines(log_file_lines)
  (unique_addrs, address_map) = ParseLibSymbols(lib_file)

  # Check for duplicate addresses in the log file, and print a warning if
  # duplicates are found. The instrumentation that produces the log file
  # should only print the first time a function is entered.
  addr_list = []
  for call in call_info:
    addr = call[3]
    if addr not in addr_list:
      addr_list.append(addr)
    else:
      print('WARNING: Address ' + hex(addr) + ' (line= ' +
            AddrToLine(addr, lib_file) + ') already profiled.')

  for call in call_info:
    if output_type == 'lineize':
      symbol = AddrToLine(call[3], lib_file)
      print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
            + symbol)
    elif output_type == 'orderfile':
      try:
        symbols = FindFunctions(call[3], unique_addrs, address_map)
        for symbol in symbols:
          print '.text.' + symbol
        print ''
      except SymbolNotFoundException as e:
        sys.stderr.write('WARNING: Did not find function in binary. addr: '
                      + hex(addr) + '\n')
    else:
      try:
        symbols = FindFunctions(call[3], unique_addrs, address_map)
        print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
              + symbols[0])
        first_symbol = True
        for symbol in symbols:
          if not first_symbol:
            print '\t\t\t\t\t' + symbol
          else:
            first_symbol = False
      except SymbolNotFoundException as e:
        sys.stderr.write('WARNING: Did not find function in binary. addr: '
                      + hex(addr) + '\n')

if __name__ == '__main__':
  main()