summaryrefslogtreecommitdiffstats
path: root/tools/linux/dump-static-initializers.py
blob: 416bca2cdfc1b9ece2e7c6dce7b398c1e65b6c15 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Dump functions called by static intializers in a Linux Release binary.

Usage example:
  tools/linux/dump-static-intializers.py out/Release/chrome

A brief overview of static initialization:
1) the compiler writes out, per object file, a function that contains
   the static intializers for that file.
2) the compiler also writes out a pointer to that function in a special
   section.
3) at link time, the linker concatenates the function pointer sections
   into a single list of all initializers.
4) at run time, on startup the binary runs all function pointers.

The functions in (1) all have mangled names of the form
  _GLOBAL__I_foobar.cc
using objdump, we can disassemble those functions and dump all symbols that
they reference.
"""

import optparse
import re
import subprocess
import sys

# A map of symbol => informative text about it.
NOTES = {
  '__cxa_atexit@plt': 'registers a dtor to run at exit',
  'std::__ioinit': '#includes <iostream>, use <ostream> instead',
}

class Demangler(object):
  """A wrapper around c++filt to provide a function to demangle symbols."""
  def __init__(self):
    self.cppfilt = subprocess.Popen(['c++filt'],
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE)

  def Demangle(self, sym):
    """Given mangled symbol |sym|, return its demangled form."""
    self.cppfilt.stdin.write(sym + '\n')
    return self.cppfilt.stdout.readline().strip()

# Matches for example: "cert_logger.pb.cc", capturing "cert_logger".
protobuf_filename_re = re.compile(r'(.*)\.pb\.cc$')
def QualifyFilenameAsProto(filename):
  """Attempt to qualify a bare |filename| with a src-relative path, assuming it
  is a protoc-generated file.  If a single match is found, it is returned.
  Otherwise the original filename is returned."""
  match = protobuf_filename_re.match(filename)
  if not match:
    return filename
  basename = match.groups(0)
  gitlsfiles = subprocess.Popen(
    ['git', 'ls-files', '--', '*/%s.proto' % basename],
    stdout=subprocess.PIPE)
  candidate = filename
  for line in gitlsfiles.stdout:
    if candidate != filename:
      return filename # Multiple hits, can't help.
    candidate = line.strip()
  return candidate

# Regex matching the substring of a symbol's demangled text representation most
# likely to appear in a source file.
# Example: "v8::internal::Builtins::InitBuiltinFunctionTable()" becomes
# "InitBuiltinFunctionTable", since the first (optional & non-capturing) group
# picks up any ::-qualification and the last fragment picks up a suffix that
# starts with an opener.
symbol_code_name_re = re.compile(r'^(?:[^(<[]*::)?([^:(<[]*).*?$')
def QualifyFilename(filename, symbol):
  """Given a bare filename and a symbol that occurs in it, attempt to qualify
  it with a src-relative path.  If more than one file matches, return the
  original filename."""
  match = symbol_code_name_re.match(symbol)
  if not match:
    return filename
  symbol = match.group(1)
  gitgrep = subprocess.Popen(
    ['git', 'grep', '-l', symbol, '--', '*/%s' % filename],
    stdout=subprocess.PIPE)
  candidate = filename
  for line in gitgrep.stdout:
    if candidate != filename:  # More than one candidate; return bare filename.
      return filename
    candidate = line.strip()
  return candidate

# Regex matching nm output for the symbols we're interested in.
# Example line:
#   0000000001919920 0000000000000008 t _ZN12_GLOBAL__I_safe_browsing_service.cc
nm_re = re.compile(r'(\S+) (\S+) t _GLOBAL__I_(.*)')
def ParseNm(binary):
  """Given a binary, yield static initializers as (file, start, size) tuples."""

  nm = subprocess.Popen(['nm', '-S', binary], stdout=subprocess.PIPE)
  for line in nm.stdout:
    match = nm_re.match(line)
    if match:
      addr, size, filename = match.groups()
      yield filename, int(addr, 16), int(size, 16)

# Regex matching objdump output for the symbols we're interested in.
# Example line:
#     12354ab:  (disassembly, including <FunctionReference>)
disassembly_re = re.compile(r'^\s+[0-9a-f]+:.*<(\S+)>')
def ExtractSymbolReferences(binary, start, end):
  """Given a span of addresses, returns symbol references from disassembly."""
  cmd = ['objdump', binary, '--disassemble',
         '--start-address=0x%x' % start, '--stop-address=0x%x' % end]
  objdump = subprocess.Popen(cmd, stdout=subprocess.PIPE)

  refs = set()
  for line in objdump.stdout:
    if '__static_initialization_and_destruction' in line:
      raise RuntimeError, ('code mentions '
                           '__static_initialization_and_destruction; '
                           'did you accidentally run this on a Debug binary?')
    match = disassembly_re.search(line)
    if match:
      (ref,) = match.groups()
      if ref.startswith('.LC') or ref.startswith('_DYNAMIC'):
        # Ignore these, they are uninformative.
        continue
      if ref.startswith('_GLOBAL__I_'):
        # Probably a relative jump within this function.
        continue
      refs.add(ref)

  return sorted(refs)

def main():
  parser = optparse.OptionParser(usage='%prog [option] filename')
  parser.add_option('-d', '--diffable', dest='diffable',
                    action='store_true', default=False,
                    help='Prints the filename on each line, for more easily '
                         'diff-able output.')
  opts, args = parser.parse_args()
  if len(args) != 1:
    parser.error('missing filename argument')
    return 1
  binary = args[0]

  demangler = Demangler()
  file_count = 0
  initializer_count = 0

  files = ParseNm(binary)
  if opts.diffable:
    files = sorted(files)
  for filename, addr, size in files:
    if size == 2:
      # gcc generates a two-byte 'repz retq' initializer when there is nothing
      # to do.  jyasskin tells me this is fixed in gcc 4.6.
      continue

    file_count += 1

    ref_output = []
    qualified_filename = QualifyFilenameAsProto(filename)
    for ref in ExtractSymbolReferences(binary, addr, addr+size):
      initializer_count += 1

      ref = demangler.Demangle(ref)
      if qualified_filename == filename:
        qualified_filename = QualifyFilename(filename, ref)
      if ref in NOTES:
        ref_output.append('  %s [%s]' % (ref, NOTES[ref]))
      else:
        ref_output.append('  ' + ref)

    if opts.diffable:
      print '\n'.join(qualified_filename + r for r in ref_output)
    else:
      print '%s (initializer offset 0x%x size 0x%x)' % (qualified_filename,
                                                        addr, size)
      print '\n'.join(ref_output) + '\n'

  print 'Found %d static initializers in %d files.' % (initializer_count,
                                                       file_count)

  return 0

if '__main__' == __name__:
  sys.exit(main())