tools/binary_size/run_binary_size_analysis.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578

#!/usr/bin/python
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Generate a spatial analysis against an arbitrary library.

To use, build the 'binary_size_tool' target. Then run this tool, passing
in the location of the library to be analyzed along with any other options
you desire.
"""

import collections
import fileinput
import json
import optparse
import os
import pprint
import re
import shutil
import subprocess
import sys
import tempfile


# TODO(andrewhayden): Only used for legacy reports. Delete.
def FormatBytes(bytes):
  """Pretty-print a number of bytes."""
  if bytes > 1e6:
    bytes = bytes / 1.0e6
    return '%.1fm' % bytes
  if bytes > 1e3:
    bytes = bytes / 1.0e3
    return '%.1fk' % bytes
  return str(bytes)


# TODO(andrewhayden): Only used for legacy reports. Delete.
def SymbolTypeToHuman(type):
  """Convert a symbol type as printed by nm into a human-readable name."""
  return {'b': 'bss',
          'd': 'data',
          'r': 'read-only data',
          't': 'code',
          'w': 'weak symbol',
          'v': 'weak symbol'}[type]


def ParseNm(input):
  """Parse nm output.

  Argument: an iterable over lines of nm output.

  Yields: (symbol name, symbol type, symbol size, source file path).
  Path may be None if nm couldn't figure out the source file.
  """

  # Match lines with size, symbol, optional location, optional discriminator
  sym_re = re.compile(r'^[0-9a-f]{8} ' # address (8 hex digits)
                      '([0-9a-f]{8}) ' # size (8 hex digits)
                      '(.) ' # symbol type, one character
                      '([^\t]+)' # symbol name, separated from next by tab
                      '(?:\t(.*):[\d\?]+)?.*$') # location
  # Match lines with addr but no size.
  addr_re = re.compile(r'^[0-9a-f]{8} (.) ([^\t]+)(?:\t.*)?$')
  # Match lines that don't have an address at all -- typically external symbols.
  noaddr_re = re.compile(r'^ {8} (.) (.*)$')

  for line in input:
    line = line.rstrip()
    match = sym_re.match(line)
    if match:
      size, type, sym = match.groups()[0:3]
      size = int(size, 16)
      if type.lower() == 'b':
        continue  # skip all BSS for now
      path = match.group(4)
      yield sym, type, size, path
      continue
    match = addr_re.match(line)
    if match:
      type, sym = match.groups()[0:2]
      # No size == we don't care.
      continue
    match = noaddr_re.match(line)
    if match:
      type, sym = match.groups()
      if type in ('U', 'w'):
        # external or weak symbol
        continue

    print >>sys.stderr, 'unparsed:', repr(line)


def _MkChild(node, name):
  child = None
  for test in node['children']:
    if test['n'] == name:
      child = test
      break
  if not child:
    child = {'n': name, 'children': []}
    node['children'].append(child)
  return child


def MakeCompactTree(symbols):
  result = {'n': '/', 'children': [], 'k': 'p', 'maxDepth': 0}
  for symbol_name, symbol_type, symbol_size, file_path in symbols:

    if 'vtable for ' in symbol_name:
      symbol_type = '@' # hack to categorize these separately
    # Take path like '/foo/bar/baz', convert to ['foo', 'bar', 'baz']
    if file_path:
      file_path = os.path.normpath(file_path)
    else:
      file_path = '(No Path)'

    if file_path.startswith('/'):
      file_path = file_path[1:]
    path_parts = file_path.split('/')

    # Find pre-existing node in tree, or update if it already exists
    node = result
    depth = 0
    while len(path_parts) > 0:
      path_part = path_parts.pop(0)
      if len(path_part) == 0:
        continue
      depth += 1
      node = _MkChild(node, path_part);
      node['k'] = 'p' # p for path

    # 'node' is now the file node. Find the symbol-type bucket.
    node['lastPathElement'] = True
    node = _MkChild(node, symbol_type)
    node['t'] = symbol_type
    node['k'] = 'b' # b for bucket
    depth += 1

    # 'node' is now the symbol-type bucket. Make the child entry.
    node = _MkChild(node, symbol_name)
    if 'children' in node: # Only possible if we're adding duplicate entries!!!
      del node['children']
    node['value'] = symbol_size
    node['t'] = symbol_type
    node['k'] = 's' # s for symbol
    depth += 1
    result['maxDepth'] = max(result['maxDepth'], depth);

  return result


# TODO(andrewhayden): Only used for legacy reports. Delete.
def TreeifySymbols(symbols):
  """Convert symbols into a path-based tree, calculating size information
  along the way.

  The result is a dictionary that contains two kinds of nodes:
  1. Leaf nodes, representing source code locations (e.g., c++ files)
     These nodes have the following dictionary entries:
       sizes: a dictionary whose keys are categories (such as code, data,
              vtable, etceteras) and whose values are the size, in bytes, of
              those categories;
       size:  the total size, in bytes, of all the entries in the sizes dict
  2. Non-leaf nodes, representing directories
     These nodes have the following dictionary entries:
       children: a dictionary whose keys are names (path entries; either
                 directory or file names) and whose values are other nodes;
       size:     the total size, in bytes, of all the leaf nodes that are
                 contained within the children dict (recursively expanded)

  The result object is itself a dictionary that represents the common ancestor
  of all child nodes, e.g. a path to which all other nodes beneath it are
  relative. The 'size' attribute of this dict yields the sum of the size of all
  leaf nodes within the data structure.
  """
  dirs = {'children': {}, 'size': 0}
  for sym, type, size, path in symbols:
    dirs['size'] += size
    if path:
      path = os.path.normpath(path)
      if path.startswith('/'):
        path = path[1:]

    parts = None
    if path:
      parts = path.split('/')

    if parts:
      assert path
      file_key = parts.pop()
      tree = dirs
      try:
        # Traverse the tree to the parent of the file node, creating as needed
        for part in parts:
          assert part != ''
          if part not in tree['children']:
            tree['children'][part] = {'children': {}, 'size': 0}
          tree = tree['children'][part]
          tree['size'] += size

        # Get (creating if necessary) the node for the file
        # This node doesn't have a 'children' attribute
        if file_key not in tree['children']:
          tree['children'][file_key] = {'sizes': collections.defaultdict(int),
                                        'size': 0}
        tree = tree['children'][file_key]
        tree['size'] += size

        # Accumulate size into a bucket within the file
        type = type.lower()
        if 'vtable for ' in sym:
          tree['sizes']['[vtable]'] += size
        elif 'r' == type:
          tree['sizes']['[rodata]'] += size
        elif 'd' == type:
          tree['sizes']['[data]'] += size
        elif 'b' == type:
          tree['sizes']['[bss]'] += size
        elif 't' == type:
          # 'text' in binary parlance means 'code'.
          tree['sizes']['[code]'] += size
        elif 'w' == type:
          tree['sizes']['[weak]'] += size
        else:
          tree['sizes']['[other]'] += size
      except:
        print >>sys.stderr, sym, parts, key
        raise
    else:
      key = 'symbols without paths'
      if key not in dirs['children']:
        dirs['children'][key] = {'sizes': collections.defaultdict(int),
                                 'size': 0}
      tree = dirs['children'][key]
      subkey = 'misc'
      if (sym.endswith('::__FUNCTION__') or
        sym.endswith('::__PRETTY_FUNCTION__')):
        subkey = '__FUNCTION__'
      elif sym.startswith('CSWTCH.'):
        subkey = 'CSWTCH'
      elif '::' in sym:
        subkey = sym[0:sym.find('::') + 2]
      tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size
      tree['size'] += size
  return dirs


# TODO(andrewhayden): Only used for legacy reports. Delete.
def JsonifyTree(tree, name):
  """Convert TreeifySymbols output to a JSON treemap.

  The format is very similar, with the notable exceptions being
  lists of children instead of maps and some different attribute names."""
  children = []
  css_class_map = {
                  '[vtable]': 'vtable',
                  '[rodata]': 'read-only_data',
                  '[data]': 'data',
                  '[bss]': 'bss',
                  '[code]': 'code',
                  '[weak]': 'weak_symbol'
  }
  if 'children' in tree:
    # Non-leaf node. Recurse.
    for child_name, child in tree['children'].iteritems():
      children.append(JsonifyTree(child, child_name))
  else:
    # Leaf node; dump per-file stats as entries in the treemap
    for kind, size in tree['sizes'].iteritems():
      child_json = {'name': kind + ' (' + FormatBytes(size) + ')',
                   'data': { '$area': size }}
      css_class = css_class_map.get(kind)
      if css_class is not None: child_json['data']['$symbol'] = css_class
      children.append(child_json)
  # Sort children by size, largest to smallest.
  children.sort(key=lambda child: -child['data']['$area'])

  # For leaf nodes, the 'size' attribute is the size of the leaf;
  # Non-leaf nodes don't really have a size, but their 'size' attribute is
  # the sum of the sizes of all their children.
  return {'name': name + ' (' + FormatBytes(tree['size']) + ')',
          'data': { '$area': tree['size'] },
          'children': children }

def DumpCompactTree(symbols, outfile):
  out = open(outfile, 'w')
  try:
    out.write('var tree_data = ' + json.dumps(MakeCompactTree(symbols)))
  finally:
    out.flush()
    out.close()


# TODO(andrewhayden): Only used for legacy reports. Delete.
def DumpTreemap(symbols, outfile):
  dirs = TreeifySymbols(symbols)
  out = open(outfile, 'w')
  try:
    out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))
  finally:
    out.flush()
    out.close()


# TODO(andrewhayden): Only used for legacy reports. Delete.
def DumpLargestSymbols(symbols, outfile, n):
  # a list of (sym, type, size, path); sort by size.
  symbols = sorted(symbols, key=lambda x: -x[2])
  dumped = 0
  out = open(outfile, 'w')
  try:
    out.write('var largestSymbols = [\n')
    for sym, type, size, path in symbols:
      if type in ('b', 'w'):
        continue  # skip bss and weak symbols
      if path is None:
        path = ''
      entry = {'size': FormatBytes(size),
               'symbol': sym,
               'type': SymbolTypeToHuman(type),
               'location': path }
      out.write(json.dumps(entry))
      out.write(',\n')
      dumped += 1
      if dumped >= n:
        return
  finally:
    out.write('];\n')
    out.flush()
    out.close()


def MakeSourceMap(symbols):
  sources = {}
  for sym, type, size, path in symbols:
    key = None
    if path:
      key = os.path.normpath(path)
    else:
      key = '[no path]'
    if key not in sources:
      sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
    record = sources[key]
    record['size'] += size
    record['symbol_count'] += 1
  return sources


# TODO(andrewhayden): Only used for legacy reports. Delete.
def DumpLargestSources(symbols, outfile, n):
  map = MakeSourceMap(symbols)
  sources = sorted(map.values(), key=lambda x: -x['size'])
  dumped = 0
  out = open(outfile, 'w')
  try:
    out.write('var largestSources = [\n')
    for record in sources:
      entry = {'size': FormatBytes(record['size']),
               'symbol_count': str(record['symbol_count']),
               'location': record['path']}
      out.write(json.dumps(entry))
      out.write(',\n')
      dumped += 1
      if dumped >= n:
        return
  finally:
    out.write('];\n')
    out.flush()
    out.close()


# TODO(andrewhayden): Only used for legacy reports. Delete.
def DumpLargestVTables(symbols, outfile, n):
  vtables = []
  for symbol, type, size, path in symbols:
    if 'vtable for ' in symbol:
      vtables.append({'symbol': symbol, 'path': path, 'size': size})
  vtables = sorted(vtables, key=lambda x: -x['size'])
  dumped = 0
  out = open(outfile, 'w')
  try:
    out.write('var largestVTables = [\n')
    for record in vtables:
      entry = {'size': FormatBytes(record['size']),
               'symbol': record['symbol'],
               'location': record['path']}
      out.write(json.dumps(entry))
      out.write(',\n')
      dumped += 1
      if dumped >= n:
        return
  finally:
    out.write('];\n')
    out.flush()
    out.close()


# TODO(andrewhayden): Switch to Primiano's python-based version.
def RunParallelAddress2Line(outfile, library, arch, jobs, verbose):
  """Run a parallel addr2line processing engine to dump and resolve symbols."""
  out_dir = os.getenv('CHROMIUM_OUT_DIR', 'out')
  build_type = os.getenv('BUILDTYPE', 'Release')
  classpath = os.path.join(out_dir, build_type, 'lib.java',
                           'binary_size_java.jar')
  cmd = ['java',
         '-classpath', classpath,
         'org.chromium.tools.binary_size.ParallelAddress2Line',
         '--disambiguate',
         '--outfile', outfile,
         '--library', library,
         '--threads', jobs]
  if verbose is True:
    cmd.append('--verbose')
  prefix = os.path.join('third_party', 'android_tools', 'ndk', 'toolchains')
  if arch == 'android-arm':
    prefix = os.path.join(prefix, 'arm-linux-androideabi-4.8', 'prebuilt',
                          'linux-x86_64', 'bin', 'arm-linux-androideabi-')
    cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
  elif arch == 'android-mips':
    prefix = os.path.join(prefix, 'mipsel-linux-android-4.8', 'prebuilt',
                          'linux-x86_64', 'bin', 'mipsel-linux-android-')
    cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
  elif arch == 'android-x86':
    prefix = os.path.join(prefix, 'x86-4.8', 'prebuilt',
                          'linux-x86_64', 'bin', 'i686-linux-android-')
    cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
  # else, use whatever is in PATH (don't pass --nm or --addr2line)

  if verbose:
    print cmd

  return_code = subprocess.call(cmd)
  if return_code:
    raise RuntimeError('Failed to run ParallelAddress2Line: returned ' +
                       str(return_code))


def GetNmSymbols(infile, outfile, library, arch, jobs, verbose):
  if infile is None:
    if outfile is None:
      infile = tempfile.NamedTemporaryFile(delete=False).name
    else:
      infile = outfile

    if verbose:
      print 'Running parallel addr2line, dumping symbols to ' + infile;
    RunParallelAddress2Line(outfile=infile, library=library, arch=arch,
             jobs=jobs, verbose=verbose)
  elif verbose:
    print 'Using nm input from ' + infile
  with file(infile, 'r') as infile:
    return list(ParseNm(infile))


def main():
  usage="""%prog [options]

  Runs a spatial analysis on a given library, looking up the source locations
  of its symbols and calculating how much space each directory, source file,
  and so on is taking. The result is a report that can be used to pinpoint
  sources of large portions of the binary, etceteras.

  Under normal circumstances, you only need to pass two arguments, thusly:

      %prog --library /path/to/library --destdir /path/to/output

  In this mode, the program will dump the symbols from the specified library
  and map those symbols back to source locations, producing a web-based
  report in the specified output directory.

  Other options are available via '--help'.
  """
  parser = optparse.OptionParser(usage=usage)
  parser.add_option('--nm-in', metavar='PATH',
                    help='if specified, use nm input from <path> instead of '
                    'generating it. Note that source locations should be '
                    'present in the file; i.e., no addr2line symbol lookups '
                    'will be performed when this option is specified. '
                    'Mutually exclusive with --library.')
  parser.add_option('--destdir', metavar='PATH',
                    help='write output to the specified directory. An HTML '
                    'report is generated here along with supporting files; '
                    'any existing report will be overwritten.')
  parser.add_option('--library', metavar='PATH',
                    help='if specified, process symbols in the library at '
                    'the specified path. Mutually exclusive with --nm-in.')
  parser.add_option('--arch',
                    help='the architecture that the library is targeted to. '
                    'Determines which nm/addr2line binaries are used. When '
                    '\'host-native\' is chosen, the program will use whichever '
                    'nm/addr2line binaries are on the PATH. This is '
                    'appropriate when you are analyzing a binary by and for '
                    'your computer. '
                    'This argument is only valid when using --library. '
                    'Default is \'host-native\'.',
                    choices=['host-native', 'android-arm',
                             'android-mips', 'android-x86'],)
  parser.add_option('--jobs',
                    help='number of jobs to use for the parallel '
                    'addr2line processing pool; defaults to 1. More '
                    'jobs greatly improve throughput but eat RAM like '
                    'popcorn, and take several gigabytes each. Start low '
                    'and ramp this number up until your machine begins to '
                    'struggle with RAM. '
                    'This argument is only valid when using --library.')
  parser.add_option('-v', dest='verbose', action='store_true',
                    help='be verbose, printing lots of status information.')
  parser.add_option('--nm-out', metavar='PATH',
                    help='keep the nm output file, and store it at the '
                    'specified path. This is useful if you want to see the '
                    'fully processed nm output after the symbols have been '
                    'mapped to source locations. By default, a tempfile is '
                    'used and is deleted when the program terminates.'
                    'This argument is only valid when using --library.')
  parser.add_option('--legacy', action='store_true',
                    help='emit legacy binary size report instead of modern')
  opts, args = parser.parse_args()

  if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
    parser.error('exactly one of --library or --nm-in is required')
  if (opts.nm_in):
    if opts.jobs:
      print >> sys.stderr, ('WARNING: --jobs has no effect '
                            'when used with --nm-in')
    if opts.arch:
      print >> sys.stderr, ('WARNING: --arch has no effect '
                            'when used with --nm-in')
  if not opts.destdir:
    parser.error('--destdir is required argument')
  if not opts.jobs:
    opts.jobs = '1'
  if not opts.arch:
    opts.arch = 'host-native'

  symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library, opts.arch,
                           opts.jobs, opts.verbose is True)
  if not os.path.exists(opts.destdir):
    os.makedirs(opts.destdir, 0755)


  if opts.legacy: # legacy report
    DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))
    DumpLargestSymbols(symbols,
                         os.path.join(opts.destdir, 'largest-symbols.js'), 100)
    DumpLargestSources(symbols,
                         os.path.join(opts.destdir, 'largest-sources.js'), 100)
    DumpLargestVTables(symbols,
                         os.path.join(opts.destdir, 'largest-vtables.js'), 100)
    treemap_out = os.path.join(opts.destdir, 'webtreemap')
    if not os.path.exists(treemap_out):
      os.makedirs(treemap_out, 0755)
    treemap_src = os.path.join('third_party', 'webtreemap', 'src')
    shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)
    shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)
    shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)
    shutil.copy(os.path.join('tools', 'binary_size', 'legacy_template',
                             'index.html'), opts.destdir)
  else: # modern report
    DumpCompactTree(symbols, os.path.join(opts.destdir, 'data.js'))
    d3_out = os.path.join(opts.destdir, 'd3')
    if not os.path.exists(d3_out):
      os.makedirs(d3_out, 0755)
    d3_src = os.path.join('third_party', 'd3', 'src')
    template_src = os.path.join('tools', 'binary_size',
                                'template')
    shutil.copy(os.path.join(d3_src, 'LICENSE'), d3_out)
    shutil.copy(os.path.join(d3_src, 'd3.js'), d3_out)
    shutil.copy(os.path.join(template_src, 'index.html'), opts.destdir)
    shutil.copy(os.path.join(template_src, 'D3SymbolTreeMap.js'), opts.destdir)

  if opts.verbose:
    print 'Report saved to ' + opts.destdir + '/index.html'


if __name__ == '__main__':
  sys.exit(main())