run_binary_size_analysis.py revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
1#!/usr/bin/python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Generate a spatial analysis against an arbitrary library.
7
8To use, build the 'binary_size_tool' target. Then run this tool, passing
9in the location of the library to be analyzed along with any other options
10you desire.
11"""
12
13import collections
14import fileinput
15import json
16import optparse
17import os
18import pprint
19import re
20import shutil
21import subprocess
22import sys
23import tempfile
24
25
26def FormatBytes(bytes):
27  """Pretty-print a number of bytes."""
28  if bytes > 1e6:
29    bytes = bytes / 1.0e6
30    return '%.1fm' % bytes
31  if bytes > 1e3:
32    bytes = bytes / 1.0e3
33    return '%.1fk' % bytes
34  return str(bytes)
35
36
37def SymbolTypeToHuman(type):
38  """Convert a symbol type as printed by nm into a human-readable name."""
39  return {'b': 'bss',
40          'd': 'data',
41          'r': 'read-only data',
42          't': 'code',
43          'w': 'weak symbol',
44          'v': 'weak symbol'}[type]
45
46
47def ParseNm(input):
48  """Parse nm output.
49
50  Argument: an iterable over lines of nm output.
51
52  Yields: (symbol name, symbol type, symbol size, source file path).
53  Path may be None if nm couldn't figure out the source file.
54  """
55
56  # Match lines with size, symbol, optional location, optional discriminator
57  sym_re = re.compile(r'^[0-9a-f]{8} ' # address (8 hex digits)
58                      '([0-9a-f]{8}) ' # size (8 hex digits)
59                      '(.) ' # symbol type, one character
60                      '([^\t]+)' # symbol name, separated from next by tab
61                      '(?:\t(.*):[\d\?]+)?.*$') # location
62  # Match lines with addr but no size.
63  addr_re = re.compile(r'^[0-9a-f]{8} (.) ([^\t]+)(?:\t.*)?$')
64  # Match lines that don't have an address at all -- typically external symbols.
65  noaddr_re = re.compile(r'^ {8} (.) (.*)$')
66
67  for line in input:
68    line = line.rstrip()
69    match = sym_re.match(line)
70    if match:
71      size, type, sym = match.groups()[0:3]
72      size = int(size, 16)
73      type = type.lower()
74      if type == 'v':
75        type = 'w'  # just call them all weak
76      if type == 'b':
77        continue  # skip all BSS for now
78      path = match.group(4)
79      yield sym, type, size, path
80      continue
81    match = addr_re.match(line)
82    if match:
83      type, sym = match.groups()[0:2]
84      # No size == we don't care.
85      continue
86    match = noaddr_re.match(line)
87    if match:
88      type, sym = match.groups()
89      if type in ('U', 'w'):
90        # external or weak symbol
91        continue
92
93    print >>sys.stderr, 'unparsed:', repr(line)
94
95
96def TreeifySymbols(symbols):
97  """Convert symbols into a path-based tree, calculating size information
98  along the way.
99
100  The result is a dictionary that contains two kinds of nodes:
101  1. Leaf nodes, representing source code locations (e.g., c++ files)
102     These nodes have the following dictionary entries:
103       sizes: a dictionary whose keys are categories (such as code, data,
104              vtable, etceteras) and whose values are the size, in bytes, of
105              those categories;
106       size:  the total size, in bytes, of all the entries in the sizes dict
107  2. Non-leaf nodes, representing directories
108     These nodes have the following dictionary entries:
109       children: a dictionary whose keys are names (path entries; either
110                 directory or file names) and whose values are other nodes;
111       size:     the total size, in bytes, of all the leaf nodes that are
112                 contained within the children dict (recursively expanded)
113
114  The result object is itself a dictionary that represents the common ancestor
115  of all child nodes, e.g. a path to which all other nodes beneath it are
116  relative. The 'size' attribute of this dict yields the sum of the size of all
117  leaf nodes within the data structure.
118  """
119  dirs = {'children': {}, 'size': 0}
120  for sym, type, size, path in symbols:
121    dirs['size'] += size
122    if path:
123      path = os.path.normpath(path)
124      if path.startswith('/'):
125        path = path[1:]
126
127    parts = None
128    if path:
129      parts = path.split('/')
130
131    if parts:
132      assert path
133      file_key = parts.pop()
134      tree = dirs
135      try:
136        # Traverse the tree to the parent of the file node, creating as needed
137        for part in parts:
138          assert part != ''
139          if part not in tree['children']:
140            tree['children'][part] = {'children': {}, 'size': 0}
141          tree = tree['children'][part]
142          tree['size'] += size
143
144        # Get (creating if necessary) the node for the file
145        # This node doesn't have a 'children' attribute
146        if file_key not in tree['children']:
147          tree['children'][file_key] = {'sizes': collections.defaultdict(int),
148                                        'size': 0}
149        tree = tree['children'][file_key]
150        tree['size'] += size
151
152        # Accumulate size into a bucket within the file
153        type = type.lower()
154        if 'vtable for ' in sym:
155          tree['sizes']['[vtable]'] += size
156        elif 'r' == type:
157          tree['sizes']['[rodata]'] += size
158        elif 'd' == type:
159          tree['sizes']['[data]'] += size
160        elif 'b' == type:
161          tree['sizes']['[bss]'] += size
162        elif 't' == type:
163          # 'text' in binary parlance means 'code'.
164          tree['sizes']['[code]'] += size
165        elif 'w' == type:
166          tree['sizes']['[weak]'] += size
167        else:
168          tree['sizes']['[other]'] += size
169      except:
170        print >>sys.stderr, sym, parts, key
171        raise
172    else:
173      key = 'symbols without paths'
174      if key not in dirs['children']:
175        dirs['children'][key] = {'sizes': collections.defaultdict(int),
176                                 'size': 0}
177      tree = dirs['children'][key]
178      subkey = 'misc'
179      if (sym.endswith('::__FUNCTION__') or
180        sym.endswith('::__PRETTY_FUNCTION__')):
181        subkey = '__FUNCTION__'
182      elif sym.startswith('CSWTCH.'):
183        subkey = 'CSWTCH'
184      elif '::' in sym:
185        subkey = sym[0:sym.find('::') + 2]
186      tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size
187      tree['size'] += size
188  return dirs
189
190
191def JsonifyTree(tree, name):
192  """Convert TreeifySymbols output to a JSON treemap.
193
194  The format is very similar, with the notable exceptions being
195  lists of children instead of maps and some different attribute names."""
196  children = []
197  css_class_map = {
198                  '[vtable]': 'vtable',
199                  '[rodata]': 'read-only_data',
200                  '[data]': 'data',
201                  '[bss]': 'bss',
202                  '[code]': 'code',
203                  '[weak]': 'weak_symbol'
204  }
205  if 'children' in tree:
206    # Non-leaf node. Recurse.
207    for child_name, child in tree['children'].iteritems():
208      children.append(JsonifyTree(child, child_name))
209  else:
210    # Leaf node; dump per-file stats as entries in the treemap
211    for kind, size in tree['sizes'].iteritems():
212      child_json = {'name': kind + ' (' + FormatBytes(size) + ')',
213                   'data': { '$area': size }}
214      css_class = css_class_map.get(kind)
215      if css_class is not None: child_json['data']['$symbol'] = css_class
216      children.append(child_json)
217  # Sort children by size, largest to smallest.
218  children.sort(key=lambda child: -child['data']['$area'])
219
220  # For leaf nodes, the 'size' attribute is the size of the leaf;
221  # Non-leaf nodes don't really have a size, but their 'size' attribute is
222  # the sum of the sizes of all their children.
223  return {'name': name + ' (' + FormatBytes(tree['size']) + ')',
224          'data': { '$area': tree['size'] },
225          'children': children }
226
227
228def DumpTreemap(symbols, outfile):
229  dirs = TreeifySymbols(symbols)
230  out = open(outfile, 'w')
231  try:
232    out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))
233  finally:
234    out.flush()
235    out.close()
236
237
238def DumpLargestSymbols(symbols, outfile, n):
239  # a list of (sym, type, size, path); sort by size.
240  symbols = sorted(symbols, key=lambda x: -x[2])
241  dumped = 0
242  out = open(outfile, 'w')
243  try:
244    out.write('var largestSymbols = [\n')
245    for sym, type, size, path in symbols:
246      if type in ('b', 'w'):
247        continue  # skip bss and weak symbols
248      if path is None:
249        path = ''
250      entry = {'size': FormatBytes(size),
251               'symbol': sym,
252               'type': SymbolTypeToHuman(type),
253               'location': path }
254      out.write(json.dumps(entry))
255      out.write(',\n')
256      dumped += 1
257      if dumped >= n:
258        return
259  finally:
260    out.write('];\n')
261    out.flush()
262    out.close()
263
264
265def MakeSourceMap(symbols):
266  sources = {}
267  for sym, type, size, path in symbols:
268    key = None
269    if path:
270      key = os.path.normpath(path)
271    else:
272      key = '[no path]'
273    if key not in sources:
274      sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
275    record = sources[key]
276    record['size'] += size
277    record['symbol_count'] += 1
278  return sources
279
280
281def DumpLargestSources(symbols, outfile, n):
282  map = MakeSourceMap(symbols)
283  sources = sorted(map.values(), key=lambda x: -x['size'])
284  dumped = 0
285  out = open(outfile, 'w')
286  try:
287    out.write('var largestSources = [\n')
288    for record in sources:
289      entry = {'size': FormatBytes(record['size']),
290               'symbol_count': str(record['symbol_count']),
291               'location': record['path']}
292      out.write(json.dumps(entry))
293      out.write(',\n')
294      dumped += 1
295      if dumped >= n:
296        return
297  finally:
298    out.write('];\n')
299    out.flush()
300    out.close()
301
302
303def DumpLargestVTables(symbols, outfile, n):
304  vtables = []
305  for symbol, type, size, path in symbols:
306    if 'vtable for ' in symbol:
307      vtables.append({'symbol': symbol, 'path': path, 'size': size})
308  vtables = sorted(vtables, key=lambda x: -x['size'])
309  dumped = 0
310  out = open(outfile, 'w')
311  try:
312    out.write('var largestVTables = [\n')
313    for record in vtables:
314      entry = {'size': FormatBytes(record['size']),
315               'symbol': record['symbol'],
316               'location': record['path']}
317      out.write(json.dumps(entry))
318      out.write(',\n')
319      dumped += 1
320      if dumped >= n:
321        return
322  finally:
323    out.write('];\n')
324    out.flush()
325    out.close()
326
327
328def RunParallelAddress2Line(outfile, library, arch, jobs, verbose):
329  """Run a parallel addr2line processing engine to dump and resolve symbols."""
330  out_dir = os.getenv('CHROMIUM_OUT_DIR', 'out')
331  build_type = os.getenv('BUILDTYPE', 'Release')
332  classpath = os.path.join(out_dir, build_type, 'lib.java',
333                           'binary_size_java.jar')
334  cmd = ['java',
335         '-classpath', classpath,
336         'org.chromium.tools.binary_size.ParallelAddress2Line',
337         '--disambiguate',
338         '--outfile', outfile,
339         '--library', library,
340         '--threads', jobs]
341  if verbose is True:
342    cmd.append('--verbose')
343  prefix = os.path.join('third_party', 'android_tools', 'ndk', 'toolchains')
344  if arch == 'android-arm':
345    prefix = os.path.join(prefix, 'arm-linux-androideabi-4.7', 'prebuilt',
346                          'linux-x86_64', 'bin', 'arm-linux-androideabi-')
347    cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
348  elif arch == 'android-mips':
349    prefix = os.path.join(prefix, 'mipsel-linux-android-4.7', 'prebuilt',
350                          'linux-x86_64', 'bin', 'mipsel-linux-android-')
351    cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
352  elif arch == 'android-x86':
353    prefix = os.path.join(prefix, 'x86-4.7', 'prebuilt',
354                          'linux-x86_64', 'bin', 'i686-linux-android-')
355    cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
356  # else, use whatever is in PATH (don't pass --nm or --addr2line)
357
358  if verbose:
359    print cmd
360
361  return_code = subprocess.call(cmd)
362  if return_code:
363    raise RuntimeError('Failed to run ParallelAddress2Line: returned ' +
364                       str(return_code))
365
366
367def GetNmSymbols(infile, outfile, library, arch, jobs, verbose):
368  if infile is None:
369    if outfile is None:
370      infile = tempfile.NamedTemporaryFile(delete=False).name
371    else:
372      infile = outfile
373
374    if verbose:
375      print 'Running parallel addr2line, dumping symbols to ' + infile;
376    RunParallelAddress2Line(outfile=infile, library=library, arch=arch,
377             jobs=jobs, verbose=verbose)
378  elif verbose:
379    print 'Using nm input from ' + infile
380  with file(infile, 'r') as infile:
381    return list(ParseNm(infile))
382
383
384def main():
385  usage="""%prog [options]
386
387  Runs a spatial analysis on a given library, looking up the source locations
388  of its symbols and calculating how much space each directory, source file,
389  and so on is taking. The result is a report that can be used to pinpoint
390  sources of large portions of the binary, etceteras.
391
392  Under normal circumstances, you only need to pass two arguments, thusly:
393
394      %prog --library /path/to/library --destdir /path/to/output
395
396  In this mode, the program will dump the symbols from the specified library
397  and map those symbols back to source locations, producing a web-based
398  report in the specified output directory.
399
400  Other options are available via '--help'.
401  """
402  parser = optparse.OptionParser(usage=usage)
403  parser.add_option('--nm-in', metavar='PATH',
404                    help='if specified, use nm input from <path> instead of '
405                    'generating it. Note that source locations should be '
406                    'present in the file; i.e., no addr2line symbol lookups '
407                    'will be performed when this option is specified. '
408                    'Mutually exclusive with --library.')
409  parser.add_option('--destdir', metavar='PATH',
410                    help='write output to the specified directory. An HTML '
411                    'report is generated here along with supporting files; '
412                    'any existing report will be overwritten.')
413  parser.add_option('--library', metavar='PATH',
414                    help='if specified, process symbols in the library at '
415                    'the specified path. Mutually exclusive with --nm-in.')
416  parser.add_option('--arch',
417                    help='the architecture that the library is targeted to. '
418                    'Determines which nm/addr2line binaries are used. When '
419                    '\'host-native\' is chosen, the program will use whichever '
420                    'nm/addr2line binaries are on the PATH. This is '
421                    'appropriate when you are analyzing a binary by and for '
422                    'your computer. '
423                    'This argument is only valid when using --library. '
424                    'Default is \'host-native\'.',
425                    choices=['host-native', 'android-arm',
426                             'android-mips', 'android-x86'],)
427  parser.add_option('--jobs',
428                    help='number of jobs to use for the parallel '
429                    'addr2line processing pool; defaults to 1. More '
430                    'jobs greatly improve throughput but eat RAM like '
431                    'popcorn, and take several gigabytes each. Start low '
432                    'and ramp this number up until your machine begins to '
433                    'struggle with RAM. '
434                    'This argument is only valid when using --library.')
435  parser.add_option('-v', dest='verbose', action='store_true',
436                    help='be verbose, printing lots of status information.')
437  parser.add_option('--nm-out', metavar='PATH',
438                    help='keep the nm output file, and store it at the '
439                    'specified path. This is useful if you want to see the '
440                    'fully processed nm output after the symbols have been '
441                    'mapped to source locations. By default, a tempfile is '
442                    'used and is deleted when the program terminates.'
443                    'This argument is only valid when using --library.')
444  opts, args = parser.parse_args()
445
446  if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
447    parser.error('exactly one of --library or --nm-in is required')
448  if (opts.nm_in):
449    if opts.jobs:
450      print >> sys.stderr, ('WARNING: --jobs has no effect '
451                            'when used with --nm-in')
452    if opts.arch:
453      print >> sys.stderr, ('WARNING: --arch has no effect '
454                            'when used with --nm-in')
455  if not opts.destdir:
456    parser.error('--destdir is required argument')
457  if not opts.jobs:
458    opts.jobs = '1'
459  if not opts.arch:
460    opts.arch = 'host-native'
461
462  symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library, opts.arch,
463                           opts.jobs, opts.verbose is True)
464  if not os.path.exists(opts.destdir):
465    os.makedirs(opts.destdir, 0755)
466
467  DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))
468  DumpLargestSymbols(symbols,
469                       os.path.join(opts.destdir, 'largest-symbols.js'), 100)
470  DumpLargestSources(symbols,
471                       os.path.join(opts.destdir, 'largest-sources.js'), 100)
472  DumpLargestVTables(symbols,
473                       os.path.join(opts.destdir, 'largest-vtables.js'), 100)
474
475  # TODO(andrewhayden): Switch to D3 for greater flexibility
476  treemap_out = os.path.join(opts.destdir, 'webtreemap')
477  if not os.path.exists(treemap_out):
478    os.makedirs(treemap_out, 0755)
479  treemap_src = os.path.join('third_party', 'webtreemap', 'src')
480  shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)
481  shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)
482  shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)
483  shutil.copy(os.path.join('tools', 'binary_size', 'template', 'index.html'),
484              opts.destdir)
485  if opts.verbose:
486    print 'Report saved to ' + opts.destdir + '/index.html'
487
488
489if __name__ == '__main__':
490  sys.exit(main())