run_binary_size_analysis.py revision 5f1c94371a64b3196d4be9466099bb892df9b88e
1#!/usr/bin/env python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Generate a spatial analysis against an arbitrary library.
7
8To use, build the 'binary_size_tool' target. Then run this tool, passing
9in the location of the library to be analyzed along with any other options
10you desire.
11"""
12
13import collections
14import json
15import logging
16import multiprocessing
17import optparse
18import os
19import re
20import shutil
21import struct
22import subprocess
23import sys
24import tempfile
25import time
26
27import binary_size_utils
28
29# This path changee is not beautiful. Temporary (I hope) measure until
30# the chromium project has figured out a proper way to organize the
31# library of python tools. http://crbug.com/375725
32elf_symbolizer_path = os.path.abspath(os.path.join(
33    os.path.dirname(__file__),
34    '..',
35    '..',
36    'build',
37    'android',
38    'pylib'))
39sys.path.append(elf_symbolizer_path)
40import symbols.elf_symbolizer as elf_symbolizer  # pylint: disable=F0401
41
42
43# Node dictionary keys. These are output in json read by the webapp so
44# keep them short to save file size.
45# Note: If these change, the webapp must also change.
46NODE_TYPE_KEY = 'k'
47NODE_NAME_KEY = 'n'
48NODE_CHILDREN_KEY = 'children'
49NODE_SYMBOL_TYPE_KEY = 't'
50NODE_SYMBOL_SIZE_KEY = 'value'
51NODE_MAX_DEPTH_KEY = 'maxDepth'
52NODE_LAST_PATH_ELEMENT_KEY = 'lastPathElement'
53
54# The display name of the bucket where we put symbols without path.
55NAME_NO_PATH_BUCKET = '(No Path)'
56
57# Try to keep data buckets smaller than this to avoid killing the
58# graphing lib.
59BIG_BUCKET_LIMIT = 3000
60
61
62# TODO(andrewhayden): Only used for legacy reports. Delete.
63def FormatBytes(byte_count):
64  """Pretty-print a number of bytes."""
65  if byte_count > 1e6:
66    byte_count = byte_count / 1.0e6
67    return '%.1fm' % byte_count
68  if byte_count > 1e3:
69    byte_count = byte_count / 1.0e3
70    return '%.1fk' % byte_count
71  return str(byte_count)
72
73
74# TODO(andrewhayden): Only used for legacy reports. Delete.
75def SymbolTypeToHuman(symbol_type):
76  """Convert a symbol type as printed by nm into a human-readable name."""
77  return {'b': 'bss',
78          'd': 'data',
79          'r': 'read-only data',
80          't': 'code',
81          'w': 'weak symbol',
82          'v': 'weak symbol'}[symbol_type]
83
84
85def _MkChild(node, name):
86  child = node[NODE_CHILDREN_KEY].get(name)
87  if child is None:
88    child = {NODE_NAME_KEY: name,
89             NODE_CHILDREN_KEY: {}}
90    node[NODE_CHILDREN_KEY][name] = child
91  return child
92
93
94
95def SplitNoPathBucket(node):
96  """NAME_NO_PATH_BUCKET can be too large for the graphing lib to
97  handle. Split it into sub-buckets in that case."""
98  root_children = node[NODE_CHILDREN_KEY]
99  if NAME_NO_PATH_BUCKET in root_children:
100    no_path_bucket = root_children[NAME_NO_PATH_BUCKET]
101    old_children = no_path_bucket[NODE_CHILDREN_KEY]
102    count = 0
103    for symbol_type, symbol_bucket in old_children.iteritems():
104      count += len(symbol_bucket[NODE_CHILDREN_KEY])
105    if count > BIG_BUCKET_LIMIT:
106      new_children = {}
107      no_path_bucket[NODE_CHILDREN_KEY] = new_children
108      current_bucket = None
109      index = 0
110      for symbol_type, symbol_bucket in old_children.iteritems():
111        for symbol_name, value in symbol_bucket[NODE_CHILDREN_KEY].iteritems():
112          if index % BIG_BUCKET_LIMIT == 0:
113            group_no = (index / BIG_BUCKET_LIMIT) + 1
114            current_bucket = _MkChild(no_path_bucket,
115                                      '%s subgroup %d' % (NAME_NO_PATH_BUCKET,
116                                                          group_no))
117            assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
118            node[NODE_TYPE_KEY] = 'p'  # p for path
119          index += 1
120          symbol_size = value[NODE_SYMBOL_SIZE_KEY]
121          AddSymbolIntoFileNode(current_bucket, symbol_type,
122                                symbol_name, symbol_size)
123
124
125def MakeChildrenDictsIntoLists(node):
126  largest_list_len = 0
127  if NODE_CHILDREN_KEY in node:
128    largest_list_len = len(node[NODE_CHILDREN_KEY])
129    child_list = []
130    for child in node[NODE_CHILDREN_KEY].itervalues():
131      child_largest_list_len = MakeChildrenDictsIntoLists(child)
132      if child_largest_list_len > largest_list_len:
133        largest_list_len = child_largest_list_len
134      child_list.append(child)
135    node[NODE_CHILDREN_KEY] = child_list
136
137  return largest_list_len
138
139
140def AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size):
141  """Puts symbol into the file path node |node|.
142  Returns the number of added levels in tree. I.e. returns 2."""
143
144  # 'node' is the file node and first step is to find its symbol-type bucket.
145  node[NODE_LAST_PATH_ELEMENT_KEY] = True
146  node = _MkChild(node, symbol_type)
147  assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'b'
148  node[NODE_SYMBOL_TYPE_KEY] = symbol_type
149  node[NODE_TYPE_KEY] = 'b'  # b for bucket
150
151  # 'node' is now the symbol-type bucket. Make the child entry.
152  node = _MkChild(node, symbol_name)
153  if NODE_CHILDREN_KEY in node:
154    if node[NODE_CHILDREN_KEY]:
155      logging.warning('A container node used as symbol for %s.' % symbol_name)
156    # This is going to be used as a leaf so no use for child list.
157    del node[NODE_CHILDREN_KEY]
158  node[NODE_SYMBOL_SIZE_KEY] = symbol_size
159  node[NODE_SYMBOL_TYPE_KEY] = symbol_type
160  node[NODE_TYPE_KEY] = 's'  # s for symbol
161
162  return 2  # Depth of the added subtree.
163
164
165def MakeCompactTree(symbols, symbol_path_origin_dir):
166  result = {NODE_NAME_KEY: '/',
167            NODE_CHILDREN_KEY: {},
168            NODE_TYPE_KEY: 'p',
169            NODE_MAX_DEPTH_KEY: 0}
170  seen_symbol_with_path = False
171  cwd = os.path.abspath(os.getcwd())
172  for symbol_name, symbol_type, symbol_size, file_path in symbols:
173
174    if 'vtable for ' in symbol_name:
175      symbol_type = '@'  # hack to categorize these separately
176    # Take path like '/foo/bar/baz', convert to ['foo', 'bar', 'baz']
177    if file_path and file_path != "??":
178      file_path = os.path.abspath(os.path.join(symbol_path_origin_dir,
179                                               file_path))
180      # Let the output structure be relative to $CWD if inside $CWD,
181      # otherwise relative to the disk root. This is to avoid
182      # unnecessary click-through levels in the output.
183      if file_path.startswith(cwd + os.sep):
184        file_path = file_path[len(cwd):]
185      if file_path.startswith('/'):
186        file_path = file_path[1:]
187      seen_symbol_with_path = True
188    else:
189      file_path = NAME_NO_PATH_BUCKET
190
191    path_parts = file_path.split('/')
192
193    # Find pre-existing node in tree, or update if it already exists
194    node = result
195    depth = 0
196    while len(path_parts) > 0:
197      path_part = path_parts.pop(0)
198      if len(path_part) == 0:
199        continue
200      depth += 1
201      node = _MkChild(node, path_part)
202      assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
203      node[NODE_TYPE_KEY] = 'p'  # p for path
204
205    depth += AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size)
206    result[NODE_MAX_DEPTH_KEY] = max(result[NODE_MAX_DEPTH_KEY], depth)
207
208  if not seen_symbol_with_path:
209    logging.warning('Symbols lack paths. Data will not be structured.')
210
211  # The (no path) bucket can be extremely large if we failed to get
212  # path information. Split it into subgroups if needed.
213  SplitNoPathBucket(result)
214
215  largest_list_len = MakeChildrenDictsIntoLists(result)
216
217  if largest_list_len > BIG_BUCKET_LIMIT:
218    logging.warning('There are sections with %d nodes. '
219                    'Results might be unusable.' % largest_list_len)
220  return result
221
222
223# TODO(andrewhayden): Only used for legacy reports. Delete.
224def TreeifySymbols(symbols):
225  """Convert symbols into a path-based tree, calculating size information
226  along the way.
227
228  The result is a dictionary that contains two kinds of nodes:
229  1. Leaf nodes, representing source code locations (e.g., c++ files)
230     These nodes have the following dictionary entries:
231       sizes: a dictionary whose keys are categories (such as code, data,
232              vtable, etceteras) and whose values are the size, in bytes, of
233              those categories;
234       size:  the total size, in bytes, of all the entries in the sizes dict
235  2. Non-leaf nodes, representing directories
236     These nodes have the following dictionary entries:
237       children: a dictionary whose keys are names (path entries; either
238                 directory or file names) and whose values are other nodes;
239       size:     the total size, in bytes, of all the leaf nodes that are
240                 contained within the children dict (recursively expanded)
241
242  The result object is itself a dictionary that represents the common ancestor
243  of all child nodes, e.g. a path to which all other nodes beneath it are
244  relative. The 'size' attribute of this dict yields the sum of the size of all
245  leaf nodes within the data structure.
246  """
247  dirs = {'children': {}, 'size': 0}
248  for sym, symbol_type, size, path in symbols:
249    dirs['size'] += size
250    if path:
251      path = os.path.normpath(path)
252      if path.startswith('/'):
253        path = path[1:]
254
255    parts = None
256    if path:
257      parts = path.split('/')
258
259    if parts:
260      assert path
261      file_key = parts.pop()
262      tree = dirs
263      try:
264        # Traverse the tree to the parent of the file node, creating as needed
265        for part in parts:
266          assert part != ''
267          if part not in tree['children']:
268            tree['children'][part] = {'children': {}, 'size': 0}
269          tree = tree['children'][part]
270          tree['size'] += size
271
272        # Get (creating if necessary) the node for the file
273        # This node doesn't have a 'children' attribute
274        if file_key not in tree['children']:
275          tree['children'][file_key] = {'sizes': collections.defaultdict(int),
276                                        'size': 0}
277        tree = tree['children'][file_key]
278        tree['size'] += size
279
280        # Accumulate size into a bucket within the file
281        symbol_type = symbol_type.lower()
282        if 'vtable for ' in sym:
283          tree['sizes']['[vtable]'] += size
284        elif 'r' == symbol_type:
285          tree['sizes']['[rodata]'] += size
286        elif 'd' == symbol_type:
287          tree['sizes']['[data]'] += size
288        elif 'b' == symbol_type:
289          tree['sizes']['[bss]'] += size
290        elif 't' == symbol_type:
291          # 'text' in binary parlance means 'code'.
292          tree['sizes']['[code]'] += size
293        elif 'w' == symbol_type:
294          tree['sizes']['[weak]'] += size
295        else:
296          tree['sizes']['[other]'] += size
297      except:
298        print >> sys.stderr, sym, parts, file_key
299        raise
300    else:
301      key = 'symbols without paths'
302      if key not in dirs['children']:
303        dirs['children'][key] = {'sizes': collections.defaultdict(int),
304                                 'size': 0}
305      tree = dirs['children'][key]
306      subkey = 'misc'
307      if (sym.endswith('::__FUNCTION__') or
308        sym.endswith('::__PRETTY_FUNCTION__')):
309        subkey = '__FUNCTION__'
310      elif sym.startswith('CSWTCH.'):
311        subkey = 'CSWTCH'
312      elif '::' in sym:
313        subkey = sym[0:sym.find('::') + 2]
314      tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size
315      tree['size'] += size
316  return dirs
317
318
319# TODO(andrewhayden): Only used for legacy reports. Delete.
320def JsonifyTree(tree, name):
321  """Convert TreeifySymbols output to a JSON treemap.
322
323  The format is very similar, with the notable exceptions being
324  lists of children instead of maps and some different attribute names."""
325  children = []
326  css_class_map = {
327                  '[vtable]': 'vtable',
328                  '[rodata]': 'read-only_data',
329                  '[data]': 'data',
330                  '[bss]': 'bss',
331                  '[code]': 'code',
332                  '[weak]': 'weak_symbol'
333  }
334  if 'children' in tree:
335    # Non-leaf node. Recurse.
336    for child_name, child in tree['children'].iteritems():
337      children.append(JsonifyTree(child, child_name))
338  else:
339    # Leaf node; dump per-file stats as entries in the treemap
340    for kind, size in tree['sizes'].iteritems():
341      child_json = {'name': kind + ' (' + FormatBytes(size) + ')',
342                   'data': { '$area': size }}
343      css_class = css_class_map.get(kind)
344      if css_class is not None:
345        child_json['data']['$symbol'] = css_class
346      children.append(child_json)
347  # Sort children by size, largest to smallest.
348  children.sort(key=lambda child: -child['data']['$area'])
349
350  # For leaf nodes, the 'size' attribute is the size of the leaf;
351  # Non-leaf nodes don't really have a size, but their 'size' attribute is
352  # the sum of the sizes of all their children.
353  return {'name': name + ' (' + FormatBytes(tree['size']) + ')',
354          'data': { '$area': tree['size'] },
355          'children': children }
356
357def DumpCompactTree(symbols, symbol_path_origin_dir, outfile):
358  tree_root = MakeCompactTree(symbols, symbol_path_origin_dir)
359  with open(outfile, 'w') as out:
360    out.write('var tree_data=')
361    # Use separators without whitespace to get a smaller file.
362    json.dump(tree_root, out, separators=(',', ':'))
363  print('Writing %d bytes json' % os.path.getsize(outfile))
364
365
366# TODO(andrewhayden): Only used for legacy reports. Delete.
367def DumpTreemap(symbols, outfile):
368  dirs = TreeifySymbols(symbols)
369  out = open(outfile, 'w')
370  try:
371    out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))
372  finally:
373    out.flush()
374    out.close()
375
376
377# TODO(andrewhayden): Only used for legacy reports. Delete.
378def DumpLargestSymbols(symbols, outfile, n):
379  # a list of (sym, symbol_type, size, path); sort by size.
380  symbols = sorted(symbols, key=lambda x: -x[2])
381  dumped = 0
382  out = open(outfile, 'w')
383  try:
384    out.write('var largestSymbols = [\n')
385    for sym, symbol_type, size, path in symbols:
386      if symbol_type in ('b', 'w'):
387        continue  # skip bss and weak symbols
388      if path is None:
389        path = ''
390      entry = {'size': FormatBytes(size),
391               'symbol': sym,
392               'type': SymbolTypeToHuman(symbol_type),
393               'location': path }
394      out.write(json.dumps(entry))
395      out.write(',\n')
396      dumped += 1
397      if dumped >= n:
398        return
399  finally:
400    out.write('];\n')
401    out.flush()
402    out.close()
403
404
405def MakeSourceMap(symbols):
406  sources = {}
407  for _sym, _symbol_type, size, path in symbols:
408    key = None
409    if path:
410      key = os.path.normpath(path)
411    else:
412      key = '[no path]'
413    if key not in sources:
414      sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
415    record = sources[key]
416    record['size'] += size
417    record['symbol_count'] += 1
418  return sources
419
420
421# TODO(andrewhayden): Only used for legacy reports. Delete.
422def DumpLargestSources(symbols, outfile, n):
423  source_map = MakeSourceMap(symbols)
424  sources = sorted(source_map.values(), key=lambda x: -x['size'])
425  dumped = 0
426  out = open(outfile, 'w')
427  try:
428    out.write('var largestSources = [\n')
429    for record in sources:
430      entry = {'size': FormatBytes(record['size']),
431               'symbol_count': str(record['symbol_count']),
432               'location': record['path']}
433      out.write(json.dumps(entry))
434      out.write(',\n')
435      dumped += 1
436      if dumped >= n:
437        return
438  finally:
439    out.write('];\n')
440    out.flush()
441    out.close()
442
443
444# TODO(andrewhayden): Only used for legacy reports. Delete.
445def DumpLargestVTables(symbols, outfile, n):
446  vtables = []
447  for symbol, _type, size, path in symbols:
448    if 'vtable for ' in symbol:
449      vtables.append({'symbol': symbol, 'path': path, 'size': size})
450  vtables = sorted(vtables, key=lambda x: -x['size'])
451  dumped = 0
452  out = open(outfile, 'w')
453  try:
454    out.write('var largestVTables = [\n')
455    for record in vtables:
456      entry = {'size': FormatBytes(record['size']),
457               'symbol': record['symbol'],
458               'location': record['path']}
459      out.write(json.dumps(entry))
460      out.write(',\n')
461      dumped += 1
462      if dumped >= n:
463        return
464  finally:
465    out.write('];\n')
466    out.flush()
467    out.close()
468
469
470# Regex for parsing "nm" output. A sample line looks like this:
471# 0167b39c 00000018 t ACCESS_DESCRIPTION_free /path/file.c:95
472#
473# The fields are: address, size, type, name, source location
474# Regular expression explained ( see also: https://xkcd.com/208 ):
475# ([0-9a-f]{8,}+)   The address
476# [\s]+             Whitespace separator
477# ([0-9a-f]{8,}+)   The size. From here on out it's all optional.
478# [\s]+             Whitespace separator
479# (\S?)             The symbol type, which is any non-whitespace char
480# [\s*]             Whitespace separator
481# ([^\t]*)          Symbol name, any non-tab character (spaces ok!)
482# [\t]?             Tab separator
483# (.*)              The location (filename[:linennum|?][ (discriminator n)]
484sNmPattern = re.compile(
485  r'([0-9a-f]{8,})[\s]+([0-9a-f]{8,})[\s]*(\S?)[\s*]([^\t]*)[\t]?(.*)')
486
487class Progress():
488  def __init__(self):
489    self.count = 0
490    self.skip_count = 0
491    self.collisions = 0
492    self.time_last_output = time.time()
493    self.count_last_output = 0
494    self.disambiguations = 0
495    self.was_ambiguous = 0
496
497
498def RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
499                     disambiguate, src_path):
500  nm_output = RunNm(library, nm_binary)
501  nm_output_lines = nm_output.splitlines()
502  nm_output_lines_len = len(nm_output_lines)
503  address_symbol = {}
504  progress = Progress()
505  def map_address_symbol(symbol, addr):
506    progress.count += 1
507    if addr in address_symbol:
508      # 'Collision between %s and %s.' % (str(symbol.name),
509      #                                   str(address_symbol[addr].name))
510      progress.collisions += 1
511    else:
512      if symbol.disambiguated:
513        progress.disambiguations += 1
514      if symbol.was_ambiguous:
515        progress.was_ambiguous += 1
516
517      address_symbol[addr] = symbol
518
519    progress_output()
520
521  def progress_output():
522    progress_chunk = 100
523    if progress.count % progress_chunk == 0:
524      time_now = time.time()
525      time_spent = time_now - progress.time_last_output
526      if time_spent > 1.0:
527        # Only output at most once per second.
528        progress.time_last_output = time_now
529        chunk_size = progress.count - progress.count_last_output
530        progress.count_last_output = progress.count
531        if time_spent > 0:
532          speed = chunk_size / time_spent
533        else:
534          speed = 0
535        progress_percent = (100.0 * (progress.count + progress.skip_count) /
536                            nm_output_lines_len)
537        disambiguation_percent = 0
538        if progress.disambiguations != 0:
539          disambiguation_percent = (100.0 * progress.disambiguations /
540                                    progress.was_ambiguous)
541
542        sys.stdout.write('\r%.1f%%: Looked up %d symbols (%d collisions, '
543              '%d disambiguations where %.1f%% succeeded)'
544              '- %.1f lookups/s.' %
545              (progress_percent, progress.count, progress.collisions,
546               progress.disambiguations, disambiguation_percent, speed))
547
548  # In case disambiguation was disabled, we remove the source path (which upon
549  # being set signals the symbolizer to enable disambiguation)
550  if not disambiguate:
551    src_path = None
552  symbolizer = elf_symbolizer.ELFSymbolizer(library, addr2line_binary,
553                                            map_address_symbol,
554                                            max_concurrent_jobs=jobs,
555                                            source_root_path=src_path)
556  user_interrupted = False
557  try:
558    for line in nm_output_lines:
559      match = sNmPattern.match(line)
560      if match:
561        location = match.group(5)
562        if not location:
563          addr = int(match.group(1), 16)
564          size = int(match.group(2), 16)
565          if addr in address_symbol:  # Already looked up, shortcut
566                                      # ELFSymbolizer.
567            map_address_symbol(address_symbol[addr], addr)
568            continue
569          elif size == 0:
570            # Save time by not looking up empty symbols (do they even exist?)
571            print('Empty symbol: ' + line)
572          else:
573            symbolizer.SymbolizeAsync(addr, addr)
574            continue
575
576      progress.skip_count += 1
577  except KeyboardInterrupt:
578    user_interrupted = True
579    print('Interrupting - killing subprocesses. Please wait.')
580
581  try:
582    symbolizer.Join()
583  except KeyboardInterrupt:
584    # Don't want to abort here since we will be finished in a few seconds.
585    user_interrupted = True
586    print('Patience you must have my young padawan.')
587
588  print ''
589
590  if user_interrupted:
591    print('Skipping the rest of the file mapping. '
592          'Output will not be fully classified.')
593
594  symbol_path_origin_dir = os.path.dirname(os.path.abspath(library))
595
596  with open(outfile, 'w') as out:
597    for line in nm_output_lines:
598      match = sNmPattern.match(line)
599      if match:
600        location = match.group(5)
601        if not location:
602          addr = int(match.group(1), 16)
603          symbol = address_symbol.get(addr)
604          if symbol is not None:
605            path = '??'
606            if symbol.source_path is not None:
607              path = os.path.abspath(os.path.join(symbol_path_origin_dir,
608                                                  symbol.source_path))
609            line_number = 0
610            if symbol.source_line is not None:
611              line_number = symbol.source_line
612            out.write('%s\t%s:%d\n' % (line, path, line_number))
613            continue
614
615      out.write('%s\n' % line)
616
617  print('%d symbols in the results.' % len(address_symbol))
618
619
620def RunNm(binary, nm_binary):
621  print('Starting nm')
622  cmd = [nm_binary, '-C', '--print-size', '--size-sort', '--reverse-sort',
623         binary]
624  nm_process = subprocess.Popen(cmd,
625                                stdout=subprocess.PIPE,
626                                stderr=subprocess.PIPE)
627  (process_output, err_output) = nm_process.communicate()
628
629  if nm_process.returncode != 0:
630    if err_output:
631      raise Exception, err_output
632    else:
633      raise Exception, process_output
634
635  print('Finished nm')
636  return process_output
637
638
639def GetNmSymbols(nm_infile, outfile, library, jobs, verbose,
640                 addr2line_binary, nm_binary, disambiguate, src_path):
641  if nm_infile is None:
642    if outfile is None:
643      outfile = tempfile.NamedTemporaryFile(delete=False).name
644
645    if verbose:
646      print 'Running parallel addr2line, dumping symbols to ' + outfile
647    RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
648                     disambiguate, src_path)
649
650    nm_infile = outfile
651
652  elif verbose:
653    print 'Using nm input from ' + nm_infile
654  with file(nm_infile, 'r') as infile:
655    return list(binary_size_utils.ParseNm(infile))
656
657
658PAK_RESOURCE_ID_TO_STRING = { "inited": False }
659
660def LoadPakIdsFromResourceFile(filename):
661  """Given a file name, it loads everything that looks like a resource id
662  into PAK_RESOURCE_ID_TO_STRING."""
663  with open(filename) as resource_header:
664    for line in resource_header:
665      if line.startswith("#define "):
666        line_data = line.split()
667        if len(line_data) == 3:
668          try:
669            resource_number = int(line_data[2])
670            resource_name = line_data[1]
671            PAK_RESOURCE_ID_TO_STRING[resource_number] = resource_name
672          except ValueError:
673            pass
674
675def GetReadablePakResourceName(pak_file, resource_id):
676  """Pak resources have a numeric identifier. It is not helpful when
677  trying to locate where footprint is generated. This does its best to
678  map the number to a usable string."""
679  if not PAK_RESOURCE_ID_TO_STRING['inited']:
680    # Try to find resource header files generated by grit when
681    # building the pak file. We'll look for files named *resources.h"
682    # and lines of the type:
683    #    #define MY_RESOURCE_JS 1234
684    PAK_RESOURCE_ID_TO_STRING['inited'] = True
685    gen_dir = os.path.join(os.path.dirname(pak_file), 'gen')
686    if os.path.isdir(gen_dir):
687      for dirname, _dirs, files in os.walk(gen_dir):
688        for filename in files:
689          if filename.endswith('resources.h'):
690            LoadPakIdsFromResourceFile(os.path.join(dirname, filename))
691  return PAK_RESOURCE_ID_TO_STRING.get(resource_id,
692                                       'Pak Resource %d' % resource_id)
693
694def AddPakData(symbols, pak_file):
695  """Adds pseudo-symbols from a pak file."""
696  pak_file = os.path.abspath(pak_file)
697  with open(pak_file, 'rb') as pak:
698    data = pak.read()
699
700  PAK_FILE_VERSION = 4
701  HEADER_LENGTH = 2 * 4 + 1  # Two uint32s. (file version, number of entries)
702                             # and one uint8 (encoding of text resources)
703  INDEX_ENTRY_SIZE = 2 + 4  # Each entry is a uint16 and a uint32.
704  version, num_entries, _encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])
705  assert version == PAK_FILE_VERSION, ('Unsupported pak file '
706                                       'version (%d) in %s. Only '
707                                       'support version %d' %
708                                       (version, pak_file, PAK_FILE_VERSION))
709  if num_entries > 0:
710    # Read the index and data.
711    data = data[HEADER_LENGTH:]
712    for _ in range(num_entries):
713      resource_id, offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
714      data = data[INDEX_ENTRY_SIZE:]
715      _next_id, next_offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
716      resource_size = next_offset - offset
717
718      symbol_name = GetReadablePakResourceName(pak_file, resource_id)
719      symbol_path = pak_file
720      symbol_type = 'd' # Data. Approximation.
721      symbol_size = resource_size
722      symbols.append((symbol_name, symbol_type, symbol_size, symbol_path))
723
724def _find_in_system_path(binary):
725  """Locate the full path to binary in the system path or return None
726  if not found."""
727  system_path = os.environ["PATH"].split(os.pathsep)
728  for path in system_path:
729    binary_path = os.path.join(path, binary)
730    if os.path.isfile(binary_path):
731      return binary_path
732  return None
733
734def CheckDebugFormatSupport(library, addr2line_binary):
735  """Kills the program if debug data is in an unsupported format.
736
737  There are two common versions of the DWARF debug formats and
738  since we are right now transitioning from DWARF2 to newer formats,
739  it's possible to have a mix of tools that are not compatible. Detect
740  that and abort rather than produce meaningless output."""
741  tool_output = subprocess.check_output([addr2line_binary, '--version'])
742  version_re = re.compile(r'^GNU [^ ]+ .* (\d+).(\d+).*?$', re.M)
743  parsed_output = version_re.match(tool_output)
744  major = int(parsed_output.group(1))
745  minor = int(parsed_output.group(2))
746  supports_dwarf4 = major > 2 or major == 2 and minor > 22
747
748  if supports_dwarf4:
749    return
750
751  print('Checking version of debug information in %s.' % library)
752  debug_info = subprocess.check_output(['readelf', '--debug-dump=info',
753                                       '--dwarf-depth=1', library])
754  dwarf_version_re = re.compile(r'^\s+Version:\s+(\d+)$', re.M)
755  parsed_dwarf_format_output = dwarf_version_re.search(debug_info)
756  version = int(parsed_dwarf_format_output.group(1))
757  if version > 2:
758    print('The supplied tools only support DWARF2 debug data but the binary\n' +
759          'uses DWARF%d. Update the tools or compile the binary\n' % version +
760          'with -gdwarf-2.')
761    sys.exit(1)
762
763
764def main():
765  usage = """%prog [options]
766
767  Runs a spatial analysis on a given library, looking up the source locations
768  of its symbols and calculating how much space each directory, source file,
769  and so on is taking. The result is a report that can be used to pinpoint
770  sources of large portions of the binary, etceteras.
771
772  Under normal circumstances, you only need to pass two arguments, thusly:
773
774      %prog --library /path/to/library --destdir /path/to/output
775
776  In this mode, the program will dump the symbols from the specified library
777  and map those symbols back to source locations, producing a web-based
778  report in the specified output directory.
779
780  Other options are available via '--help'.
781  """
782  parser = optparse.OptionParser(usage=usage)
783  parser.add_option('--nm-in', metavar='PATH',
784                    help='if specified, use nm input from <path> instead of '
785                    'generating it. Note that source locations should be '
786                    'present in the file; i.e., no addr2line symbol lookups '
787                    'will be performed when this option is specified. '
788                    'Mutually exclusive with --library.')
789  parser.add_option('--destdir', metavar='PATH',
790                    help='write output to the specified directory. An HTML '
791                    'report is generated here along with supporting files; '
792                    'any existing report will be overwritten.')
793  parser.add_option('--library', metavar='PATH',
794                    help='if specified, process symbols in the library at '
795                    'the specified path. Mutually exclusive with --nm-in.')
796  parser.add_option('--pak', metavar='PATH',
797                    help='if specified, includes the contents of the '
798                    'specified *.pak file in the output.')
799  parser.add_option('--nm-binary',
800                    help='use the specified nm binary to analyze library. '
801                    'This is to be used when the nm in the path is not for '
802                    'the right architecture or of the right version.')
803  parser.add_option('--addr2line-binary',
804                    help='use the specified addr2line binary to analyze '
805                    'library. This is to be used when the addr2line in '
806                    'the path is not for the right architecture or '
807                    'of the right version.')
808  parser.add_option('--jobs', type='int',
809                    help='number of jobs to use for the parallel '
810                    'addr2line processing pool; defaults to 1. More '
811                    'jobs greatly improve throughput but eat RAM like '
812                    'popcorn, and take several gigabytes each. Start low '
813                    'and ramp this number up until your machine begins to '
814                    'struggle with RAM. '
815                    'This argument is only valid when using --library.')
816  parser.add_option('-v', dest='verbose', action='store_true',
817                    help='be verbose, printing lots of status information.')
818  parser.add_option('--nm-out', metavar='PATH',
819                    help='keep the nm output file, and store it at the '
820                    'specified path. This is useful if you want to see the '
821                    'fully processed nm output after the symbols have been '
822                    'mapped to source locations. By default, a tempfile is '
823                    'used and is deleted when the program terminates.'
824                    'This argument is only valid when using --library.')
825  parser.add_option('--legacy', action='store_true',
826                    help='emit legacy binary size report instead of modern')
827  parser.add_option('--disable-disambiguation', action='store_true',
828                    help='disables the disambiguation process altogether,'
829                    ' NOTE: this may, depending on your toolchain, produce'
830                    ' output with some symbols at the top layer if addr2line'
831                    ' could not get the entire source path.')
832  parser.add_option('--source-path', default='./',
833                    help='the path to the source code of the output binary, '
834                    'default set to current directory. Used in the'
835                    ' disambiguation process.')
836  opts, _args = parser.parse_args()
837
838  if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
839    parser.error('exactly one of --library or --nm-in is required')
840  if (opts.nm_in):
841    if opts.jobs:
842      print >> sys.stderr, ('WARNING: --jobs has no effect '
843                            'when used with --nm-in')
844  if not opts.destdir:
845    parser.error('--destdir is required argument')
846  if not opts.jobs:
847    # Use the number of processors but cap between 2 and 4 since raw
848    # CPU power isn't the limiting factor. It's I/O limited, memory
849    # bus limited and available-memory-limited. Too many processes and
850    # the computer will run out of memory and it will be slow.
851    opts.jobs = max(2, min(4, str(multiprocessing.cpu_count())))
852
853  if opts.addr2line_binary:
854    assert os.path.isfile(opts.addr2line_binary)
855    addr2line_binary = opts.addr2line_binary
856  else:
857    addr2line_binary = _find_in_system_path('addr2line')
858    assert addr2line_binary, 'Unable to find addr2line in the path. '\
859        'Use --addr2line-binary to specify location.'
860
861  if opts.nm_binary:
862    assert os.path.isfile(opts.nm_binary)
863    nm_binary = opts.nm_binary
864  else:
865    nm_binary = _find_in_system_path('nm')
866    assert nm_binary, 'Unable to find nm in the path. Use --nm-binary '\
867        'to specify location.'
868
869  if opts.pak:
870    assert os.path.isfile(opts.pak), 'Could not find ' % opts.pak
871
872  print('addr2line: %s' % addr2line_binary)
873  print('nm: %s' % nm_binary)
874
875  CheckDebugFormatSupport(opts.library, addr2line_binary)
876
877  symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library,
878                         opts.jobs, opts.verbose is True,
879                         addr2line_binary, nm_binary,
880                         opts.disable_disambiguation is None,
881                         opts.source_path)
882
883  if opts.pak:
884    AddPakData(symbols, opts.pak)
885
886  if not os.path.exists(opts.destdir):
887    os.makedirs(opts.destdir, 0755)
888
889
890  if opts.legacy: # legacy report
891    DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))
892    DumpLargestSymbols(symbols,
893                         os.path.join(opts.destdir, 'largest-symbols.js'), 100)
894    DumpLargestSources(symbols,
895                         os.path.join(opts.destdir, 'largest-sources.js'), 100)
896    DumpLargestVTables(symbols,
897                         os.path.join(opts.destdir, 'largest-vtables.js'), 100)
898    treemap_out = os.path.join(opts.destdir, 'webtreemap')
899    if not os.path.exists(treemap_out):
900      os.makedirs(treemap_out, 0755)
901    treemap_src = os.path.join('third_party', 'webtreemap', 'src')
902    shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)
903    shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)
904    shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)
905    shutil.copy(os.path.join('tools', 'binary_size', 'legacy_template',
906                             'index.html'), opts.destdir)
907  else: # modern report
908    if opts.library:
909      symbol_path_origin_dir = os.path.dirname(os.path.abspath(opts.library))
910    else:
911      # Just a guess. Hopefully all paths in the input file are absolute.
912      symbol_path_origin_dir = os.path.abspath(os.getcwd())
913    data_js_file_name = os.path.join(opts.destdir, 'data.js')
914    DumpCompactTree(symbols, symbol_path_origin_dir, data_js_file_name)
915    d3_out = os.path.join(opts.destdir, 'd3')
916    if not os.path.exists(d3_out):
917      os.makedirs(d3_out, 0755)
918    d3_src = os.path.join(os.path.dirname(__file__),
919                          '..',
920                          '..',
921                          'third_party', 'd3', 'src')
922    template_src = os.path.join(os.path.dirname(__file__),
923                                'template')
924    shutil.copy(os.path.join(d3_src, 'LICENSE'), d3_out)
925    shutil.copy(os.path.join(d3_src, 'd3.js'), d3_out)
926    shutil.copy(os.path.join(template_src, 'index.html'), opts.destdir)
927    shutil.copy(os.path.join(template_src, 'D3SymbolTreeMap.js'), opts.destdir)
928
929  print 'Report saved to ' + opts.destdir + '/index.html'
930
931
932if __name__ == '__main__':
933  sys.exit(main())
934