1#!/usr/bin/env python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Generate a spatial analysis against an arbitrary library.
7
8To use, build the 'binary_size_tool' target. Then run this tool, passing
9in the location of the library to be analyzed along with any other options
10you desire.
11"""
12
13import collections
14import json
15import logging
16import multiprocessing
17import optparse
18import os
19import re
20import shutil
21import struct
22import subprocess
23import sys
24import tempfile
25import time
26
27import binary_size_utils
28
29# This path changee is not beautiful. Temporary (I hope) measure until
30# the chromium project has figured out a proper way to organize the
31# library of python tools. http://crbug.com/375725
32elf_symbolizer_path = os.path.abspath(os.path.join(
33    os.path.dirname(__file__),
34    '..',
35    '..',
36    'build',
37    'android',
38    'pylib'))
39sys.path.append(elf_symbolizer_path)
40import symbols.elf_symbolizer as elf_symbolizer  # pylint: disable=F0401
41
42
43# Node dictionary keys. These are output in json read by the webapp so
44# keep them short to save file size.
45# Note: If these change, the webapp must also change.
46NODE_TYPE_KEY = 'k'
47NODE_NAME_KEY = 'n'
48NODE_CHILDREN_KEY = 'children'
49NODE_SYMBOL_TYPE_KEY = 't'
50NODE_SYMBOL_SIZE_KEY = 'value'
51NODE_MAX_DEPTH_KEY = 'maxDepth'
52NODE_LAST_PATH_ELEMENT_KEY = 'lastPathElement'
53
54# The display name of the bucket where we put symbols without path.
55NAME_NO_PATH_BUCKET = '(No Path)'
56
57# Try to keep data buckets smaller than this to avoid killing the
58# graphing lib.
59BIG_BUCKET_LIMIT = 3000
60
61
62# TODO(andrewhayden): Only used for legacy reports. Delete.
63def FormatBytes(byte_count):
64  """Pretty-print a number of bytes."""
65  if byte_count > 1e6:
66    byte_count = byte_count / 1.0e6
67    return '%.1fm' % byte_count
68  if byte_count > 1e3:
69    byte_count = byte_count / 1.0e3
70    return '%.1fk' % byte_count
71  return str(byte_count)
72
73
74# TODO(andrewhayden): Only used for legacy reports. Delete.
75def SymbolTypeToHuman(symbol_type):
76  """Convert a symbol type as printed by nm into a human-readable name."""
77  return {'b': 'bss',
78          'd': 'data',
79          'r': 'read-only data',
80          't': 'code',
81          'w': 'weak symbol',
82          'v': 'weak symbol'}[symbol_type]
83
84
85def _MkChild(node, name):
86  child = node[NODE_CHILDREN_KEY].get(name)
87  if child is None:
88    child = {NODE_NAME_KEY: name,
89             NODE_CHILDREN_KEY: {}}
90    node[NODE_CHILDREN_KEY][name] = child
91  return child
92
93
94
95def SplitNoPathBucket(node):
96  """NAME_NO_PATH_BUCKET can be too large for the graphing lib to
97  handle. Split it into sub-buckets in that case."""
98  root_children = node[NODE_CHILDREN_KEY]
99  if NAME_NO_PATH_BUCKET in root_children:
100    no_path_bucket = root_children[NAME_NO_PATH_BUCKET]
101    old_children = no_path_bucket[NODE_CHILDREN_KEY]
102    count = 0
103    for symbol_type, symbol_bucket in old_children.iteritems():
104      count += len(symbol_bucket[NODE_CHILDREN_KEY])
105    if count > BIG_BUCKET_LIMIT:
106      new_children = {}
107      no_path_bucket[NODE_CHILDREN_KEY] = new_children
108      current_bucket = None
109      index = 0
110      for symbol_type, symbol_bucket in old_children.iteritems():
111        for symbol_name, value in symbol_bucket[NODE_CHILDREN_KEY].iteritems():
112          if index % BIG_BUCKET_LIMIT == 0:
113            group_no = (index / BIG_BUCKET_LIMIT) + 1
114            current_bucket = _MkChild(no_path_bucket,
115                                      '%s subgroup %d' % (NAME_NO_PATH_BUCKET,
116                                                          group_no))
117            assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
118            node[NODE_TYPE_KEY] = 'p'  # p for path
119          index += 1
120          symbol_size = value[NODE_SYMBOL_SIZE_KEY]
121          AddSymbolIntoFileNode(current_bucket, symbol_type,
122                                symbol_name, symbol_size)
123
124
125def MakeChildrenDictsIntoLists(node):
126  largest_list_len = 0
127  if NODE_CHILDREN_KEY in node:
128    largest_list_len = len(node[NODE_CHILDREN_KEY])
129    child_list = []
130    for child in node[NODE_CHILDREN_KEY].itervalues():
131      child_largest_list_len = MakeChildrenDictsIntoLists(child)
132      if child_largest_list_len > largest_list_len:
133        largest_list_len = child_largest_list_len
134      child_list.append(child)
135    node[NODE_CHILDREN_KEY] = child_list
136
137  return largest_list_len
138
139
140def AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size):
141  """Puts symbol into the file path node |node|.
142  Returns the number of added levels in tree. I.e. returns 2."""
143
144  # 'node' is the file node and first step is to find its symbol-type bucket.
145  node[NODE_LAST_PATH_ELEMENT_KEY] = True
146  node = _MkChild(node, symbol_type)
147  assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'b'
148  node[NODE_SYMBOL_TYPE_KEY] = symbol_type
149  node[NODE_TYPE_KEY] = 'b'  # b for bucket
150
151  # 'node' is now the symbol-type bucket. Make the child entry.
152  node = _MkChild(node, symbol_name)
153  if NODE_CHILDREN_KEY in node:
154    if node[NODE_CHILDREN_KEY]:
155      logging.warning('A container node used as symbol for %s.' % symbol_name)
156    # This is going to be used as a leaf so no use for child list.
157    del node[NODE_CHILDREN_KEY]
158  node[NODE_SYMBOL_SIZE_KEY] = symbol_size
159  node[NODE_SYMBOL_TYPE_KEY] = symbol_type
160  node[NODE_TYPE_KEY] = 's'  # s for symbol
161
162  return 2  # Depth of the added subtree.
163
164
165def MakeCompactTree(symbols, symbol_path_origin_dir):
166  result = {NODE_NAME_KEY: '/',
167            NODE_CHILDREN_KEY: {},
168            NODE_TYPE_KEY: 'p',
169            NODE_MAX_DEPTH_KEY: 0}
170  seen_symbol_with_path = False
171  cwd = os.path.abspath(os.getcwd())
172  for symbol_name, symbol_type, symbol_size, file_path in symbols:
173
174    if 'vtable for ' in symbol_name:
175      symbol_type = '@'  # hack to categorize these separately
176    # Take path like '/foo/bar/baz', convert to ['foo', 'bar', 'baz']
177    if file_path and file_path != "??":
178      file_path = os.path.abspath(os.path.join(symbol_path_origin_dir,
179                                               file_path))
180      # Let the output structure be relative to $CWD if inside $CWD,
181      # otherwise relative to the disk root. This is to avoid
182      # unnecessary click-through levels in the output.
183      if file_path.startswith(cwd + os.sep):
184        file_path = file_path[len(cwd):]
185      if file_path.startswith('/'):
186        file_path = file_path[1:]
187      seen_symbol_with_path = True
188    else:
189      file_path = NAME_NO_PATH_BUCKET
190
191    path_parts = file_path.split('/')
192
193    # Find pre-existing node in tree, or update if it already exists
194    node = result
195    depth = 0
196    while len(path_parts) > 0:
197      path_part = path_parts.pop(0)
198      if len(path_part) == 0:
199        continue
200      depth += 1
201      node = _MkChild(node, path_part)
202      assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
203      node[NODE_TYPE_KEY] = 'p'  # p for path
204
205    depth += AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size)
206    result[NODE_MAX_DEPTH_KEY] = max(result[NODE_MAX_DEPTH_KEY], depth)
207
208  if not seen_symbol_with_path:
209    logging.warning('Symbols lack paths. Data will not be structured.')
210
211  # The (no path) bucket can be extremely large if we failed to get
212  # path information. Split it into subgroups if needed.
213  SplitNoPathBucket(result)
214
215  largest_list_len = MakeChildrenDictsIntoLists(result)
216
217  if largest_list_len > BIG_BUCKET_LIMIT:
218    logging.warning('There are sections with %d nodes. '
219                    'Results might be unusable.' % largest_list_len)
220  return result
221
222
223# TODO(andrewhayden): Only used for legacy reports. Delete.
224def TreeifySymbols(symbols):
225  """Convert symbols into a path-based tree, calculating size information
226  along the way.
227
228  The result is a dictionary that contains two kinds of nodes:
229  1. Leaf nodes, representing source code locations (e.g., c++ files)
230     These nodes have the following dictionary entries:
231       sizes: a dictionary whose keys are categories (such as code, data,
232              vtable, etceteras) and whose values are the size, in bytes, of
233              those categories;
234       size:  the total size, in bytes, of all the entries in the sizes dict
235  2. Non-leaf nodes, representing directories
236     These nodes have the following dictionary entries:
237       children: a dictionary whose keys are names (path entries; either
238                 directory or file names) and whose values are other nodes;
239       size:     the total size, in bytes, of all the leaf nodes that are
240                 contained within the children dict (recursively expanded)
241
242  The result object is itself a dictionary that represents the common ancestor
243  of all child nodes, e.g. a path to which all other nodes beneath it are
244  relative. The 'size' attribute of this dict yields the sum of the size of all
245  leaf nodes within the data structure.
246  """
247  dirs = {'children': {}, 'size': 0}
248  for sym, symbol_type, size, path in symbols:
249    dirs['size'] += size
250    if path:
251      path = os.path.normpath(path)
252      if path.startswith('/'):
253        path = path[1:]
254
255    parts = None
256    if path:
257      parts = path.split('/')
258
259    if parts:
260      assert path
261      file_key = parts.pop()
262      tree = dirs
263      try:
264        # Traverse the tree to the parent of the file node, creating as needed
265        for part in parts:
266          assert part != ''
267          if part not in tree['children']:
268            tree['children'][part] = {'children': {}, 'size': 0}
269          tree = tree['children'][part]
270          tree['size'] += size
271
272        # Get (creating if necessary) the node for the file
273        # This node doesn't have a 'children' attribute
274        if file_key not in tree['children']:
275          tree['children'][file_key] = {'sizes': collections.defaultdict(int),
276                                        'size': 0}
277        tree = tree['children'][file_key]
278        tree['size'] += size
279
280        # Accumulate size into a bucket within the file
281        symbol_type = symbol_type.lower()
282        if 'vtable for ' in sym:
283          tree['sizes']['[vtable]'] += size
284        elif 'r' == symbol_type:
285          tree['sizes']['[rodata]'] += size
286        elif 'd' == symbol_type:
287          tree['sizes']['[data]'] += size
288        elif 'b' == symbol_type:
289          tree['sizes']['[bss]'] += size
290        elif 't' == symbol_type:
291          # 'text' in binary parlance means 'code'.
292          tree['sizes']['[code]'] += size
293        elif 'w' == symbol_type:
294          tree['sizes']['[weak]'] += size
295        else:
296          tree['sizes']['[other]'] += size
297      except:
298        print >> sys.stderr, sym, parts, file_key
299        raise
300    else:
301      key = 'symbols without paths'
302      if key not in dirs['children']:
303        dirs['children'][key] = {'sizes': collections.defaultdict(int),
304                                 'size': 0}
305      tree = dirs['children'][key]
306      subkey = 'misc'
307      if (sym.endswith('::__FUNCTION__') or
308        sym.endswith('::__PRETTY_FUNCTION__')):
309        subkey = '__FUNCTION__'
310      elif sym.startswith('CSWTCH.'):
311        subkey = 'CSWTCH'
312      elif '::' in sym:
313        subkey = sym[0:sym.find('::') + 2]
314      tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size
315      tree['size'] += size
316  return dirs
317
318
319# TODO(andrewhayden): Only used for legacy reports. Delete.
320def JsonifyTree(tree, name):
321  """Convert TreeifySymbols output to a JSON treemap.
322
323  The format is very similar, with the notable exceptions being
324  lists of children instead of maps and some different attribute names."""
325  children = []
326  css_class_map = {
327                  '[vtable]': 'vtable',
328                  '[rodata]': 'read-only_data',
329                  '[data]': 'data',
330                  '[bss]': 'bss',
331                  '[code]': 'code',
332                  '[weak]': 'weak_symbol'
333  }
334  if 'children' in tree:
335    # Non-leaf node. Recurse.
336    for child_name, child in tree['children'].iteritems():
337      children.append(JsonifyTree(child, child_name))
338  else:
339    # Leaf node; dump per-file stats as entries in the treemap
340    for kind, size in tree['sizes'].iteritems():
341      child_json = {'name': kind + ' (' + FormatBytes(size) + ')',
342                   'data': { '$area': size }}
343      css_class = css_class_map.get(kind)
344      if css_class is not None:
345        child_json['data']['$symbol'] = css_class
346      children.append(child_json)
347  # Sort children by size, largest to smallest.
348  children.sort(key=lambda child: -child['data']['$area'])
349
350  # For leaf nodes, the 'size' attribute is the size of the leaf;
351  # Non-leaf nodes don't really have a size, but their 'size' attribute is
352  # the sum of the sizes of all their children.
353  return {'name': name + ' (' + FormatBytes(tree['size']) + ')',
354          'data': { '$area': tree['size'] },
355          'children': children }
356
357def DumpCompactTree(symbols, symbol_path_origin_dir, outfile):
358  tree_root = MakeCompactTree(symbols, symbol_path_origin_dir)
359  with open(outfile, 'w') as out:
360    out.write('var tree_data=')
361    # Use separators without whitespace to get a smaller file.
362    json.dump(tree_root, out, separators=(',', ':'))
363  print('Writing %d bytes json' % os.path.getsize(outfile))
364
365
366# TODO(andrewhayden): Only used for legacy reports. Delete.
367def DumpTreemap(symbols, outfile):
368  dirs = TreeifySymbols(symbols)
369  out = open(outfile, 'w')
370  try:
371    out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))
372  finally:
373    out.flush()
374    out.close()
375
376
377# TODO(andrewhayden): Only used for legacy reports. Delete.
378def DumpLargestSymbols(symbols, outfile, n):
379  # a list of (sym, symbol_type, size, path); sort by size.
380  symbols = sorted(symbols, key=lambda x: -x[2])
381  dumped = 0
382  out = open(outfile, 'w')
383  try:
384    out.write('var largestSymbols = [\n')
385    for sym, symbol_type, size, path in symbols:
386      if symbol_type in ('b', 'w'):
387        continue  # skip bss and weak symbols
388      if path is None:
389        path = ''
390      entry = {'size': FormatBytes(size),
391               'symbol': sym,
392               'type': SymbolTypeToHuman(symbol_type),
393               'location': path }
394      out.write(json.dumps(entry))
395      out.write(',\n')
396      dumped += 1
397      if dumped >= n:
398        return
399  finally:
400    out.write('];\n')
401    out.flush()
402    out.close()
403
404
405def MakeSourceMap(symbols):
406  sources = {}
407  for _sym, _symbol_type, size, path in symbols:
408    key = None
409    if path:
410      key = os.path.normpath(path)
411    else:
412      key = '[no path]'
413    if key not in sources:
414      sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
415    record = sources[key]
416    record['size'] += size
417    record['symbol_count'] += 1
418  return sources
419
420
421# TODO(andrewhayden): Only used for legacy reports. Delete.
422def DumpLargestSources(symbols, outfile, n):
423  source_map = MakeSourceMap(symbols)
424  sources = sorted(source_map.values(), key=lambda x: -x['size'])
425  dumped = 0
426  out = open(outfile, 'w')
427  try:
428    out.write('var largestSources = [\n')
429    for record in sources:
430      entry = {'size': FormatBytes(record['size']),
431               'symbol_count': str(record['symbol_count']),
432               'location': record['path']}
433      out.write(json.dumps(entry))
434      out.write(',\n')
435      dumped += 1
436      if dumped >= n:
437        return
438  finally:
439    out.write('];\n')
440    out.flush()
441    out.close()
442
443
444# TODO(andrewhayden): Only used for legacy reports. Delete.
445def DumpLargestVTables(symbols, outfile, n):
446  vtables = []
447  for symbol, _type, size, path in symbols:
448    if 'vtable for ' in symbol:
449      vtables.append({'symbol': symbol, 'path': path, 'size': size})
450  vtables = sorted(vtables, key=lambda x: -x['size'])
451  dumped = 0
452  out = open(outfile, 'w')
453  try:
454    out.write('var largestVTables = [\n')
455    for record in vtables:
456      entry = {'size': FormatBytes(record['size']),
457               'symbol': record['symbol'],
458               'location': record['path']}
459      out.write(json.dumps(entry))
460      out.write(',\n')
461      dumped += 1
462      if dumped >= n:
463        return
464  finally:
465    out.write('];\n')
466    out.flush()
467    out.close()
468
469
470# Regex for parsing "nm" output. A sample line looks like this:
471# 0167b39c 00000018 t ACCESS_DESCRIPTION_free /path/file.c:95
472#
473# The fields are: address, size, type, name, source location
474# Regular expression explained ( see also: https://xkcd.com/208 ):
475# ([0-9a-f]{8,}+)   The address
476# [\s]+             Whitespace separator
477# ([0-9a-f]{8,}+)   The size. From here on out it's all optional.
478# [\s]+             Whitespace separator
479# (\S?)             The symbol type, which is any non-whitespace char
480# [\s*]             Whitespace separator
481# ([^\t]*)          Symbol name, any non-tab character (spaces ok!)
482# [\t]?             Tab separator
483# (.*)              The location (filename[:linennum|?][ (discriminator n)]
484sNmPattern = re.compile(
485  r'([0-9a-f]{8,})[\s]+([0-9a-f]{8,})[\s]*(\S?)[\s*]([^\t]*)[\t]?(.*)')
486
487class Progress():
488  def __init__(self):
489    self.count = 0
490    self.skip_count = 0
491    self.collisions = 0
492    self.time_last_output = time.time()
493    self.count_last_output = 0
494    self.disambiguations = 0
495    self.was_ambiguous = 0
496
497
498def RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
499                     disambiguate, src_path):
500  nm_output = RunNm(library, nm_binary)
501  nm_output_lines = nm_output.splitlines()
502  nm_output_lines_len = len(nm_output_lines)
503  address_symbol = {}
504  progress = Progress()
505  def map_address_symbol(symbol, addr):
506    progress.count += 1
507    if addr in address_symbol:
508      # 'Collision between %s and %s.' % (str(symbol.name),
509      #                                   str(address_symbol[addr].name))
510      progress.collisions += 1
511    else:
512      if symbol.disambiguated:
513        progress.disambiguations += 1
514      if symbol.was_ambiguous:
515        progress.was_ambiguous += 1
516
517      address_symbol[addr] = symbol
518
519    progress_output()
520
521  def progress_output():
522    progress_chunk = 100
523    if progress.count % progress_chunk == 0:
524      time_now = time.time()
525      time_spent = time_now - progress.time_last_output
526      if time_spent > 1.0:
527        # Only output at most once per second.
528        progress.time_last_output = time_now
529        chunk_size = progress.count - progress.count_last_output
530        progress.count_last_output = progress.count
531        if time_spent > 0:
532          speed = chunk_size / time_spent
533        else:
534          speed = 0
535        progress_percent = (100.0 * (progress.count + progress.skip_count) /
536                            nm_output_lines_len)
537        disambiguation_percent = 0
538        if progress.disambiguations != 0:
539          disambiguation_percent = (100.0 * progress.disambiguations /
540                                    progress.was_ambiguous)
541
542        sys.stdout.write('\r%.1f%%: Looked up %d symbols (%d collisions, '
543              '%d disambiguations where %.1f%% succeeded)'
544              ' - %.1f lookups/s.' %
545              (progress_percent, progress.count, progress.collisions,
546               progress.disambiguations, disambiguation_percent, speed))
547
548  # In case disambiguation was disabled, we remove the source path (which upon
549  # being set signals the symbolizer to enable disambiguation)
550  if not disambiguate:
551    src_path = None
552  symbolizer = elf_symbolizer.ELFSymbolizer(library, addr2line_binary,
553                                            map_address_symbol,
554                                            max_concurrent_jobs=jobs,
555                                            source_root_path=src_path)
556  user_interrupted = False
557  try:
558    for line in nm_output_lines:
559      match = sNmPattern.match(line)
560      if match:
561        location = match.group(5)
562        if not location:
563          addr = int(match.group(1), 16)
564          size = int(match.group(2), 16)
565          if addr in address_symbol:  # Already looked up, shortcut
566                                      # ELFSymbolizer.
567            map_address_symbol(address_symbol[addr], addr)
568            continue
569          elif size == 0:
570            # Save time by not looking up empty symbols (do they even exist?)
571            print('Empty symbol: ' + line)
572          else:
573            symbolizer.SymbolizeAsync(addr, addr)
574            continue
575
576      progress.skip_count += 1
577  except KeyboardInterrupt:
578    user_interrupted = True
579    print('Interrupting - killing subprocesses. Please wait.')
580
581  try:
582    symbolizer.Join()
583  except KeyboardInterrupt:
584    # Don't want to abort here since we will be finished in a few seconds.
585    user_interrupted = True
586    print('Patience you must have my young padawan.')
587
588  print ''
589
590  if user_interrupted:
591    print('Skipping the rest of the file mapping. '
592          'Output will not be fully classified.')
593
594  symbol_path_origin_dir = os.path.dirname(os.path.abspath(library))
595
596  with open(outfile, 'w') as out:
597    for line in nm_output_lines:
598      match = sNmPattern.match(line)
599      if match:
600        location = match.group(5)
601        if not location:
602          addr = int(match.group(1), 16)
603          symbol = address_symbol.get(addr)
604          if symbol is not None:
605            path = '??'
606            if symbol.source_path is not None:
607              path = os.path.abspath(os.path.join(symbol_path_origin_dir,
608                                                  symbol.source_path))
609            line_number = 0
610            if symbol.source_line is not None:
611              line_number = symbol.source_line
612            out.write('%s\t%s:%d\n' % (line, path, line_number))
613            continue
614
615      out.write('%s\n' % line)
616
617  print('%d symbols in the results.' % len(address_symbol))
618
619
620def RunNm(binary, nm_binary):
621  cmd = [nm_binary, '-C', '--print-size', '--size-sort', '--reverse-sort',
622         binary]
623  nm_process = subprocess.Popen(cmd,
624                                stdout=subprocess.PIPE,
625                                stderr=subprocess.PIPE)
626  (process_output, err_output) = nm_process.communicate()
627
628  if nm_process.returncode != 0:
629    if err_output:
630      raise Exception, err_output
631    else:
632      raise Exception, process_output
633
634  return process_output
635
636
637def GetNmSymbols(nm_infile, outfile, library, jobs, verbose,
638                 addr2line_binary, nm_binary, disambiguate, src_path):
639  if nm_infile is None:
640    if outfile is None:
641      outfile = tempfile.NamedTemporaryFile(delete=False).name
642
643    if verbose:
644      print 'Running parallel addr2line, dumping symbols to ' + outfile
645    RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
646                     disambiguate, src_path)
647
648    nm_infile = outfile
649
650  elif verbose:
651    print 'Using nm input from ' + nm_infile
652  with file(nm_infile, 'r') as infile:
653    return list(binary_size_utils.ParseNm(infile))
654
655
656PAK_RESOURCE_ID_TO_STRING = { "inited": False }
657
658def LoadPakIdsFromResourceFile(filename):
659  """Given a file name, it loads everything that looks like a resource id
660  into PAK_RESOURCE_ID_TO_STRING."""
661  with open(filename) as resource_header:
662    for line in resource_header:
663      if line.startswith("#define "):
664        line_data = line.split()
665        if len(line_data) == 3:
666          try:
667            resource_number = int(line_data[2])
668            resource_name = line_data[1]
669            PAK_RESOURCE_ID_TO_STRING[resource_number] = resource_name
670          except ValueError:
671            pass
672
673def GetReadablePakResourceName(pak_file, resource_id):
674  """Pak resources have a numeric identifier. It is not helpful when
675  trying to locate where footprint is generated. This does its best to
676  map the number to a usable string."""
677  if not PAK_RESOURCE_ID_TO_STRING['inited']:
678    # Try to find resource header files generated by grit when
679    # building the pak file. We'll look for files named *resources.h"
680    # and lines of the type:
681    #    #define MY_RESOURCE_JS 1234
682    PAK_RESOURCE_ID_TO_STRING['inited'] = True
683    gen_dir = os.path.join(os.path.dirname(pak_file), 'gen')
684    if os.path.isdir(gen_dir):
685      for dirname, _dirs, files in os.walk(gen_dir):
686        for filename in files:
687          if filename.endswith('resources.h'):
688            LoadPakIdsFromResourceFile(os.path.join(dirname, filename))
689  return PAK_RESOURCE_ID_TO_STRING.get(resource_id,
690                                       'Pak Resource %d' % resource_id)
691
692def AddPakData(symbols, pak_file):
693  """Adds pseudo-symbols from a pak file."""
694  pak_file = os.path.abspath(pak_file)
695  with open(pak_file, 'rb') as pak:
696    data = pak.read()
697
698  PAK_FILE_VERSION = 4
699  HEADER_LENGTH = 2 * 4 + 1  # Two uint32s. (file version, number of entries)
700                             # and one uint8 (encoding of text resources)
701  INDEX_ENTRY_SIZE = 2 + 4  # Each entry is a uint16 and a uint32.
702  version, num_entries, _encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])
703  assert version == PAK_FILE_VERSION, ('Unsupported pak file '
704                                       'version (%d) in %s. Only '
705                                       'support version %d' %
706                                       (version, pak_file, PAK_FILE_VERSION))
707  if num_entries > 0:
708    # Read the index and data.
709    data = data[HEADER_LENGTH:]
710    for _ in range(num_entries):
711      resource_id, offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
712      data = data[INDEX_ENTRY_SIZE:]
713      _next_id, next_offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
714      resource_size = next_offset - offset
715
716      symbol_name = GetReadablePakResourceName(pak_file, resource_id)
717      symbol_path = pak_file
718      symbol_type = 'd' # Data. Approximation.
719      symbol_size = resource_size
720      symbols.append((symbol_name, symbol_type, symbol_size, symbol_path))
721
722def _find_in_system_path(binary):
723  """Locate the full path to binary in the system path or return None
724  if not found."""
725  system_path = os.environ["PATH"].split(os.pathsep)
726  for path in system_path:
727    binary_path = os.path.join(path, binary)
728    if os.path.isfile(binary_path):
729      return binary_path
730  return None
731
732def CheckDebugFormatSupport(library, addr2line_binary):
733  """Kills the program if debug data is in an unsupported format.
734
735  There are two common versions of the DWARF debug formats and
736  since we are right now transitioning from DWARF2 to newer formats,
737  it's possible to have a mix of tools that are not compatible. Detect
738  that and abort rather than produce meaningless output."""
739  tool_output = subprocess.check_output([addr2line_binary, '--version'])
740  version_re = re.compile(r'^GNU [^ ]+ .* (\d+).(\d+).*?$', re.M)
741  parsed_output = version_re.match(tool_output)
742  major = int(parsed_output.group(1))
743  minor = int(parsed_output.group(2))
744  supports_dwarf4 = major > 2 or major == 2 and minor > 22
745
746  if supports_dwarf4:
747    return
748
749  print('Checking version of debug information in %s.' % library)
750  debug_info = subprocess.check_output(['readelf', '--debug-dump=info',
751                                       '--dwarf-depth=1', library])
752  dwarf_version_re = re.compile(r'^\s+Version:\s+(\d+)$', re.M)
753  parsed_dwarf_format_output = dwarf_version_re.search(debug_info)
754  version = int(parsed_dwarf_format_output.group(1))
755  if version > 2:
756    print('The supplied tools only support DWARF2 debug data but the binary\n' +
757          'uses DWARF%d. Update the tools or compile the binary\n' % version +
758          'with -gdwarf-2.')
759    sys.exit(1)
760
761
762def main():
763  usage = """%prog [options]
764
765  Runs a spatial analysis on a given library, looking up the source locations
766  of its symbols and calculating how much space each directory, source file,
767  and so on is taking. The result is a report that can be used to pinpoint
768  sources of large portions of the binary, etceteras.
769
770  Under normal circumstances, you only need to pass two arguments, thusly:
771
772      %prog --library /path/to/library --destdir /path/to/output
773
774  In this mode, the program will dump the symbols from the specified library
775  and map those symbols back to source locations, producing a web-based
776  report in the specified output directory.
777
778  Other options are available via '--help'.
779  """
780  parser = optparse.OptionParser(usage=usage)
781  parser.add_option('--nm-in', metavar='PATH',
782                    help='if specified, use nm input from <path> instead of '
783                    'generating it. Note that source locations should be '
784                    'present in the file; i.e., no addr2line symbol lookups '
785                    'will be performed when this option is specified. '
786                    'Mutually exclusive with --library.')
787  parser.add_option('--destdir', metavar='PATH',
788                    help='write output to the specified directory. An HTML '
789                    'report is generated here along with supporting files; '
790                    'any existing report will be overwritten.')
791  parser.add_option('--library', metavar='PATH',
792                    help='if specified, process symbols in the library at '
793                    'the specified path. Mutually exclusive with --nm-in.')
794  parser.add_option('--pak', metavar='PATH',
795                    help='if specified, includes the contents of the '
796                    'specified *.pak file in the output.')
797  parser.add_option('--nm-binary',
798                    help='use the specified nm binary to analyze library. '
799                    'This is to be used when the nm in the path is not for '
800                    'the right architecture or of the right version.')
801  parser.add_option('--addr2line-binary',
802                    help='use the specified addr2line binary to analyze '
803                    'library. This is to be used when the addr2line in '
804                    'the path is not for the right architecture or '
805                    'of the right version.')
806  parser.add_option('--jobs', type='int',
807                    help='number of jobs to use for the parallel '
808                    'addr2line processing pool; defaults to 1. More '
809                    'jobs greatly improve throughput but eat RAM like '
810                    'popcorn, and take several gigabytes each. Start low '
811                    'and ramp this number up until your machine begins to '
812                    'struggle with RAM. '
813                    'This argument is only valid when using --library.')
814  parser.add_option('-v', dest='verbose', action='store_true',
815                    help='be verbose, printing lots of status information.')
816  parser.add_option('--nm-out', metavar='PATH',
817                    help='keep the nm output file, and store it at the '
818                    'specified path. This is useful if you want to see the '
819                    'fully processed nm output after the symbols have been '
820                    'mapped to source locations. By default, a tempfile is '
821                    'used and is deleted when the program terminates.'
822                    'This argument is only valid when using --library.')
823  parser.add_option('--legacy', action='store_true',
824                    help='emit legacy binary size report instead of modern')
825  parser.add_option('--disable-disambiguation', action='store_true',
826                    help='disables the disambiguation process altogether,'
827                    ' NOTE: this may, depending on your toolchain, produce'
828                    ' output with some symbols at the top layer if addr2line'
829                    ' could not get the entire source path.')
830  parser.add_option('--source-path', default='./',
831                    help='the path to the source code of the output binary, '
832                    'default set to current directory. Used in the'
833                    ' disambiguation process.')
834  opts, _args = parser.parse_args()
835
836  if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
837    parser.error('exactly one of --library or --nm-in is required')
838  if (opts.nm_in):
839    if opts.jobs:
840      print >> sys.stderr, ('WARNING: --jobs has no effect '
841                            'when used with --nm-in')
842  if not opts.destdir:
843    parser.error('--destdir is required argument')
844  if not opts.jobs:
845    # Use the number of processors but cap between 2 and 4 since raw
846    # CPU power isn't the limiting factor. It's I/O limited, memory
847    # bus limited and available-memory-limited. Too many processes and
848    # the computer will run out of memory and it will be slow.
849    opts.jobs = max(2, min(4, str(multiprocessing.cpu_count())))
850
851  if opts.addr2line_binary:
852    assert os.path.isfile(opts.addr2line_binary)
853    addr2line_binary = opts.addr2line_binary
854  else:
855    addr2line_binary = _find_in_system_path('addr2line')
856    assert addr2line_binary, 'Unable to find addr2line in the path. '\
857        'Use --addr2line-binary to specify location.'
858
859  if opts.nm_binary:
860    assert os.path.isfile(opts.nm_binary)
861    nm_binary = opts.nm_binary
862  else:
863    nm_binary = _find_in_system_path('nm')
864    assert nm_binary, 'Unable to find nm in the path. Use --nm-binary '\
865        'to specify location.'
866
867  if opts.pak:
868    assert os.path.isfile(opts.pak), 'Could not find ' % opts.pak
869
870  print('addr2line: %s' % addr2line_binary)
871  print('nm: %s' % nm_binary)
872
873  if opts.library:
874    CheckDebugFormatSupport(opts.library, addr2line_binary)
875
876  symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library,
877                         opts.jobs, opts.verbose is True,
878                         addr2line_binary, nm_binary,
879                         opts.disable_disambiguation is None,
880                         opts.source_path)
881
882  if opts.pak:
883    AddPakData(symbols, opts.pak)
884
885  if not os.path.exists(opts.destdir):
886    os.makedirs(opts.destdir, 0755)
887
888
889  if opts.legacy: # legacy report
890    DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))
891    DumpLargestSymbols(symbols,
892                         os.path.join(opts.destdir, 'largest-symbols.js'), 100)
893    DumpLargestSources(symbols,
894                         os.path.join(opts.destdir, 'largest-sources.js'), 100)
895    DumpLargestVTables(symbols,
896                         os.path.join(opts.destdir, 'largest-vtables.js'), 100)
897    treemap_out = os.path.join(opts.destdir, 'webtreemap')
898    if not os.path.exists(treemap_out):
899      os.makedirs(treemap_out, 0755)
900    treemap_src = os.path.join('third_party', 'webtreemap', 'src')
901    shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)
902    shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)
903    shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)
904    shutil.copy(os.path.join('tools', 'binary_size', 'legacy_template',
905                             'index.html'), opts.destdir)
906  else: # modern report
907    if opts.library:
908      symbol_path_origin_dir = os.path.dirname(os.path.abspath(opts.library))
909    else:
910      # Just a guess. Hopefully all paths in the input file are absolute.
911      symbol_path_origin_dir = os.path.abspath(os.getcwd())
912    data_js_file_name = os.path.join(opts.destdir, 'data.js')
913    DumpCompactTree(symbols, symbol_path_origin_dir, data_js_file_name)
914    d3_out = os.path.join(opts.destdir, 'd3')
915    if not os.path.exists(d3_out):
916      os.makedirs(d3_out, 0755)
917    d3_src = os.path.join(os.path.dirname(__file__),
918                          '..',
919                          '..',
920                          'third_party', 'd3', 'src')
921    template_src = os.path.join(os.path.dirname(__file__),
922                                'template')
923    shutil.copy(os.path.join(d3_src, 'LICENSE'), d3_out)
924    shutil.copy(os.path.join(d3_src, 'd3.js'), d3_out)
925    shutil.copy(os.path.join(template_src, 'index.html'), opts.destdir)
926    shutil.copy(os.path.join(template_src, 'D3SymbolTreeMap.js'), opts.destdir)
927
928  print 'Report saved to ' + opts.destdir + '/index.html'
929
930
931if __name__ == '__main__':
932  sys.exit(main())
933