1#!/usr/bin/env python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Generate a spatial analysis against an arbitrary library.
7
8Adapted for Skia's use case from
9chromium/src/tools/binary_size/run_binary_size_analysis.py. Main changes:
10
11-- Cleans up some deprecated codes.
12-- Always use relative code path so the tree root is Skia repo's root.
13-- Instead of outputting the standalone HTML/CSS/JS filesets, writes the
14    TreeMap JSON data into a Google Storage bucket.
15-- Adds githash and total_size to the JSON data.
16-- Outputs another summary data in JSON Bench format for skiaperf ingestion.
17
18The output JSON data for visualization is in the following format:
19
20{
21  "githash": 123abc,
22  "commit_ts": 1234567890,
23  "total_size": 1234567,
24  "key": {"source_type": "binary_size"},
25  "tree_data": {
26    "maxDepth": 9,
27    "k": "p", "children":[
28      {"k":"p","children":[
29        {"k":"p","children":[
30          {"k":"p","lastPathElement":true,"children":[
31            {"k":"b","t":"t","children":[
32              {"k":"s", "t":"t", "value":4029,
33               "n":"etc_encode_subblock_helper(unsigned char const*, ...)"
34              },
35          ......
36  }
37}
38
39Another JSON file is generated for size summaries to be used in skiaperf. The
40JSON format details can be found at:
41  https://github.com/google/skia/blob/master/bench/ResultsWriter.h#L54
42and:
43  https://skia.googlesource.com/buildbot/+/master/perf/go/ingester/nanobench.go
44
45In the binary size case, outputs look like:
46
47{
48  "gitHash": "123abc",
49  "key": {
50    "source_type": "binarysize"
51  }
52  "results: {
53    "src_lazy_global_weak_symbol": {
54      "memory": {
55        "bytes": 41,
56        "options": {
57          "path": "src_lazy",
58          "symbol": "global_weak_symbol"
59        }
60      }
61    },
62    "src_lazy_global_read_only_data": {
63      "memory": {
64        "bytes": 13476,
65        "options": {
66          "path": "src_lazy",
67          "symbol": "global_read_only_data"
68        }
69      }
70    },
71    ...
72  }
73}
74
75"""
76
77import collections
78import datetime
79import json
80import logging
81import multiprocessing
82import optparse
83import os
84import re
85import shutil
86import struct
87import subprocess
88import sys
89import tempfile
90import time
91import urllib2
92
93import binary_size_utils
94import elf_symbolizer
95
96from recipe_engine.types import freeze
97
98# Node dictionary keys. These are output in json read by the webapp so
99# keep them short to save file size.
100# Note: If these change, the webapp must also change.
101NODE_TYPE_KEY = 'k'
102NODE_NAME_KEY = 'n'
103NODE_CHILDREN_KEY = 'children'
104NODE_SYMBOL_TYPE_KEY = 't'
105NODE_SYMBOL_SIZE_KEY = 'value'
106NODE_MAX_DEPTH_KEY = 'maxDepth'
107NODE_LAST_PATH_ELEMENT_KEY = 'lastPathElement'
108
109# The display name of the bucket where we put symbols without path.
110NAME_NO_PATH_BUCKET = '(No Path)'
111
112# Try to keep data buckets smaller than this to avoid killing the
113# graphing lib.
114BIG_BUCKET_LIMIT = 3000
115
116# Skia addition: relative dir for libskia.so from code base.
117LIBSKIA_RELATIVE_PATH = os.path.join('out', 'Release', 'lib')
118
119# Skia addition: dictionary mapping symbol type code to symbol name.
120# See
121# https://code.google.com/p/chromium/codesearch#chromium/src/tools/binary_size/template/D3SymbolTreeMap.js&l=74
122SYMBOL_MAP = freeze({
123    'A': 'global_absolute',
124    'B': 'global_uninitialized_data',
125    'b': 'local_uninitialized_data',
126    'C': 'global_uninitialized_common',
127    'D': 'global_initialized_data',
128    'd': 'local_initialized_data',
129    'G': 'global_small initialized_data',
130    'g': 'local_small_initialized_data',
131    'i': 'indirect_function',
132    'N': 'debugging',
133    'p': 'stack_unwind',
134    'R': 'global_read_only_data',
135    'r': 'local_read_only_data',
136    'S': 'global_small_uninitialized_data',
137    's': 'local_small_uninitialized_data',
138    'T': 'global_code',
139    't': 'local_code',
140    'U': 'undefined',
141    'u': 'unique',
142    'V': 'global_weak_object',
143    'v': 'local_weak_object',
144    'W': 'global_weak_symbol',
145    'w': 'local_weak_symbol',
146    '@': 'vtable_entry',
147    '-': 'stabs_debugging',
148    '?': 'unrecognized',
149})
150
151
152def _MkChild(node, name):
153  child = node[NODE_CHILDREN_KEY].get(name)
154  if child is None:
155    child = {NODE_NAME_KEY: name,
156             NODE_CHILDREN_KEY: {}}
157    node[NODE_CHILDREN_KEY][name] = child
158  return child
159
160
161def SplitNoPathBucket(node):
162  """NAME_NO_PATH_BUCKET can be too large for the graphing lib to
163  handle. Split it into sub-buckets in that case."""
164  root_children = node[NODE_CHILDREN_KEY]
165  if NAME_NO_PATH_BUCKET in root_children:
166    no_path_bucket = root_children[NAME_NO_PATH_BUCKET]
167    old_children = no_path_bucket[NODE_CHILDREN_KEY]
168    count = 0
169    for symbol_type, symbol_bucket in old_children.iteritems():
170      count += len(symbol_bucket[NODE_CHILDREN_KEY])
171    if count > BIG_BUCKET_LIMIT:
172      new_children = {}
173      no_path_bucket[NODE_CHILDREN_KEY] = new_children
174      current_bucket = None
175      index = 0
176      for symbol_type, symbol_bucket in old_children.iteritems():
177        for symbol_name, value in symbol_bucket[NODE_CHILDREN_KEY].iteritems():
178          if index % BIG_BUCKET_LIMIT == 0:
179            group_no = (index / BIG_BUCKET_LIMIT) + 1
180            current_bucket = _MkChild(no_path_bucket,
181                                      '%s subgroup %d' % (NAME_NO_PATH_BUCKET,
182                                                          group_no))
183            assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
184            node[NODE_TYPE_KEY] = 'p'  # p for path
185          index += 1
186          symbol_size = value[NODE_SYMBOL_SIZE_KEY]
187          AddSymbolIntoFileNode(current_bucket, symbol_type,
188                                symbol_name, symbol_size)
189
190
191def MakeChildrenDictsIntoLists(node):
192  largest_list_len = 0
193  if NODE_CHILDREN_KEY in node:
194    largest_list_len = len(node[NODE_CHILDREN_KEY])
195    child_list = []
196    for child in node[NODE_CHILDREN_KEY].itervalues():
197      child_largest_list_len = MakeChildrenDictsIntoLists(child)
198      if child_largest_list_len > largest_list_len:
199        largest_list_len = child_largest_list_len
200      child_list.append(child)
201    node[NODE_CHILDREN_KEY] = child_list
202
203  return largest_list_len
204
205
206def AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size):
207  """Puts symbol into the file path node |node|.
208  Returns the number of added levels in tree. I.e. returns 2."""
209
210  # 'node' is the file node and first step is to find its symbol-type bucket.
211  node[NODE_LAST_PATH_ELEMENT_KEY] = True
212  node = _MkChild(node, symbol_type)
213  assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'b'
214  node[NODE_SYMBOL_TYPE_KEY] = symbol_type
215  node[NODE_TYPE_KEY] = 'b'  # b for bucket
216
217  # 'node' is now the symbol-type bucket. Make the child entry.
218  node = _MkChild(node, symbol_name)
219  if NODE_CHILDREN_KEY in node:
220    if node[NODE_CHILDREN_KEY]:
221      logging.warning('A container node used as symbol for %s.' % symbol_name)
222    # This is going to be used as a leaf so no use for child list.
223    del node[NODE_CHILDREN_KEY]
224  node[NODE_SYMBOL_SIZE_KEY] = symbol_size
225  node[NODE_SYMBOL_TYPE_KEY] = symbol_type
226  node[NODE_TYPE_KEY] = 's'  # s for symbol
227
228  return 2  # Depth of the added subtree.
229
230
231def MakeCompactTree(symbols, symbol_path_origin_dir):
232  result = {NODE_NAME_KEY: '/',
233            NODE_CHILDREN_KEY: {},
234            NODE_TYPE_KEY: 'p',
235            NODE_MAX_DEPTH_KEY: 0}
236  seen_symbol_with_path = False
237  for symbol_name, symbol_type, symbol_size, file_path in symbols:
238
239    if 'vtable for ' in symbol_name:
240      symbol_type = '@'  # hack to categorize these separately
241    if file_path and file_path != "??":
242      seen_symbol_with_path = True
243    else:
244      file_path = NAME_NO_PATH_BUCKET
245
246    path_parts = file_path.split('/')
247
248    # Find pre-existing node in tree, or update if it already exists
249    node = result
250    depth = 0
251    while len(path_parts) > 0:
252      path_part = path_parts.pop(0)
253      if len(path_part) == 0:
254        continue
255      depth += 1
256      node = _MkChild(node, path_part)
257      assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
258      node[NODE_TYPE_KEY] = 'p'  # p for path
259
260    depth += AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size)
261    result[NODE_MAX_DEPTH_KEY] = max(result[NODE_MAX_DEPTH_KEY], depth)
262
263  if not seen_symbol_with_path:
264    logging.warning('Symbols lack paths. Data will not be structured.')
265
266  # The (no path) bucket can be extremely large if we failed to get
267  # path information. Split it into subgroups if needed.
268  SplitNoPathBucket(result)
269
270  largest_list_len = MakeChildrenDictsIntoLists(result)
271
272  if largest_list_len > BIG_BUCKET_LIMIT:
273    logging.warning('There are sections with %d nodes. '
274                    'Results might be unusable.' % largest_list_len)
275  return result
276
277
278# Skia added: summarizes tree size by symbol type for the given root node.
279# Returns a dict keyed by symbol type, and value the type's overall size.
280# e.g., {"t": 12345, "W": 543}.
281def GetTreeSizes(node):
282  if 'children' not in node or not node['children']:
283    return {node['t']: node['value']}
284  dic = {}
285  for i in node['children']:
286    for k, v in GetTreeSizes(i).items():
287      dic.setdefault(k, 0)
288      dic[k] += v
289
290  return dic
291
292
293# Skia added: creates dict to be converted to JSON in bench format.
294# See top of file for the structure description.
295def GetBenchDict(githash, tree_root):
296  dic = {'gitHash': githash,
297         'key': {'source_type': 'binarysize'},
298         'results': {},}
299  for i in tree_root['children']:
300    if '(No Path)' == i['n']:  # Already at symbol summary level.
301      for k, v in GetTreeSizes(i).items():
302        dic['results']['no_path_' + SYMBOL_MAP[k]] = {
303            'memory': {
304              'bytes': v,
305              'options': {'path': 'no_path',
306                          'symbol': SYMBOL_MAP[k],},}}
307    else:  # We need to go deeper.
308      for c in i['children']:
309        path = i['n'] + '_' + c['n']
310        for k, v in GetTreeSizes(c).items():
311          dic['results'][path + '_' + SYMBOL_MAP[k]] = {
312              'memory': {
313                'bytes': v,
314                'options': {'path': path,
315                            'symbol': SYMBOL_MAP[k],}}}
316
317  return dic
318
319
320# Skia added: constructs 'gsutil cp' subprocess command list.
321def GetGsCopyCommandList(gsutil, src, dst):
322  return [gsutil, '-h', 'Content-Type:application/json', 'cp', '-a',
323          'public-read', src, dst]
324
325
326def DumpCompactTree(symbols, symbol_path_origin_dir, ha, ts, issue, gsutil):
327  tree_root = MakeCompactTree(symbols, symbol_path_origin_dir)
328  json_data = {'tree_data': tree_root,
329               'githash': ha,
330               'commit_ts': ts,
331               'key': {'source_type': 'binary_size'},
332               'total_size': sum(GetTreeSizes(tree_root).values()),}
333  tmpfile = tempfile.NamedTemporaryFile(delete=False).name
334  with open(tmpfile, 'w') as out:
335    # Use separators without whitespace to get a smaller file.
336    json.dump(json_data, out, separators=(',', ':'))
337
338  GS_PREFIX = 'gs://skia-perf/'
339  # Writes to Google Storage for visualization.
340  subprocess.check_call(GetGsCopyCommandList(
341      gsutil, tmpfile, GS_PREFIX + 'size/' + ha + '.json'))
342  # Updates the latest data.
343  if not issue:
344    subprocess.check_call(GetGsCopyCommandList(gsutil, tmpfile,
345                                               GS_PREFIX + 'size/latest.json'))
346  # Writes an extra copy using year/month/day/hour path for easy ingestion.
347  with open(tmpfile, 'w') as out:
348    json.dump(GetBenchDict(ha, tree_root), out, separators=(',', ':'))
349  now = datetime.datetime.utcnow()
350  ingest_path = '/'.join(('nano-json-v1', str(now.year).zfill(4),
351                          str(now.month).zfill(2), str(now.day).zfill(2),
352                          str(now.hour).zfill(2)))
353  if issue:
354    ingest_path = '/'.join('trybot', ingest_path, issue)
355  subprocess.check_call(GetGsCopyCommandList(gsutil, tmpfile,
356      GS_PREFIX + ingest_path + '/binarysize_' + ha + '.json'))
357
358
359def MakeSourceMap(symbols):
360  sources = {}
361  for _sym, _symbol_type, size, path in symbols:
362    key = None
363    if path:
364      key = os.path.normpath(path)
365    else:
366      key = '[no path]'
367    if key not in sources:
368      sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
369    record = sources[key]
370    record['size'] += size
371    record['symbol_count'] += 1
372  return sources
373
374
375# Regex for parsing "nm" output. A sample line looks like this:
376# 0167b39c 00000018 t ACCESS_DESCRIPTION_free /path/file.c:95
377#
378# The fields are: address, size, type, name, source location
379# Regular expression explained ( see also: https://xkcd.com/208 ):
380# ([0-9a-f]{8,}+)   The address
381# [\s]+             Whitespace separator
382# ([0-9a-f]{8,}+)   The size. From here on out it's all optional.
383# [\s]+             Whitespace separator
384# (\S?)             The symbol type, which is any non-whitespace char
385# [\s*]             Whitespace separator
386# ([^\t]*)          Symbol name, any non-tab character (spaces ok!)
387# [\t]?             Tab separator
388# (.*)              The location (filename[:linennum|?][ (discriminator n)]
389sNmPattern = re.compile(
390  r'([0-9a-f]{8,})[\s]+([0-9a-f]{8,})[\s]*(\S?)[\s*]([^\t]*)[\t]?(.*)')
391
392class Progress():
393  def __init__(self):
394    self.count = 0
395    self.skip_count = 0
396    self.collisions = 0
397    self.time_last_output = time.time()
398    self.count_last_output = 0
399    self.disambiguations = 0
400    self.was_ambiguous = 0
401
402
403def RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
404                     disambiguate, src_path):
405  nm_output = RunNm(library, nm_binary)
406  nm_output_lines = nm_output.splitlines()
407  nm_output_lines_len = len(nm_output_lines)
408  address_symbol = {}
409  progress = Progress()
410  def map_address_symbol(symbol, addr):
411    progress.count += 1
412    if addr in address_symbol:
413      # 'Collision between %s and %s.' % (str(symbol.name),
414      #                                   str(address_symbol[addr].name))
415      progress.collisions += 1
416    else:
417      if symbol.disambiguated:
418        progress.disambiguations += 1
419      if symbol.was_ambiguous:
420        progress.was_ambiguous += 1
421
422      address_symbol[addr] = symbol
423
424    progress_output()
425
426  def progress_output():
427    progress_chunk = 100
428    if progress.count % progress_chunk == 0:
429      time_now = time.time()
430      time_spent = time_now - progress.time_last_output
431      if time_spent > 1.0:
432        # Only output at most once per second.
433        progress.time_last_output = time_now
434        chunk_size = progress.count - progress.count_last_output
435        progress.count_last_output = progress.count
436        if time_spent > 0:
437          speed = chunk_size / time_spent
438        else:
439          speed = 0
440        progress_percent = (100.0 * (progress.count + progress.skip_count) /
441                            nm_output_lines_len)
442        disambiguation_percent = 0
443        if progress.disambiguations != 0:
444          disambiguation_percent = (100.0 * progress.disambiguations /
445                                    progress.was_ambiguous)
446
447        sys.stdout.write('\r%.1f%%: Looked up %d symbols (%d collisions, '
448              '%d disambiguations where %.1f%% succeeded)'
449              ' - %.1f lookups/s.' %
450              (progress_percent, progress.count, progress.collisions,
451               progress.disambiguations, disambiguation_percent, speed))
452
453  # In case disambiguation was disabled, we remove the source path (which upon
454  # being set signals the symbolizer to enable disambiguation)
455  if not disambiguate:
456    src_path = None
457  symbol_path_origin_dir = os.path.dirname(library)
458  # Skia specific.
459  symbol_path_prefix = symbol_path_origin_dir.replace(LIBSKIA_RELATIVE_PATH, '')
460  symbolizer = elf_symbolizer.ELFSymbolizer(library, addr2line_binary,
461                                            map_address_symbol,
462                                            max_concurrent_jobs=jobs,
463                                            source_root_path=src_path,
464                                            prefix_to_remove=symbol_path_prefix)
465  user_interrupted = False
466  try:
467    for line in nm_output_lines:
468      match = sNmPattern.match(line)
469      if match:
470        location = match.group(5)
471        if not location:
472          addr = int(match.group(1), 16)
473          size = int(match.group(2), 16)
474          if addr in address_symbol:  # Already looked up, shortcut
475                                      # ELFSymbolizer.
476            map_address_symbol(address_symbol[addr], addr)
477            continue
478          elif size == 0:
479            # Save time by not looking up empty symbols (do they even exist?)
480            print('Empty symbol: ' + line)
481          else:
482            symbolizer.SymbolizeAsync(addr, addr)
483            continue
484
485      progress.skip_count += 1
486  except KeyboardInterrupt:
487    user_interrupted = True
488    print('Interrupting - killing subprocesses. Please wait.')
489
490  try:
491    symbolizer.Join()
492  except KeyboardInterrupt:
493    # Don't want to abort here since we will be finished in a few seconds.
494    user_interrupted = True
495    print('Patience you must have my young padawan.')
496
497  print ''
498
499  if user_interrupted:
500    print('Skipping the rest of the file mapping. '
501          'Output will not be fully classified.')
502
503  symbol_path_origin_dir = os.path.dirname(library)
504  # Skia specific: path prefix to strip.
505  symbol_path_prefix = symbol_path_origin_dir.replace(LIBSKIA_RELATIVE_PATH, '')
506
507  with open(outfile, 'w') as out:
508    for line in nm_output_lines:
509      match = sNmPattern.match(line)
510      if match:
511        location = match.group(5)
512        if not location:
513          addr = int(match.group(1), 16)
514          symbol = address_symbol.get(addr)
515          if symbol is not None:
516            path = '??'
517            if symbol.source_path is not None:
518              path = symbol.source_path.replace(symbol_path_prefix, '')
519            line_number = 0
520            if symbol.source_line is not None:
521              line_number = symbol.source_line
522            out.write('%s\t%s:%d\n' % (line, path, line_number))
523            continue
524
525      out.write('%s\n' % line)
526
527  print('%d symbols in the results.' % len(address_symbol))
528
529
530def RunNm(binary, nm_binary):
531  cmd = [nm_binary, '-C', '--print-size', '--size-sort', '--reverse-sort',
532         binary]
533  nm_process = subprocess.Popen(cmd,
534                                stdout=subprocess.PIPE,
535                                stderr=subprocess.PIPE)
536  (process_output, err_output) = nm_process.communicate()
537
538  if nm_process.returncode != 0:
539    if err_output:
540      raise Exception, err_output
541    else:
542      raise Exception, process_output
543
544  return process_output
545
546
547def GetNmSymbols(nm_infile, outfile, library, jobs, verbose,
548                 addr2line_binary, nm_binary, disambiguate, src_path):
549  if nm_infile is None:
550    if outfile is None:
551      outfile = tempfile.NamedTemporaryFile(delete=False).name
552
553    if verbose:
554      print 'Running parallel addr2line, dumping symbols to ' + outfile
555    RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
556                     disambiguate, src_path)
557
558    nm_infile = outfile
559
560  elif verbose:
561    print 'Using nm input from ' + nm_infile
562  with file(nm_infile, 'r') as infile:
563    return list(binary_size_utils.ParseNm(infile))
564
565
566PAK_RESOURCE_ID_TO_STRING = { "inited": False }
567
568def LoadPakIdsFromResourceFile(filename):
569  """Given a file name, it loads everything that looks like a resource id
570  into PAK_RESOURCE_ID_TO_STRING."""
571  with open(filename) as resource_header:
572    for line in resource_header:
573      if line.startswith("#define "):
574        line_data = line.split()
575        if len(line_data) == 3:
576          try:
577            resource_number = int(line_data[2])
578            resource_name = line_data[1]
579            PAK_RESOURCE_ID_TO_STRING[resource_number] = resource_name
580          except ValueError:
581            pass
582
583def GetReadablePakResourceName(pak_file, resource_id):
584  """Pak resources have a numeric identifier. It is not helpful when
585  trying to locate where footprint is generated. This does its best to
586  map the number to a usable string."""
587  if not PAK_RESOURCE_ID_TO_STRING['inited']:
588    # Try to find resource header files generated by grit when
589    # building the pak file. We'll look for files named *resources.h"
590    # and lines of the type:
591    #    #define MY_RESOURCE_JS 1234
592    PAK_RESOURCE_ID_TO_STRING['inited'] = True
593    gen_dir = os.path.join(os.path.dirname(pak_file), 'gen')
594    if os.path.isdir(gen_dir):
595      for dirname, _dirs, files in os.walk(gen_dir):
596        for filename in files:
597          if filename.endswith('resources.h'):
598            LoadPakIdsFromResourceFile(os.path.join(dirname, filename))
599  return PAK_RESOURCE_ID_TO_STRING.get(resource_id,
600                                       'Pak Resource %d' % resource_id)
601
602def AddPakData(symbols, pak_file):
603  """Adds pseudo-symbols from a pak file."""
604  pak_file = os.path.abspath(pak_file)
605  with open(pak_file, 'rb') as pak:
606    data = pak.read()
607
608  PAK_FILE_VERSION = 4
609  HEADER_LENGTH = 2 * 4 + 1  # Two uint32s. (file version, number of entries)
610                             # and one uint8 (encoding of text resources)
611  INDEX_ENTRY_SIZE = 2 + 4  # Each entry is a uint16 and a uint32.
612  version, num_entries, _encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])
613  assert version == PAK_FILE_VERSION, ('Unsupported pak file '
614                                       'version (%d) in %s. Only '
615                                       'support version %d' %
616                                       (version, pak_file, PAK_FILE_VERSION))
617  if num_entries > 0:
618    # Read the index and data.
619    data = data[HEADER_LENGTH:]
620    for _ in range(num_entries):
621      resource_id, offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
622      data = data[INDEX_ENTRY_SIZE:]
623      _next_id, next_offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
624      resource_size = next_offset - offset
625
626      symbol_name = GetReadablePakResourceName(pak_file, resource_id)
627      symbol_path = pak_file
628      symbol_type = 'd' # Data. Approximation.
629      symbol_size = resource_size
630      symbols.append((symbol_name, symbol_type, symbol_size, symbol_path))
631
632def _find_in_system_path(binary):
633  """Locate the full path to binary in the system path or return None
634  if not found."""
635  system_path = os.environ["PATH"].split(os.pathsep)
636  for path in system_path:
637    binary_path = os.path.join(path, binary)
638    if os.path.isfile(binary_path):
639      return binary_path
640  return None
641
642def CheckDebugFormatSupport(library, addr2line_binary):
643  """Kills the program if debug data is in an unsupported format.
644
645  There are two common versions of the DWARF debug formats and
646  since we are right now transitioning from DWARF2 to newer formats,
647  it's possible to have a mix of tools that are not compatible. Detect
648  that and abort rather than produce meaningless output."""
649  tool_output = subprocess.check_output([addr2line_binary, '--version'])
650  version_re = re.compile(r'^GNU [^ ]+ .* (\d+).(\d+).*?$', re.M)
651  parsed_output = version_re.match(tool_output)
652  major = int(parsed_output.group(1))
653  minor = int(parsed_output.group(2))
654  supports_dwarf4 = major > 2 or major == 2 and minor > 22
655
656  if supports_dwarf4:
657    return
658
659  print('Checking version of debug information in %s.' % library)
660  debug_info = subprocess.check_output(['readelf', '--debug-dump=info',
661                                       '--dwarf-depth=1', library])
662  dwarf_version_re = re.compile(r'^\s+Version:\s+(\d+)$', re.M)
663  parsed_dwarf_format_output = dwarf_version_re.search(debug_info)
664  version = int(parsed_dwarf_format_output.group(1))
665  if version > 2:
666    print('The supplied tools only support DWARF2 debug data but the binary\n' +
667          'uses DWARF%d. Update the tools or compile the binary\n' % version +
668          'with -gdwarf-2.')
669    sys.exit(1)
670
671
672def main():
673  usage = """%prog [options]
674
675  Runs a spatial analysis on a given library, looking up the source locations
676  of its symbols and calculating how much space each directory, source file,
677  and so on is taking. The result is a report that can be used to pinpoint
678  sources of large portions of the binary, etceteras.
679
680  Under normal circumstances, you only need to pass two arguments, thusly:
681
682      %prog --library /path/to/library --destdir /path/to/output
683
684  In this mode, the program will dump the symbols from the specified library
685  and map those symbols back to source locations, producing a web-based
686  report in the specified output directory.
687
688  Other options are available via '--help'.
689  """
690  parser = optparse.OptionParser(usage=usage)
691  parser.add_option('--nm-in', metavar='PATH',
692                    help='if specified, use nm input from <path> instead of '
693                    'generating it. Note that source locations should be '
694                    'present in the file; i.e., no addr2line symbol lookups '
695                    'will be performed when this option is specified. '
696                    'Mutually exclusive with --library.')
697  parser.add_option('--destdir', metavar='PATH',
698                    help='write output to the specified directory. An HTML '
699                    'report is generated here along with supporting files; '
700                    'any existing report will be overwritten. Not used in '
701                    'Skia.')
702  parser.add_option('--library', metavar='PATH',
703                    help='if specified, process symbols in the library at '
704                    'the specified path. Mutually exclusive with --nm-in.')
705  parser.add_option('--pak', metavar='PATH',
706                    help='if specified, includes the contents of the '
707                    'specified *.pak file in the output.')
708  parser.add_option('--nm-binary',
709                    help='use the specified nm binary to analyze library. '
710                    'This is to be used when the nm in the path is not for '
711                    'the right architecture or of the right version.')
712  parser.add_option('--addr2line-binary',
713                    help='use the specified addr2line binary to analyze '
714                    'library. This is to be used when the addr2line in '
715                    'the path is not for the right architecture or '
716                    'of the right version.')
717  parser.add_option('--jobs', type='int',
718                    help='number of jobs to use for the parallel '
719                    'addr2line processing pool; defaults to 1. More '
720                    'jobs greatly improve throughput but eat RAM like '
721                    'popcorn, and take several gigabytes each. Start low '
722                    'and ramp this number up until your machine begins to '
723                    'struggle with RAM. '
724                    'This argument is only valid when using --library.')
725  parser.add_option('-v', dest='verbose', action='store_true',
726                    help='be verbose, printing lots of status information.')
727  parser.add_option('--nm-out', metavar='PATH',
728                    help='keep the nm output file, and store it at the '
729                    'specified path. This is useful if you want to see the '
730                    'fully processed nm output after the symbols have been '
731                    'mapped to source locations. By default, a tempfile is '
732                    'used and is deleted when the program terminates.'
733                    'This argument is only valid when using --library.')
734  parser.add_option('--legacy', action='store_true',
735                    help='emit legacy binary size report instead of modern')
736  parser.add_option('--disable-disambiguation', action='store_true',
737                    help='disables the disambiguation process altogether,'
738                    ' NOTE: this may, depending on your toolchain, produce'
739                    ' output with some symbols at the top layer if addr2line'
740                    ' could not get the entire source path.')
741  parser.add_option('--source-path', default='./',
742                    help='the path to the source code of the output binary, '
743                    'default set to current directory. Used in the'
744                    ' disambiguation process.')
745  parser.add_option('--githash', default='latest',
746                    help='Git hash for the binary version. Added by Skia.')
747  parser.add_option('--commit_ts', type='int', default=-1,
748                    help='Timestamp for the commit. Added by Skia.')
749  parser.add_option('--issue_number', default='',
750                    help='The trybot issue number in string. Added by Skia.')
751  parser.add_option('--gsutil_path', default='gsutil',
752                    help='Path to gsutil binary. Added by Skia.')
753  opts, _args = parser.parse_args()
754
755  if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
756    parser.error('exactly one of --library or --nm-in is required')
757  if (opts.nm_in):
758    if opts.jobs:
759      print >> sys.stderr, ('WARNING: --jobs has no effect '
760                            'when used with --nm-in')
761  if not opts.jobs:
762    # Use the number of processors but cap between 2 and 4 since raw
763    # CPU power isn't the limiting factor. It's I/O limited, memory
764    # bus limited and available-memory-limited. Too many processes and
765    # the computer will run out of memory and it will be slow.
766    opts.jobs = max(2, min(4, str(multiprocessing.cpu_count())))
767
768  if opts.addr2line_binary:
769    assert os.path.isfile(opts.addr2line_binary)
770    addr2line_binary = opts.addr2line_binary
771  else:
772    addr2line_binary = _find_in_system_path('addr2line')
773    assert addr2line_binary, 'Unable to find addr2line in the path. '\
774        'Use --addr2line-binary to specify location.'
775
776  if opts.nm_binary:
777    assert os.path.isfile(opts.nm_binary)
778    nm_binary = opts.nm_binary
779  else:
780    nm_binary = _find_in_system_path('nm')
781    assert nm_binary, 'Unable to find nm in the path. Use --nm-binary '\
782        'to specify location.'
783
784  if opts.pak:
785    assert os.path.isfile(opts.pak), 'Could not find ' % opts.pak
786
787  print('addr2line: %s' % addr2line_binary)
788  print('nm: %s' % nm_binary)
789
790  if opts.library:
791    CheckDebugFormatSupport(opts.library, addr2line_binary)
792
793  symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library,
794                         opts.jobs, opts.verbose is True,
795                         addr2line_binary, nm_binary,
796                         opts.disable_disambiguation is None,
797                         opts.source_path)
798
799  if opts.pak:
800    AddPakData(symbols, opts.pak)
801
802  if opts.legacy: # legacy report
803    print 'Do Not set legacy flag.'
804
805  else: # modern report
806    if opts.library:
807      symbol_path_origin_dir = os.path.dirname(os.path.abspath(opts.library))
808    else:
809      # Just a guess. Hopefully all paths in the input file are absolute.
810      symbol_path_origin_dir = os.path.abspath(os.getcwd())
811    DumpCompactTree(symbols, symbol_path_origin_dir, opts.githash,
812                    opts.commit_ts, opts.issue_number, opts.gsutil_path)
813    print 'Report data uploaded to GS.'
814
815
816if __name__ == '__main__':
817  sys.exit(main())
818