1#!/usr/bin/env python
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Describe the size difference of two binaries.
7
8Generates a description of the size difference of two binaries based
9on the difference of the size of various symbols.
10
11This tool needs "nm" dumps of each binary with full symbol
12information. You can obtain the necessary dumps by running the
13run_binary_size_analysis.py script upon each binary, with the
14"--nm-out" parameter set to the location in which you want to save the
15dumps. Example:
16
17  # obtain symbol data from first binary in /tmp/nm1.dump
18  cd $CHECKOUT1_SRC
19  ninja -C out/Release binary_size_tool
20  tools/binary_size/run_binary_size_analysis \
21      --library <path_to_library>
22      --destdir /tmp/throwaway
23      --nm-out /tmp/nm1.dump
24
25  # obtain symbol data from second binary in /tmp/nm2.dump
26  cd $CHECKOUT2_SRC
27  ninja -C out/Release binary_size_tool
28  tools/binary_size/run_binary_size_analysis \
29      --library <path_to_library>
30      --destdir /tmp/throwaway
31      --nm-out /tmp/nm2.dump
32
33  # cleanup useless files
34  rm -r /tmp/throwaway
35
36  # run this tool
37  explain_binary_size_delta.py --nm1 /tmp/nm1.dump --nm2 /tmp/nm2.dump
38"""
39
40import collections
41import operator
42import optparse
43import os
44import sys
45
46import binary_size_utils
47
48
49def Compare(symbols1, symbols2):
50  """Executes a comparison of the symbols in symbols1 and symbols2.
51
52  Returns:
53      tuple of lists: (added_symbols, removed_symbols, changed_symbols, others)
54  """
55  added = [] # tuples
56  removed = [] # tuples
57  changed = [] # tuples
58  unchanged = [] # tuples
59
60  cache1 = {}
61  cache2 = {}
62  # Make a map of (file, symbol_type) : (symbol_name, symbol_size)
63  for cache, symbols in ((cache1, symbols1), (cache2, symbols2)):
64    for symbol_name, symbol_type, symbol_size, file_path in symbols:
65      if 'vtable for ' in symbol_name:
66        symbol_type = '@' # hack to categorize these separately
67      if file_path:
68        file_path = os.path.normpath(file_path)
69      else:
70        file_path = '(No Path)'
71      key = (file_path, symbol_type)
72      bucket = cache.setdefault(key, {})
73      size_list = bucket.setdefault(symbol_name, [])
74      size_list.append(symbol_size)
75
76  # Now diff them. We iterate over the elements in cache1. For each symbol
77  # that we find in cache2, we record whether it was deleted, changed, or
78  # unchanged. We then remove it from cache2; all the symbols that remain
79  # in cache2 at the end of the iteration over cache1 are the 'new' symbols.
80  for key, bucket1 in cache1.items():
81    bucket2 = cache2.get(key)
82    if not bucket2:
83      # A file was removed. Everything in bucket1 is dead.
84      for symbol_name, symbol_size_list in bucket1.items():
85        for symbol_size in symbol_size_list:
86          removed.append((key[0], key[1], symbol_name, symbol_size, None))
87    else:
88      # File still exists, look for changes within.
89      for symbol_name, symbol_size_list in bucket1.items():
90        size_list2 = bucket2.get(symbol_name)
91        if size_list2 is None:
92          # Symbol no longer exists in bucket2.
93          for symbol_size in symbol_size_list:
94            removed.append((key[0], key[1], symbol_name, symbol_size, None))
95        else:
96          del bucket2[symbol_name] # Symbol is not new, delete from cache2.
97          if len(symbol_size_list) == 1 and len(size_list2) == 1:
98            symbol_size = symbol_size_list[0]
99            size2 = size_list2[0]
100            if symbol_size != size2:
101              # Symbol has change size in bucket.
102              changed.append((key[0], key[1], symbol_name, symbol_size, size2))
103            else:
104              # Symbol is unchanged.
105              unchanged.append((key[0], key[1], symbol_name, symbol_size,
106                                size2))
107          else:
108            # Complex comparison for when a symbol exists multiple times
109            # in the same file (where file can be "unknown file").
110            symbol_size_counter = collections.Counter(symbol_size_list)
111            delta_counter = collections.Counter(symbol_size_list)
112            delta_counter.subtract(size_list2)
113            for symbol_size in sorted(delta_counter.keys()):
114              delta = delta_counter[symbol_size]
115              unchanged_count = symbol_size_counter[symbol_size]
116              if delta > 0:
117                unchanged_count -= delta
118              for _ in range(unchanged_count):
119                unchanged.append((key[0], key[1], symbol_name, symbol_size,
120                                  symbol_size))
121              if delta > 0: # Used to be more of these than there is now.
122                for _ in range(delta):
123                  removed.append((key[0], key[1], symbol_name, symbol_size,
124                                  None))
125              elif delta < 0: # More of this (symbol,size) now.
126                for _ in range(-delta):
127                  added.append((key[0], key[1], symbol_name, None, symbol_size))
128
129          if len(bucket2) == 0:
130            del cache1[key] # Entire bucket is empty, delete from cache2
131
132  # We have now analyzed all symbols that are in cache1 and removed all of
133  # the encountered symbols from cache2. What's left in cache2 is the new
134  # symbols.
135  for key, bucket2 in cache2.iteritems():
136    for symbol_name, symbol_size_list in bucket2.items():
137      for symbol_size in symbol_size_list:
138        added.append((key[0], key[1], symbol_name, None, symbol_size))
139  return (added, removed, changed, unchanged)
140
141def DeltaStr(number):
142  """Returns the number as a string with a '+' prefix if it's > 0 and
143  a '-' prefix if it's < 0."""
144  result = str(number)
145  if number > 0:
146    result = '+' + result
147  return result
148
149
150class CrunchStatsData(object):
151  """Stores a summary of data of a certain kind."""
152  def __init__(self, symbols):
153    self.symbols = symbols
154    self.sources = set()
155    self.before_size = 0
156    self.after_size = 0
157    self.symbols_by_path = {}
158
159
160def CrunchStats(added, removed, changed, unchanged, showsources, showsymbols):
161  """Outputs to stdout a summary of changes based on the symbol lists."""
162  # Split changed into grown and shrunk because that is easier to
163  # discuss.
164  grown = []
165  shrunk = []
166  for item in changed:
167    file_path, symbol_type, symbol_name, size1, size2 = item
168    if size1 < size2:
169      grown.append(item)
170    else:
171      shrunk.append(item)
172
173  new_symbols = CrunchStatsData(added)
174  removed_symbols = CrunchStatsData(removed)
175  grown_symbols = CrunchStatsData(grown)
176  shrunk_symbols = CrunchStatsData(shrunk)
177  sections = [new_symbols, removed_symbols, grown_symbols, shrunk_symbols]
178  for section in sections:
179    for file_path, symbol_type, symbol_name, size1, size2 in section.symbols:
180      section.sources.add(file_path)
181      if size1 is not None:
182        section.before_size += size1
183      if size2 is not None:
184        section.after_size += size2
185      bucket = section.symbols_by_path.setdefault(file_path, [])
186      bucket.append((symbol_name, symbol_type, size1, size2))
187
188  total_change = sum(s.after_size - s.before_size for s in sections)
189  summary = 'Total change: %s bytes' % DeltaStr(total_change)
190  print(summary)
191  print('=' * len(summary))
192  for section in sections:
193    if not section.symbols:
194      continue
195    if section.before_size == 0:
196      description = ('added, totalling %s bytes' % DeltaStr(section.after_size))
197    elif section.after_size == 0:
198      description = ('removed, totalling %s bytes' %
199                     DeltaStr(-section.before_size))
200    else:
201      if section.after_size > section.before_size:
202        type_str = 'grown'
203      else:
204        type_str = 'shrunk'
205      description = ('%s, for a net change of %s bytes '
206                     '(%d bytes before, %d bytes after)' %
207            (type_str, DeltaStr(section.after_size - section.before_size),
208             section.before_size, section.after_size))
209    print('  %d %s across %d sources' %
210          (len(section.symbols), description, len(section.sources)))
211
212  maybe_unchanged_sources = set()
213  unchanged_symbols_size = 0
214  for file_path, symbol_type, symbol_name, size1, size2 in unchanged:
215    maybe_unchanged_sources.add(file_path)
216    unchanged_symbols_size += size1 # == size2
217  print('  %d unchanged, totalling %d bytes' %
218        (len(unchanged), unchanged_symbols_size))
219
220  # High level analysis, always output.
221  unchanged_sources = maybe_unchanged_sources
222  for section in sections:
223    unchanged_sources = unchanged_sources - section.sources
224  new_sources = (new_symbols.sources -
225    maybe_unchanged_sources -
226    removed_symbols.sources)
227  removed_sources = (removed_symbols.sources -
228    maybe_unchanged_sources -
229    new_symbols.sources)
230  partially_changed_sources = (grown_symbols.sources |
231    shrunk_symbols.sources | new_symbols.sources |
232    removed_symbols.sources) - removed_sources - new_sources
233  allFiles = set()
234  for section in sections:
235    allFiles = allFiles | section.sources
236  allFiles = allFiles | maybe_unchanged_sources
237  print 'Source stats:'
238  print('  %d sources encountered.' % len(allFiles))
239  print('  %d completely new.' % len(new_sources))
240  print('  %d removed completely.' % len(removed_sources))
241  print('  %d partially changed.' % len(partially_changed_sources))
242  print('  %d completely unchanged.' % len(unchanged_sources))
243  remainder = (allFiles - new_sources - removed_sources -
244    partially_changed_sources - unchanged_sources)
245  assert len(remainder) == 0
246
247  if not showsources:
248    return  # Per-source analysis, only if requested
249  print 'Per-source Analysis:'
250  delta_by_path = {}
251  for section in sections:
252    for path in section.symbols_by_path:
253      entry = delta_by_path.get(path)
254      if not entry:
255        entry = {'plus': 0, 'minus': 0}
256        delta_by_path[path] = entry
257      for symbol_name, symbol_type, size1, size2 in \
258            section.symbols_by_path[path]:
259        if size1 is None:
260          delta = size2
261        elif size2 is None:
262          delta = -size1
263        else:
264          delta = size2 - size1
265
266        if delta > 0:
267          entry['plus'] += delta
268        else:
269          entry['minus'] += (-1 * delta)
270
271  def delta_sort_key(item):
272    _path, size_data = item
273    growth = size_data['plus'] - size_data['minus']
274    return growth
275
276  for path, size_data in sorted(delta_by_path.iteritems(), key=delta_sort_key,
277                                reverse=True):
278    gain = size_data['plus']
279    loss = size_data['minus']
280    delta = size_data['plus'] - size_data['minus']
281    header = ' %s - Source: %s - (gained %d, lost %d)' % (DeltaStr(delta),
282                                                          path, gain, loss)
283    divider = '-' * len(header)
284    print ''
285    print divider
286    print header
287    print divider
288    if showsymbols:
289      if path in new_symbols.symbols_by_path:
290        print '  New symbols:'
291        for symbol_name, symbol_type, size1, size2 in \
292            sorted(new_symbols.symbols_by_path[path],
293                   key=operator.itemgetter(3),
294                   reverse=True):
295          print ('   %8s: %s type=%s, size=%d bytes' %
296                 (DeltaStr(size2), symbol_name, symbol_type, size2))
297      if path in removed_symbols.symbols_by_path:
298        print '  Removed symbols:'
299        for symbol_name, symbol_type, size1, size2 in \
300            sorted(removed_symbols.symbols_by_path[path],
301                   key=operator.itemgetter(2)):
302          print ('   %8s: %s type=%s, size=%d bytes' %
303                 (DeltaStr(-size1), symbol_name, symbol_type, size1))
304      for (changed_symbols_by_path, type_str) in [
305        (grown_symbols.symbols_by_path, "Grown"),
306        (shrunk_symbols.symbols_by_path, "Shrunk")]:
307        if path in changed_symbols_by_path:
308          print '  %s symbols:' % type_str
309          def changed_symbol_sortkey(item):
310            symbol_name, _symbol_type, size1, size2 = item
311            return (size1 - size2, symbol_name)
312          for symbol_name, symbol_type, size1, size2 in \
313              sorted(changed_symbols_by_path[path], key=changed_symbol_sortkey):
314            print ('   %8s: %s type=%s, (was %d bytes, now %d bytes)'
315                   % (DeltaStr(size2 - size1), symbol_name,
316                      symbol_type, size1, size2))
317
318
319def main():
320  usage = """%prog [options]
321
322  Analyzes the symbolic differences between two binary files
323  (typically, not necessarily, two different builds of the same
324  library) and produces a detailed description of symbols that have
325  been added, removed, or whose size has changed.
326
327  Example:
328       explain_binary_size_delta.py --nm1 /tmp/nm1.dump --nm2 /tmp/nm2.dump
329
330  Options are available via '--help'.
331  """
332  parser = optparse.OptionParser(usage=usage)
333  parser.add_option('--nm1', metavar='PATH',
334                    help='the nm dump of the first library')
335  parser.add_option('--nm2', metavar='PATH',
336                    help='the nm dump of the second library')
337  parser.add_option('--showsources', action='store_true', default=False,
338                    help='show per-source statistics')
339  parser.add_option('--showsymbols', action='store_true', default=False,
340                    help='show all symbol information; implies --showfiles')
341  parser.add_option('--verbose', action='store_true', default=False,
342                    help='output internal debugging stuff')
343  opts, _args = parser.parse_args()
344
345  if not opts.nm1:
346    parser.error('--nm1 is required')
347  if not opts.nm2:
348    parser.error('--nm2 is required')
349  symbols = []
350  for path in [opts.nm1, opts.nm2]:
351    with file(path, 'r') as nm_input:
352      if opts.verbose:
353        print 'parsing ' + path + '...'
354      symbols.append(list(binary_size_utils.ParseNm(nm_input)))
355  (added, removed, changed, unchanged) = Compare(symbols[0], symbols[1])
356  CrunchStats(added, removed, changed, unchanged,
357    opts.showsources | opts.showsymbols, opts.showsymbols)
358
359if __name__ == '__main__':
360  sys.exit(main())
361