run_binary_size_analysis.py revision 5d1f7b1de12d16ceb2c938c56701a3e8bfa558f7
1#!/usr/bin/python 2# Copyright 2014 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Generate a spatial analysis against an arbitrary library. 7 8To use, build the 'binary_size_tool' target. Then run this tool, passing 9in the location of the library to be analyzed along with any other options 10you desire. 11""" 12 13import collections 14import fileinput 15import json 16import optparse 17import os 18import pprint 19import re 20import shutil 21import subprocess 22import sys 23import tempfile 24 25 26def FormatBytes(bytes): 27 """Pretty-print a number of bytes.""" 28 if bytes > 1e6: 29 bytes = bytes / 1.0e6 30 return '%.1fm' % bytes 31 if bytes > 1e3: 32 bytes = bytes / 1.0e3 33 return '%.1fk' % bytes 34 return str(bytes) 35 36 37def SymbolTypeToHuman(type): 38 """Convert a symbol type as printed by nm into a human-readable name.""" 39 return {'b': 'bss', 40 'd': 'data', 41 'r': 'read-only data', 42 't': 'code', 43 'w': 'weak symbol', 44 'v': 'weak symbol'}[type] 45 46 47def ParseNm(input): 48 """Parse nm output. 49 50 Argument: an iterable over lines of nm output. 51 52 Yields: (symbol name, symbol type, symbol size, source file path). 53 Path may be None if nm couldn't figure out the source file. 54 """ 55 56 # Match lines with size, symbol, optional location, optional discriminator 57 sym_re = re.compile(r'^[0-9a-f]{8} ' # address (8 hex digits) 58 '([0-9a-f]{8}) ' # size (8 hex digits) 59 '(.) ' # symbol type, one character 60 '([^\t]+)' # symbol name, separated from next by tab 61 '(?:\t(.*):[\d\?]+)?.*$') # location 62 # Match lines with addr but no size. 63 addr_re = re.compile(r'^[0-9a-f]{8} (.) ([^\t]+)(?:\t.*)?$') 64 # Match lines that don't have an address at all -- typically external symbols. 65 noaddr_re = re.compile(r'^ {8} (.) (.*)$') 66 67 for line in input: 68 line = line.rstrip() 69 match = sym_re.match(line) 70 if match: 71 size, type, sym = match.groups()[0:3] 72 size = int(size, 16) 73 type = type.lower() 74 if type == 'v': 75 type = 'w' # just call them all weak 76 if type == 'b': 77 continue # skip all BSS for now 78 path = match.group(4) 79 yield sym, type, size, path 80 continue 81 match = addr_re.match(line) 82 if match: 83 type, sym = match.groups()[0:2] 84 # No size == we don't care. 85 continue 86 match = noaddr_re.match(line) 87 if match: 88 type, sym = match.groups() 89 if type in ('U', 'w'): 90 # external or weak symbol 91 continue 92 93 print >>sys.stderr, 'unparsed:', repr(line) 94 95 96def TreeifySymbols(symbols): 97 """Convert symbols into a path-based tree, calculating size information 98 along the way. 99 100 The result is a dictionary that contains two kinds of nodes: 101 1. Leaf nodes, representing source code locations (e.g., c++ files) 102 These nodes have the following dictionary entries: 103 sizes: a dictionary whose keys are categories (such as code, data, 104 vtable, etceteras) and whose values are the size, in bytes, of 105 those categories; 106 size: the total size, in bytes, of all the entries in the sizes dict 107 2. Non-leaf nodes, representing directories 108 These nodes have the following dictionary entries: 109 children: a dictionary whose keys are names (path entries; either 110 directory or file names) and whose values are other nodes; 111 size: the total size, in bytes, of all the leaf nodes that are 112 contained within the children dict (recursively expanded) 113 114 The result object is itself a dictionary that represents the common ancestor 115 of all child nodes, e.g. a path to which all other nodes beneath it are 116 relative. The 'size' attribute of this dict yields the sum of the size of all 117 leaf nodes within the data structure. 118 """ 119 dirs = {'children': {}, 'size': 0} 120 for sym, type, size, path in symbols: 121 dirs['size'] += size 122 if path: 123 path = os.path.normpath(path) 124 if path.startswith('/'): 125 path = path[1:] 126 127 parts = None 128 if path: 129 parts = path.split('/') 130 131 if parts: 132 assert path 133 file_key = parts.pop() 134 tree = dirs 135 try: 136 # Traverse the tree to the parent of the file node, creating as needed 137 for part in parts: 138 assert part != '' 139 if part not in tree['children']: 140 tree['children'][part] = {'children': {}, 'size': 0} 141 tree = tree['children'][part] 142 tree['size'] += size 143 144 # Get (creating if necessary) the node for the file 145 # This node doesn't have a 'children' attribute 146 if file_key not in tree['children']: 147 tree['children'][file_key] = {'sizes': collections.defaultdict(int), 148 'size': 0} 149 tree = tree['children'][file_key] 150 tree['size'] += size 151 152 # Accumulate size into a bucket within the file 153 type = type.lower() 154 if 'vtable for ' in sym: 155 tree['sizes']['[vtable]'] += size 156 elif 'r' == type: 157 tree['sizes']['[rodata]'] += size 158 elif 'd' == type: 159 tree['sizes']['[data]'] += size 160 elif 'b' == type: 161 tree['sizes']['[bss]'] += size 162 elif 't' == type: 163 # 'text' in binary parlance means 'code'. 164 tree['sizes']['[code]'] += size 165 elif 'w' == type: 166 tree['sizes']['[weak]'] += size 167 else: 168 tree['sizes']['[other]'] += size 169 except: 170 print >>sys.stderr, sym, parts, key 171 raise 172 else: 173 key = 'symbols without paths' 174 if key not in dirs['children']: 175 dirs['children'][key] = {'sizes': collections.defaultdict(int), 176 'size': 0} 177 tree = dirs['children'][key] 178 subkey = 'misc' 179 if (sym.endswith('::__FUNCTION__') or 180 sym.endswith('::__PRETTY_FUNCTION__')): 181 subkey = '__FUNCTION__' 182 elif sym.startswith('CSWTCH.'): 183 subkey = 'CSWTCH' 184 elif '::' in sym: 185 subkey = sym[0:sym.find('::') + 2] 186 tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size 187 tree['size'] += size 188 return dirs 189 190 191def JsonifyTree(tree, name): 192 """Convert TreeifySymbols output to a JSON treemap. 193 194 The format is very similar, with the notable exceptions being 195 lists of children instead of maps and some different attribute names.""" 196 children = [] 197 css_class_map = { 198 '[vtable]': 'vtable', 199 '[rodata]': 'read-only_data', 200 '[data]': 'data', 201 '[bss]': 'bss', 202 '[code]': 'code', 203 '[weak]': 'weak_symbol' 204 } 205 if 'children' in tree: 206 # Non-leaf node. Recurse. 207 for child_name, child in tree['children'].iteritems(): 208 children.append(JsonifyTree(child, child_name)) 209 else: 210 # Leaf node; dump per-file stats as entries in the treemap 211 for kind, size in tree['sizes'].iteritems(): 212 child_json = {'name': kind + ' (' + FormatBytes(size) + ')', 213 'data': { '$area': size }} 214 css_class = css_class_map.get(kind) 215 if css_class is not None: child_json['data']['$symbol'] = css_class 216 children.append(child_json) 217 # Sort children by size, largest to smallest. 218 children.sort(key=lambda child: -child['data']['$area']) 219 220 # For leaf nodes, the 'size' attribute is the size of the leaf; 221 # Non-leaf nodes don't really have a size, but their 'size' attribute is 222 # the sum of the sizes of all their children. 223 return {'name': name + ' (' + FormatBytes(tree['size']) + ')', 224 'data': { '$area': tree['size'] }, 225 'children': children } 226 227 228def DumpTreemap(symbols, outfile): 229 dirs = TreeifySymbols(symbols) 230 out = open(outfile, 'w') 231 try: 232 out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/'))) 233 finally: 234 out.flush() 235 out.close() 236 237 238def DumpLargestSymbols(symbols, outfile, n): 239 # a list of (sym, type, size, path); sort by size. 240 symbols = sorted(symbols, key=lambda x: -x[2]) 241 dumped = 0 242 out = open(outfile, 'w') 243 try: 244 out.write('var largestSymbols = [\n') 245 for sym, type, size, path in symbols: 246 if type in ('b', 'w'): 247 continue # skip bss and weak symbols 248 if path is None: 249 path = '' 250 entry = {'size': FormatBytes(size), 251 'symbol': sym, 252 'type': SymbolTypeToHuman(type), 253 'location': path } 254 out.write(json.dumps(entry)) 255 out.write(',\n') 256 dumped += 1 257 if dumped >= n: 258 return 259 finally: 260 out.write('];\n') 261 out.flush() 262 out.close() 263 264 265def MakeSourceMap(symbols): 266 sources = {} 267 for sym, type, size, path in symbols: 268 key = None 269 if path: 270 key = os.path.normpath(path) 271 else: 272 key = '[no path]' 273 if key not in sources: 274 sources[key] = {'path': path, 'symbol_count': 0, 'size': 0} 275 record = sources[key] 276 record['size'] += size 277 record['symbol_count'] += 1 278 return sources 279 280 281def DumpLargestSources(symbols, outfile, n): 282 map = MakeSourceMap(symbols) 283 sources = sorted(map.values(), key=lambda x: -x['size']) 284 dumped = 0 285 out = open(outfile, 'w') 286 try: 287 out.write('var largestSources = [\n') 288 for record in sources: 289 entry = {'size': FormatBytes(record['size']), 290 'symbol_count': str(record['symbol_count']), 291 'location': record['path']} 292 out.write(json.dumps(entry)) 293 out.write(',\n') 294 dumped += 1 295 if dumped >= n: 296 return 297 finally: 298 out.write('];\n') 299 out.flush() 300 out.close() 301 302 303def DumpLargestVTables(symbols, outfile, n): 304 vtables = [] 305 for symbol, type, size, path in symbols: 306 if 'vtable for ' in symbol: 307 vtables.append({'symbol': symbol, 'path': path, 'size': size}) 308 vtables = sorted(vtables, key=lambda x: -x['size']) 309 dumped = 0 310 out = open(outfile, 'w') 311 try: 312 out.write('var largestVTables = [\n') 313 for record in vtables: 314 entry = {'size': FormatBytes(record['size']), 315 'symbol': record['symbol'], 316 'location': record['path']} 317 out.write(json.dumps(entry)) 318 out.write(',\n') 319 dumped += 1 320 if dumped >= n: 321 return 322 finally: 323 out.write('];\n') 324 out.flush() 325 out.close() 326 327 328def RunParallelAddress2Line(outfile, library, arch, jobs, verbose): 329 """Run a parallel addr2line processing engine to dump and resolve symbols.""" 330 out_dir = os.getenv('CHROMIUM_OUT_DIR', 'out') 331 build_type = os.getenv('BUILDTYPE', 'Release') 332 classpath = os.path.join(out_dir, build_type, 'lib.java', 333 'binary_size_java.jar') 334 cmd = ['java', 335 '-classpath', classpath, 336 'org.chromium.tools.binary_size.ParallelAddress2Line', 337 '--disambiguate', 338 '--outfile', outfile, 339 '--library', library, 340 '--threads', jobs] 341 if verbose is True: 342 cmd.append('--verbose') 343 prefix = os.path.join('third_party', 'android_tools', 'ndk', 'toolchains') 344 if arch == 'android-arm': 345 prefix = os.path.join(prefix, 'arm-linux-androideabi-4.7', 'prebuilt', 346 'linux-x86_64', 'bin', 'arm-linux-androideabi-') 347 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line']) 348 elif arch == 'android-mips': 349 prefix = os.path.join(prefix, 'mipsel-linux-android-4.7', 'prebuilt', 350 'linux-x86_64', 'bin', 'mipsel-linux-android-') 351 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line']) 352 elif arch == 'android-x86': 353 prefix = os.path.join(prefix, 'x86-4.7', 'prebuilt', 354 'linux-x86_64', 'bin', 'i686-linux-android-') 355 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line']) 356 # else, use whatever is in PATH (don't pass --nm or --addr2line) 357 358 if verbose: 359 print cmd 360 361 return_code = subprocess.call(cmd) 362 if return_code: 363 raise RuntimeError('Failed to run ParallelAddress2Line: returned ' + 364 str(return_code)) 365 366 367def GetNmSymbols(infile, outfile, library, arch, jobs, verbose): 368 if infile is None: 369 if outfile is None: 370 infile = tempfile.NamedTemporaryFile(delete=False).name 371 else: 372 infile = outfile 373 374 if verbose: 375 print 'Running parallel addr2line, dumping symbols to ' + infile; 376 RunParallelAddress2Line(outfile=infile, library=library, arch=arch, 377 jobs=jobs, verbose=verbose) 378 elif verbose: 379 print 'Using nm input from ' + infile 380 with file(infile, 'r') as infile: 381 return list(ParseNm(infile)) 382 383 384def main(): 385 usage="""%prog [options] 386 387 Runs a spatial analysis on a given library, looking up the source locations 388 of its symbols and calculating how much space each directory, source file, 389 and so on is taking. The result is a report that can be used to pinpoint 390 sources of large portions of the binary, etceteras. 391 392 Under normal circumstances, you only need to pass two arguments, thusly: 393 394 %prog --library /path/to/library --destdir /path/to/output 395 396 In this mode, the program will dump the symbols from the specified library 397 and map those symbols back to source locations, producing a web-based 398 report in the specified output directory. 399 400 Other options are available via '--help'. 401 """ 402 parser = optparse.OptionParser(usage=usage) 403 parser.add_option('--nm-in', metavar='PATH', 404 help='if specified, use nm input from <path> instead of ' 405 'generating it. Note that source locations should be ' 406 'present in the file; i.e., no addr2line symbol lookups ' 407 'will be performed when this option is specified. ' 408 'Mutually exclusive with --library.') 409 parser.add_option('--destdir', metavar='PATH', 410 help='write output to the specified directory. An HTML ' 411 'report is generated here along with supporting files; ' 412 'any existing report will be overwritten.') 413 parser.add_option('--library', metavar='PATH', 414 help='if specified, process symbols in the library at ' 415 'the specified path. Mutually exclusive with --nm-in.') 416 parser.add_option('--arch', 417 help='the architecture that the library is targeted to. ' 418 'Determines which nm/addr2line binaries are used. When ' 419 '\'host-native\' is chosen, the program will use whichever ' 420 'nm/addr2line binaries are on the PATH. This is ' 421 'appropriate when you are analyzing a binary by and for ' 422 'your computer. ' 423 'This argument is only valid when using --library. ' 424 'Default is \'host-native\'.', 425 choices=['host-native', 'android-arm', 426 'android-mips', 'android-x86'],) 427 parser.add_option('--jobs', 428 help='number of jobs to use for the parallel ' 429 'addr2line processing pool; defaults to 1. More ' 430 'jobs greatly improve throughput but eat RAM like ' 431 'popcorn, and take several gigabytes each. Start low ' 432 'and ramp this number up until your machine begins to ' 433 'struggle with RAM. ' 434 'This argument is only valid when using --library.') 435 parser.add_option('-v', dest='verbose', action='store_true', 436 help='be verbose, printing lots of status information.') 437 parser.add_option('--nm-out', metavar='PATH', 438 help='keep the nm output file, and store it at the ' 439 'specified path. This is useful if you want to see the ' 440 'fully processed nm output after the symbols have been ' 441 'mapped to source locations. By default, a tempfile is ' 442 'used and is deleted when the program terminates.' 443 'This argument is only valid when using --library.') 444 opts, args = parser.parse_args() 445 446 if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in): 447 parser.error('exactly one of --library or --nm-in is required') 448 if (opts.nm_in): 449 if opts.jobs: 450 print >> sys.stderr, ('WARNING: --jobs has no effect ' 451 'when used with --nm-in') 452 if opts.arch: 453 print >> sys.stderr, ('WARNING: --arch has no effect ' 454 'when used with --nm-in') 455 if not opts.destdir: 456 parser.error('--destdir is required argument') 457 if not opts.jobs: 458 opts.jobs = '1' 459 if not opts.arch: 460 opts.arch = 'host-native' 461 462 symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library, opts.arch, 463 opts.jobs, opts.verbose is True) 464 if not os.path.exists(opts.destdir): 465 os.makedirs(opts.destdir, 0755) 466 467 DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js')) 468 DumpLargestSymbols(symbols, 469 os.path.join(opts.destdir, 'largest-symbols.js'), 100) 470 DumpLargestSources(symbols, 471 os.path.join(opts.destdir, 'largest-sources.js'), 100) 472 DumpLargestVTables(symbols, 473 os.path.join(opts.destdir, 'largest-vtables.js'), 100) 474 475 # TODO(andrewhayden): Switch to D3 for greater flexibility 476 treemap_out = os.path.join(opts.destdir, 'webtreemap') 477 if not os.path.exists(treemap_out): 478 os.makedirs(treemap_out, 0755) 479 treemap_src = os.path.join('third_party', 'webtreemap', 'src') 480 shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out) 481 shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out) 482 shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out) 483 shutil.copy(os.path.join('tools', 'binary_size', 'template', 'index.html'), 484 opts.destdir) 485 if opts.verbose: 486 print 'Report saved to ' + opts.destdir + '/index.html' 487 488 489if __name__ == '__main__': 490 sys.exit(main())