1#!/usr/bin/env python 2# Copyright 2016 the V8 project authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Script to transform and merge sancov files into human readable json-format. 7 8The script supports three actions: 9all: Writes a json file with all instrumented lines of all executables. 10merge: Merges sancov files with coverage output into an existing json file. 11split: Split json file into separate files per covered source file. 12 13The json data is structured as follows: 14{ 15 "version": 1, 16 "tests": ["executable1", "executable2", ...], 17 "files": { 18 "file1": [[<instr line 1>, <bit_mask>], [<instr line 2>, <bit_mask>], ...], 19 "file2": [...], 20 ... 21 } 22} 23 24The executables are sorted and determine the test bit mask. Their index+1 is 25the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by 26executable1 and executable3 will have bit_mask == 5 == 0b101. The number of 27tests is restricted to 52 in version 1, to allow javascript JSON parsing of 28the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. 29 30The line-number-bit_mask pairs are sorted by line number and don't contain 31duplicates. 32 33Split json data preserves the same format, but only contains one file per 34json file. 35 36The sancov tool is expected to be in the llvm compiler-rt third-party 37directory. It's not checked out by default and must be added as a custom deps: 38'v8/third_party/llvm/projects/compiler-rt': 39 'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git' 40""" 41 42import argparse 43import json 44import logging 45import os 46import re 47import subprocess 48import sys 49 50from multiprocessing import Pool, cpu_count 51 52 53logging.basicConfig(level=logging.INFO) 54 55# Files to exclude from coverage. Dropping their data early adds more speed. 56# The contained cc files are already excluded from instrumentation, but inlined 57# data is referenced through v8's object files. 58EXCLUSIONS = [ 59 'buildtools', 60 'src/third_party', 61 'third_party', 62 'test', 63 'testing', 64] 65 66# Executables found in the build output for which no coverage is generated. 67# Exclude them from the coverage data file. 68EXE_BLACKLIST = [ 69 'generate-bytecode-expectations', 70 'hello-world', 71 'mksnapshot', 72 'parser-shell', 73 'process', 74 'shell', 75] 76 77# V8 checkout directory. 78BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname( 79 os.path.abspath(__file__)))) 80 81# Executable location. TODO(machenbach): Only release is supported for now. 82BUILD_DIR = os.path.join(BASE_DIR, 'out', 'Release') 83 84# Path prefix added by the llvm symbolizer including trailing slash. 85OUTPUT_PATH_PREFIX = os.path.join(BUILD_DIR, '..', '..', '') 86 87# The sancov tool location. 88SANCOV_TOOL = os.path.join( 89 BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt', 90 'lib', 'sanitizer_common', 'scripts', 'sancov.py') 91 92# Simple script to sanitize the PCs from objdump. 93SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py') 94 95# The llvm symbolizer location. 96SYMBOLIZER = os.path.join( 97 BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin', 98 'llvm-symbolizer') 99 100# Number of cpus. 101CPUS = cpu_count() 102 103# Regexp to find sancov files as output by sancov_merger.py. Also grabs the 104# executable name in group 1. 105SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$') 106 107 108def executables(): 109 """Iterates over executable files in the build directory.""" 110 for f in os.listdir(BUILD_DIR): 111 file_path = os.path.join(BUILD_DIR, f) 112 if (os.path.isfile(file_path) and 113 os.access(file_path, os.X_OK) and 114 f not in EXE_BLACKLIST): 115 yield file_path 116 117 118def process_symbolizer_output(output): 119 """Post-process llvm symbolizer output. 120 121 Excludes files outside the v8 checkout or given in exclusion list above 122 from further processing. Drops the character index in each line. 123 124 Returns: A mapping of file names to lists of line numbers. The file names 125 have relative paths to the v8 base directory. The lists of line 126 numbers don't contain duplicate lines and are sorted. 127 """ 128 # Drop path prefix when iterating lines. The path is redundant and takes 129 # too much space. Drop files outside that path, e.g. generated files in 130 # the build dir and absolute paths to c++ library headers. 131 def iter_lines(): 132 for line in output.strip().splitlines(): 133 if line.startswith(OUTPUT_PATH_PREFIX): 134 yield line[len(OUTPUT_PATH_PREFIX):] 135 136 # Map file names to sets of instrumented line numbers. 137 file_map = {} 138 for line in iter_lines(): 139 # Drop character number, we only care for line numbers. Each line has the 140 # form: <file name>:<line number>:<character number>. 141 file_name, number, _ = line.split(':') 142 file_map.setdefault(file_name, set([])).add(int(number)) 143 144 # Remove exclusion patterns from file map. It's cheaper to do it after the 145 # mapping, as there are few excluded files and we don't want to do this 146 # check for numerous lines in ordinary files. 147 def keep(file_name): 148 for e in EXCLUSIONS: 149 if file_name.startswith(e): 150 return False 151 return True 152 153 # Return in serializable form and filter. 154 return {k: sorted(file_map[k]) for k in file_map if keep(k)} 155 156 157def get_instrumented_lines(executable): 158 """Return the instrumented lines of an executable. 159 160 Called trough multiprocessing pool. 161 162 Returns: Post-processed llvm output as returned by process_symbolizer_output. 163 """ 164 # The first two pipes are from llvm's tool sancov.py with 0x added to the hex 165 # numbers. The results are piped into the llvm symbolizer, which outputs for 166 # each PC: <file name with abs path>:<line number>:<character number>. 167 # We don't call the sancov tool to get more speed. 168 process = subprocess.Popen( 169 'objdump -d %s | ' 170 'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ ' 171 '<__sanitizer_cov\(_with_check\|\)\(@plt\|\)>\' | ' 172 'grep \'^\s\+[0-9a-f]\+\' -o | ' 173 '%s | ' 174 '%s --obj %s -functions=none' % 175 (executable, SANITIZE_PCS, SYMBOLIZER, executable), 176 stdout=subprocess.PIPE, 177 stderr=subprocess.PIPE, 178 stdin=subprocess.PIPE, 179 cwd=BASE_DIR, 180 shell=True, 181 ) 182 output, _ = process.communicate() 183 assert process.returncode == 0 184 return process_symbolizer_output(output) 185 186 187def merge_instrumented_line_results(exe_list, results): 188 """Merge multiprocessing results for all instrumented lines. 189 190 Args: 191 exe_list: List of all executable names with absolute paths. 192 results: List of results as returned by get_instrumented_lines. 193 194 Returns: Dict to be used as json data as specified on the top of this page. 195 The dictionary contains all instrumented lines of all files 196 referenced by all executables. 197 """ 198 def merge_files(x, y): 199 for file_name, lines in y.iteritems(): 200 x.setdefault(file_name, set([])).update(lines) 201 return x 202 result = reduce(merge_files, results, {}) 203 204 # Return data as file->lines mapping. The lines are saved as lists 205 # with (line number, test bits (as int)). The test bits are initialized with 206 # 0, meaning instrumented, but no coverage. 207 # The order of the test bits is given with key 'tests'. For now, these are 208 # the executable names. We use a _list_ with two items instead of a tuple to 209 # ease merging by allowing mutation of the second item. 210 return { 211 'version': 1, 212 'tests': sorted(map(os.path.basename, exe_list)), 213 'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result}, 214 } 215 216 217def write_instrumented(options): 218 """Implements the 'all' action of this tool.""" 219 exe_list = list(executables()) 220 logging.info('Reading instrumented lines from %d executables.', 221 len(exe_list)) 222 pool = Pool(CPUS) 223 try: 224 results = pool.imap_unordered(get_instrumented_lines, exe_list) 225 finally: 226 pool.close() 227 228 # Merge multiprocessing results and prepare output data. 229 data = merge_instrumented_line_results(exe_list, results) 230 231 logging.info('Read data from %d executables, which covers %d files.', 232 len(data['tests']), len(data['files'])) 233 logging.info('Writing results to %s', options.json_output) 234 235 # Write json output. 236 with open(options.json_output, 'w') as f: 237 json.dump(data, f, sort_keys=True) 238 239 240def get_covered_lines(args): 241 """Return the covered lines of an executable. 242 243 Called trough multiprocessing pool. The args are expected to unpack to: 244 cov_dir: Folder with sancov files merged by sancov_merger.py. 245 executable: The executable that was called to produce the given coverage 246 data. 247 sancov_file: The merged sancov file with coverage data. 248 249 Returns: A tuple of post-processed llvm output as returned by 250 process_symbolizer_output and the executable name. 251 """ 252 cov_dir, executable, sancov_file = args 253 254 # Let the sancov tool print the covered PCs and pipe them through the llvm 255 # symbolizer. 256 process = subprocess.Popen( 257 '%s print %s 2> /dev/null | ' 258 '%s --obj %s -functions=none' % 259 (SANCOV_TOOL, 260 os.path.join(cov_dir, sancov_file), 261 SYMBOLIZER, 262 os.path.join(BUILD_DIR, executable)), 263 stdout=subprocess.PIPE, 264 stderr=subprocess.PIPE, 265 stdin=subprocess.PIPE, 266 cwd=BASE_DIR, 267 shell=True, 268 ) 269 output, _ = process.communicate() 270 assert process.returncode == 0 271 return process_symbolizer_output(output), executable 272 273 274def merge_covered_line_results(data, results): 275 """Merge multiprocessing results for covered lines. 276 277 The data is mutated, the results are merged into it in place. 278 279 Args: 280 data: Existing coverage data from json file containing all instrumented 281 lines. 282 results: List of results as returned by get_covered_lines. 283 """ 284 285 # List of executables and mapping to the test bit mask. The number of 286 # tests is restricted to 52, to allow javascript JSON parsing of 287 # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1. 288 exe_list = data['tests'] 289 assert len(exe_list) <= 52, 'Max 52 different tests are supported.' 290 test_bit_masks = {exe:1<<i for i, exe in enumerate(exe_list)} 291 292 def merge_lines(old_lines, new_lines, mask): 293 """Merge the coverage data of a list of lines. 294 295 Args: 296 old_lines: Lines as list of pairs with line number and test bit mask. 297 The new lines will be merged into the list in place. 298 new_lines: List of new (covered) lines (sorted). 299 mask: The bit to be set for covered lines. The bit index is the test 300 index of the executable that covered the line. 301 """ 302 i = 0 303 # Iterate over old and new lines, both are sorted. 304 for l in new_lines: 305 while old_lines[i][0] < l: 306 # Forward instrumented lines not present in this coverage data. 307 i += 1 308 # TODO: Add more context to the assert message. 309 assert i < len(old_lines), 'Covered line %d not in input file.' % l 310 assert old_lines[i][0] == l, 'Covered line %d not in input file.' % l 311 312 # Add coverage information to the line. 313 old_lines[i][1] |= mask 314 315 def merge_files(data, result): 316 """Merge result into data. 317 318 The data is mutated in place. 319 320 Args: 321 data: Merged coverage data from the previous reduce step. 322 result: New result to be merged in. The type is as returned by 323 get_covered_lines. 324 """ 325 file_map, executable = result 326 files = data['files'] 327 for file_name, lines in file_map.iteritems(): 328 merge_lines(files[file_name], lines, test_bit_masks[executable]) 329 return data 330 331 reduce(merge_files, results, data) 332 333 334def merge(options): 335 """Implements the 'merge' action of this tool.""" 336 337 # Check if folder with coverage output exists. 338 assert (os.path.exists(options.coverage_dir) and 339 os.path.isdir(options.coverage_dir)) 340 341 # Inputs for multiprocessing. List of tuples of: 342 # Coverage dir, executable name, sancov file name. 343 inputs = [] 344 for f in os.listdir(options.coverage_dir): 345 match = SANCOV_FILE_RE.match(f) 346 if match: 347 inputs.append((options.coverage_dir, match.group(1), f)) 348 349 logging.info('Merging %d sancov files into %s', 350 len(inputs), options.json_input) 351 352 # Post-process covered lines in parallel. 353 pool = Pool(CPUS) 354 try: 355 results = pool.imap_unordered(get_covered_lines, inputs) 356 finally: 357 pool.close() 358 359 # Load existing json data file for merging the results. 360 with open(options.json_input, 'r') as f: 361 data = json.load(f) 362 363 # Merge muliprocessing results. Mutates data. 364 merge_covered_line_results(data, results) 365 366 logging.info('Merged data from %d executables, which covers %d files.', 367 len(data['tests']), len(data['files'])) 368 logging.info('Writing results to %s', options.json_output) 369 370 # Write merged results to file. 371 with open(options.json_output, 'w') as f: 372 json.dump(data, f, sort_keys=True) 373 374 375def split(options): 376 """Implements the 'split' action of this tool.""" 377 # Load existing json data file for splitting. 378 with open(options.json_input, 'r') as f: 379 data = json.load(f) 380 381 logging.info('Splitting off %d coverage files from %s', 382 len(data['files']), options.json_input) 383 384 for file_name, coverage in data['files'].iteritems(): 385 # Preserve relative directories that are part of the file name. 386 file_path = os.path.join(options.output_dir, file_name + '.json') 387 try: 388 os.makedirs(os.path.dirname(file_path)) 389 except OSError: 390 # Ignore existing directories. 391 pass 392 393 with open(file_path, 'w') as f: 394 # Flat-copy the old dict. 395 new_data = dict(data) 396 397 # Update current file. 398 new_data['files'] = {file_name: coverage} 399 400 # Write json data. 401 json.dump(new_data, f, sort_keys=True) 402 403 404def main(args=None): 405 parser = argparse.ArgumentParser() 406 parser.add_argument('--coverage-dir', 407 help='Path to the sancov output files.') 408 parser.add_argument('--json-input', 409 help='Path to an existing json file with coverage data.') 410 parser.add_argument('--json-output', 411 help='Path to a file to write json output to.') 412 parser.add_argument('--output-dir', 413 help='Directory where to put split output files to.') 414 parser.add_argument('action', choices=['all', 'merge', 'split'], 415 help='Action to perform.') 416 417 options = parser.parse_args(args) 418 if options.action.lower() == 'all': 419 if not options.json_output: 420 print '--json-output is required' 421 return 1 422 write_instrumented(options) 423 elif options.action.lower() == 'merge': 424 if not options.coverage_dir: 425 print '--coverage-dir is required' 426 return 1 427 if not options.json_input: 428 print '--json-input is required' 429 return 1 430 if not options.json_output: 431 print '--json-output is required' 432 return 1 433 merge(options) 434 elif options.action.lower() == 'split': 435 if not options.json_input: 436 print '--json-input is required' 437 return 1 438 if not options.output_dir: 439 print '--output-dir is required' 440 return 1 441 split(options) 442 return 0 443 444 445if __name__ == '__main__': 446 sys.exit(main()) 447