1#!/usr/bin/env python
2# Copyright 2016 the V8 project authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Script to transform and merge sancov files into human readable json-format.
7
8The script supports three actions:
9all: Writes a json file with all instrumented lines of all executables.
10merge: Merges sancov files with coverage output into an existing json file.
11split: Split json file into separate files per covered source file.
12
13The json data is structured as follows:
14{
15  "version": 1,
16  "tests": ["executable1", "executable2", ...],
17  "files": {
18    "file1": [[<instr line 1>, <bit_mask>], [<instr line 2>, <bit_mask>], ...],
19    "file2": [...],
20    ...
21  }
22}
23
24The executables are sorted and determine the test bit mask. Their index+1 is
25the bit, e.g. executable1 = 1, executable3 = 4, etc. Hence, a line covered by
26executable1 and executable3 will have bit_mask == 5 == 0b101. The number of
27tests is restricted to 52 in version 1, to allow javascript JSON parsing of
28the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
29
30The line-number-bit_mask pairs are sorted by line number and don't contain
31duplicates.
32
33Split json data preserves the same format, but only contains one file per
34json file.
35
36The sancov tool is expected to be in the llvm compiler-rt third-party
37directory. It's not checked out by default and must be added as a custom deps:
38'v8/third_party/llvm/projects/compiler-rt':
39    'https://chromium.googlesource.com/external/llvm.org/compiler-rt.git'
40"""
41
42import argparse
43import json
44import logging
45import os
46import re
47import subprocess
48import sys
49
50from multiprocessing import Pool, cpu_count
51
52
53logging.basicConfig(level=logging.INFO)
54
55# Files to exclude from coverage. Dropping their data early adds more speed.
56# The contained cc files are already excluded from instrumentation, but inlined
57# data is referenced through v8's object files.
58EXCLUSIONS = [
59  'buildtools',
60  'src/third_party',
61  'third_party',
62  'test',
63  'testing',
64]
65
66# Executables found in the build output for which no coverage is generated.
67# Exclude them from the coverage data file.
68EXE_BLACKLIST = [
69  'generate-bytecode-expectations',
70  'hello-world',
71  'mksnapshot',
72  'parser-shell',
73  'process',
74  'shell',
75]
76
77# V8 checkout directory.
78BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(
79    os.path.abspath(__file__))))
80
81# Executable location. TODO(machenbach): Only release is supported for now.
82BUILD_DIR = os.path.join(BASE_DIR, 'out', 'Release')
83
84# Path prefix added by the llvm symbolizer including trailing slash.
85OUTPUT_PATH_PREFIX = os.path.join(BUILD_DIR, '..', '..', '')
86
87# The sancov tool location.
88SANCOV_TOOL = os.path.join(
89    BASE_DIR, 'third_party', 'llvm', 'projects', 'compiler-rt',
90    'lib', 'sanitizer_common', 'scripts', 'sancov.py')
91
92# Simple script to sanitize the PCs from objdump.
93SANITIZE_PCS = os.path.join(BASE_DIR, 'tools', 'sanitizers', 'sanitize_pcs.py')
94
95# The llvm symbolizer location.
96SYMBOLIZER = os.path.join(
97    BASE_DIR, 'third_party', 'llvm-build', 'Release+Asserts', 'bin',
98    'llvm-symbolizer')
99
100# Number of cpus.
101CPUS = cpu_count()
102
103# Regexp to find sancov files as output by sancov_merger.py. Also grabs the
104# executable name in group 1.
105SANCOV_FILE_RE = re.compile(r'^(.*)\.result.sancov$')
106
107
108def executables():
109  """Iterates over executable files in the build directory."""
110  for f in os.listdir(BUILD_DIR):
111    file_path = os.path.join(BUILD_DIR, f)
112    if (os.path.isfile(file_path) and
113        os.access(file_path, os.X_OK) and
114        f not in EXE_BLACKLIST):
115      yield file_path
116
117
118def process_symbolizer_output(output):
119  """Post-process llvm symbolizer output.
120
121  Excludes files outside the v8 checkout or given in exclusion list above
122  from further processing. Drops the character index in each line.
123
124  Returns: A mapping of file names to lists of line numbers. The file names
125           have relative paths to the v8 base directory. The lists of line
126           numbers don't contain duplicate lines and are sorted.
127  """
128  # Drop path prefix when iterating lines. The path is redundant and takes
129  # too much space. Drop files outside that path, e.g. generated files in
130  # the build dir and absolute paths to c++ library headers.
131  def iter_lines():
132    for line in output.strip().splitlines():
133      if line.startswith(OUTPUT_PATH_PREFIX):
134        yield line[len(OUTPUT_PATH_PREFIX):]
135
136  # Map file names to sets of instrumented line numbers.
137  file_map = {}
138  for line in iter_lines():
139    # Drop character number, we only care for line numbers. Each line has the
140    # form: <file name>:<line number>:<character number>.
141    file_name, number, _ = line.split(':')
142    file_map.setdefault(file_name, set([])).add(int(number))
143
144  # Remove exclusion patterns from file map. It's cheaper to do it after the
145  # mapping, as there are few excluded files and we don't want to do this
146  # check for numerous lines in ordinary files.
147  def keep(file_name):
148    for e in EXCLUSIONS:
149      if file_name.startswith(e):
150        return False
151    return True
152
153  # Return in serializable form and filter.
154  return {k: sorted(file_map[k]) for k in file_map if keep(k)}
155
156
157def get_instrumented_lines(executable):
158  """Return the instrumented lines of an executable.
159
160  Called trough multiprocessing pool.
161
162  Returns: Post-processed llvm output as returned by process_symbolizer_output.
163  """
164  # The first two pipes are from llvm's tool sancov.py with 0x added to the hex
165  # numbers. The results are piped into the llvm symbolizer, which outputs for
166  # each PC: <file name with abs path>:<line number>:<character number>.
167  # We don't call the sancov tool to get more speed.
168  process = subprocess.Popen(
169      'objdump -d %s | '
170      'grep \'^\s\+[0-9a-f]\+:.*\scall\(q\|\)\s\+[0-9a-f]\+ '
171      '<__sanitizer_cov\(_with_check\|\)\(@plt\|\)>\' | '
172      'grep \'^\s\+[0-9a-f]\+\' -o | '
173      '%s | '
174      '%s --obj %s -functions=none' %
175          (executable, SANITIZE_PCS, SYMBOLIZER, executable),
176      stdout=subprocess.PIPE,
177      stderr=subprocess.PIPE,
178      stdin=subprocess.PIPE,
179      cwd=BASE_DIR,
180      shell=True,
181  )
182  output, _ = process.communicate()
183  assert process.returncode == 0
184  return process_symbolizer_output(output)
185
186
187def merge_instrumented_line_results(exe_list, results):
188  """Merge multiprocessing results for all instrumented lines.
189
190  Args:
191    exe_list: List of all executable names with absolute paths.
192    results: List of results as returned by get_instrumented_lines.
193
194  Returns: Dict to be used as json data as specified on the top of this page.
195           The dictionary contains all instrumented lines of all files
196           referenced by all executables.
197  """
198  def merge_files(x, y):
199    for file_name, lines in y.iteritems():
200      x.setdefault(file_name, set([])).update(lines)
201    return x
202  result = reduce(merge_files, results, {})
203
204  # Return data as file->lines mapping. The lines are saved as lists
205  # with (line number, test bits (as int)). The test bits are initialized with
206  # 0, meaning instrumented, but no coverage.
207  # The order of the test bits is given with key 'tests'. For now, these are
208  # the executable names. We use a _list_ with two items instead of a tuple to
209  # ease merging by allowing mutation of the second item.
210  return {
211    'version': 1,
212    'tests': sorted(map(os.path.basename, exe_list)),
213    'files': {f: map(lambda l: [l, 0], sorted(result[f])) for f in result},
214  }
215
216
217def write_instrumented(options):
218  """Implements the 'all' action of this tool."""
219  exe_list = list(executables())
220  logging.info('Reading instrumented lines from %d executables.',
221               len(exe_list))
222  pool = Pool(CPUS)
223  try:
224    results = pool.imap_unordered(get_instrumented_lines, exe_list)
225  finally:
226    pool.close()
227
228  # Merge multiprocessing results and prepare output data.
229  data = merge_instrumented_line_results(exe_list, results)
230
231  logging.info('Read data from %d executables, which covers %d files.',
232               len(data['tests']), len(data['files']))
233  logging.info('Writing results to %s', options.json_output)
234
235  # Write json output.
236  with open(options.json_output, 'w') as f:
237    json.dump(data, f, sort_keys=True)
238
239
240def get_covered_lines(args):
241  """Return the covered lines of an executable.
242
243  Called trough multiprocessing pool. The args are expected to unpack to:
244    cov_dir: Folder with sancov files merged by sancov_merger.py.
245    executable: The executable that was called to produce the given coverage
246                data.
247    sancov_file: The merged sancov file with coverage data.
248
249  Returns: A tuple of post-processed llvm output as returned by
250           process_symbolizer_output and the executable name.
251  """
252  cov_dir, executable, sancov_file = args
253
254  # Let the sancov tool print the covered PCs and pipe them through the llvm
255  # symbolizer.
256  process = subprocess.Popen(
257      '%s print %s 2> /dev/null | '
258      '%s --obj %s -functions=none' %
259          (SANCOV_TOOL,
260           os.path.join(cov_dir, sancov_file),
261           SYMBOLIZER,
262           os.path.join(BUILD_DIR, executable)),
263      stdout=subprocess.PIPE,
264      stderr=subprocess.PIPE,
265      stdin=subprocess.PIPE,
266      cwd=BASE_DIR,
267      shell=True,
268  )
269  output, _ = process.communicate()
270  assert process.returncode == 0
271  return process_symbolizer_output(output), executable
272
273
274def merge_covered_line_results(data, results):
275  """Merge multiprocessing results for covered lines.
276
277  The data is mutated, the results are merged into it in place.
278
279  Args:
280    data: Existing coverage data from json file containing all instrumented
281          lines.
282    results: List of results as returned by get_covered_lines.
283  """
284
285  # List of executables and mapping to the test bit mask. The number of
286  # tests is restricted to 52, to allow javascript JSON parsing of
287  # the bitsets encoded as numbers. JS max safe int is (1 << 53) - 1.
288  exe_list = data['tests']
289  assert len(exe_list) <= 52, 'Max 52 different tests are supported.'
290  test_bit_masks = {exe:1<<i for i, exe in enumerate(exe_list)}
291
292  def merge_lines(old_lines, new_lines, mask):
293    """Merge the coverage data of a list of lines.
294
295    Args:
296      old_lines: Lines as list of pairs with line number and test bit mask.
297                 The new lines will be merged into the list in place.
298      new_lines: List of new (covered) lines (sorted).
299      mask: The bit to be set for covered lines. The bit index is the test
300            index of the executable that covered the line.
301    """
302    i = 0
303    # Iterate over old and new lines, both are sorted.
304    for l in new_lines:
305      while old_lines[i][0] < l:
306        # Forward instrumented lines not present in this coverage data.
307        i += 1
308        # TODO: Add more context to the assert message.
309        assert i < len(old_lines), 'Covered line %d not in input file.' % l
310      assert old_lines[i][0] == l, 'Covered line %d not in input file.' % l
311
312      # Add coverage information to the line.
313      old_lines[i][1] |= mask
314
315  def merge_files(data, result):
316    """Merge result into data.
317
318    The data is mutated in place.
319
320    Args:
321      data: Merged coverage data from the previous reduce step.
322      result: New result to be merged in. The type is as returned by
323              get_covered_lines.
324    """
325    file_map, executable = result
326    files = data['files']
327    for file_name, lines in file_map.iteritems():
328      merge_lines(files[file_name], lines, test_bit_masks[executable])
329    return data
330
331  reduce(merge_files, results, data)
332
333
334def merge(options):
335  """Implements the 'merge' action of this tool."""
336
337  # Check if folder with coverage output exists.
338  assert (os.path.exists(options.coverage_dir) and
339          os.path.isdir(options.coverage_dir))
340
341  # Inputs for multiprocessing. List of tuples of:
342  # Coverage dir, executable name, sancov file name.
343  inputs = []
344  for f in os.listdir(options.coverage_dir):
345    match = SANCOV_FILE_RE.match(f)
346    if match:
347      inputs.append((options.coverage_dir, match.group(1), f))
348
349  logging.info('Merging %d sancov files into %s',
350               len(inputs), options.json_input)
351
352  # Post-process covered lines in parallel.
353  pool = Pool(CPUS)
354  try:
355    results = pool.imap_unordered(get_covered_lines, inputs)
356  finally:
357    pool.close()
358
359  # Load existing json data file for merging the results.
360  with open(options.json_input, 'r') as f:
361    data = json.load(f)
362
363  # Merge muliprocessing results. Mutates data.
364  merge_covered_line_results(data, results)
365
366  logging.info('Merged data from %d executables, which covers %d files.',
367               len(data['tests']), len(data['files']))
368  logging.info('Writing results to %s', options.json_output)
369
370  # Write merged results to file.
371  with open(options.json_output, 'w') as f:
372    json.dump(data, f, sort_keys=True)
373
374
375def split(options):
376  """Implements the 'split' action of this tool."""
377  # Load existing json data file for splitting.
378  with open(options.json_input, 'r') as f:
379    data = json.load(f)
380
381  logging.info('Splitting off %d coverage files from %s',
382               len(data['files']), options.json_input)
383
384  for file_name, coverage in data['files'].iteritems():
385    # Preserve relative directories that are part of the file name.
386    file_path = os.path.join(options.output_dir, file_name + '.json')
387    try:
388      os.makedirs(os.path.dirname(file_path))
389    except OSError:
390      # Ignore existing directories.
391      pass
392
393    with open(file_path, 'w') as f:
394      # Flat-copy the old dict.
395      new_data = dict(data)
396
397      # Update current file.
398      new_data['files'] = {file_name: coverage}
399
400      # Write json data.
401      json.dump(new_data, f, sort_keys=True)
402
403
404def main(args=None):
405  parser = argparse.ArgumentParser()
406  parser.add_argument('--coverage-dir',
407                      help='Path to the sancov output files.')
408  parser.add_argument('--json-input',
409                      help='Path to an existing json file with coverage data.')
410  parser.add_argument('--json-output',
411                      help='Path to a file to write json output to.')
412  parser.add_argument('--output-dir',
413                      help='Directory where to put split output files to.')
414  parser.add_argument('action', choices=['all', 'merge', 'split'],
415                      help='Action to perform.')
416
417  options = parser.parse_args(args)
418  if options.action.lower() == 'all':
419    if not options.json_output:
420      print '--json-output is required'
421      return 1
422    write_instrumented(options)
423  elif options.action.lower() == 'merge':
424    if not options.coverage_dir:
425      print '--coverage-dir is required'
426      return 1
427    if not options.json_input:
428      print '--json-input is required'
429      return 1
430    if not options.json_output:
431      print '--json-output is required'
432      return 1
433    merge(options)
434  elif options.action.lower() == 'split':
435    if not options.json_input:
436      print '--json-input is required'
437      return 1
438    if not options.output_dir:
439      print '--output-dir is required'
440      return 1
441    split(options)
442  return 0
443
444
445if __name__ == '__main__':
446  sys.exit(main())
447