1# -*- coding: utf-8 -*-
2#                     The LLVM Compiler Infrastructure
3#
4# This file is distributed under the University of Illinois Open Source
5# License. See LICENSE.TXT for details.
6""" This module is responsible to capture the compiler invocation of any
7build process. The result of that should be a compilation database.
8
9This implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
10mechanisms provided by the dynamic linker. The related library is implemented
11in C language and can be found under 'libear' directory.
12
13The 'libear' library is capturing all child process creation and logging the
14relevant information about it into separate files in a specified directory.
15The parameter of this process is the output directory name, where the report
16files shall be placed. This parameter is passed as an environment variable.
17
18The module also implements compiler wrappers to intercept the compiler calls.
19
20The module implements the build command execution and the post-processing of
21the output files, which will condensates into a compilation database. """
22
23import sys
24import os
25import os.path
26import re
27import itertools
28import json
29import glob
30import argparse
31import logging
32import subprocess
33from libear import build_libear, TemporaryDirectory
34from libscanbuild import command_entry_point
35from libscanbuild import duplicate_check, tempdir, initialize_logging
36from libscanbuild.compilation import split_command
37from libscanbuild.shell import encode, decode
38
39__all__ = ['capture', 'intercept_build_main', 'intercept_build_wrapper']
40
41GS = chr(0x1d)
42RS = chr(0x1e)
43US = chr(0x1f)
44
45COMPILER_WRAPPER_CC = 'intercept-cc'
46COMPILER_WRAPPER_CXX = 'intercept-c++'
47
48
49@command_entry_point
50def intercept_build_main(bin_dir):
51    """ Entry point for 'intercept-build' command. """
52
53    parser = create_parser()
54    args = parser.parse_args()
55
56    initialize_logging(args.verbose)
57    logging.debug('Parsed arguments: %s', args)
58
59    if not args.build:
60        parser.print_help()
61        return 0
62
63    return capture(args, bin_dir)
64
65
66def capture(args, bin_dir):
67    """ The entry point of build command interception. """
68
69    def post_processing(commands):
70        """ To make a compilation database, it needs to filter out commands
71        which are not compiler calls. Needs to find the source file name
72        from the arguments. And do shell escaping on the command.
73
74        To support incremental builds, it is desired to read elements from
75        an existing compilation database from a previous run. These elements
76        shall be merged with the new elements. """
77
78        # create entries from the current run
79        current = itertools.chain.from_iterable(
80            # creates a sequence of entry generators from an exec,
81            format_entry(command) for command in commands)
82        # read entries from previous run
83        if 'append' in args and args.append and os.path.isfile(args.cdb):
84            with open(args.cdb) as handle:
85                previous = iter(json.load(handle))
86        else:
87            previous = iter([])
88        # filter out duplicate entries from both
89        duplicate = duplicate_check(entry_hash)
90        return (entry
91                for entry in itertools.chain(previous, current)
92                if os.path.exists(entry['file']) and not duplicate(entry))
93
94    with TemporaryDirectory(prefix='intercept-', dir=tempdir()) as tmp_dir:
95        # run the build command
96        environment = setup_environment(args, tmp_dir, bin_dir)
97        logging.debug('run build in environment: %s', environment)
98        exit_code = subprocess.call(args.build, env=environment)
99        logging.info('build finished with exit code: %d', exit_code)
100        # read the intercepted exec calls
101        exec_traces = itertools.chain.from_iterable(
102            parse_exec_trace(os.path.join(tmp_dir, filename))
103            for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
104        # do post processing only if that was requested
105        if 'raw_entries' not in args or not args.raw_entries:
106            entries = post_processing(exec_traces)
107        else:
108            entries = exec_traces
109        # dump the compilation database
110        with open(args.cdb, 'w+') as handle:
111            json.dump(list(entries), handle, sort_keys=True, indent=4)
112        return exit_code
113
114
115def setup_environment(args, destination, bin_dir):
116    """ Sets up the environment for the build command.
117
118    It sets the required environment variables and execute the given command.
119    The exec calls will be logged by the 'libear' preloaded library or by the
120    'wrapper' programs. """
121
122    c_compiler = args.cc if 'cc' in args else 'cc'
123    cxx_compiler = args.cxx if 'cxx' in args else 'c++'
124
125    libear_path = None if args.override_compiler or is_preload_disabled(
126        sys.platform) else build_libear(c_compiler, destination)
127
128    environment = dict(os.environ)
129    environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination})
130
131    if not libear_path:
132        logging.debug('intercept gonna use compiler wrappers')
133        environment.update({
134            'CC': os.path.join(bin_dir, COMPILER_WRAPPER_CC),
135            'CXX': os.path.join(bin_dir, COMPILER_WRAPPER_CXX),
136            'INTERCEPT_BUILD_CC': c_compiler,
137            'INTERCEPT_BUILD_CXX': cxx_compiler,
138            'INTERCEPT_BUILD_VERBOSE': 'DEBUG' if args.verbose > 2 else 'INFO'
139        })
140    elif sys.platform == 'darwin':
141        logging.debug('intercept gonna preload libear on OSX')
142        environment.update({
143            'DYLD_INSERT_LIBRARIES': libear_path,
144            'DYLD_FORCE_FLAT_NAMESPACE': '1'
145        })
146    else:
147        logging.debug('intercept gonna preload libear on UNIX')
148        environment.update({'LD_PRELOAD': libear_path})
149
150    return environment
151
152
153def intercept_build_wrapper(cplusplus):
154    """ Entry point for `intercept-cc` and `intercept-c++` compiler wrappers.
155
156    It does generate execution report into target directory. And execute
157    the wrapped compilation with the real compiler. The parameters for
158    report and execution are from environment variables.
159
160    Those parameters which for 'libear' library can't have meaningful
161    values are faked. """
162
163    # initialize wrapper logging
164    logging.basicConfig(format='intercept: %(levelname)s: %(message)s',
165                        level=os.getenv('INTERCEPT_BUILD_VERBOSE', 'INFO'))
166    # write report
167    try:
168        target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR')
169        if not target_dir:
170            raise UserWarning('exec report target directory not found')
171        pid = str(os.getpid())
172        target_file = os.path.join(target_dir, pid + '.cmd')
173        logging.debug('writing exec report to: %s', target_file)
174        with open(target_file, 'ab') as handler:
175            working_dir = os.getcwd()
176            command = US.join(sys.argv) + US
177            content = RS.join([pid, pid, 'wrapper', working_dir, command]) + GS
178            handler.write(content.encode('utf-8'))
179    except IOError:
180        logging.exception('writing exec report failed')
181    except UserWarning as warning:
182        logging.warning(warning)
183    # execute with real compiler
184    compiler = os.getenv('INTERCEPT_BUILD_CXX', 'c++') if cplusplus \
185        else os.getenv('INTERCEPT_BUILD_CC', 'cc')
186    compilation = [compiler] + sys.argv[1:]
187    logging.debug('execute compiler: %s', compilation)
188    return subprocess.call(compilation)
189
190
191def parse_exec_trace(filename):
192    """ Parse the file generated by the 'libear' preloaded library.
193
194    Given filename points to a file which contains the basic report
195    generated by the interception library or wrapper command. A single
196    report file _might_ contain multiple process creation info. """
197
198    logging.debug('parse exec trace file: %s', filename)
199    with open(filename, 'r') as handler:
200        content = handler.read()
201        for group in filter(bool, content.split(GS)):
202            records = group.split(RS)
203            yield {
204                'pid': records[0],
205                'ppid': records[1],
206                'function': records[2],
207                'directory': records[3],
208                'command': records[4].split(US)[:-1]
209            }
210
211
212def format_entry(exec_trace):
213    """ Generate the desired fields for compilation database entries. """
214
215    def abspath(cwd, name):
216        """ Create normalized absolute path from input filename. """
217        fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
218        return os.path.normpath(fullname)
219
220    logging.debug('format this command: %s', exec_trace['command'])
221    compilation = split_command(exec_trace['command'])
222    if compilation:
223        for source in compilation.files:
224            compiler = 'c++' if compilation.compiler == 'c++' else 'cc'
225            command = [compiler, '-c'] + compilation.flags + [source]
226            logging.debug('formated as: %s', command)
227            yield {
228                'directory': exec_trace['directory'],
229                'command': encode(command),
230                'file': abspath(exec_trace['directory'], source)
231            }
232
233
234def is_preload_disabled(platform):
235    """ Library-based interposition will fail silently if SIP is enabled,
236    so this should be detected. You can detect whether SIP is enabled on
237    Darwin by checking whether (1) there is a binary called 'csrutil' in
238    the path and, if so, (2) whether the output of executing 'csrutil status'
239    contains 'System Integrity Protection status: enabled'.
240
241    Same problem on linux when SELinux is enabled. The status query program
242    'sestatus' and the output when it's enabled 'SELinux status: enabled'. """
243
244    if platform == 'darwin':
245        pattern = re.compile(r'System Integrity Protection status:\s+enabled')
246        command = ['csrutil', 'status']
247    elif platform in {'linux', 'linux2'}:
248        pattern = re.compile(r'SELinux status:\s+enabled')
249        command = ['sestatus']
250    else:
251        return False
252
253    try:
254        lines = subprocess.check_output(command).decode('utf-8')
255        return any((pattern.match(line) for line in lines.splitlines()))
256    except:
257        return False
258
259
260def entry_hash(entry):
261    """ Implement unique hash method for compilation database entries. """
262
263    # For faster lookup in set filename is reverted
264    filename = entry['file'][::-1]
265    # For faster lookup in set directory is reverted
266    directory = entry['directory'][::-1]
267    # On OS X the 'cc' and 'c++' compilers are wrappers for
268    # 'clang' therefore both call would be logged. To avoid
269    # this the hash does not contain the first word of the
270    # command.
271    command = ' '.join(decode(entry['command'])[1:])
272
273    return '<>'.join([filename, directory, command])
274
275
276def create_parser():
277    """ Command line argument parser factory method. """
278
279    parser = argparse.ArgumentParser(
280        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
281
282    parser.add_argument(
283        '--verbose', '-v',
284        action='count',
285        default=0,
286        help="""Enable verbose output from '%(prog)s'. A second and third
287                flag increases verbosity.""")
288    parser.add_argument(
289        '--cdb',
290        metavar='<file>',
291        default="compile_commands.json",
292        help="""The JSON compilation database.""")
293    group = parser.add_mutually_exclusive_group()
294    group.add_argument(
295        '--append',
296        action='store_true',
297        help="""Append new entries to existing compilation database.""")
298    group.add_argument(
299        '--disable-filter', '-n',
300        dest='raw_entries',
301        action='store_true',
302        help="""Intercepted child process creation calls (exec calls) are all
303                logged to the output. The output is not a compilation database.
304                This flag is for debug purposes.""")
305
306    advanced = parser.add_argument_group('advanced options')
307    advanced.add_argument(
308        '--override-compiler',
309        action='store_true',
310        help="""Always resort to the compiler wrapper even when better
311                intercept methods are available.""")
312    advanced.add_argument(
313        '--use-cc',
314        metavar='<path>',
315        dest='cc',
316        default='cc',
317        help="""When '%(prog)s' analyzes a project by interposing a compiler
318                wrapper, which executes a real compiler for compilation and
319                do other tasks (record the compiler invocation). Because of
320                this interposing, '%(prog)s' does not know what compiler your
321                project normally uses. Instead, it simply overrides the CC
322                environment variable, and guesses your default compiler.
323
324                If you need '%(prog)s' to use a specific compiler for
325                *compilation* then you can use this option to specify a path
326                to that compiler.""")
327    advanced.add_argument(
328        '--use-c++',
329        metavar='<path>',
330        dest='cxx',
331        default='c++',
332        help="""This is the same as "--use-cc" but for C++ code.""")
333
334    parser.add_argument(
335        dest='build',
336        nargs=argparse.REMAINDER,
337        help="""Command to run.""")
338
339    return parser
340