1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
3
4"""Code parsing for coverage.py."""
5
6import collections
7import dis
8import re
9import token
10import tokenize
11
12from coverage.backward import range    # pylint: disable=redefined-builtin
13from coverage.backward import bytes_to_ints
14from coverage.bytecode import ByteCodes, CodeObjects
15from coverage.misc import contract, nice_pair, expensive, join_regex
16from coverage.misc import CoverageException, NoSource, NotPython
17from coverage.phystokens import compile_unicode, generate_tokens
18
19
20class PythonParser(object):
21    """Parse code to find executable lines, excluded lines, etc."""
22
23    @contract(text='unicode|None')
24    def __init__(self, text=None, filename=None, exclude=None):
25        """
26        Source can be provided as `text`, the text itself, or `filename`, from
27        which the text will be read.  Excluded lines are those that match
28        `exclude`, a regex.
29
30        """
31        assert text or filename, "PythonParser needs either text or filename"
32        self.filename = filename or "<code>"
33        self.text = text
34        if not self.text:
35            from coverage.python import get_python_source
36            try:
37                self.text = get_python_source(self.filename)
38            except IOError as err:
39                raise NoSource(
40                    "No source for code: '%s': %s" % (self.filename, err)
41                )
42
43        self.exclude = exclude
44
45        self.show_tokens = False
46
47        # The text lines of the parsed code.
48        self.lines = self.text.split('\n')
49
50        # The line numbers of excluded lines of code.
51        self.excluded = set()
52
53        # The line numbers of docstring lines.
54        self.docstrings = set()
55
56        # The line numbers of class definitions.
57        self.classdefs = set()
58
59        # A dict mapping line numbers to (lo,hi) for multi-line statements.
60        self.multiline = {}
61
62        # The line numbers that start statements.
63        self.statement_starts = set()
64
65        # Lazily-created ByteParser and arc data.
66        self._byte_parser = None
67        self._all_arcs = None
68
69    @property
70    def byte_parser(self):
71        """Create a ByteParser on demand."""
72        if not self._byte_parser:
73            self._byte_parser = ByteParser(self.text, filename=self.filename)
74        return self._byte_parser
75
76    def lines_matching(self, *regexes):
77        """Find the lines matching one of a list of regexes.
78
79        Returns a set of line numbers, the lines that contain a match for one
80        of the regexes in `regexes`.  The entire line needn't match, just a
81        part of it.
82
83        """
84        regex_c = re.compile(join_regex(regexes))
85        matches = set()
86        for i, ltext in enumerate(self.lines, start=1):
87            if regex_c.search(ltext):
88                matches.add(i)
89        return matches
90
91    def _raw_parse(self):
92        """Parse the source to find the interesting facts about its lines.
93
94        A handful of member fields are updated.
95
96        """
97        # Find lines which match an exclusion pattern.
98        if self.exclude:
99            self.excluded = self.lines_matching(self.exclude)
100
101        # Tokenize, to find excluded suites, to find docstrings, and to find
102        # multi-line statements.
103        indent = 0
104        exclude_indent = 0
105        excluding = False
106        prev_toktype = token.INDENT
107        first_line = None
108        empty = True
109
110        tokgen = generate_tokens(self.text)
111        for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen:
112            if self.show_tokens:                # pragma: not covered
113                print("%10s %5s %-20r %r" % (
114                    tokenize.tok_name.get(toktype, toktype),
115                    nice_pair((slineno, elineno)), ttext, ltext
116                ))
117            if toktype == token.INDENT:
118                indent += 1
119            elif toktype == token.DEDENT:
120                indent -= 1
121            elif toktype == token.NAME and ttext == 'class':
122                # Class definitions look like branches in the byte code, so
123                # we need to exclude them.  The simplest way is to note the
124                # lines with the 'class' keyword.
125                self.classdefs.add(slineno)
126            elif toktype == token.OP and ttext == ':':
127                if not excluding and elineno in self.excluded:
128                    # Start excluding a suite.  We trigger off of the colon
129                    # token so that the #pragma comment will be recognized on
130                    # the same line as the colon.
131                    exclude_indent = indent
132                    excluding = True
133            elif toktype == token.STRING and prev_toktype == token.INDENT:
134                # Strings that are first on an indented line are docstrings.
135                # (a trick from trace.py in the stdlib.) This works for
136                # 99.9999% of cases.  For the rest (!) see:
137                # http://stackoverflow.com/questions/1769332/x/1769794#1769794
138                self.docstrings.update(range(slineno, elineno+1))
139            elif toktype == token.NEWLINE:
140                if first_line is not None and elineno != first_line:
141                    # We're at the end of a line, and we've ended on a
142                    # different line than the first line of the statement,
143                    # so record a multi-line range.
144                    for l in range(first_line, elineno+1):
145                        self.multiline[l] = first_line
146                first_line = None
147
148            if ttext.strip() and toktype != tokenize.COMMENT:
149                # A non-whitespace token.
150                empty = False
151                if first_line is None:
152                    # The token is not whitespace, and is the first in a
153                    # statement.
154                    first_line = slineno
155                    # Check whether to end an excluded suite.
156                    if excluding and indent <= exclude_indent:
157                        excluding = False
158                    if excluding:
159                        self.excluded.add(elineno)
160
161            prev_toktype = toktype
162
163        # Find the starts of the executable statements.
164        if not empty:
165            self.statement_starts.update(self.byte_parser._find_statements())
166
167    def first_line(self, line):
168        """Return the first line number of the statement including `line`."""
169        first_line = self.multiline.get(line)
170        if first_line:
171            return first_line
172        else:
173            return line
174
175    def first_lines(self, lines):
176        """Map the line numbers in `lines` to the correct first line of the
177        statement.
178
179        Returns a set of the first lines.
180
181        """
182        return set(self.first_line(l) for l in lines)
183
184    def translate_lines(self, lines):
185        """Implement `FileReporter.translate_lines`."""
186        return self.first_lines(lines)
187
188    def translate_arcs(self, arcs):
189        """Implement `FileReporter.translate_arcs`."""
190        return [
191            (self.first_line(a), self.first_line(b))
192            for (a, b) in arcs
193        ]
194
195    @expensive
196    def parse_source(self):
197        """Parse source text to find executable lines, excluded lines, etc.
198
199        Return values are 1) a set of executable line numbers, and 2) a set of
200        excluded line numbers.
201
202        Reported line numbers are normalized to the first line of multi-line
203        statements.
204
205        """
206        try:
207            self._raw_parse()
208        except (tokenize.TokenError, IndentationError) as err:
209            if hasattr(err, "lineno"):
210                lineno = err.lineno         # IndentationError
211            else:
212                lineno = err.args[1][0]     # TokenError
213            raise NotPython(
214                u"Couldn't parse '%s' as Python source: '%s' at line %d" % (
215                    self.filename, err.args[0], lineno
216                )
217            )
218
219        excluded_lines = self.first_lines(self.excluded)
220        ignore = set()
221        ignore.update(excluded_lines)
222        ignore.update(self.docstrings)
223        starts = self.statement_starts - ignore
224        lines = self.first_lines(starts)
225        lines -= ignore
226
227        return lines, excluded_lines
228
229    def arcs(self):
230        """Get information about the arcs available in the code.
231
232        Returns a set of line number pairs.  Line numbers have been normalized
233        to the first line of multi-line statements.
234
235        """
236        if self._all_arcs is None:
237            self._all_arcs = set()
238            for l1, l2 in self.byte_parser._all_arcs():
239                fl1 = self.first_line(l1)
240                fl2 = self.first_line(l2)
241                if fl1 != fl2:
242                    self._all_arcs.add((fl1, fl2))
243        return self._all_arcs
244
245    def exit_counts(self):
246        """Get a count of exits from that each line.
247
248        Excluded lines are excluded.
249
250        """
251        excluded_lines = self.first_lines(self.excluded)
252        exit_counts = collections.defaultdict(int)
253        for l1, l2 in self.arcs():
254            if l1 < 0:
255                # Don't ever report -1 as a line number
256                continue
257            if l1 in excluded_lines:
258                # Don't report excluded lines as line numbers.
259                continue
260            if l2 in excluded_lines:
261                # Arcs to excluded lines shouldn't count.
262                continue
263            exit_counts[l1] += 1
264
265        # Class definitions have one extra exit, so remove one for each:
266        for l in self.classdefs:
267            # Ensure key is there: class definitions can include excluded lines.
268            if l in exit_counts:
269                exit_counts[l] -= 1
270
271        return exit_counts
272
273
274## Opcodes that guide the ByteParser.
275
276def _opcode(name):
277    """Return the opcode by name from the dis module."""
278    return dis.opmap[name]
279
280
281def _opcode_set(*names):
282    """Return a set of opcodes by the names in `names`."""
283    s = set()
284    for name in names:
285        try:
286            s.add(_opcode(name))
287        except KeyError:
288            pass
289    return s
290
291# Opcodes that leave the code object.
292OPS_CODE_END = _opcode_set('RETURN_VALUE')
293
294# Opcodes that unconditionally end the code chunk.
295OPS_CHUNK_END = _opcode_set(
296    'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS',
297    'BREAK_LOOP', 'CONTINUE_LOOP',
298)
299
300# Opcodes that unconditionally begin a new code chunk.  By starting new chunks
301# with unconditional jump instructions, we neatly deal with jumps to jumps
302# properly.
303OPS_CHUNK_BEGIN = _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD')
304
305# Opcodes that push a block on the block stack.
306OPS_PUSH_BLOCK = _opcode_set(
307    'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH'
308)
309
310# Block types for exception handling.
311OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY')
312
313# Opcodes that pop a block from the block stack.
314OPS_POP_BLOCK = _opcode_set('POP_BLOCK')
315
316# Opcodes that have a jump destination, but aren't really a jump.
317OPS_NO_JUMP = OPS_PUSH_BLOCK
318
319# Individual opcodes we need below.
320OP_BREAK_LOOP = _opcode('BREAK_LOOP')
321OP_END_FINALLY = _opcode('END_FINALLY')
322OP_COMPARE_OP = _opcode('COMPARE_OP')
323COMPARE_EXCEPTION = 10  # just have to get this constant from the code.
324OP_LOAD_CONST = _opcode('LOAD_CONST')
325OP_RETURN_VALUE = _opcode('RETURN_VALUE')
326
327
328class ByteParser(object):
329    """Parse byte codes to understand the structure of code."""
330
331    @contract(text='unicode')
332    def __init__(self, text, code=None, filename=None):
333        self.text = text
334        if code:
335            self.code = code
336        else:
337            try:
338                self.code = compile_unicode(text, filename, "exec")
339            except SyntaxError as synerr:
340                raise NotPython(
341                    u"Couldn't parse '%s' as Python source: '%s' at line %d" % (
342                        filename, synerr.msg, synerr.lineno
343                    )
344                )
345
346        # Alternative Python implementations don't always provide all the
347        # attributes on code objects that we need to do the analysis.
348        for attr in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']:
349            if not hasattr(self.code, attr):
350                raise CoverageException(
351                    "This implementation of Python doesn't support code analysis.\n"
352                    "Run coverage.py under CPython for this command."
353                )
354
355    def child_parsers(self):
356        """Iterate over all the code objects nested within this one.
357
358        The iteration includes `self` as its first value.
359
360        """
361        children = CodeObjects(self.code)
362        return (ByteParser(self.text, code=c) for c in children)
363
364    def _bytes_lines(self):
365        """Map byte offsets to line numbers in `code`.
366
367        Uses co_lnotab described in Python/compile.c to map byte offsets to
368        line numbers.  Produces a sequence: (b0, l0), (b1, l1), ...
369
370        Only byte offsets that correspond to line numbers are included in the
371        results.
372
373        """
374        # Adapted from dis.py in the standard library.
375        byte_increments = bytes_to_ints(self.code.co_lnotab[0::2])
376        line_increments = bytes_to_ints(self.code.co_lnotab[1::2])
377
378        last_line_num = None
379        line_num = self.code.co_firstlineno
380        byte_num = 0
381        for byte_incr, line_incr in zip(byte_increments, line_increments):
382            if byte_incr:
383                if line_num != last_line_num:
384                    yield (byte_num, line_num)
385                    last_line_num = line_num
386                byte_num += byte_incr
387            line_num += line_incr
388        if line_num != last_line_num:
389            yield (byte_num, line_num)
390
391    def _find_statements(self):
392        """Find the statements in `self.code`.
393
394        Produce a sequence of line numbers that start statements.  Recurses
395        into all code objects reachable from `self.code`.
396
397        """
398        for bp in self.child_parsers():
399            # Get all of the lineno information from this code.
400            for _, l in bp._bytes_lines():
401                yield l
402
403    def _block_stack_repr(self, block_stack):               # pragma: debugging
404        """Get a string version of `block_stack`, for debugging."""
405        blocks = ", ".join(
406            "(%s, %r)" % (dis.opname[b[0]], b[1]) for b in block_stack
407        )
408        return "[" + blocks + "]"
409
410    def _split_into_chunks(self):
411        """Split the code object into a list of `Chunk` objects.
412
413        Each chunk is only entered at its first instruction, though there can
414        be many exits from a chunk.
415
416        Returns a list of `Chunk` objects.
417
418        """
419        # The list of chunks so far, and the one we're working on.
420        chunks = []
421        chunk = None
422
423        # A dict mapping byte offsets of line starts to the line numbers.
424        bytes_lines_map = dict(self._bytes_lines())
425
426        # The block stack: loops and try blocks get pushed here for the
427        # implicit jumps that can occur.
428        # Each entry is a tuple: (block type, destination)
429        block_stack = []
430
431        # Some op codes are followed by branches that should be ignored.  This
432        # is a count of how many ignores are left.
433        ignore_branch = 0
434
435        # We have to handle the last two bytecodes specially.
436        ult = penult = None
437
438        # Get a set of all of the jump-to points.
439        jump_to = set()
440        bytecodes = list(ByteCodes(self.code.co_code))
441        for bc in bytecodes:
442            if bc.jump_to >= 0:
443                jump_to.add(bc.jump_to)
444
445        chunk_lineno = 0
446
447        # Walk the byte codes building chunks.
448        for bc in bytecodes:
449            # Maybe have to start a new chunk.
450            start_new_chunk = False
451            first_chunk = False
452            if bc.offset in bytes_lines_map:
453                # Start a new chunk for each source line number.
454                start_new_chunk = True
455                chunk_lineno = bytes_lines_map[bc.offset]
456                first_chunk = True
457            elif bc.offset in jump_to:
458                # To make chunks have a single entrance, we have to make a new
459                # chunk when we get to a place some bytecode jumps to.
460                start_new_chunk = True
461            elif bc.op in OPS_CHUNK_BEGIN:
462                # Jumps deserve their own unnumbered chunk.  This fixes
463                # problems with jumps to jumps getting confused.
464                start_new_chunk = True
465
466            if not chunk or start_new_chunk:
467                if chunk:
468                    chunk.exits.add(bc.offset)
469                chunk = Chunk(bc.offset, chunk_lineno, first_chunk)
470                if not chunks:
471                    # The very first chunk of a code object is always an
472                    # entrance.
473                    chunk.entrance = True
474                chunks.append(chunk)
475
476            # Look at the opcode.
477            if bc.jump_to >= 0 and bc.op not in OPS_NO_JUMP:
478                if ignore_branch:
479                    # Someone earlier wanted us to ignore this branch.
480                    ignore_branch -= 1
481                else:
482                    # The opcode has a jump, it's an exit for this chunk.
483                    chunk.exits.add(bc.jump_to)
484
485            if bc.op in OPS_CODE_END:
486                # The opcode can exit the code object.
487                chunk.exits.add(-self.code.co_firstlineno)
488            if bc.op in OPS_PUSH_BLOCK:
489                # The opcode adds a block to the block_stack.
490                block_stack.append((bc.op, bc.jump_to))
491            if bc.op in OPS_POP_BLOCK:
492                # The opcode pops a block from the block stack.
493                block_stack.pop()
494            if bc.op in OPS_CHUNK_END:
495                # This opcode forces the end of the chunk.
496                if bc.op == OP_BREAK_LOOP:
497                    # A break is implicit: jump where the top of the
498                    # block_stack points.
499                    chunk.exits.add(block_stack[-1][1])
500                chunk = None
501            if bc.op == OP_END_FINALLY:
502                # For the finally clause we need to find the closest exception
503                # block, and use its jump target as an exit.
504                for block in reversed(block_stack):
505                    if block[0] in OPS_EXCEPT_BLOCKS:
506                        chunk.exits.add(block[1])
507                        break
508            if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION:
509                # This is an except clause.  We want to overlook the next
510                # branch, so that except's don't count as branches.
511                ignore_branch += 1
512
513            penult = ult
514            ult = bc
515
516        if chunks:
517            # The last two bytecodes could be a dummy "return None" that
518            # shouldn't be counted as real code. Every Python code object seems
519            # to end with a return, and a "return None" is inserted if there
520            # isn't an explicit return in the source.
521            if ult and penult:
522                if penult.op == OP_LOAD_CONST and ult.op == OP_RETURN_VALUE:
523                    if self.code.co_consts[penult.arg] is None:
524                        # This is "return None", but is it dummy?  A real line
525                        # would be a last chunk all by itself.
526                        if chunks[-1].byte != penult.offset:
527                            ex = -self.code.co_firstlineno
528                            # Split the last chunk
529                            last_chunk = chunks[-1]
530                            last_chunk.exits.remove(ex)
531                            last_chunk.exits.add(penult.offset)
532                            chunk = Chunk(
533                                penult.offset, last_chunk.line, False
534                            )
535                            chunk.exits.add(ex)
536                            chunks.append(chunk)
537
538            # Give all the chunks a length.
539            chunks[-1].length = bc.next_offset - chunks[-1].byte
540            for i in range(len(chunks)-1):
541                chunks[i].length = chunks[i+1].byte - chunks[i].byte
542
543        #self.validate_chunks(chunks)
544        return chunks
545
546    def validate_chunks(self, chunks):                      # pragma: debugging
547        """Validate the rule that chunks have a single entrance."""
548        # starts is the entrances to the chunks
549        starts = set(ch.byte for ch in chunks)
550        for ch in chunks:
551            assert all((ex in starts or ex < 0) for ex in ch.exits)
552
553    def _arcs(self):
554        """Find the executable arcs in the code.
555
556        Yields pairs: (from,to).  From and to are integer line numbers.  If
557        from is < 0, then the arc is an entrance into the code object.  If to
558        is < 0, the arc is an exit from the code object.
559
560        """
561        chunks = self._split_into_chunks()
562
563        # A map from byte offsets to the chunk starting at that offset.
564        byte_chunks = dict((c.byte, c) for c in chunks)
565
566        # Traverse from the first chunk in each line, and yield arcs where
567        # the trace function will be invoked.
568        for chunk in chunks:
569            if chunk.entrance:
570                yield (-1, chunk.line)
571
572            if not chunk.first:
573                continue
574
575            chunks_considered = set()
576            chunks_to_consider = [chunk]
577            while chunks_to_consider:
578                # Get the chunk we're considering, and make sure we don't
579                # consider it again.
580                this_chunk = chunks_to_consider.pop()
581                chunks_considered.add(this_chunk)
582
583                # For each exit, add the line number if the trace function
584                # would be triggered, or add the chunk to those being
585                # considered if not.
586                for ex in this_chunk.exits:
587                    if ex < 0:
588                        yield (chunk.line, ex)
589                    else:
590                        next_chunk = byte_chunks[ex]
591                        if next_chunk in chunks_considered:
592                            continue
593
594                        # The trace function is invoked if visiting the first
595                        # bytecode in a line, or if the transition is a
596                        # backward jump.
597                        backward_jump = next_chunk.byte < this_chunk.byte
598                        if next_chunk.first or backward_jump:
599                            if next_chunk.line != chunk.line:
600                                yield (chunk.line, next_chunk.line)
601                        else:
602                            chunks_to_consider.append(next_chunk)
603
604    def _all_chunks(self):
605        """Returns a list of `Chunk` objects for this code and its children.
606
607        See `_split_into_chunks` for details.
608
609        """
610        chunks = []
611        for bp in self.child_parsers():
612            chunks.extend(bp._split_into_chunks())
613
614        return chunks
615
616    def _all_arcs(self):
617        """Get the set of all arcs in this code object and its children.
618
619        See `_arcs` for details.
620
621        """
622        arcs = set()
623        for bp in self.child_parsers():
624            arcs.update(bp._arcs())
625
626        return arcs
627
628
629class Chunk(object):
630    """A sequence of byte codes with a single entrance.
631
632    To analyze byte code, we have to divide it into chunks, sequences of byte
633    codes such that each chunk has only one entrance, the first instruction in
634    the block.
635
636    This is almost the CS concept of `basic block`_, except that we're willing
637    to have many exits from a chunk, and "basic block" is a more cumbersome
638    term.
639
640    .. _basic block: http://en.wikipedia.org/wiki/Basic_block
641
642    `byte` is the offset to the bytecode starting this chunk.
643
644    `line` is the source line number containing this chunk.
645
646    `first` is true if this is the first chunk in the source line.
647
648    An exit < 0 means the chunk can leave the code (return).  The exit is
649    the negative of the starting line number of the code block.
650
651    The `entrance` attribute is a boolean indicating whether the code object
652    can be entered at this chunk.
653
654    """
655    def __init__(self, byte, line, first):
656        self.byte = byte
657        self.line = line
658        self.first = first
659        self.length = 0
660        self.entrance = False
661        self.exits = set()
662
663    def __repr__(self):
664        return "<%d+%d @%d%s%s %r>" % (
665            self.byte,
666            self.length,
667            self.line,
668            "!" if self.first else "",
669            "v" if self.entrance else "",
670            list(self.exits),
671        )
672