1"""Code parsing for Coverage."""
2
3import opcode, re, sys, token, tokenize
4
5from coverage.backward import set, sorted, StringIO # pylint: disable=W0622
6from coverage.backward import open_source
7from coverage.bytecode import ByteCodes, CodeObjects
8from coverage.misc import nice_pair, expensive, join_regex
9from coverage.misc import CoverageException, NoSource, NotPython
10
11
12class CodeParser(object):
13    """Parse code to find executable lines, excluded lines, etc."""
14
15    def __init__(self, text=None, filename=None, exclude=None):
16        """
17        Source can be provided as `text`, the text itself, or `filename`, from
18        which the text will be read.  Excluded lines are those that match
19        `exclude`, a regex.
20
21        """
22        assert text or filename, "CodeParser needs either text or filename"
23        self.filename = filename or "<code>"
24        self.text = text
25        if not self.text:
26            try:
27                sourcef = open_source(self.filename)
28                try:
29                    self.text = sourcef.read()
30                finally:
31                    sourcef.close()
32            except IOError:
33                _, err, _ = sys.exc_info()
34                raise NoSource(
35                    "No source for code: %r: %s" % (self.filename, err)
36                    )
37
38        self.exclude = exclude
39
40        self.show_tokens = False
41
42        # The text lines of the parsed code.
43        self.lines = self.text.split('\n')
44
45        # The line numbers of excluded lines of code.
46        self.excluded = set()
47
48        # The line numbers of docstring lines.
49        self.docstrings = set()
50
51        # The line numbers of class definitions.
52        self.classdefs = set()
53
54        # A dict mapping line numbers to (lo,hi) for multi-line statements.
55        self.multiline = {}
56
57        # The line numbers that start statements.
58        self.statement_starts = set()
59
60        # Lazily-created ByteParser
61        self._byte_parser = None
62
63    def _get_byte_parser(self):
64        """Create a ByteParser on demand."""
65        if not self._byte_parser:
66            self._byte_parser = \
67                            ByteParser(text=self.text, filename=self.filename)
68        return self._byte_parser
69    byte_parser = property(_get_byte_parser)
70
71    def lines_matching(self, *regexes):
72        """Find the lines matching one of a list of regexes.
73
74        Returns a set of line numbers, the lines that contain a match for one
75        of the regexes in `regexes`.  The entire line needn't match, just a
76        part of it.
77
78        """
79        regex_c = re.compile(join_regex(regexes))
80        matches = set()
81        for i, ltext in enumerate(self.lines):
82            if regex_c.search(ltext):
83                matches.add(i+1)
84        return matches
85
86    def _raw_parse(self):
87        """Parse the source to find the interesting facts about its lines.
88
89        A handful of member fields are updated.
90
91        """
92        # Find lines which match an exclusion pattern.
93        if self.exclude:
94            self.excluded = self.lines_matching(self.exclude)
95
96        # Tokenize, to find excluded suites, to find docstrings, and to find
97        # multi-line statements.
98        indent = 0
99        exclude_indent = 0
100        excluding = False
101        prev_toktype = token.INDENT
102        first_line = None
103        empty = True
104
105        tokgen = tokenize.generate_tokens(StringIO(self.text).readline)
106        for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen:
107            if self.show_tokens:                # pragma: no cover
108                print("%10s %5s %-20r %r" % (
109                    tokenize.tok_name.get(toktype, toktype),
110                    nice_pair((slineno, elineno)), ttext, ltext
111                    ))
112            if toktype == token.INDENT:
113                indent += 1
114            elif toktype == token.DEDENT:
115                indent -= 1
116            elif toktype == token.NAME and ttext == 'class':
117                # Class definitions look like branches in the byte code, so
118                # we need to exclude them.  The simplest way is to note the
119                # lines with the 'class' keyword.
120                self.classdefs.add(slineno)
121            elif toktype == token.OP and ttext == ':':
122                if not excluding and elineno in self.excluded:
123                    # Start excluding a suite.  We trigger off of the colon
124                    # token so that the #pragma comment will be recognized on
125                    # the same line as the colon.
126                    exclude_indent = indent
127                    excluding = True
128            elif toktype == token.STRING and prev_toktype == token.INDENT:
129                # Strings that are first on an indented line are docstrings.
130                # (a trick from trace.py in the stdlib.) This works for
131                # 99.9999% of cases.  For the rest (!) see:
132                # http://stackoverflow.com/questions/1769332/x/1769794#1769794
133                for i in range(slineno, elineno+1):
134                    self.docstrings.add(i)
135            elif toktype == token.NEWLINE:
136                if first_line is not None and elineno != first_line:
137                    # We're at the end of a line, and we've ended on a
138                    # different line than the first line of the statement,
139                    # so record a multi-line range.
140                    rng = (first_line, elineno)
141                    for l in range(first_line, elineno+1):
142                        self.multiline[l] = rng
143                first_line = None
144
145            if ttext.strip() and toktype != tokenize.COMMENT:
146                # A non-whitespace token.
147                empty = False
148                if first_line is None:
149                    # The token is not whitespace, and is the first in a
150                    # statement.
151                    first_line = slineno
152                    # Check whether to end an excluded suite.
153                    if excluding and indent <= exclude_indent:
154                        excluding = False
155                    if excluding:
156                        self.excluded.add(elineno)
157
158            prev_toktype = toktype
159
160        # Find the starts of the executable statements.
161        if not empty:
162            self.statement_starts.update(self.byte_parser._find_statements())
163
164    def first_line(self, line):
165        """Return the first line number of the statement including `line`."""
166        rng = self.multiline.get(line)
167        if rng:
168            first_line = rng[0]
169        else:
170            first_line = line
171        return first_line
172
173    def first_lines(self, lines, ignore=None):
174        """Map the line numbers in `lines` to the correct first line of the
175        statement.
176
177        Skip any line mentioned in `ignore`.
178
179        Returns a sorted list of the first lines.
180
181        """
182        ignore = ignore or []
183        lset = set()
184        for l in lines:
185            if l in ignore:
186                continue
187            new_l = self.first_line(l)
188            if new_l not in ignore:
189                lset.add(new_l)
190        return sorted(lset)
191
192    def parse_source(self):
193        """Parse source text to find executable lines, excluded lines, etc.
194
195        Return values are 1) a sorted list of executable line numbers, and
196        2) a sorted list of excluded line numbers.
197
198        Reported line numbers are normalized to the first line of multi-line
199        statements.
200
201        """
202        self._raw_parse()
203
204        excluded_lines = self.first_lines(self.excluded)
205        ignore = excluded_lines + list(self.docstrings)
206        lines = self.first_lines(self.statement_starts, ignore)
207
208        return lines, excluded_lines
209
210    def arcs(self):
211        """Get information about the arcs available in the code.
212
213        Returns a sorted list of line number pairs.  Line numbers have been
214        normalized to the first line of multiline statements.
215
216        """
217        all_arcs = []
218        for l1, l2 in self.byte_parser._all_arcs():
219            fl1 = self.first_line(l1)
220            fl2 = self.first_line(l2)
221            if fl1 != fl2:
222                all_arcs.append((fl1, fl2))
223        return sorted(all_arcs)
224    arcs = expensive(arcs)
225
226    def exit_counts(self):
227        """Get a mapping from line numbers to count of exits from that line.
228
229        Excluded lines are excluded.
230
231        """
232        excluded_lines = self.first_lines(self.excluded)
233        exit_counts = {}
234        for l1, l2 in self.arcs():
235            if l1 < 0:
236                # Don't ever report -1 as a line number
237                continue
238            if l1 in excluded_lines:
239                # Don't report excluded lines as line numbers.
240                continue
241            if l2 in excluded_lines:
242                # Arcs to excluded lines shouldn't count.
243                continue
244            if l1 not in exit_counts:
245                exit_counts[l1] = 0
246            exit_counts[l1] += 1
247
248        # Class definitions have one extra exit, so remove one for each:
249        for l in self.classdefs:
250            # Ensure key is there: classdefs can include excluded lines.
251            if l in exit_counts:
252                exit_counts[l] -= 1
253
254        return exit_counts
255    exit_counts = expensive(exit_counts)
256
257
258## Opcodes that guide the ByteParser.
259
260def _opcode(name):
261    """Return the opcode by name from the opcode module."""
262    return opcode.opmap[name]
263
264def _opcode_set(*names):
265    """Return a set of opcodes by the names in `names`."""
266    s = set()
267    for name in names:
268        try:
269            s.add(_opcode(name))
270        except KeyError:
271            pass
272    return s
273
274# Opcodes that leave the code object.
275OPS_CODE_END = _opcode_set('RETURN_VALUE')
276
277# Opcodes that unconditionally end the code chunk.
278OPS_CHUNK_END = _opcode_set(
279    'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS',
280    'BREAK_LOOP', 'CONTINUE_LOOP',
281    )
282
283# Opcodes that unconditionally begin a new code chunk.  By starting new chunks
284# with unconditional jump instructions, we neatly deal with jumps to jumps
285# properly.
286OPS_CHUNK_BEGIN = _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD')
287
288# Opcodes that push a block on the block stack.
289OPS_PUSH_BLOCK = _opcode_set(
290    'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH'
291    )
292
293# Block types for exception handling.
294OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY')
295
296# Opcodes that pop a block from the block stack.
297OPS_POP_BLOCK = _opcode_set('POP_BLOCK')
298
299# Opcodes that have a jump destination, but aren't really a jump.
300OPS_NO_JUMP = OPS_PUSH_BLOCK
301
302# Individual opcodes we need below.
303OP_BREAK_LOOP = _opcode('BREAK_LOOP')
304OP_END_FINALLY = _opcode('END_FINALLY')
305OP_COMPARE_OP = _opcode('COMPARE_OP')
306COMPARE_EXCEPTION = 10  # just have to get this const from the code.
307OP_LOAD_CONST = _opcode('LOAD_CONST')
308OP_RETURN_VALUE = _opcode('RETURN_VALUE')
309
310
311class ByteParser(object):
312    """Parse byte codes to understand the structure of code."""
313
314    def __init__(self, code=None, text=None, filename=None):
315        if code:
316            self.code = code
317            self.text = text
318        else:
319            if not text:
320                assert filename, "If no code or text, need a filename"
321                sourcef = open_source(filename)
322                try:
323                    text = sourcef.read()
324                finally:
325                    sourcef.close()
326            self.text = text
327
328            try:
329                # Python 2.3 and 2.4 don't like partial last lines, so be sure
330                # the text ends nicely for them.
331                self.code = compile(text + '\n', filename, "exec")
332            except SyntaxError:
333                _, synerr, _ = sys.exc_info()
334                raise NotPython(
335                    "Couldn't parse '%s' as Python source: '%s' at line %d" %
336                        (filename, synerr.msg, synerr.lineno)
337                    )
338
339        # Alternative Python implementations don't always provide all the
340        # attributes on code objects that we need to do the analysis.
341        for attr in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']:
342            if not hasattr(self.code, attr):
343                raise CoverageException(
344                    "This implementation of Python doesn't support code "
345                    "analysis.\n"
346                    "Run coverage.py under CPython for this command."
347                    )
348
349    def child_parsers(self):
350        """Iterate over all the code objects nested within this one.
351
352        The iteration includes `self` as its first value.
353
354        """
355        children = CodeObjects(self.code)
356        return [ByteParser(code=c, text=self.text) for c in children]
357
358    # Getting numbers from the lnotab value changed in Py3.0.
359    if sys.version_info >= (3, 0):
360        def _lnotab_increments(self, lnotab):
361            """Return a list of ints from the lnotab bytes in 3.x"""
362            return list(lnotab)
363    else:
364        def _lnotab_increments(self, lnotab):
365            """Return a list of ints from the lnotab string in 2.x"""
366            return [ord(c) for c in lnotab]
367
368    def _bytes_lines(self):
369        """Map byte offsets to line numbers in `code`.
370
371        Uses co_lnotab described in Python/compile.c to map byte offsets to
372        line numbers.  Returns a list: [(b0, l0), (b1, l1), ...]
373
374        """
375        # Adapted from dis.py in the standard library.
376        byte_increments = self._lnotab_increments(self.code.co_lnotab[0::2])
377        line_increments = self._lnotab_increments(self.code.co_lnotab[1::2])
378
379        bytes_lines = []
380        last_line_num = None
381        line_num = self.code.co_firstlineno
382        byte_num = 0
383        for byte_incr, line_incr in zip(byte_increments, line_increments):
384            if byte_incr:
385                if line_num != last_line_num:
386                    bytes_lines.append((byte_num, line_num))
387                    last_line_num = line_num
388                byte_num += byte_incr
389            line_num += line_incr
390        if line_num != last_line_num:
391            bytes_lines.append((byte_num, line_num))
392        return bytes_lines
393
394    def _find_statements(self):
395        """Find the statements in `self.code`.
396
397        Return a set of line numbers that start statements.  Recurses into all
398        code objects reachable from `self.code`.
399
400        """
401        stmts = set()
402        for bp in self.child_parsers():
403            # Get all of the lineno information from this code.
404            for _, l in bp._bytes_lines():
405                stmts.add(l)
406        return stmts
407
408    def _split_into_chunks(self):
409        """Split the code object into a list of `Chunk` objects.
410
411        Each chunk is only entered at its first instruction, though there can
412        be many exits from a chunk.
413
414        Returns a list of `Chunk` objects.
415
416        """
417
418        # The list of chunks so far, and the one we're working on.
419        chunks = []
420        chunk = None
421        bytes_lines_map = dict(self._bytes_lines())
422
423        # The block stack: loops and try blocks get pushed here for the
424        # implicit jumps that can occur.
425        # Each entry is a tuple: (block type, destination)
426        block_stack = []
427
428        # Some op codes are followed by branches that should be ignored.  This
429        # is a count of how many ignores are left.
430        ignore_branch = 0
431
432        # We have to handle the last two bytecodes specially.
433        ult = penult = None
434
435        for bc in ByteCodes(self.code.co_code):
436            # Maybe have to start a new chunk
437            if bc.offset in bytes_lines_map:
438                # Start a new chunk for each source line number.
439                if chunk:
440                    chunk.exits.add(bc.offset)
441                chunk = Chunk(bc.offset, bytes_lines_map[bc.offset])
442                chunks.append(chunk)
443            elif bc.op in OPS_CHUNK_BEGIN:
444                # Jumps deserve their own unnumbered chunk.  This fixes
445                # problems with jumps to jumps getting confused.
446                if chunk:
447                    chunk.exits.add(bc.offset)
448                chunk = Chunk(bc.offset)
449                chunks.append(chunk)
450
451            if not chunk:
452                chunk = Chunk(bc.offset)
453                chunks.append(chunk)
454
455            # Look at the opcode
456            if bc.jump_to >= 0 and bc.op not in OPS_NO_JUMP:
457                if ignore_branch:
458                    # Someone earlier wanted us to ignore this branch.
459                    ignore_branch -= 1
460                else:
461                    # The opcode has a jump, it's an exit for this chunk.
462                    chunk.exits.add(bc.jump_to)
463
464            if bc.op in OPS_CODE_END:
465                # The opcode can exit the code object.
466                chunk.exits.add(-self.code.co_firstlineno)
467            if bc.op in OPS_PUSH_BLOCK:
468                # The opcode adds a block to the block_stack.
469                block_stack.append((bc.op, bc.jump_to))
470            if bc.op in OPS_POP_BLOCK:
471                # The opcode pops a block from the block stack.
472                block_stack.pop()
473            if bc.op in OPS_CHUNK_END:
474                # This opcode forces the end of the chunk.
475                if bc.op == OP_BREAK_LOOP:
476                    # A break is implicit: jump where the top of the
477                    # block_stack points.
478                    chunk.exits.add(block_stack[-1][1])
479                chunk = None
480            if bc.op == OP_END_FINALLY:
481                if block_stack:
482                    # A break that goes through a finally will jump to whatever
483                    # block is on top of the stack.
484                    chunk.exits.add(block_stack[-1][1])
485                # For the finally clause we need to find the closest exception
486                # block, and use its jump target as an exit.
487                for iblock in range(len(block_stack)-1, -1, -1):
488                    if block_stack[iblock][0] in OPS_EXCEPT_BLOCKS:
489                        chunk.exits.add(block_stack[iblock][1])
490                        break
491            if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION:
492                # This is an except clause.  We want to overlook the next
493                # branch, so that except's don't count as branches.
494                ignore_branch += 1
495
496            penult = ult
497            ult = bc
498
499        if chunks:
500            # The last two bytecodes could be a dummy "return None" that
501            # shouldn't be counted as real code. Every Python code object seems
502            # to end with a return, and a "return None" is inserted if there
503            # isn't an explicit return in the source.
504            if ult and penult:
505                if penult.op == OP_LOAD_CONST and ult.op == OP_RETURN_VALUE:
506                    if self.code.co_consts[penult.arg] is None:
507                        # This is "return None", but is it dummy?  A real line
508                        # would be a last chunk all by itself.
509                        if chunks[-1].byte != penult.offset:
510                            ex = -self.code.co_firstlineno
511                            # Split the last chunk
512                            last_chunk = chunks[-1]
513                            last_chunk.exits.remove(ex)
514                            last_chunk.exits.add(penult.offset)
515                            chunk = Chunk(penult.offset)
516                            chunk.exits.add(ex)
517                            chunks.append(chunk)
518
519            # Give all the chunks a length.
520            chunks[-1].length = bc.next_offset - chunks[-1].byte
521            for i in range(len(chunks)-1):
522                chunks[i].length = chunks[i+1].byte - chunks[i].byte
523
524        return chunks
525
526    def _arcs(self):
527        """Find the executable arcs in the code.
528
529        Returns a set of pairs, (from,to).  From and to are integer line
530        numbers.  If from is < 0, then the arc is an entrance into the code
531        object.  If to is < 0, the arc is an exit from the code object.
532
533        """
534        chunks = self._split_into_chunks()
535
536        # A map from byte offsets to chunks jumped into.
537        byte_chunks = dict([(c.byte, c) for c in chunks])
538
539        # Build a map from byte offsets to actual lines reached.
540        byte_lines = {}
541        bytes_to_add = set([c.byte for c in chunks])
542
543        while bytes_to_add:
544            byte_to_add = bytes_to_add.pop()
545            if byte_to_add in byte_lines or byte_to_add < 0:
546                continue
547
548            # Which lines does this chunk lead to?
549            bytes_considered = set()
550            bytes_to_consider = [byte_to_add]
551            lines = set()
552
553            while bytes_to_consider:
554                byte = bytes_to_consider.pop()
555                bytes_considered.add(byte)
556
557                # Find chunk for byte
558                try:
559                    ch = byte_chunks[byte]
560                except KeyError:
561                    for ch in chunks:
562                        if ch.byte <= byte < ch.byte+ch.length:
563                            break
564                    else:
565                        # No chunk for this byte!
566                        raise Exception("Couldn't find chunk @ %d" % byte)
567                    byte_chunks[byte] = ch
568
569                if ch.line:
570                    lines.add(ch.line)
571                else:
572                    for ex in ch.exits:
573                        if ex < 0:
574                            lines.add(ex)
575                        elif ex not in bytes_considered:
576                            bytes_to_consider.append(ex)
577
578                bytes_to_add.update(ch.exits)
579
580            byte_lines[byte_to_add] = lines
581
582        # Figure out for each chunk where the exits go.
583        arcs = set()
584        for chunk in chunks:
585            if chunk.line:
586                for ex in chunk.exits:
587                    if ex < 0:
588                        exit_lines = [ex]
589                    else:
590                        exit_lines = byte_lines[ex]
591                    for exit_line in exit_lines:
592                        if chunk.line != exit_line:
593                            arcs.add((chunk.line, exit_line))
594        for line in byte_lines[0]:
595            arcs.add((-1, line))
596
597        return arcs
598
599    def _all_chunks(self):
600        """Returns a list of `Chunk` objects for this code and its children.
601
602        See `_split_into_chunks` for details.
603
604        """
605        chunks = []
606        for bp in self.child_parsers():
607            chunks.extend(bp._split_into_chunks())
608
609        return chunks
610
611    def _all_arcs(self):
612        """Get the set of all arcs in this code object and its children.
613
614        See `_arcs` for details.
615
616        """
617        arcs = set()
618        for bp in self.child_parsers():
619            arcs.update(bp._arcs())
620
621        return arcs
622
623
624class Chunk(object):
625    """A sequence of bytecodes with a single entrance.
626
627    To analyze byte code, we have to divide it into chunks, sequences of byte
628    codes such that each basic block has only one entrance, the first
629    instruction in the block.
630
631    This is almost the CS concept of `basic block`_, except that we're willing
632    to have many exits from a chunk, and "basic block" is a more cumbersome
633    term.
634
635    .. _basic block: http://en.wikipedia.org/wiki/Basic_block
636
637    An exit < 0 means the chunk can leave the code (return).  The exit is
638    the negative of the starting line number of the code block.
639
640    """
641    def __init__(self, byte, line=0):
642        self.byte = byte
643        self.line = line
644        self.length = 0
645        self.exits = set()
646
647    def __repr__(self):
648        return "<%d+%d @%d %r>" % (
649            self.byte, self.length, self.line, list(self.exits)
650            )
651