1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt 3 4"""Code parsing for coverage.py.""" 5 6import collections 7import dis 8import re 9import token 10import tokenize 11 12from coverage.backward import range # pylint: disable=redefined-builtin 13from coverage.backward import bytes_to_ints 14from coverage.bytecode import ByteCodes, CodeObjects 15from coverage.misc import contract, nice_pair, expensive, join_regex 16from coverage.misc import CoverageException, NoSource, NotPython 17from coverage.phystokens import compile_unicode, generate_tokens 18 19 20class PythonParser(object): 21 """Parse code to find executable lines, excluded lines, etc.""" 22 23 @contract(text='unicode|None') 24 def __init__(self, text=None, filename=None, exclude=None): 25 """ 26 Source can be provided as `text`, the text itself, or `filename`, from 27 which the text will be read. Excluded lines are those that match 28 `exclude`, a regex. 29 30 """ 31 assert text or filename, "PythonParser needs either text or filename" 32 self.filename = filename or "<code>" 33 self.text = text 34 if not self.text: 35 from coverage.python import get_python_source 36 try: 37 self.text = get_python_source(self.filename) 38 except IOError as err: 39 raise NoSource( 40 "No source for code: '%s': %s" % (self.filename, err) 41 ) 42 43 self.exclude = exclude 44 45 self.show_tokens = False 46 47 # The text lines of the parsed code. 48 self.lines = self.text.split('\n') 49 50 # The line numbers of excluded lines of code. 51 self.excluded = set() 52 53 # The line numbers of docstring lines. 54 self.docstrings = set() 55 56 # The line numbers of class definitions. 57 self.classdefs = set() 58 59 # A dict mapping line numbers to (lo,hi) for multi-line statements. 60 self.multiline = {} 61 62 # The line numbers that start statements. 63 self.statement_starts = set() 64 65 # Lazily-created ByteParser and arc data. 66 self._byte_parser = None 67 self._all_arcs = None 68 69 @property 70 def byte_parser(self): 71 """Create a ByteParser on demand.""" 72 if not self._byte_parser: 73 self._byte_parser = ByteParser(self.text, filename=self.filename) 74 return self._byte_parser 75 76 def lines_matching(self, *regexes): 77 """Find the lines matching one of a list of regexes. 78 79 Returns a set of line numbers, the lines that contain a match for one 80 of the regexes in `regexes`. The entire line needn't match, just a 81 part of it. 82 83 """ 84 regex_c = re.compile(join_regex(regexes)) 85 matches = set() 86 for i, ltext in enumerate(self.lines, start=1): 87 if regex_c.search(ltext): 88 matches.add(i) 89 return matches 90 91 def _raw_parse(self): 92 """Parse the source to find the interesting facts about its lines. 93 94 A handful of member fields are updated. 95 96 """ 97 # Find lines which match an exclusion pattern. 98 if self.exclude: 99 self.excluded = self.lines_matching(self.exclude) 100 101 # Tokenize, to find excluded suites, to find docstrings, and to find 102 # multi-line statements. 103 indent = 0 104 exclude_indent = 0 105 excluding = False 106 prev_toktype = token.INDENT 107 first_line = None 108 empty = True 109 110 tokgen = generate_tokens(self.text) 111 for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen: 112 if self.show_tokens: # pragma: not covered 113 print("%10s %5s %-20r %r" % ( 114 tokenize.tok_name.get(toktype, toktype), 115 nice_pair((slineno, elineno)), ttext, ltext 116 )) 117 if toktype == token.INDENT: 118 indent += 1 119 elif toktype == token.DEDENT: 120 indent -= 1 121 elif toktype == token.NAME and ttext == 'class': 122 # Class definitions look like branches in the byte code, so 123 # we need to exclude them. The simplest way is to note the 124 # lines with the 'class' keyword. 125 self.classdefs.add(slineno) 126 elif toktype == token.OP and ttext == ':': 127 if not excluding and elineno in self.excluded: 128 # Start excluding a suite. We trigger off of the colon 129 # token so that the #pragma comment will be recognized on 130 # the same line as the colon. 131 exclude_indent = indent 132 excluding = True 133 elif toktype == token.STRING and prev_toktype == token.INDENT: 134 # Strings that are first on an indented line are docstrings. 135 # (a trick from trace.py in the stdlib.) This works for 136 # 99.9999% of cases. For the rest (!) see: 137 # http://stackoverflow.com/questions/1769332/x/1769794#1769794 138 self.docstrings.update(range(slineno, elineno+1)) 139 elif toktype == token.NEWLINE: 140 if first_line is not None and elineno != first_line: 141 # We're at the end of a line, and we've ended on a 142 # different line than the first line of the statement, 143 # so record a multi-line range. 144 for l in range(first_line, elineno+1): 145 self.multiline[l] = first_line 146 first_line = None 147 148 if ttext.strip() and toktype != tokenize.COMMENT: 149 # A non-whitespace token. 150 empty = False 151 if first_line is None: 152 # The token is not whitespace, and is the first in a 153 # statement. 154 first_line = slineno 155 # Check whether to end an excluded suite. 156 if excluding and indent <= exclude_indent: 157 excluding = False 158 if excluding: 159 self.excluded.add(elineno) 160 161 prev_toktype = toktype 162 163 # Find the starts of the executable statements. 164 if not empty: 165 self.statement_starts.update(self.byte_parser._find_statements()) 166 167 def first_line(self, line): 168 """Return the first line number of the statement including `line`.""" 169 first_line = self.multiline.get(line) 170 if first_line: 171 return first_line 172 else: 173 return line 174 175 def first_lines(self, lines): 176 """Map the line numbers in `lines` to the correct first line of the 177 statement. 178 179 Returns a set of the first lines. 180 181 """ 182 return set(self.first_line(l) for l in lines) 183 184 def translate_lines(self, lines): 185 """Implement `FileReporter.translate_lines`.""" 186 return self.first_lines(lines) 187 188 def translate_arcs(self, arcs): 189 """Implement `FileReporter.translate_arcs`.""" 190 return [ 191 (self.first_line(a), self.first_line(b)) 192 for (a, b) in arcs 193 ] 194 195 @expensive 196 def parse_source(self): 197 """Parse source text to find executable lines, excluded lines, etc. 198 199 Return values are 1) a set of executable line numbers, and 2) a set of 200 excluded line numbers. 201 202 Reported line numbers are normalized to the first line of multi-line 203 statements. 204 205 """ 206 try: 207 self._raw_parse() 208 except (tokenize.TokenError, IndentationError) as err: 209 if hasattr(err, "lineno"): 210 lineno = err.lineno # IndentationError 211 else: 212 lineno = err.args[1][0] # TokenError 213 raise NotPython( 214 u"Couldn't parse '%s' as Python source: '%s' at line %d" % ( 215 self.filename, err.args[0], lineno 216 ) 217 ) 218 219 excluded_lines = self.first_lines(self.excluded) 220 ignore = set() 221 ignore.update(excluded_lines) 222 ignore.update(self.docstrings) 223 starts = self.statement_starts - ignore 224 lines = self.first_lines(starts) 225 lines -= ignore 226 227 return lines, excluded_lines 228 229 def arcs(self): 230 """Get information about the arcs available in the code. 231 232 Returns a set of line number pairs. Line numbers have been normalized 233 to the first line of multi-line statements. 234 235 """ 236 if self._all_arcs is None: 237 self._all_arcs = set() 238 for l1, l2 in self.byte_parser._all_arcs(): 239 fl1 = self.first_line(l1) 240 fl2 = self.first_line(l2) 241 if fl1 != fl2: 242 self._all_arcs.add((fl1, fl2)) 243 return self._all_arcs 244 245 def exit_counts(self): 246 """Get a count of exits from that each line. 247 248 Excluded lines are excluded. 249 250 """ 251 excluded_lines = self.first_lines(self.excluded) 252 exit_counts = collections.defaultdict(int) 253 for l1, l2 in self.arcs(): 254 if l1 < 0: 255 # Don't ever report -1 as a line number 256 continue 257 if l1 in excluded_lines: 258 # Don't report excluded lines as line numbers. 259 continue 260 if l2 in excluded_lines: 261 # Arcs to excluded lines shouldn't count. 262 continue 263 exit_counts[l1] += 1 264 265 # Class definitions have one extra exit, so remove one for each: 266 for l in self.classdefs: 267 # Ensure key is there: class definitions can include excluded lines. 268 if l in exit_counts: 269 exit_counts[l] -= 1 270 271 return exit_counts 272 273 274## Opcodes that guide the ByteParser. 275 276def _opcode(name): 277 """Return the opcode by name from the dis module.""" 278 return dis.opmap[name] 279 280 281def _opcode_set(*names): 282 """Return a set of opcodes by the names in `names`.""" 283 s = set() 284 for name in names: 285 try: 286 s.add(_opcode(name)) 287 except KeyError: 288 pass 289 return s 290 291# Opcodes that leave the code object. 292OPS_CODE_END = _opcode_set('RETURN_VALUE') 293 294# Opcodes that unconditionally end the code chunk. 295OPS_CHUNK_END = _opcode_set( 296 'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS', 297 'BREAK_LOOP', 'CONTINUE_LOOP', 298) 299 300# Opcodes that unconditionally begin a new code chunk. By starting new chunks 301# with unconditional jump instructions, we neatly deal with jumps to jumps 302# properly. 303OPS_CHUNK_BEGIN = _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD') 304 305# Opcodes that push a block on the block stack. 306OPS_PUSH_BLOCK = _opcode_set( 307 'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH' 308) 309 310# Block types for exception handling. 311OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY') 312 313# Opcodes that pop a block from the block stack. 314OPS_POP_BLOCK = _opcode_set('POP_BLOCK') 315 316# Opcodes that have a jump destination, but aren't really a jump. 317OPS_NO_JUMP = OPS_PUSH_BLOCK 318 319# Individual opcodes we need below. 320OP_BREAK_LOOP = _opcode('BREAK_LOOP') 321OP_END_FINALLY = _opcode('END_FINALLY') 322OP_COMPARE_OP = _opcode('COMPARE_OP') 323COMPARE_EXCEPTION = 10 # just have to get this constant from the code. 324OP_LOAD_CONST = _opcode('LOAD_CONST') 325OP_RETURN_VALUE = _opcode('RETURN_VALUE') 326 327 328class ByteParser(object): 329 """Parse byte codes to understand the structure of code.""" 330 331 @contract(text='unicode') 332 def __init__(self, text, code=None, filename=None): 333 self.text = text 334 if code: 335 self.code = code 336 else: 337 try: 338 self.code = compile_unicode(text, filename, "exec") 339 except SyntaxError as synerr: 340 raise NotPython( 341 u"Couldn't parse '%s' as Python source: '%s' at line %d" % ( 342 filename, synerr.msg, synerr.lineno 343 ) 344 ) 345 346 # Alternative Python implementations don't always provide all the 347 # attributes on code objects that we need to do the analysis. 348 for attr in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']: 349 if not hasattr(self.code, attr): 350 raise CoverageException( 351 "This implementation of Python doesn't support code analysis.\n" 352 "Run coverage.py under CPython for this command." 353 ) 354 355 def child_parsers(self): 356 """Iterate over all the code objects nested within this one. 357 358 The iteration includes `self` as its first value. 359 360 """ 361 children = CodeObjects(self.code) 362 return (ByteParser(self.text, code=c) for c in children) 363 364 def _bytes_lines(self): 365 """Map byte offsets to line numbers in `code`. 366 367 Uses co_lnotab described in Python/compile.c to map byte offsets to 368 line numbers. Produces a sequence: (b0, l0), (b1, l1), ... 369 370 Only byte offsets that correspond to line numbers are included in the 371 results. 372 373 """ 374 # Adapted from dis.py in the standard library. 375 byte_increments = bytes_to_ints(self.code.co_lnotab[0::2]) 376 line_increments = bytes_to_ints(self.code.co_lnotab[1::2]) 377 378 last_line_num = None 379 line_num = self.code.co_firstlineno 380 byte_num = 0 381 for byte_incr, line_incr in zip(byte_increments, line_increments): 382 if byte_incr: 383 if line_num != last_line_num: 384 yield (byte_num, line_num) 385 last_line_num = line_num 386 byte_num += byte_incr 387 line_num += line_incr 388 if line_num != last_line_num: 389 yield (byte_num, line_num) 390 391 def _find_statements(self): 392 """Find the statements in `self.code`. 393 394 Produce a sequence of line numbers that start statements. Recurses 395 into all code objects reachable from `self.code`. 396 397 """ 398 for bp in self.child_parsers(): 399 # Get all of the lineno information from this code. 400 for _, l in bp._bytes_lines(): 401 yield l 402 403 def _block_stack_repr(self, block_stack): # pragma: debugging 404 """Get a string version of `block_stack`, for debugging.""" 405 blocks = ", ".join( 406 "(%s, %r)" % (dis.opname[b[0]], b[1]) for b in block_stack 407 ) 408 return "[" + blocks + "]" 409 410 def _split_into_chunks(self): 411 """Split the code object into a list of `Chunk` objects. 412 413 Each chunk is only entered at its first instruction, though there can 414 be many exits from a chunk. 415 416 Returns a list of `Chunk` objects. 417 418 """ 419 # The list of chunks so far, and the one we're working on. 420 chunks = [] 421 chunk = None 422 423 # A dict mapping byte offsets of line starts to the line numbers. 424 bytes_lines_map = dict(self._bytes_lines()) 425 426 # The block stack: loops and try blocks get pushed here for the 427 # implicit jumps that can occur. 428 # Each entry is a tuple: (block type, destination) 429 block_stack = [] 430 431 # Some op codes are followed by branches that should be ignored. This 432 # is a count of how many ignores are left. 433 ignore_branch = 0 434 435 # We have to handle the last two bytecodes specially. 436 ult = penult = None 437 438 # Get a set of all of the jump-to points. 439 jump_to = set() 440 bytecodes = list(ByteCodes(self.code.co_code)) 441 for bc in bytecodes: 442 if bc.jump_to >= 0: 443 jump_to.add(bc.jump_to) 444 445 chunk_lineno = 0 446 447 # Walk the byte codes building chunks. 448 for bc in bytecodes: 449 # Maybe have to start a new chunk. 450 start_new_chunk = False 451 first_chunk = False 452 if bc.offset in bytes_lines_map: 453 # Start a new chunk for each source line number. 454 start_new_chunk = True 455 chunk_lineno = bytes_lines_map[bc.offset] 456 first_chunk = True 457 elif bc.offset in jump_to: 458 # To make chunks have a single entrance, we have to make a new 459 # chunk when we get to a place some bytecode jumps to. 460 start_new_chunk = True 461 elif bc.op in OPS_CHUNK_BEGIN: 462 # Jumps deserve their own unnumbered chunk. This fixes 463 # problems with jumps to jumps getting confused. 464 start_new_chunk = True 465 466 if not chunk or start_new_chunk: 467 if chunk: 468 chunk.exits.add(bc.offset) 469 chunk = Chunk(bc.offset, chunk_lineno, first_chunk) 470 if not chunks: 471 # The very first chunk of a code object is always an 472 # entrance. 473 chunk.entrance = True 474 chunks.append(chunk) 475 476 # Look at the opcode. 477 if bc.jump_to >= 0 and bc.op not in OPS_NO_JUMP: 478 if ignore_branch: 479 # Someone earlier wanted us to ignore this branch. 480 ignore_branch -= 1 481 else: 482 # The opcode has a jump, it's an exit for this chunk. 483 chunk.exits.add(bc.jump_to) 484 485 if bc.op in OPS_CODE_END: 486 # The opcode can exit the code object. 487 chunk.exits.add(-self.code.co_firstlineno) 488 if bc.op in OPS_PUSH_BLOCK: 489 # The opcode adds a block to the block_stack. 490 block_stack.append((bc.op, bc.jump_to)) 491 if bc.op in OPS_POP_BLOCK: 492 # The opcode pops a block from the block stack. 493 block_stack.pop() 494 if bc.op in OPS_CHUNK_END: 495 # This opcode forces the end of the chunk. 496 if bc.op == OP_BREAK_LOOP: 497 # A break is implicit: jump where the top of the 498 # block_stack points. 499 chunk.exits.add(block_stack[-1][1]) 500 chunk = None 501 if bc.op == OP_END_FINALLY: 502 # For the finally clause we need to find the closest exception 503 # block, and use its jump target as an exit. 504 for block in reversed(block_stack): 505 if block[0] in OPS_EXCEPT_BLOCKS: 506 chunk.exits.add(block[1]) 507 break 508 if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION: 509 # This is an except clause. We want to overlook the next 510 # branch, so that except's don't count as branches. 511 ignore_branch += 1 512 513 penult = ult 514 ult = bc 515 516 if chunks: 517 # The last two bytecodes could be a dummy "return None" that 518 # shouldn't be counted as real code. Every Python code object seems 519 # to end with a return, and a "return None" is inserted if there 520 # isn't an explicit return in the source. 521 if ult and penult: 522 if penult.op == OP_LOAD_CONST and ult.op == OP_RETURN_VALUE: 523 if self.code.co_consts[penult.arg] is None: 524 # This is "return None", but is it dummy? A real line 525 # would be a last chunk all by itself. 526 if chunks[-1].byte != penult.offset: 527 ex = -self.code.co_firstlineno 528 # Split the last chunk 529 last_chunk = chunks[-1] 530 last_chunk.exits.remove(ex) 531 last_chunk.exits.add(penult.offset) 532 chunk = Chunk( 533 penult.offset, last_chunk.line, False 534 ) 535 chunk.exits.add(ex) 536 chunks.append(chunk) 537 538 # Give all the chunks a length. 539 chunks[-1].length = bc.next_offset - chunks[-1].byte 540 for i in range(len(chunks)-1): 541 chunks[i].length = chunks[i+1].byte - chunks[i].byte 542 543 #self.validate_chunks(chunks) 544 return chunks 545 546 def validate_chunks(self, chunks): # pragma: debugging 547 """Validate the rule that chunks have a single entrance.""" 548 # starts is the entrances to the chunks 549 starts = set(ch.byte for ch in chunks) 550 for ch in chunks: 551 assert all((ex in starts or ex < 0) for ex in ch.exits) 552 553 def _arcs(self): 554 """Find the executable arcs in the code. 555 556 Yields pairs: (from,to). From and to are integer line numbers. If 557 from is < 0, then the arc is an entrance into the code object. If to 558 is < 0, the arc is an exit from the code object. 559 560 """ 561 chunks = self._split_into_chunks() 562 563 # A map from byte offsets to the chunk starting at that offset. 564 byte_chunks = dict((c.byte, c) for c in chunks) 565 566 # Traverse from the first chunk in each line, and yield arcs where 567 # the trace function will be invoked. 568 for chunk in chunks: 569 if chunk.entrance: 570 yield (-1, chunk.line) 571 572 if not chunk.first: 573 continue 574 575 chunks_considered = set() 576 chunks_to_consider = [chunk] 577 while chunks_to_consider: 578 # Get the chunk we're considering, and make sure we don't 579 # consider it again. 580 this_chunk = chunks_to_consider.pop() 581 chunks_considered.add(this_chunk) 582 583 # For each exit, add the line number if the trace function 584 # would be triggered, or add the chunk to those being 585 # considered if not. 586 for ex in this_chunk.exits: 587 if ex < 0: 588 yield (chunk.line, ex) 589 else: 590 next_chunk = byte_chunks[ex] 591 if next_chunk in chunks_considered: 592 continue 593 594 # The trace function is invoked if visiting the first 595 # bytecode in a line, or if the transition is a 596 # backward jump. 597 backward_jump = next_chunk.byte < this_chunk.byte 598 if next_chunk.first or backward_jump: 599 if next_chunk.line != chunk.line: 600 yield (chunk.line, next_chunk.line) 601 else: 602 chunks_to_consider.append(next_chunk) 603 604 def _all_chunks(self): 605 """Returns a list of `Chunk` objects for this code and its children. 606 607 See `_split_into_chunks` for details. 608 609 """ 610 chunks = [] 611 for bp in self.child_parsers(): 612 chunks.extend(bp._split_into_chunks()) 613 614 return chunks 615 616 def _all_arcs(self): 617 """Get the set of all arcs in this code object and its children. 618 619 See `_arcs` for details. 620 621 """ 622 arcs = set() 623 for bp in self.child_parsers(): 624 arcs.update(bp._arcs()) 625 626 return arcs 627 628 629class Chunk(object): 630 """A sequence of byte codes with a single entrance. 631 632 To analyze byte code, we have to divide it into chunks, sequences of byte 633 codes such that each chunk has only one entrance, the first instruction in 634 the block. 635 636 This is almost the CS concept of `basic block`_, except that we're willing 637 to have many exits from a chunk, and "basic block" is a more cumbersome 638 term. 639 640 .. _basic block: http://en.wikipedia.org/wiki/Basic_block 641 642 `byte` is the offset to the bytecode starting this chunk. 643 644 `line` is the source line number containing this chunk. 645 646 `first` is true if this is the first chunk in the source line. 647 648 An exit < 0 means the chunk can leave the code (return). The exit is 649 the negative of the starting line number of the code block. 650 651 The `entrance` attribute is a boolean indicating whether the code object 652 can be entered at this chunk. 653 654 """ 655 def __init__(self, byte, line, first): 656 self.byte = byte 657 self.line = line 658 self.first = first 659 self.length = 0 660 self.entrance = False 661 self.exits = set() 662 663 def __repr__(self): 664 return "<%d+%d @%d%s%s %r>" % ( 665 self.byte, 666 self.length, 667 self.line, 668 "!" if self.first else "", 669 "v" if self.entrance else "", 670 list(self.exits), 671 ) 672