1"""Better tokenizing for coverage.py.""" 2 3import keyword, re, token, tokenize 4from coverage.backward import StringIO # pylint: disable=W0622 5 6def phys_tokens(toks): 7 """Return all physical tokens, even line continuations. 8 9 tokenize.generate_tokens() doesn't return a token for the backslash that 10 continues lines. This wrapper provides those tokens so that we can 11 re-create a faithful representation of the original source. 12 13 Returns the same values as generate_tokens() 14 15 """ 16 last_line = None 17 last_lineno = -1 18 last_ttype = None 19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 20 if last_lineno != elineno: 21 if last_line and last_line[-2:] == "\\\n": 22 # We are at the beginning of a new line, and the last line 23 # ended with a backslash. We probably have to inject a 24 # backslash token into the stream. Unfortunately, there's more 25 # to figure out. This code:: 26 # 27 # usage = """\ 28 # HEY THERE 29 # """ 30 # 31 # triggers this condition, but the token text is:: 32 # 33 # '"""\\\nHEY THERE\n"""' 34 # 35 # so we need to figure out if the backslash is already in the 36 # string token or not. 37 inject_backslash = True 38 if last_ttype == tokenize.COMMENT: 39 # Comments like this \ 40 # should never result in a new token. 41 inject_backslash = False 42 elif ttype == token.STRING: 43 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': 44 # It's a multiline string and the first line ends with 45 # a backslash, so we don't need to inject another. 46 inject_backslash = False 47 if inject_backslash: 48 # Figure out what column the backslash is in. 49 ccol = len(last_line.split("\n")[-2]) - 1 50 # Yield the token, with a fake token type. 51 yield ( 52 99999, "\\\n", 53 (slineno, ccol), (slineno, ccol+2), 54 last_line 55 ) 56 last_line = ltext 57 last_ttype = ttype 58 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext 59 last_lineno = elineno 60 61 62def source_token_lines(source): 63 """Generate a series of lines, one for each line in `source`. 64 65 Each line is a list of pairs, each pair is a token:: 66 67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] 68 69 Each pair has a token class, and the token text. 70 71 If you concatenate all the token texts, and then join them with newlines, 72 you should have your original `source` back, with two differences: 73 trailing whitespace is not preserved, and a final line with no newline 74 is indistinguishable from a final line with a newline. 75 76 """ 77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] 78 line = [] 79 col = 0 80 source = source.expandtabs(8).replace('\r\n', '\n') 81 tokgen = tokenize.generate_tokens(StringIO(source).readline) 82 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 83 mark_start = True 84 for part in re.split('(\n)', ttext): 85 if part == '\n': 86 yield line 87 line = [] 88 col = 0 89 mark_end = False 90 elif part == '': 91 mark_end = False 92 elif ttype in ws_tokens: 93 mark_end = False 94 else: 95 if mark_start and scol > col: 96 line.append(("ws", " " * (scol - col))) 97 mark_start = False 98 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] 99 if ttype == token.NAME and keyword.iskeyword(ttext): 100 tok_class = "key" 101 line.append((tok_class, part)) 102 mark_end = True 103 scol = 0 104 if mark_end: 105 col = ecol 106 107 if line: 108 yield line 109