17757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch"""Better tokenizing for coverage.py.""" 27757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 37757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdochimport keyword, re, token, tokenize 47757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdochfrom coverage.backward import StringIO # pylint: disable=W0622 57757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 67757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdochdef phys_tokens(toks): 77757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch """Return all physical tokens, even line continuations. 87757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 97757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch tokenize.generate_tokens() doesn't return a token for the backslash that 107757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch continues lines. This wrapper provides those tokens so that we can 117757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch re-create a faithful representation of the original source. 127757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 137757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch Returns the same values as generate_tokens() 147757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 157757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch """ 167757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch last_line = None 177757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch last_lineno = -1 187757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch last_ttype = None 197757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 207757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if last_lineno != elineno: 217757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if last_line and last_line[-2:] == "\\\n": 227757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # We are at the beginning of a new line, and the last line 237757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # ended with a backslash. We probably have to inject a 247757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # backslash token into the stream. Unfortunately, there's more 257757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # to figure out. This code:: 267757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # 277757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # usage = """\ 287757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # HEY THERE 297757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # """ 307757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # 317757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # triggers this condition, but the token text is:: 327757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # 337757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # '"""\\\nHEY THERE\n"""' 347757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # 357757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # so we need to figure out if the backslash is already in the 367757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # string token or not. 377757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch inject_backslash = True 387757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if last_ttype == tokenize.COMMENT: 397757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # Comments like this \ 407757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # should never result in a new token. 417757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch inject_backslash = False 427757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch elif ttype == token.STRING: 437757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': 447757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # It's a multiline string and the first line ends with 457757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # a backslash, so we don't need to inject another. 467757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch inject_backslash = False 477757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if inject_backslash: 487757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # Figure out what column the backslash is in. 497757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch ccol = len(last_line.split("\n")[-2]) - 1 507757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch # Yield the token, with a fake token type. 517757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch yield ( 527757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 99999, "\\\n", 537757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch (slineno, ccol), (slineno, ccol+2), 547757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch last_line 557757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch ) 567757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch last_line = ltext 577757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch last_ttype = ttype 587757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext 597757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch last_lineno = elineno 607757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 617757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 627757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdochdef source_token_lines(source): 637757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch """Generate a series of lines, one for each line in `source`. 647757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 657757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch Each line is a list of pairs, each pair is a token:: 667757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 677757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] 687757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 697757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch Each pair has a token class, and the token text. 707757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 717757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch If you concatenate all the token texts, and then join them with newlines, 727757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch you should have your original `source` back, with two differences: 737757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch trailing whitespace is not preserved, and a final line with no newline 747757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch is indistinguishable from a final line with a newline. 757757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 767757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch """ 777757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] 787757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch line = [] 797757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch col = 0 807757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch source = source.expandtabs(8).replace('\r\n', '\n') 817757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch tokgen = tokenize.generate_tokens(StringIO(source).readline) 827757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 837757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch mark_start = True 847757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch for part in re.split('(\n)', ttext): 857757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if part == '\n': 867757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch yield line 877757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch line = [] 887757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch col = 0 897757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch mark_end = False 907757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch elif part == '': 917757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch mark_end = False 927757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch elif ttype in ws_tokens: 937757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch mark_end = False 947757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch else: 957757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if mark_start and scol > col: 967757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch line.append(("ws", " " * (scol - col))) 977757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch mark_start = False 987757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] 997757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if ttype == token.NAME and keyword.iskeyword(ttext): 1007757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch tok_class = "key" 1017757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch line.append((tok_class, part)) 1027757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch mark_end = True 1037757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch scol = 0 1047757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if mark_end: 1057757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch col = ecol 1067757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch 1077757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch if line: 1087757ec2eadfa2dd8ac2aeed0a4399e9b07ec38cbBen Murdoch yield line 109