1"""Better tokenizing for coverage.py."""
2
3import keyword, re, token, tokenize
4from coverage.backward import StringIO              # pylint: disable=W0622
5
6def phys_tokens(toks):
7    """Return all physical tokens, even line continuations.
8
9    tokenize.generate_tokens() doesn't return a token for the backslash that
10    continues lines.  This wrapper provides those tokens so that we can
11    re-create a faithful representation of the original source.
12
13    Returns the same values as generate_tokens()
14
15    """
16    last_line = None
17    last_lineno = -1
18    last_ttype = None
19    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
20        if last_lineno != elineno:
21            if last_line and last_line[-2:] == "\\\n":
22                # We are at the beginning of a new line, and the last line
23                # ended with a backslash.  We probably have to inject a
24                # backslash token into the stream. Unfortunately, there's more
25                # to figure out.  This code::
26                #
27                #   usage = """\
28                #   HEY THERE
29                #   """
30                #
31                # triggers this condition, but the token text is::
32                #
33                #   '"""\\\nHEY THERE\n"""'
34                #
35                # so we need to figure out if the backslash is already in the
36                # string token or not.
37                inject_backslash = True
38                if last_ttype == tokenize.COMMENT:
39                    # Comments like this \
40                    # should never result in a new token.
41                    inject_backslash = False
42                elif ttype == token.STRING:
43                    if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
44                        # It's a multiline string and the first line ends with
45                        # a backslash, so we don't need to inject another.
46                        inject_backslash = False
47                if inject_backslash:
48                    # Figure out what column the backslash is in.
49                    ccol = len(last_line.split("\n")[-2]) - 1
50                    # Yield the token, with a fake token type.
51                    yield (
52                        99999, "\\\n",
53                        (slineno, ccol), (slineno, ccol+2),
54                        last_line
55                        )
56            last_line = ltext
57            last_ttype = ttype
58        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
59        last_lineno = elineno
60
61
62def source_token_lines(source):
63    """Generate a series of lines, one for each line in `source`.
64
65    Each line is a list of pairs, each pair is a token::
66
67        [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
68
69    Each pair has a token class, and the token text.
70
71    If you concatenate all the token texts, and then join them with newlines,
72    you should have your original `source` back, with two differences:
73    trailing whitespace is not preserved, and a final line with no newline
74    is indistinguishable from a final line with a newline.
75
76    """
77    ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]
78    line = []
79    col = 0
80    source = source.expandtabs(8).replace('\r\n', '\n')
81    tokgen = tokenize.generate_tokens(StringIO(source).readline)
82    for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
83        mark_start = True
84        for part in re.split('(\n)', ttext):
85            if part == '\n':
86                yield line
87                line = []
88                col = 0
89                mark_end = False
90            elif part == '':
91                mark_end = False
92            elif ttype in ws_tokens:
93                mark_end = False
94            else:
95                if mark_start and scol > col:
96                    line.append(("ws", " " * (scol - col)))
97                    mark_start = False
98                tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
99                if ttype == token.NAME and keyword.iskeyword(ttext):
100                    tok_class = "key"
101                line.append((tok_class, part))
102                mark_end = True
103            scol = 0
104        if mark_end:
105            col = ecol
106
107    if line:
108        yield line
109