tokenize.py revision 49fd7fa4431da299196d74087df4a04f99f9c46f
1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens.  It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF).  It generates
65-tuples with these members:
7
8    the token type (see token.py)
9    the token (a string)
10    the starting (row, column) indices of the token (a 2-tuple of ints)
11    the ending (row, column) indices of the token (a 2-tuple of ints)
12    the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19    tokenize_loop(readline, tokeneater)
20    tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = \
27    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
28
29import string, re
30from token import *
31
32import token
33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34           "generate_tokens", "NL", "untokenize"]
35del x
36del token
37
38COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
40NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
42N_TOKENS += 2
43
44def group(*choices): return '(' + '|'.join(choices) + ')'
45def any(*choices): return group(*choices) + '*'
46def maybe(*choices): return group(*choices) + '?'
47
48Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
52
53Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54Octnumber = r'0[0-7]*[lL]?'
55Decnumber = r'[1-9]\d*[lL]?'
56Intnumber = group(Hexnumber, Octnumber, Decnumber)
57Exponent = r'[eE][-+]?\d+'
58Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59Expfloat = r'\d+' + Exponent
60Floatnumber = group(Pointfloat, Expfloat)
61Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62Number = group(Imagnumber, Floatnumber, Intnumber)
63
64# Tail end of ' string.
65Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66# Tail end of " string.
67Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68# Tail end of ''' string.
69Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70# Tail end of """ string.
71Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73# Single-line ' or " string.
74String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
76
77# Because of leftmost-then-longest match semantics, be sure to put the
78# longest operators first (e.g., if = came before ==, == would get
79# recognized as two instances of =).
80Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81                 r"//=?",
82                 r"[+\-*/%&|^=<>]=?",
83                 r"~")
84
85Bracket = '[][(){}]'
86Special = group(r'\r?\n', r'[:;.,`@]')
87Funny = group(Operator, Bracket, Special)
88
89PlainToken = group(Number, Funny, String, Name)
90Token = Ignore + PlainToken
91
92# First (or only) line of ' or " string.
93ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94                group("'", r'\\\r?\n'),
95                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96                group('"', r'\\\r?\n'))
97PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99
100tokenprog, pseudoprog, single3prog, double3prog = map(
101    re.compile, (Token, PseudoToken, Single3, Double3))
102endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103            "'''": single3prog, '"""': double3prog,
104            "r'''": single3prog, 'r"""': double3prog,
105            "u'''": single3prog, 'u"""': double3prog,
106            "ur'''": single3prog, 'ur"""': double3prog,
107            "R'''": single3prog, 'R"""': double3prog,
108            "U'''": single3prog, 'U"""': double3prog,
109            "uR'''": single3prog, 'uR"""': double3prog,
110            "Ur'''": single3prog, 'Ur"""': double3prog,
111            "UR'''": single3prog, 'UR"""': double3prog,
112            'r': None, 'R': None, 'u': None, 'U': None}
113
114triple_quoted = {}
115for t in ("'''", '"""',
116          "r'''", 'r"""', "R'''", 'R"""',
117          "u'''", 'u"""', "U'''", 'U"""',
118          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119          "uR'''", 'uR"""', "UR'''", 'UR"""'):
120    triple_quoted[t] = t
121single_quoted = {}
122for t in ("'", '"',
123          "r'", 'r"', "R'", 'R"',
124          "u'", 'u"', "U'", 'U"',
125          "ur'", 'ur"', "Ur'", 'Ur"',
126          "uR'", 'uR"', "UR'", 'UR"' ):
127    single_quoted[t] = t
128
129tabsize = 8
130
131class TokenError(Exception): pass
132
133class StopTokenizing(Exception): pass
134
135def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136    print "%d,%d-%d,%d:\t%s\t%s" % \
137        (srow, scol, erow, ecol, tok_name[type], repr(token))
138
139def tokenize(readline, tokeneater=printtoken):
140    """
141    The tokenize() function accepts two parameters: one representing the
142    input stream, and one providing an output mechanism for tokenize().
143
144    The first parameter, readline, must be a callable object which provides
145    the same interface as the readline() method of built-in file objects.
146    Each call to the function should return one line of input as a string.
147
148    The second parameter, tokeneater, must also be a callable object. It is
149    called once for each token, with five arguments, corresponding to the
150    tuples generated by generate_tokens().
151    """
152    try:
153        tokenize_loop(readline, tokeneater)
154    except StopTokenizing:
155        pass
156
157# backwards compatible interface
158def tokenize_loop(readline, tokeneater):
159    for token_info in generate_tokens(readline):
160        tokeneater(*token_info)
161
162
163def untokenize(iterable):
164    """Transform tokens back into Python source code.
165
166    Each element returned by the iterable must be a token sequence
167    with at least two elements, a token number and token value.
168
169    Round-trip invariant:
170        # Output text will tokenize the back to the input
171        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
172        newcode = untokenize(t1)
173        readline = iter(newcode.splitlines(1)).next
174        t2 = [tok[:2] for tokin generate_tokens(readline)]
175        assert t1 == t2
176    """
177
178    startline = False
179    indents = []
180    toks = []
181    toks_append = toks.append
182    for tok in iterable:
183        toknum, tokval = tok[:2]
184
185        if toknum in (NAME, NUMBER):
186            tokval += ' '
187
188        if toknum == INDENT:
189            indents.append(tokval)
190            continue
191        elif toknum == DEDENT:
192            indents.pop()
193            continue
194        elif toknum in (NEWLINE, COMMENT, NL):
195            startline = True
196        elif startline and indents:
197            toks_append(indents[-1])
198            startline = False
199        toks_append(tokval)
200    return ''.join(toks)
201
202
203def generate_tokens(readline):
204    """
205    The generate_tokens() generator requires one argment, readline, which
206    must be a callable object which provides the same interface as the
207    readline() method of built-in file objects. Each call to the function
208    should return one line of input as a string.  Alternately, readline
209    can be a callable function terminating with StopIteration:
210        readline = open(myfile).next    # Example of alternate readline
211
212    The generator produces 5-tuples with these members: the token type; the
213    token string; a 2-tuple (srow, scol) of ints specifying the row and
214    column where the token begins in the source; a 2-tuple (erow, ecol) of
215    ints specifying the row and column where the token ends in the source;
216    and the line on which the token was found. The line passed is the
217    logical line; continuation lines are included.
218    """
219    lnum = parenlev = continued = 0
220    namechars, numchars = string.ascii_letters + '_', '0123456789'
221    contstr, needcont = '', 0
222    contline = None
223    indents = [0]
224
225    while 1:                                   # loop over lines in stream
226        try:
227            line = readline()
228        except StopIteration:
229            line = ''
230        lnum = lnum + 1
231        pos, max = 0, len(line)
232
233        if contstr:                            # continued string
234            if not line:
235                raise TokenError, ("EOF in multi-line string", strstart)
236            endmatch = endprog.match(line)
237            if endmatch:
238                pos = end = endmatch.end(0)
239                yield (STRING, contstr + line[:end],
240                           strstart, (lnum, end), contline + line)
241                contstr, needcont = '', 0
242                contline = None
243            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
244                yield (ERRORTOKEN, contstr + line,
245                           strstart, (lnum, len(line)), contline)
246                contstr = ''
247                contline = None
248                continue
249            else:
250                contstr = contstr + line
251                contline = contline + line
252                continue
253
254        elif parenlev == 0 and not continued:  # new statement
255            if not line: break
256            column = 0
257            while pos < max:                   # measure leading whitespace
258                if line[pos] == ' ': column = column + 1
259                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
260                elif line[pos] == '\f': column = 0
261                else: break
262                pos = pos + 1
263            if pos == max: break
264
265            if line[pos] in '#\r\n':           # skip comments or blank lines
266                yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
267                           (lnum, pos), (lnum, len(line)), line)
268                continue
269
270            if column > indents[-1]:           # count indents or dedents
271                indents.append(column)
272                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
273            while column < indents[-1]:
274                if column not in indents:
275                    raise IndentationError(
276                        "unindent does not match any outer indentation level")
277                indents = indents[:-1]
278                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
279
280        else:                                  # continued statement
281            if not line:
282                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
283            continued = 0
284
285        while pos < max:
286            pseudomatch = pseudoprog.match(line, pos)
287            if pseudomatch:                                # scan for tokens
288                start, end = pseudomatch.span(1)
289                spos, epos, pos = (lnum, start), (lnum, end), end
290                token, initial = line[start:end], line[start]
291
292                if initial in numchars or \
293                   (initial == '.' and token != '.'):      # ordinary number
294                    yield (NUMBER, token, spos, epos, line)
295                elif initial in '\r\n':
296                    yield (parenlev > 0 and NL or NEWLINE,
297                               token, spos, epos, line)
298                elif initial == '#':
299                    yield (COMMENT, token, spos, epos, line)
300                elif token in triple_quoted:
301                    endprog = endprogs[token]
302                    endmatch = endprog.match(line, pos)
303                    if endmatch:                           # all on one line
304                        pos = endmatch.end(0)
305                        token = line[start:pos]
306                        yield (STRING, token, spos, (lnum, pos), line)
307                    else:
308                        strstart = (lnum, start)           # multiple lines
309                        contstr = line[start:]
310                        contline = line
311                        break
312                elif initial in single_quoted or \
313                    token[:2] in single_quoted or \
314                    token[:3] in single_quoted:
315                    if token[-1] == '\n':                  # continued string
316                        strstart = (lnum, start)
317                        endprog = (endprogs[initial] or endprogs[token[1]] or
318                                   endprogs[token[2]])
319                        contstr, needcont = line[start:], 1
320                        contline = line
321                        break
322                    else:                                  # ordinary string
323                        yield (STRING, token, spos, epos, line)
324                elif initial in namechars:                 # ordinary name
325                    yield (NAME, token, spos, epos, line)
326                elif initial == '\\':                      # continued stmt
327                    continued = 1
328                else:
329                    if initial in '([{': parenlev = parenlev + 1
330                    elif initial in ')]}': parenlev = parenlev - 1
331                    yield (OP, token, spos, epos, line)
332            else:
333                yield (ERRORTOKEN, line[pos],
334                           (lnum, pos), (lnum, pos+1), line)
335                pos = pos + 1
336
337    for indent in indents[1:]:                 # pop remaining indent levels
338        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
339    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
340
341if __name__ == '__main__':                     # testing
342    import sys
343    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
344    else: tokenize(sys.stdin.readline)
345