tokenize.py revision 33856de84d1115a18b699e0ca93c3b921bc6a1af
1"""Tokenization help for Python programs.
2
3tokenize(readline) is a generator that breaks a stream of
4bytes into Python tokens. It decodes the bytes according to
5PEP-0263 for determining source file encoding.
6
7It accepts a readline-like method which is called
8repeatedly to get the next line of input (or b"" for EOF).  It generates
95-tuples with these members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators. Aditionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream."""
21
22__author__ = 'Ka-Ping Yee <ping@lfw.org>'
23__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
24               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
25               'Michael Foord')
26import re, string, sys
27from token import *
28from codecs import lookup, BOM_UTF8
29cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
30
31import token
32__all__ = [x for x in dir(token) if not x.startswith("_")]
33__all__.extend(["COMMENT", "tokenize", "detect_encoding", "NL", "untokenize",
34                "ENCODING", "TokenInfo"])
35del token
36
37COMMENT = N_TOKENS
38tok_name[COMMENT] = 'COMMENT'
39NL = N_TOKENS + 1
40tok_name[NL] = 'NL'
41ENCODING = N_TOKENS + 2
42tok_name[ENCODING] = 'ENCODING'
43N_TOKENS += 3
44
45class TokenInfo(tuple):
46    'TokenInfo(type, string, start, end, line)'
47
48    __slots__ = ()
49
50    _fields = ('type', 'string', 'start', 'end', 'line')
51
52    def __new__(cls, type, string, start, end, line):
53        return tuple.__new__(cls, (type, string, start, end, line))
54
55    @classmethod
56    def _make(cls, iterable, new=tuple.__new__, len=len):
57        'Make a new TokenInfo object from a sequence or iterable'
58        result = new(cls, iterable)
59        if len(result) != 5:
60            raise TypeError('Expected 5 arguments, got %d' % len(result))
61        return result
62
63    def __repr__(self):
64        return 'TokenInfo(type=%r, string=%r, start=%r, end=%r, line=%r)' % self
65
66    def _asdict(self):
67        'Return a new dict which maps field names to their values'
68        return dict(zip(self._fields, self))
69
70    def _replace(self, **kwds):
71        'Return a new TokenInfo object replacing specified fields with new values'
72        result = self._make(map(kwds.pop, ('type', 'string', 'start', 'end', 'line'), self))
73        if kwds:
74            raise ValueError('Got unexpected field names: %r' % kwds.keys())
75        return result
76
77    def __getnewargs__(self):
78        return tuple(self)
79
80    type = property(lambda t: t[0])
81    string = property(lambda t: t[1])
82    start = property(lambda t: t[2])
83    end = property(lambda t: t[3])
84    line = property(lambda t: t[4])
85
86def group(*choices): return '(' + '|'.join(choices) + ')'
87def any(*choices): return group(*choices) + '*'
88def maybe(*choices): return group(*choices) + '?'
89
90# Note: we use unicode matching for names ("\w") but ascii matching for
91# number literals.
92Whitespace = r'[ \f\t]*'
93Comment = r'#[^\r\n]*'
94Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
95Name = r'\w+'
96
97Hexnumber = r'0[xX][0-9a-fA-F]+'
98Binnumber = r'0[bB][01]+'
99Octnumber = r'0[oO][0-7]+'
100Decnumber = r'(?:0+|[1-9][0-9]*)'
101Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
102Exponent = r'[eE][-+]?[0-9]+'
103Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
104Expfloat = r'[0-9]+' + Exponent
105Floatnumber = group(Pointfloat, Expfloat)
106Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
107Number = group(Imagnumber, Floatnumber, Intnumber)
108
109# Tail end of ' string.
110Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
111# Tail end of " string.
112Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
113# Tail end of ''' string.
114Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
115# Tail end of """ string.
116Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
117Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
118# Single-line ' or " string.
119String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
120               r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
121
122# Because of leftmost-then-longest match semantics, be sure to put the
123# longest operators first (e.g., if = came before ==, == would get
124# recognized as two instances of =).
125Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
126                 r"//=?", r"->",
127                 r"[+\-*/%&|^=<>]=?",
128                 r"~")
129
130Bracket = '[][(){}]'
131Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
132Funny = group(Operator, Bracket, Special)
133
134PlainToken = group(Number, Funny, String, Name)
135Token = Ignore + PlainToken
136
137# First (or only) line of ' or " string.
138ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
139                group("'", r'\\\r?\n'),
140                r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
141                group('"', r'\\\r?\n'))
142PseudoExtras = group(r'\\\r?\n', Comment, Triple)
143PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
144
145def _compile(expr):
146    return re.compile(expr, re.UNICODE)
147
148tokenprog, pseudoprog, single3prog, double3prog = map(
149    _compile, (Token, PseudoToken, Single3, Double3))
150endprogs = {"'": _compile(Single), '"': _compile(Double),
151            "'''": single3prog, '"""': double3prog,
152            "r'''": single3prog, 'r"""': double3prog,
153            "b'''": single3prog, 'b"""': double3prog,
154            "br'''": single3prog, 'br"""': double3prog,
155            "R'''": single3prog, 'R"""': double3prog,
156            "B'''": single3prog, 'B"""': double3prog,
157            "bR'''": single3prog, 'bR"""': double3prog,
158            "Br'''": single3prog, 'Br"""': double3prog,
159            "BR'''": single3prog, 'BR"""': double3prog,
160            'r': None, 'R': None, 'b': None, 'B': None}
161
162triple_quoted = {}
163for t in ("'''", '"""',
164          "r'''", 'r"""', "R'''", 'R"""',
165          "b'''", 'b"""', "B'''", 'B"""',
166          "br'''", 'br"""', "Br'''", 'Br"""',
167          "bR'''", 'bR"""', "BR'''", 'BR"""'):
168    triple_quoted[t] = t
169single_quoted = {}
170for t in ("'", '"',
171          "r'", 'r"', "R'", 'R"',
172          "b'", 'b"', "B'", 'B"',
173          "br'", 'br"', "Br'", 'Br"',
174          "bR'", 'bR"', "BR'", 'BR"' ):
175    single_quoted[t] = t
176
177del _compile
178
179tabsize = 8
180
181class TokenError(Exception): pass
182
183class StopTokenizing(Exception): pass
184
185
186class Untokenizer:
187
188    def __init__(self):
189        self.tokens = []
190        self.prev_row = 1
191        self.prev_col = 0
192        self.encoding = None
193
194    def add_whitespace(self, start):
195        row, col = start
196        assert row <= self.prev_row
197        col_offset = col - self.prev_col
198        if col_offset:
199            self.tokens.append(" " * col_offset)
200
201    def untokenize(self, iterable):
202        for t in iterable:
203            if len(t) == 2:
204                self.compat(t, iterable)
205                break
206            tok_type, token, start, end, line = t
207            if tok_type == ENCODING:
208                self.encoding = token
209                continue
210            self.add_whitespace(start)
211            self.tokens.append(token)
212            self.prev_row, self.prev_col = end
213            if tok_type in (NEWLINE, NL):
214                self.prev_row += 1
215                self.prev_col = 0
216        return "".join(self.tokens)
217
218    def compat(self, token, iterable):
219        startline = False
220        indents = []
221        toks_append = self.tokens.append
222        toknum, tokval = token
223
224        if toknum in (NAME, NUMBER):
225            tokval += ' '
226        if toknum in (NEWLINE, NL):
227            startline = True
228        prevstring = False
229        for tok in iterable:
230            toknum, tokval = tok[:2]
231            if toknum == ENCODING:
232                self.encoding = tokval
233                continue
234
235            if toknum in (NAME, NUMBER):
236                tokval += ' '
237
238            # Insert a space between two consecutive strings
239            if toknum == STRING:
240                if prevstring:
241                    tokval = ' ' + tokval
242                prevstring = True
243            else:
244                prevstring = False
245
246            if toknum == INDENT:
247                indents.append(tokval)
248                continue
249            elif toknum == DEDENT:
250                indents.pop()
251                continue
252            elif toknum in (NEWLINE, NL):
253                startline = True
254            elif startline and indents:
255                toks_append(indents[-1])
256                startline = False
257            toks_append(tokval)
258
259
260def untokenize(iterable):
261    """Transform tokens back into Python source code.
262    It returns a bytes object, encoded using the ENCODING
263    token, which is the first token sequence output by tokenize.
264
265    Each element returned by the iterable must be a token sequence
266    with at least two elements, a token number and token value.  If
267    only two tokens are passed, the resulting output is poor.
268
269    Round-trip invariant for full input:
270        Untokenized source will match input source exactly
271
272    Round-trip invariant for limited intput:
273        # Output bytes will tokenize the back to the input
274        t1 = [tok[:2] for tok in tokenize(f.readline)]
275        newcode = untokenize(t1)
276        readline = BytesIO(newcode).readline
277        t2 = [tok[:2] for tok in tokenize(readline)]
278        assert t1 == t2
279    """
280    ut = Untokenizer()
281    out = ut.untokenize(iterable)
282    if ut.encoding is not None:
283        out = out.encode(ut.encoding)
284    return out
285
286
287def _get_normal_name(orig_enc):
288    """Imitates get_normal_name in tokenizer.c."""
289    # Only care about the first 12 characters.
290    enc = orig_enc[:12].lower().replace("_", "-")
291    if enc == "utf-8" or enc.startswith("utf-8-"):
292        return "utf-8"
293    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
294       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
295        return "iso-8859-1"
296    return orig_enc
297
298def detect_encoding(readline):
299    """
300    The detect_encoding() function is used to detect the encoding that should
301    be used to decode a Python source file. It requires one argment, readline,
302    in the same way as the tokenize() generator.
303
304    It will call readline a maximum of twice, and return the encoding used
305    (as a string) and a list of any lines (left as bytes) it has read
306    in.
307
308    It detects the encoding from the presence of a utf-8 bom or an encoding
309    cookie as specified in pep-0263. If both a bom and a cookie are present, but
310    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
311    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
312    'utf-8-sig' is returned.
313
314    If no encoding is specified, then the default of 'utf-8' will be returned.
315    """
316    bom_found = False
317    encoding = None
318    default = 'utf-8'
319    def read_or_stop():
320        try:
321            return readline()
322        except StopIteration:
323            return b''
324
325    def find_cookie(line):
326        try:
327            line_string = line.decode('ascii')
328        except UnicodeDecodeError:
329            return None
330
331        matches = cookie_re.findall(line_string)
332        if not matches:
333            return None
334        encoding = _get_normal_name(matches[0])
335        try:
336            codec = lookup(encoding)
337        except LookupError:
338            # This behaviour mimics the Python interpreter
339            raise SyntaxError("unknown encoding: " + encoding)
340
341        if bom_found:
342            if codec.name != 'utf-8':
343                # This behaviour mimics the Python interpreter
344                raise SyntaxError('encoding problem: utf-8')
345            encoding += '-sig'
346        return encoding
347
348    first = read_or_stop()
349    if first.startswith(BOM_UTF8):
350        bom_found = True
351        first = first[3:]
352        default = 'utf-8-sig'
353    if not first:
354        return default, []
355
356    encoding = find_cookie(first)
357    if encoding:
358        return encoding, [first]
359
360    second = read_or_stop()
361    if not second:
362        return default, [first]
363
364    encoding = find_cookie(second)
365    if encoding:
366        return encoding, [first, second]
367
368    return default, [first, second]
369
370
371def tokenize(readline):
372    """
373    The tokenize() generator requires one argment, readline, which
374    must be a callable object which provides the same interface as the
375    readline() method of built-in file objects. Each call to the function
376    should return one line of input as bytes.  Alternately, readline
377    can be a callable function terminating with StopIteration:
378        readline = open(myfile, 'rb').__next__  # Example of alternate readline
379
380    The generator produces 5-tuples with these members: the token type; the
381    token string; a 2-tuple (srow, scol) of ints specifying the row and
382    column where the token begins in the source; a 2-tuple (erow, ecol) of
383    ints specifying the row and column where the token ends in the source;
384    and the line on which the token was found. The line passed is the
385    logical line; continuation lines are included.
386
387    The first token sequence will always be an ENCODING token
388    which tells you which encoding was used to decode the bytes stream.
389    """
390    # This import is here to avoid problems when the itertools module is not
391    # built yet and tokenize is imported.
392    from itertools import chain, repeat
393    encoding, consumed = detect_encoding(readline)
394    rl_gen = iter(readline, b"")
395    empty = repeat(b"")
396    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
397
398
399def _tokenize(readline, encoding):
400    lnum = parenlev = continued = 0
401    numchars = '0123456789'
402    contstr, needcont = '', 0
403    contline = None
404    indents = [0]
405
406    if encoding is not None:
407        if encoding == "utf-8-sig":
408            # BOM will already have been stripped.
409            encoding = "utf-8"
410        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
411    while True:             # loop over lines in stream
412        try:
413            line = readline()
414        except StopIteration:
415            line = b''
416
417        if encoding is not None:
418            line = line.decode(encoding)
419        lnum += 1
420        pos, max = 0, len(line)
421
422        if contstr:                            # continued string
423            if not line:
424                raise TokenError("EOF in multi-line string", strstart)
425            endmatch = endprog.match(line)
426            if endmatch:
427                pos = end = endmatch.end(0)
428                yield TokenInfo(STRING, contstr + line[:end],
429                       strstart, (lnum, end), contline + line)
430                contstr, needcont = '', 0
431                contline = None
432            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
433                yield TokenInfo(ERRORTOKEN, contstr + line,
434                           strstart, (lnum, len(line)), contline)
435                contstr = ''
436                contline = None
437                continue
438            else:
439                contstr = contstr + line
440                contline = contline + line
441                continue
442
443        elif parenlev == 0 and not continued:  # new statement
444            if not line: break
445            column = 0
446            while pos < max:                   # measure leading whitespace
447                if line[pos] == ' ':
448                    column += 1
449                elif line[pos] == '\t':
450                    column = (column//tabsize + 1)*tabsize
451                elif line[pos] == '\f':
452                    column = 0
453                else:
454                    break
455                pos += 1
456            if pos == max:
457                break
458
459            if line[pos] in '#\r\n':           # skip comments or blank lines
460                if line[pos] == '#':
461                    comment_token = line[pos:].rstrip('\r\n')
462                    nl_pos = pos + len(comment_token)
463                    yield TokenInfo(COMMENT, comment_token,
464                           (lnum, pos), (lnum, pos + len(comment_token)), line)
465                    yield TokenInfo(NL, line[nl_pos:],
466                           (lnum, nl_pos), (lnum, len(line)), line)
467                else:
468                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
469                           (lnum, pos), (lnum, len(line)), line)
470                continue
471
472            if column > indents[-1]:           # count indents or dedents
473                indents.append(column)
474                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
475            while column < indents[-1]:
476                if column not in indents:
477                    raise IndentationError(
478                        "unindent does not match any outer indentation level",
479                        ("<tokenize>", lnum, pos, line))
480                indents = indents[:-1]
481                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
482
483        else:                                  # continued statement
484            if not line:
485                raise TokenError("EOF in multi-line statement", (lnum, 0))
486            continued = 0
487
488        while pos < max:
489            pseudomatch = pseudoprog.match(line, pos)
490            if pseudomatch:                                # scan for tokens
491                start, end = pseudomatch.span(1)
492                spos, epos, pos = (lnum, start), (lnum, end), end
493                token, initial = line[start:end], line[start]
494
495                if (initial in numchars or                  # ordinary number
496                    (initial == '.' and token != '.' and token != '...')):
497                    yield TokenInfo(NUMBER, token, spos, epos, line)
498                elif initial in '\r\n':
499                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
500                           token, spos, epos, line)
501                elif initial == '#':
502                    assert not token.endswith("\n")
503                    yield TokenInfo(COMMENT, token, spos, epos, line)
504                elif token in triple_quoted:
505                    endprog = endprogs[token]
506                    endmatch = endprog.match(line, pos)
507                    if endmatch:                           # all on one line
508                        pos = endmatch.end(0)
509                        token = line[start:pos]
510                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
511                    else:
512                        strstart = (lnum, start)           # multiple lines
513                        contstr = line[start:]
514                        contline = line
515                        break
516                elif initial in single_quoted or \
517                    token[:2] in single_quoted or \
518                    token[:3] in single_quoted:
519                    if token[-1] == '\n':                  # continued string
520                        strstart = (lnum, start)
521                        endprog = (endprogs[initial] or endprogs[token[1]] or
522                                   endprogs[token[2]])
523                        contstr, needcont = line[start:], 1
524                        contline = line
525                        break
526                    else:                                  # ordinary string
527                        yield TokenInfo(STRING, token, spos, epos, line)
528                elif initial.isidentifier():               # ordinary name
529                    yield TokenInfo(NAME, token, spos, epos, line)
530                elif initial == '\\':                      # continued stmt
531                    continued = 1
532                else:
533                    if initial in '([{':
534                        parenlev += 1
535                    elif initial in ')]}':
536                        parenlev -= 1
537                    yield TokenInfo(OP, token, spos, epos, line)
538            else:
539                yield TokenInfo(ERRORTOKEN, line[pos],
540                           (lnum, pos), (lnum, pos+1), line)
541                pos += 1
542
543    for indent in indents[1:]:                 # pop remaining indent levels
544        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
545    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
546
547
548# An undocumented, backwards compatible, API for all the places in the standard
549# library that expect to be able to use tokenize with strings
550def generate_tokens(readline):
551    return _tokenize(readline, None)
552