tokenize.py revision 43e4ea1b17ac912e4f8e55e256b96be0c57a88ee
1"""Tokenization help for Python programs.
2
3tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens.  It decodes the bytes according to PEP-0263 for
5determining source file encoding.
6
7It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF).  It generates 5-tuples with these
9members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators.  Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
22
23__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26               'Michael Foord')
27import re
28import sys
29from token import *
30from codecs import lookup, BOM_UTF8
31cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
32
33import token
34__all__ = [x for x in dir(token) if not x.startswith("_")]
35__all__.extend(["COMMENT", "tokenize", "detect_encoding", "NL", "untokenize",
36                "ENCODING", "TokenInfo"])
37del token
38
39COMMENT = N_TOKENS
40tok_name[COMMENT] = 'COMMENT'
41NL = N_TOKENS + 1
42tok_name[NL] = 'NL'
43ENCODING = N_TOKENS + 2
44tok_name[ENCODING] = 'ENCODING'
45N_TOKENS += 3
46
47class TokenInfo(tuple):
48    'TokenInfo(type, string, start, end, line)'
49
50    __slots__ = ()
51
52    _fields = ('type', 'string', 'start', 'end', 'line')
53
54    def __new__(cls, type, string, start, end, line):
55        return tuple.__new__(cls, (type, string, start, end, line))
56
57    @classmethod
58    def _make(cls, iterable, new=tuple.__new__, len=len):
59        'Make a new TokenInfo object from a sequence or iterable'
60        result = new(cls, iterable)
61        if len(result) != 5:
62            raise TypeError('Expected 5 arguments, got %d' % len(result))
63        return result
64
65    def __repr__(self):
66        return 'TokenInfo(type=%r, string=%r, start=%r, end=%r, line=%r)' % self
67
68    def _asdict(self):
69        'Return a new dict which maps field names to their values'
70        return dict(zip(self._fields, self))
71
72    def _replace(self, **kwds):
73        'Return a new TokenInfo object replacing specified fields with new values'
74        result = self._make(map(kwds.pop, ('type', 'string', 'start', 'end', 'line'), self))
75        if kwds:
76            raise ValueError('Got unexpected field names: %r' % kwds.keys())
77        return result
78
79    def __getnewargs__(self):
80        return tuple(self)
81
82    type = property(lambda t: t[0])
83    string = property(lambda t: t[1])
84    start = property(lambda t: t[2])
85    end = property(lambda t: t[3])
86    line = property(lambda t: t[4])
87
88def group(*choices): return '(' + '|'.join(choices) + ')'
89def any(*choices): return group(*choices) + '*'
90def maybe(*choices): return group(*choices) + '?'
91
92# Note: we use unicode matching for names ("\w") but ascii matching for
93# number literals.
94Whitespace = r'[ \f\t]*'
95Comment = r'#[^\r\n]*'
96Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
97Name = r'\w+'
98
99Hexnumber = r'0[xX][0-9a-fA-F]+'
100Binnumber = r'0[bB][01]+'
101Octnumber = r'0[oO][0-7]+'
102Decnumber = r'(?:0+|[1-9][0-9]*)'
103Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
104Exponent = r'[eE][-+]?[0-9]+'
105Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
106Expfloat = r'[0-9]+' + Exponent
107Floatnumber = group(Pointfloat, Expfloat)
108Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
109Number = group(Imagnumber, Floatnumber, Intnumber)
110
111# Tail end of ' string.
112Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
113# Tail end of " string.
114Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
115# Tail end of ''' string.
116Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
117# Tail end of """ string.
118Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
119Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
120# Single-line ' or " string.
121String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
122               r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
123
124# Because of leftmost-then-longest match semantics, be sure to put the
125# longest operators first (e.g., if = came before ==, == would get
126# recognized as two instances of =).
127Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
128                 r"//=?", r"->",
129                 r"[+\-*/%&|^=<>]=?",
130                 r"~")
131
132Bracket = '[][(){}]'
133Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
134Funny = group(Operator, Bracket, Special)
135
136PlainToken = group(Number, Funny, String, Name)
137Token = Ignore + PlainToken
138
139# First (or only) line of ' or " string.
140ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
141                group("'", r'\\\r?\n'),
142                r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
143                group('"', r'\\\r?\n'))
144PseudoExtras = group(r'\\\r?\n', Comment, Triple)
145PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
146
147def _compile(expr):
148    return re.compile(expr, re.UNICODE)
149
150tokenprog, pseudoprog, single3prog, double3prog = map(
151    _compile, (Token, PseudoToken, Single3, Double3))
152endprogs = {"'": _compile(Single), '"': _compile(Double),
153            "'''": single3prog, '"""': double3prog,
154            "r'''": single3prog, 'r"""': double3prog,
155            "b'''": single3prog, 'b"""': double3prog,
156            "br'''": single3prog, 'br"""': double3prog,
157            "R'''": single3prog, 'R"""': double3prog,
158            "B'''": single3prog, 'B"""': double3prog,
159            "bR'''": single3prog, 'bR"""': double3prog,
160            "Br'''": single3prog, 'Br"""': double3prog,
161            "BR'''": single3prog, 'BR"""': double3prog,
162            'r': None, 'R': None, 'b': None, 'B': None}
163
164triple_quoted = {}
165for t in ("'''", '"""',
166          "r'''", 'r"""', "R'''", 'R"""',
167          "b'''", 'b"""', "B'''", 'B"""',
168          "br'''", 'br"""', "Br'''", 'Br"""',
169          "bR'''", 'bR"""', "BR'''", 'BR"""'):
170    triple_quoted[t] = t
171single_quoted = {}
172for t in ("'", '"',
173          "r'", 'r"', "R'", 'R"',
174          "b'", 'b"', "B'", 'B"',
175          "br'", 'br"', "Br'", 'Br"',
176          "bR'", 'bR"', "BR'", 'BR"' ):
177    single_quoted[t] = t
178
179del _compile
180
181tabsize = 8
182
183class TokenError(Exception): pass
184
185class StopTokenizing(Exception): pass
186
187
188class Untokenizer:
189
190    def __init__(self):
191        self.tokens = []
192        self.prev_row = 1
193        self.prev_col = 0
194        self.encoding = None
195
196    def add_whitespace(self, start):
197        row, col = start
198        assert row <= self.prev_row
199        col_offset = col - self.prev_col
200        if col_offset:
201            self.tokens.append(" " * col_offset)
202
203    def untokenize(self, iterable):
204        for t in iterable:
205            if len(t) == 2:
206                self.compat(t, iterable)
207                break
208            tok_type, token, start, end, line = t
209            if tok_type == ENCODING:
210                self.encoding = token
211                continue
212            self.add_whitespace(start)
213            self.tokens.append(token)
214            self.prev_row, self.prev_col = end
215            if tok_type in (NEWLINE, NL):
216                self.prev_row += 1
217                self.prev_col = 0
218        return "".join(self.tokens)
219
220    def compat(self, token, iterable):
221        startline = False
222        indents = []
223        toks_append = self.tokens.append
224        toknum, tokval = token
225
226        if toknum in (NAME, NUMBER):
227            tokval += ' '
228        if toknum in (NEWLINE, NL):
229            startline = True
230        prevstring = False
231        for tok in iterable:
232            toknum, tokval = tok[:2]
233            if toknum == ENCODING:
234                self.encoding = tokval
235                continue
236
237            if toknum in (NAME, NUMBER):
238                tokval += ' '
239
240            # Insert a space between two consecutive strings
241            if toknum == STRING:
242                if prevstring:
243                    tokval = ' ' + tokval
244                prevstring = True
245            else:
246                prevstring = False
247
248            if toknum == INDENT:
249                indents.append(tokval)
250                continue
251            elif toknum == DEDENT:
252                indents.pop()
253                continue
254            elif toknum in (NEWLINE, NL):
255                startline = True
256            elif startline and indents:
257                toks_append(indents[-1])
258                startline = False
259            toks_append(tokval)
260
261
262def untokenize(iterable):
263    """Transform tokens back into Python source code.
264    It returns a bytes object, encoded using the ENCODING
265    token, which is the first token sequence output by tokenize.
266
267    Each element returned by the iterable must be a token sequence
268    with at least two elements, a token number and token value.  If
269    only two tokens are passed, the resulting output is poor.
270
271    Round-trip invariant for full input:
272        Untokenized source will match input source exactly
273
274    Round-trip invariant for limited intput:
275        # Output bytes will tokenize the back to the input
276        t1 = [tok[:2] for tok in tokenize(f.readline)]
277        newcode = untokenize(t1)
278        readline = BytesIO(newcode).readline
279        t2 = [tok[:2] for tok in tokenize(readline)]
280        assert t1 == t2
281    """
282    ut = Untokenizer()
283    out = ut.untokenize(iterable)
284    if ut.encoding is not None:
285        out = out.encode(ut.encoding)
286    return out
287
288
289def _get_normal_name(orig_enc):
290    """Imitates get_normal_name in tokenizer.c."""
291    # Only care about the first 12 characters.
292    enc = orig_enc[:12].lower().replace("_", "-")
293    if enc == "utf-8" or enc.startswith("utf-8-"):
294        return "utf-8"
295    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
296       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
297        return "iso-8859-1"
298    return orig_enc
299
300def detect_encoding(readline):
301    """
302    The detect_encoding() function is used to detect the encoding that should
303    be used to decode a Python source file.  It requires one argment, readline,
304    in the same way as the tokenize() generator.
305
306    It will call readline a maximum of twice, and return the encoding used
307    (as a string) and a list of any lines (left as bytes) it has read in.
308
309    It detects the encoding from the presence of a utf-8 bom or an encoding
310    cookie as specified in pep-0263.  If both a bom and a cookie are present,
311    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
312    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
313    'utf-8-sig' is returned.
314
315    If no encoding is specified, then the default of 'utf-8' will be returned.
316    """
317    bom_found = False
318    encoding = None
319    default = 'utf-8'
320    def read_or_stop():
321        try:
322            return readline()
323        except StopIteration:
324            return b''
325
326    def find_cookie(line):
327        try:
328            line_string = line.decode('ascii')
329        except UnicodeDecodeError:
330            return None
331
332        matches = cookie_re.findall(line_string)
333        if not matches:
334            return None
335        encoding = _get_normal_name(matches[0])
336        try:
337            codec = lookup(encoding)
338        except LookupError:
339            # This behaviour mimics the Python interpreter
340            raise SyntaxError("unknown encoding: " + encoding)
341
342        if bom_found:
343            if codec.name != 'utf-8':
344                # This behaviour mimics the Python interpreter
345                raise SyntaxError('encoding problem: utf-8')
346            encoding += '-sig'
347        return encoding
348
349    first = read_or_stop()
350    if first.startswith(BOM_UTF8):
351        bom_found = True
352        first = first[3:]
353        default = 'utf-8-sig'
354    if not first:
355        return default, []
356
357    encoding = find_cookie(first)
358    if encoding:
359        return encoding, [first]
360
361    second = read_or_stop()
362    if not second:
363        return default, [first]
364
365    encoding = find_cookie(second)
366    if encoding:
367        return encoding, [first, second]
368
369    return default, [first, second]
370
371
372def tokenize(readline):
373    """
374    The tokenize() generator requires one argment, readline, which
375    must be a callable object which provides the same interface as the
376    readline() method of built-in file objects.  Each call to the function
377    should return one line of input as bytes.  Alternately, readline
378    can be a callable function terminating with StopIteration:
379        readline = open(myfile, 'rb').__next__  # Example of alternate readline
380
381    The generator produces 5-tuples with these members: the token type; the
382    token string; a 2-tuple (srow, scol) of ints specifying the row and
383    column where the token begins in the source; a 2-tuple (erow, ecol) of
384    ints specifying the row and column where the token ends in the source;
385    and the line on which the token was found.  The line passed is the
386    logical line; continuation lines are included.
387
388    The first token sequence will always be an ENCODING token
389    which tells you which encoding was used to decode the bytes stream.
390    """
391    # This import is here to avoid problems when the itertools module is not
392    # built yet and tokenize is imported.
393    from itertools import chain, repeat
394    encoding, consumed = detect_encoding(readline)
395    rl_gen = iter(readline, b"")
396    empty = repeat(b"")
397    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
398
399
400def _tokenize(readline, encoding):
401    lnum = parenlev = continued = 0
402    numchars = '0123456789'
403    contstr, needcont = '', 0
404    contline = None
405    indents = [0]
406
407    if encoding is not None:
408        if encoding == "utf-8-sig":
409            # BOM will already have been stripped.
410            encoding = "utf-8"
411        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
412    while True:             # loop over lines in stream
413        try:
414            line = readline()
415        except StopIteration:
416            line = b''
417
418        if encoding is not None:
419            line = line.decode(encoding)
420        lnum += 1
421        pos, max = 0, len(line)
422
423        if contstr:                            # continued string
424            if not line:
425                raise TokenError("EOF in multi-line string", strstart)
426            endmatch = endprog.match(line)
427            if endmatch:
428                pos = end = endmatch.end(0)
429                yield TokenInfo(STRING, contstr + line[:end],
430                       strstart, (lnum, end), contline + line)
431                contstr, needcont = '', 0
432                contline = None
433            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
434                yield TokenInfo(ERRORTOKEN, contstr + line,
435                           strstart, (lnum, len(line)), contline)
436                contstr = ''
437                contline = None
438                continue
439            else:
440                contstr = contstr + line
441                contline = contline + line
442                continue
443
444        elif parenlev == 0 and not continued:  # new statement
445            if not line: break
446            column = 0
447            while pos < max:                   # measure leading whitespace
448                if line[pos] == ' ':
449                    column += 1
450                elif line[pos] == '\t':
451                    column = (column//tabsize + 1)*tabsize
452                elif line[pos] == '\f':
453                    column = 0
454                else:
455                    break
456                pos += 1
457            if pos == max:
458                break
459
460            if line[pos] in '#\r\n':           # skip comments or blank lines
461                if line[pos] == '#':
462                    comment_token = line[pos:].rstrip('\r\n')
463                    nl_pos = pos + len(comment_token)
464                    yield TokenInfo(COMMENT, comment_token,
465                           (lnum, pos), (lnum, pos + len(comment_token)), line)
466                    yield TokenInfo(NL, line[nl_pos:],
467                           (lnum, nl_pos), (lnum, len(line)), line)
468                else:
469                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
470                           (lnum, pos), (lnum, len(line)), line)
471                continue
472
473            if column > indents[-1]:           # count indents or dedents
474                indents.append(column)
475                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
476            while column < indents[-1]:
477                if column not in indents:
478                    raise IndentationError(
479                        "unindent does not match any outer indentation level",
480                        ("<tokenize>", lnum, pos, line))
481                indents = indents[:-1]
482                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
483
484        else:                                  # continued statement
485            if not line:
486                raise TokenError("EOF in multi-line statement", (lnum, 0))
487            continued = 0
488
489        while pos < max:
490            pseudomatch = pseudoprog.match(line, pos)
491            if pseudomatch:                                # scan for tokens
492                start, end = pseudomatch.span(1)
493                spos, epos, pos = (lnum, start), (lnum, end), end
494                token, initial = line[start:end], line[start]
495
496                if (initial in numchars or                  # ordinary number
497                    (initial == '.' and token != '.' and token != '...')):
498                    yield TokenInfo(NUMBER, token, spos, epos, line)
499                elif initial in '\r\n':
500                    yield TokenInfo(NL if parenlev > 0 else NEWLINE,
501                           token, spos, epos, line)
502                elif initial == '#':
503                    assert not token.endswith("\n")
504                    yield TokenInfo(COMMENT, token, spos, epos, line)
505                elif token in triple_quoted:
506                    endprog = endprogs[token]
507                    endmatch = endprog.match(line, pos)
508                    if endmatch:                           # all on one line
509                        pos = endmatch.end(0)
510                        token = line[start:pos]
511                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
512                    else:
513                        strstart = (lnum, start)           # multiple lines
514                        contstr = line[start:]
515                        contline = line
516                        break
517                elif initial in single_quoted or \
518                    token[:2] in single_quoted or \
519                    token[:3] in single_quoted:
520                    if token[-1] == '\n':                  # continued string
521                        strstart = (lnum, start)
522                        endprog = (endprogs[initial] or endprogs[token[1]] or
523                                   endprogs[token[2]])
524                        contstr, needcont = line[start:], 1
525                        contline = line
526                        break
527                    else:                                  # ordinary string
528                        yield TokenInfo(STRING, token, spos, epos, line)
529                elif initial.isidentifier():               # ordinary name
530                    yield TokenInfo(NAME, token, spos, epos, line)
531                elif initial == '\\':                      # continued stmt
532                    continued = 1
533                else:
534                    if initial in '([{':
535                        parenlev += 1
536                    elif initial in ')]}':
537                        parenlev -= 1
538                    yield TokenInfo(OP, token, spos, epos, line)
539            else:
540                yield TokenInfo(ERRORTOKEN, line[pos],
541                           (lnum, pos), (lnum, pos+1), line)
542                pos += 1
543
544    for indent in indents[1:]:                 # pop remaining indent levels
545        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
546    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
547
548
549# An undocumented, backwards compatible, API for all the places in the standard
550# library that expect to be able to use tokenize with strings
551def generate_tokens(readline):
552    return _tokenize(readline, None)
553