14adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
24adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# All rights reserved.
34adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
44adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao"""Tokenization help for Python programs.
54adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
64adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaogenerate_tokens(readline) is a generator that breaks a stream of
74adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotext into Python tokens.  It accepts a readline-like method which is called
84adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaorepeatedly to get the next line of input (or "" for EOF).  It generates
94adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao5-tuples with these members:
104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the token type (see token.py)
124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the token (a string)
134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the starting (row, column) indices of the token (a 2-tuple of ints)
144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the ending (row, column) indices of the token (a 2-tuple of ints)
154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the original line (string)
164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
174adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIt is designed to match the working of the Python tokenizer exactly, except
184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaothat it produces COMMENT tokens for comments and gives type OP for all
194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaooperators
204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
214adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOlder entry points
224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    tokenize_loop(readline, tokeneater)
234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    tokenize(readline, tokeneater=printtoken)
244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoare the same, except instead of generating tokens, tokeneater is a callback
254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofunction to which the 5 fields described above are passed as 5 arguments,
264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoeach time a new token is found."""
274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__author__ = 'Ka-Ping Yee <ping@lfw.org>'
294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__credits__ = \
304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport string, re
334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofrom codecs import BOM_UTF8, lookup
344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofrom lib2to3.pgen2.token import *
354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofrom . import token
374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao           "generate_tokens", "untokenize"]
394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodel token
404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotry:
424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    bytes
434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoexcept NameError:
444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Support bytes type in Python <= 2.5, so 2to3 turns itself into
454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # valid Python 3 code.
464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    bytes = str
474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef group(*choices): return '(' + '|'.join(choices) + ')'
494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef any(*choices): return group(*choices) + '*'
504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef maybe(*choices): return group(*choices) + '?'
514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
524adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoWhitespace = r'[ \f\t]*'
534adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoComment = r'#[^\r\n]*'
544adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
554adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoName = r'[a-zA-Z_]\w*'
564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
574adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoBinnumber = r'0[bB][01]*'
584adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoHexnumber = r'0[xX][\da-fA-F]*[lL]?'
594adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOctnumber = r'0[oO]?[0-7]*[lL]?'
604adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDecnumber = r'[1-9]\d*[lL]?'
614adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIntnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
624adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoExponent = r'[eE][-+]?\d+'
634adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
644adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoExpfloat = r'\d+' + Exponent
654adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoFloatnumber = group(Pointfloat, Expfloat)
664adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoImagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
674adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoNumber = group(Imagnumber, Floatnumber, Intnumber)
684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of ' string.
704adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSingle = r"[^'\\]*(?:\\.[^'\\]*)*'"
714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of " string.
724adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDouble = r'[^"\\]*(?:\\.[^"\\]*)*"'
734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of ''' string.
744adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of """ string.
764adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
774adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoTriple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Single-line ' or " string.
794adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoString = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Because of leftmost-then-longest match semantics, be sure to put the
834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# longest operators first (e.g., if = came before ==, == would get
844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# recognized as two instances of =).
854adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                 r"//=?", r"->",
874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                 r"[+\-*/%&|^=<>]=?",
884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                 r"~")
894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
904adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoBracket = '[][(){}]'
914adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSpecial = group(r'\r?\n', r'[:;.,`@]')
924adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoFunny = group(Operator, Bracket, Special)
934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
944adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPlainToken = group(Number, Funny, String, Name)
954adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoToken = Ignore + PlainToken
964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# First (or only) line of ' or " string.
984adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                group("'", r'\\\r?\n'),
1004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
1014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                group('"', r'\\\r?\n'))
1024adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPseudoExtras = group(r'\\\r?\n', Comment, Triple)
1034adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
1044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotokenprog, pseudoprog, single3prog, double3prog = map(
1064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    re.compile, (Token, PseudoToken, Single3, Double3))
1074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoendprogs = {"'": re.compile(Single), '"': re.compile(Double),
1084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "'''": single3prog, '"""': double3prog,
1094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "r'''": single3prog, 'r"""': double3prog,
1104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "u'''": single3prog, 'u"""': double3prog,
1114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "b'''": single3prog, 'b"""': double3prog,
1124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "ur'''": single3prog, 'ur"""': double3prog,
1134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "br'''": single3prog, 'br"""': double3prog,
1144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "R'''": single3prog, 'R"""': double3prog,
1154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "U'''": single3prog, 'U"""': double3prog,
1164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "B'''": single3prog, 'B"""': double3prog,
1174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "uR'''": single3prog, 'uR"""': double3prog,
1184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "Ur'''": single3prog, 'Ur"""': double3prog,
1194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "UR'''": single3prog, 'UR"""': double3prog,
1204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "bR'''": single3prog, 'bR"""': double3prog,
1214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "Br'''": single3prog, 'Br"""': double3prog,
1224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "BR'''": single3prog, 'BR"""': double3prog,
1234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            'r': None, 'R': None,
1244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            'u': None, 'U': None,
1254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            'b': None, 'B': None}
1264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotriple_quoted = {}
1284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofor t in ("'''", '"""',
1294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "r'''", 'r"""', "R'''", 'R"""',
1304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "u'''", 'u"""', "U'''", 'U"""',
1314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "b'''", 'b"""', "B'''", 'B"""',
1324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
1334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "uR'''", 'uR"""', "UR'''", 'UR"""',
1344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "br'''", 'br"""', "Br'''", 'Br"""',
1354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "bR'''", 'bR"""', "BR'''", 'BR"""',):
1364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    triple_quoted[t] = t
1374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaosingle_quoted = {}
1384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofor t in ("'", '"',
1394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "r'", 'r"', "R'", 'R"',
1404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "u'", 'u"', "U'", 'U"',
1414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "b'", 'b"', "B'", 'B"',
1424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "ur'", 'ur"', "Ur'", 'Ur"',
1434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "uR'", 'uR"', "UR'", 'UR"',
1444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "br'", 'br"', "Br'", 'Br"',
1454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "bR'", 'bR"', "BR'", 'BR"', ):
1464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    single_quoted[t] = t
1474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotabsize = 8
1494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass TokenError(Exception): pass
1514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass StopTokenizing(Exception): pass
1534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef printtoken(type, token, start, end, line): # for testing
1554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    (srow, scol) = start
1564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    (erow, ecol) = end
1574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    print "%d,%d-%d,%d:\t%s\t%s" % \
1584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        (srow, scol, erow, ecol, tok_name[type], repr(token))
1594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef tokenize(readline, tokeneater=printtoken):
1614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
1624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The tokenize() function accepts two parameters: one representing the
1634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    input stream, and one providing an output mechanism for tokenize().
1644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The first parameter, readline, must be a callable object which provides
1664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the same interface as the readline() method of built-in file objects.
1674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Each call to the function should return one line of input as a string.
1684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The second parameter, tokeneater, must also be a callable object. It is
1704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    called once for each token, with five arguments, corresponding to the
1714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    tuples generated by generate_tokens().
1724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
1734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    try:
1744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        tokenize_loop(readline, tokeneater)
1754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    except StopTokenizing:
1764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
1774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# backwards compatible interface
1794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef tokenize_loop(readline, tokeneater):
1804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    for token_info in generate_tokens(readline):
1814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        tokeneater(*token_info)
1824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass Untokenizer:
1844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def __init__(self):
1864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.tokens = []
1874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.prev_row = 1
1884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.prev_col = 0
1894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def add_whitespace(self, start):
1914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        row, col = start
1924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert row <= self.prev_row
1934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        col_offset = col - self.prev_col
1944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if col_offset:
1954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.tokens.append(" " * col_offset)
1964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def untokenize(self, iterable):
1984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        for t in iterable:
1994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if len(t) == 2:
2004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.compat(t, iterable)
2014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                break
2024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            tok_type, token, start, end, line = t
2034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.add_whitespace(start)
2044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.tokens.append(token)
2054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.prev_row, self.prev_col = end
2064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if tok_type in (NEWLINE, NL):
2074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.prev_row += 1
2084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.prev_col = 0
2094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return "".join(self.tokens)
2104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def compat(self, token, iterable):
2124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        startline = False
2134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        indents = []
2144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        toks_append = self.tokens.append
2154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        toknum, tokval = token
2164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if toknum in (NAME, NUMBER):
2174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            tokval += ' '
2184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if toknum in (NEWLINE, NL):
2194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            startline = True
2204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        for tok in iterable:
2214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            toknum, tokval = tok[:2]
2224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if toknum in (NAME, NUMBER):
2244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                tokval += ' '
2254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if toknum == INDENT:
2274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents.append(tokval)
2284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
2294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif toknum == DEDENT:
2304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents.pop()
2314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
2324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif toknum in (NEWLINE, NL):
2334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                startline = True
2344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif startline and indents:
2354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                toks_append(indents[-1])
2364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                startline = False
2374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            toks_append(tokval)
2384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaocookie_re = re.compile("coding[:=]\s*([-\w.]+)")
2404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef _get_normal_name(orig_enc):
2424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """Imitates get_normal_name in tokenizer.c."""
2434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    # Only care about the first 12 characters.
2444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    enc = orig_enc[:12].lower().replace("_", "-")
2454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if enc == "utf-8" or enc.startswith("utf-8-"):
2464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return "utf-8"
2474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
2484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
2494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return "iso-8859-1"
2504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    return orig_enc
2514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef detect_encoding(readline):
2534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
2544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The detect_encoding() function is used to detect the encoding that should
2554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    be used to decode a Python source file. It requires one argment, readline,
2564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    in the same way as the tokenize() generator.
2574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    It will call readline a maximum of twice, and return the encoding used
2594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    (as a string) and a list of any lines (left as bytes) it has read
2604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    in.
2614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    It detects the encoding from the presence of a utf-8 bom or an encoding
2634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    cookie as specified in pep-0263. If both a bom and a cookie are present, but
2644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
2654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
2664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    'utf-8-sig' is returned.
2674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    If no encoding is specified, then the default of 'utf-8' will be returned.
2694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
2704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    bom_found = False
2714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    encoding = None
2724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    default = 'utf-8'
2734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def read_or_stop():
2744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        try:
2754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return readline()
2764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        except StopIteration:
2774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return bytes()
2784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def find_cookie(line):
2804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        try:
2814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            line_string = line.decode('ascii')
2824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        except UnicodeDecodeError:
2834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return None
2844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        matches = cookie_re.findall(line_string)
2864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if not matches:
2874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            return None
2884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        encoding = _get_normal_name(matches[0])
2894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        try:
2904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            codec = lookup(encoding)
2914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        except LookupError:
2924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # This behaviour mimics the Python interpreter
2934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            raise SyntaxError("unknown encoding: " + encoding)
2944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if bom_found:
2964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if codec.name != 'utf-8':
2974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                # This behaviour mimics the Python interpreter
2984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                raise SyntaxError('encoding problem: utf-8')
2994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            encoding += '-sig'
3004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return encoding
3014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    first = read_or_stop()
3034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if first.startswith(BOM_UTF8):
3044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        bom_found = True
3054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        first = first[3:]
3064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        default = 'utf-8-sig'
3074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if not first:
3084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return default, []
3094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    encoding = find_cookie(first)
3114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if encoding:
3124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return encoding, [first]
3134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    second = read_or_stop()
3154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if not second:
3164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return default, [first]
3174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    encoding = find_cookie(second)
3194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if encoding:
3204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return encoding, [first, second]
3214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    return default, [first, second]
3234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef untokenize(iterable):
3254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """Transform tokens back into Python source code.
3264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Each element returned by the iterable must be a token sequence
3284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    with at least two elements, a token number and token value.  If
3294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    only two tokens are passed, the resulting output is poor.
3304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Round-trip invariant for full input:
3324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        Untokenized source will match input source exactly
3334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Round-trip invariant for limited intput:
3354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        # Output text will tokenize the back to the input
3364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
3374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        newcode = untokenize(t1)
3384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        readline = iter(newcode.splitlines(1)).next
3394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        t2 = [tok[:2] for tokin generate_tokens(readline)]
3404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert t1 == t2
3414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
3424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    ut = Untokenizer()
3434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    return ut.untokenize(iterable)
3444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef generate_tokens(readline):
3464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
3474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The generate_tokens() generator requires one argment, readline, which
3484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    must be a callable object which provides the same interface as the
3494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    readline() method of built-in file objects. Each call to the function
3504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    should return one line of input as a string.  Alternately, readline
3514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    can be a callable function terminating with StopIteration:
3524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        readline = open(myfile).next    # Example of alternate readline
3534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The generator produces 5-tuples with these members: the token type; the
3554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    token string; a 2-tuple (srow, scol) of ints specifying the row and
3564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    column where the token begins in the source; a 2-tuple (erow, ecol) of
3574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    ints specifying the row and column where the token ends in the source;
3584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    and the line on which the token was found. The line passed is the
3594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    logical line; continuation lines are included.
3604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
3614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    lnum = parenlev = continued = 0
3624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    namechars, numchars = string.ascii_letters + '_', '0123456789'
3634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    contstr, needcont = '', 0
3644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    contline = None
3654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    indents = [0]
3664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    while 1:                                   # loop over lines in stream
3684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        try:
3694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            line = readline()
3704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        except StopIteration:
3714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            line = ''
3724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        lnum = lnum + 1
3734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pos, max = 0, len(line)
3744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if contstr:                            # continued string
3764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not line:
3774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                raise TokenError, ("EOF in multi-line string", strstart)
3784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            endmatch = endprog.match(line)
3794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if endmatch:
3804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                pos = end = endmatch.end(0)
3814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (STRING, contstr + line[:end],
3824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                       strstart, (lnum, end), contline + line)
3834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contstr, needcont = '', 0
3844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contline = None
3854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
3864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (ERRORTOKEN, contstr + line,
3874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           strstart, (lnum, len(line)), contline)
3884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contstr = ''
3894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contline = None
3904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
3914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
3924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contstr = contstr + line
3934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contline = contline + line
3944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
3954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        elif parenlev == 0 and not continued:  # new statement
3974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not line: break
3984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            column = 0
3994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            while pos < max:                   # measure leading whitespace
4004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if line[pos] == ' ': column = column + 1
4014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
4024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif line[pos] == '\f': column = 0
4034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else: break
4044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                pos = pos + 1
4054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if pos == max: break
4064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if line[pos] in '#\r\n':           # skip comments or blank lines
4084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if line[pos] == '#':
4094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    comment_token = line[pos:].rstrip('\r\n')
4104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    nl_pos = pos + len(comment_token)
4114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (COMMENT, comment_token,
4124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, pos), (lnum, pos + len(comment_token)), line)
4134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NL, line[nl_pos:],
4144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, nl_pos), (lnum, len(line)), line)
4154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
4164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
4174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, pos), (lnum, len(line)), line)
4184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
4194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if column > indents[-1]:           # count indents or dedents
4214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents.append(column)
4224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
4234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            while column < indents[-1]:
4244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if column not in indents:
4254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    raise IndentationError(
4264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        "unindent does not match any outer indentation level",
4274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        ("<tokenize>", lnum, pos, line))
4284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents = indents[:-1]
4294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
4304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        else:                                  # continued statement
4324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not line:
4334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
4344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            continued = 0
4354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        while pos < max:
4374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            pseudomatch = pseudoprog.match(line, pos)
4384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if pseudomatch:                                # scan for tokens
4394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                start, end = pseudomatch.span(1)
4404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                spos, epos, pos = (lnum, start), (lnum, end), end
4414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                token, initial = line[start:end], line[start]
4424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if initial in numchars or \
4444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                   (initial == '.' and token != '.'):      # ordinary number
4454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NUMBER, token, spos, epos, line)
4464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial in '\r\n':
4474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    newline = NEWLINE
4484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if parenlev > 0:
4494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        newline = NL
4504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (newline, token, spos, epos, line)
4514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial == '#':
4524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    assert not token.endswith("\n")
4534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (COMMENT, token, spos, epos, line)
4544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif token in triple_quoted:
4554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    endprog = endprogs[token]
4564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    endmatch = endprog.match(line, pos)
4574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if endmatch:                           # all on one line
4584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        pos = endmatch.end(0)
4594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        token = line[start:pos]
4604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        yield (STRING, token, spos, (lnum, pos), line)
4614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    else:
4624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        strstart = (lnum, start)           # multiple lines
4634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contstr = line[start:]
4644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contline = line
4654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        break
4664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial in single_quoted or \
4674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    token[:2] in single_quoted or \
4684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    token[:3] in single_quoted:
4694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if token[-1] == '\n':                  # continued string
4704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        strstart = (lnum, start)
4714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        endprog = (endprogs[initial] or endprogs[token[1]] or
4724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                                   endprogs[token[2]])
4734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contstr, needcont = line[start:], 1
4744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contline = line
4754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        break
4764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    else:                                  # ordinary string
4774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        yield (STRING, token, spos, epos, line)
4784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial in namechars:                 # ordinary name
4794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NAME, token, spos, epos, line)
4804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial == '\\':                      # continued stmt
4814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    # This yield is new; needed for better idempotency:
4824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NL, token, spos, (lnum, pos), line)
4834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    continued = 1
4844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
4854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if initial in '([{': parenlev = parenlev + 1
4864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    elif initial in ')]}': parenlev = parenlev - 1
4874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (OP, token, spos, epos, line)
4884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
4894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (ERRORTOKEN, line[pos],
4904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, pos), (lnum, pos+1), line)
4914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                pos = pos + 1
4924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    for indent in indents[1:]:                 # pop remaining indent levels
4944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
4954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
4964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoif __name__ == '__main__':                     # testing
4984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    import sys
4994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
5004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    else: tokenize(sys.stdin.readline)
501