14adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao"""Tokenization help for Python programs.
24adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
34adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaogenerate_tokens(readline) is a generator that breaks a stream of
44adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotext into Python tokens.  It accepts a readline-like method which is called
54adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaorepeatedly to get the next line of input (or "" for EOF).  It generates
64adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao5-tuples with these members:
74adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
84adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the token type (see token.py)
94adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the token (a string)
104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the starting (row, column) indices of the token (a 2-tuple of ints)
114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the ending (row, column) indices of the token (a 2-tuple of ints)
124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the original line (string)
134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
144adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIt is designed to match the working of the Python tokenizer exactly, except
154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaothat it produces COMMENT tokens for comments and gives type OP for all
164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaooperators
174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
184adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOlder entry points
194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    tokenize_loop(readline, tokeneater)
204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    tokenize(readline, tokeneater=printtoken)
214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoare the same, except instead of generating tokens, tokeneater is a callback
224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofunction to which the 5 fields described above are passed as 5 arguments,
234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoeach time a new token is found."""
244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__author__ = 'Ka-Ping Yee <ping@lfw.org>'
264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao               'Skip Montanaro, Raymond Hettinger')
284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport string, re
304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofrom token import *
314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport token
334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__all__ = [x for x in dir(token) if not x.startswith("_")]
344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodel x
364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodel token
374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
384adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoCOMMENT = N_TOKENS
394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotok_name[COMMENT] = 'COMMENT'
404adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoNL = N_TOKENS + 1
414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotok_name[NL] = 'NL'
424adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoN_TOKENS += 2
434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef group(*choices): return '(' + '|'.join(choices) + ')'
454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef any(*choices): return group(*choices) + '*'
464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef maybe(*choices): return group(*choices) + '?'
474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
484adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoWhitespace = r'[ \f\t]*'
494adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoComment = r'#[^\r\n]*'
504adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
514adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoName = r'[a-zA-Z_]\w*'
524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
534adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoHexnumber = r'0[xX][\da-fA-F]+[lL]?'
544adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOctnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
554adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoBinnumber = r'0[bB][01]+[lL]?'
564adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDecnumber = r'[1-9]\d*[lL]?'
574adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIntnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
584adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoExponent = r'[eE][-+]?\d+'
594adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
604adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoExpfloat = r'\d+' + Exponent
614adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoFloatnumber = group(Pointfloat, Expfloat)
624adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoImagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
634adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoNumber = group(Imagnumber, Floatnumber, Intnumber)
644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of ' string.
664adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSingle = r"[^'\\]*(?:\\.[^'\\]*)*'"
674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of " string.
684adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDouble = r'[^"\\]*(?:\\.[^"\\]*)*"'
694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of ''' string.
704adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of """ string.
724adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
734adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoTriple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Single-line ' or " string.
754adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoString = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Because of leftmost-then-longest match semantics, be sure to put the
794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# longest operators first (e.g., if = came before ==, == would get
804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# recognized as two instances of =).
814adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                 r"//=?",
834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                 r"[+\-*/%&|^=<>]=?",
844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                 r"~")
854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
864adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoBracket = '[][(){}]'
874adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSpecial = group(r'\r?\n', r'[:;.,`@]')
884adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoFunny = group(Operator, Bracket, Special)
894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
904adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPlainToken = group(Number, Funny, String, Name)
914adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoToken = Ignore + PlainToken
924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# First (or only) line of ' or " string.
944adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                group("'", r'\\\r?\n'),
964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                group('"', r'\\\r?\n'))
984adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
994adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
1004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotokenprog, pseudoprog, single3prog, double3prog = map(
1024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    re.compile, (Token, PseudoToken, Single3, Double3))
1034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoendprogs = {"'": re.compile(Single), '"': re.compile(Double),
1044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "'''": single3prog, '"""': double3prog,
1054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "r'''": single3prog, 'r"""': double3prog,
1064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "u'''": single3prog, 'u"""': double3prog,
1074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "ur'''": single3prog, 'ur"""': double3prog,
1084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "R'''": single3prog, 'R"""': double3prog,
1094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "U'''": single3prog, 'U"""': double3prog,
1104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "uR'''": single3prog, 'uR"""': double3prog,
1114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "Ur'''": single3prog, 'Ur"""': double3prog,
1124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "UR'''": single3prog, 'UR"""': double3prog,
1134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "b'''": single3prog, 'b"""': double3prog,
1144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "br'''": single3prog, 'br"""': double3prog,
1154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "B'''": single3prog, 'B"""': double3prog,
1164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "bR'''": single3prog, 'bR"""': double3prog,
1174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "Br'''": single3prog, 'Br"""': double3prog,
1184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            "BR'''": single3prog, 'BR"""': double3prog,
1194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            'r': None, 'R': None, 'u': None, 'U': None,
1204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            'b': None, 'B': None}
1214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotriple_quoted = {}
1234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofor t in ("'''", '"""',
1244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "r'''", 'r"""', "R'''", 'R"""',
1254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "u'''", 'u"""', "U'''", 'U"""',
1264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
1274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "uR'''", 'uR"""', "UR'''", 'UR"""',
1284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "b'''", 'b"""', "B'''", 'B"""',
1294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "br'''", 'br"""', "Br'''", 'Br"""',
1304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "bR'''", 'bR"""', "BR'''", 'BR"""'):
1314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    triple_quoted[t] = t
1324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaosingle_quoted = {}
1334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofor t in ("'", '"',
1344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "r'", 'r"', "R'", 'R"',
1354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "u'", 'u"', "U'", 'U"',
1364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "ur'", 'ur"', "Ur'", 'Ur"',
1374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "uR'", 'uR"', "UR'", 'UR"',
1384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "b'", 'b"', "B'", 'B"',
1394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "br'", 'br"', "Br'", 'Br"',
1404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao          "bR'", 'bR"', "BR'", 'BR"' ):
1414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    single_quoted[t] = t
1424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotabsize = 8
1444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass TokenError(Exception): pass
1464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass StopTokenizing(Exception): pass
1484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef printtoken(type, token, srow_scol, erow_ecol, line): # for testing
1504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    srow, scol = srow_scol
1514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    erow, ecol = erow_ecol
1524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    print "%d,%d-%d,%d:\t%s\t%s" % \
1534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        (srow, scol, erow, ecol, tok_name[type], repr(token))
1544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef tokenize(readline, tokeneater=printtoken):
1564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
1574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The tokenize() function accepts two parameters: one representing the
1584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    input stream, and one providing an output mechanism for tokenize().
1594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The first parameter, readline, must be a callable object which provides
1614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    the same interface as the readline() method of built-in file objects.
1624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Each call to the function should return one line of input as a string.
1634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The second parameter, tokeneater, must also be a callable object. It is
1654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    called once for each token, with five arguments, corresponding to the
1664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    tuples generated by generate_tokens().
1674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
1684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    try:
1694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        tokenize_loop(readline, tokeneater)
1704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    except StopTokenizing:
1714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pass
1724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# backwards compatible interface
1744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef tokenize_loop(readline, tokeneater):
1754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    for token_info in generate_tokens(readline):
1764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        tokeneater(*token_info)
1774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass Untokenizer:
1794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def __init__(self):
1814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.tokens = []
1824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.prev_row = 1
1834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        self.prev_col = 0
1844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def add_whitespace(self, start):
1864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        row, col = start
1874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert row <= self.prev_row
1884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        col_offset = col - self.prev_col
1894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if col_offset:
1904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.tokens.append(" " * col_offset)
1914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
1924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def untokenize(self, iterable):
1934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        for t in iterable:
1944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if len(t) == 2:
1954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.compat(t, iterable)
1964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                break
1974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            tok_type, token, start, end, line = t
1984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.add_whitespace(start)
1994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.tokens.append(token)
2004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            self.prev_row, self.prev_col = end
2014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if tok_type in (NEWLINE, NL):
2024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.prev_row += 1
2034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                self.prev_col = 0
2044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        return "".join(self.tokens)
2054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    def compat(self, token, iterable):
2074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        startline = False
2084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        indents = []
2094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        toks_append = self.tokens.append
2104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        toknum, tokval = token
2114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if toknum in (NAME, NUMBER):
2124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            tokval += ' '
2134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if toknum in (NEWLINE, NL):
2144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            startline = True
2154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        prevstring = False
2164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        for tok in iterable:
2174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            toknum, tokval = tok[:2]
2184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if toknum in (NAME, NUMBER):
2204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                tokval += ' '
2214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            # Insert a space between two consecutive strings
2234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if toknum == STRING:
2244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if prevstring:
2254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    tokval = ' ' + tokval
2264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                prevstring = True
2274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
2284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                prevstring = False
2294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if toknum == INDENT:
2314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents.append(tokval)
2324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
2334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif toknum == DEDENT:
2344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents.pop()
2354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
2364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif toknum in (NEWLINE, NL):
2374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                startline = True
2384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif startline and indents:
2394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                toks_append(indents[-1])
2404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                startline = False
2414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            toks_append(tokval)
2424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef untokenize(iterable):
2444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """Transform tokens back into Python source code.
2454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Each element returned by the iterable must be a token sequence
2474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    with at least two elements, a token number and token value.  If
2484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    only two tokens are passed, the resulting output is poor.
2494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Round-trip invariant for full input:
2514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        Untokenized source will match input source exactly
2524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    Round-trip invariant for limited intput:
2544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        # Output text will tokenize the back to the input
2554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
2564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        newcode = untokenize(t1)
2574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        readline = iter(newcode.splitlines(1)).next
2584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        t2 = [tok[:2] for tok in generate_tokens(readline)]
2594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        assert t1 == t2
2604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
2614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    ut = Untokenizer()
2624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    return ut.untokenize(iterable)
2634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef generate_tokens(readline):
2654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
2664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The generate_tokens() generator requires one argment, readline, which
2674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    must be a callable object which provides the same interface as the
2684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    readline() method of built-in file objects. Each call to the function
2694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    should return one line of input as a string.  Alternately, readline
2704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    can be a callable function terminating with StopIteration:
2714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        readline = open(myfile).next    # Example of alternate readline
2724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    The generator produces 5-tuples with these members: the token type; the
2744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    token string; a 2-tuple (srow, scol) of ints specifying the row and
2754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    column where the token begins in the source; a 2-tuple (erow, ecol) of
2764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    ints specifying the row and column where the token ends in the source;
2774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    and the line on which the token was found. The line passed is the
2784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    logical line; continuation lines are included.
2794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    """
2804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    lnum = parenlev = continued = 0
2814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    namechars, numchars = string.ascii_letters + '_', '0123456789'
2824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    contstr, needcont = '', 0
2834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    contline = None
2844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    indents = [0]
2854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    while 1:                                   # loop over lines in stream
2874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        try:
2884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            line = readline()
2894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        except StopIteration:
2904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            line = ''
2914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        lnum += 1
2924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        pos, max = 0, len(line)
2934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
2944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        if contstr:                            # continued string
2954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not line:
2964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                raise TokenError, ("EOF in multi-line string", strstart)
2974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            endmatch = endprog.match(line)
2984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if endmatch:
2994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                pos = end = endmatch.end(0)
3004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (STRING, contstr + line[:end],
3014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                       strstart, (lnum, end), contline + line)
3024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contstr, needcont = '', 0
3034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contline = None
3044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
3054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (ERRORTOKEN, contstr + line,
3064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           strstart, (lnum, len(line)), contline)
3074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contstr = ''
3084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contline = None
3094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
3104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
3114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contstr = contstr + line
3124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                contline = contline + line
3134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
3144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        elif parenlev == 0 and not continued:  # new statement
3164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not line: break
3174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            column = 0
3184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            while pos < max:                   # measure leading whitespace
3194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if line[pos] == ' ':
3204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    column += 1
3214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif line[pos] == '\t':
3224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    column = (column//tabsize + 1)*tabsize
3234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif line[pos] == '\f':
3244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    column = 0
3254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
3264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    break
3274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                pos += 1
3284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if pos == max:
3294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                break
3304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if line[pos] in '#\r\n':           # skip comments or blank lines
3324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if line[pos] == '#':
3334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    comment_token = line[pos:].rstrip('\r\n')
3344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    nl_pos = pos + len(comment_token)
3354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (COMMENT, comment_token,
3364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, pos), (lnum, pos + len(comment_token)), line)
3374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NL, line[nl_pos:],
3384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, nl_pos), (lnum, len(line)), line)
3394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
3404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
3414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, pos), (lnum, len(line)), line)
3424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                continue
3434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if column > indents[-1]:           # count indents or dedents
3454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents.append(column)
3464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
3474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            while column < indents[-1]:
3484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if column not in indents:
3494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    raise IndentationError(
3504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        "unindent does not match any outer indentation level",
3514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        ("<tokenize>", lnum, pos, line))
3524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                indents = indents[:-1]
3534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
3544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        else:                                  # continued statement
3564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if not line:
3574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
3584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            continued = 0
3594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        while pos < max:
3614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            pseudomatch = pseudoprog.match(line, pos)
3624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            if pseudomatch:                                # scan for tokens
3634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                start, end = pseudomatch.span(1)
3644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                spos, epos, pos = (lnum, start), (lnum, end), end
3654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if start == end:
3664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    continue
3674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                token, initial = line[start:end], line[start]
3684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
3694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                if initial in numchars or \
3704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                   (initial == '.' and token != '.'):      # ordinary number
3714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NUMBER, token, spos, epos, line)
3724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial in '\r\n':
3734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NL if parenlev > 0 else NEWLINE,
3744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           token, spos, epos, line)
3754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial == '#':
3764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    assert not token.endswith("\n")
3774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (COMMENT, token, spos, epos, line)
3784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif token in triple_quoted:
3794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    endprog = endprogs[token]
3804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    endmatch = endprog.match(line, pos)
3814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if endmatch:                           # all on one line
3824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        pos = endmatch.end(0)
3834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        token = line[start:pos]
3844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        yield (STRING, token, spos, (lnum, pos), line)
3854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    else:
3864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        strstart = (lnum, start)           # multiple lines
3874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contstr = line[start:]
3884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contline = line
3894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        break
3904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial in single_quoted or \
3914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    token[:2] in single_quoted or \
3924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    token[:3] in single_quoted:
3934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if token[-1] == '\n':                  # continued string
3944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        strstart = (lnum, start)
3954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        endprog = (endprogs[initial] or endprogs[token[1]] or
3964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                                   endprogs[token[2]])
3974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contstr, needcont = line[start:], 1
3984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        contline = line
3994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        break
4004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    else:                                  # ordinary string
4014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        yield (STRING, token, spos, epos, line)
4024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial in namechars:                 # ordinary name
4034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (NAME, token, spos, epos, line)
4044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                elif initial == '\\':                      # continued stmt
4054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    continued = 1
4064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                else:
4074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    if initial in '([{':
4084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        parenlev += 1
4094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    elif initial in ')]}':
4104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                        parenlev -= 1
4114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                    yield (OP, token, spos, epos, line)
4124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao            else:
4134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                yield (ERRORTOKEN, line[pos],
4144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                           (lnum, pos), (lnum, pos+1), line)
4154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao                pos += 1
4164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    for indent in indents[1:]:                 # pop remaining indent levels
4184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
4194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
4204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao
4214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoif __name__ == '__main__':                     # testing
4224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    import sys
4234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    if len(sys.argv) > 1:
4244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        tokenize(open(sys.argv[1]).readline)
4254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao    else:
4264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao        tokenize(sys.stdin.readline)
427