14adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao"""Tokenization help for Python programs. 24adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 34adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaogenerate_tokens(readline) is a generator that breaks a stream of 44adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotext into Python tokens. It accepts a readline-like method which is called 54adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaorepeatedly to get the next line of input (or "" for EOF). It generates 64adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao5-tuples with these members: 74adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 84adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao the token type (see token.py) 94adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao the token (a string) 104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao the starting (row, column) indices of the token (a 2-tuple of ints) 114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao the ending (row, column) indices of the token (a 2-tuple of ints) 124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao the original line (string) 134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 144adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIt is designed to match the working of the Python tokenizer exactly, except 154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaothat it produces COMMENT tokens for comments and gives type OP for all 164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaooperators 174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 184adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOlder entry points 194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokenize_loop(readline, tokeneater) 204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokenize(readline, tokeneater=printtoken) 214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoare the same, except instead of generating tokens, tokeneater is a callback 224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofunction to which the 5 fields described above are passed as 5 arguments, 234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoeach time a new token is found.""" 244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__author__ = 'Ka-Ping Yee <ping@lfw.org>' 264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 'Skip Montanaro, Raymond Hettinger') 284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport string, re 304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofrom token import * 314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoimport token 334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__all__ = [x for x in dir(token) if not x.startswith("_")] 344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"] 354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodel x 364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodel token 374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 384adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoCOMMENT = N_TOKENS 394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotok_name[COMMENT] = 'COMMENT' 404adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoNL = N_TOKENS + 1 414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotok_name[NL] = 'NL' 424adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoN_TOKENS += 2 434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef group(*choices): return '(' + '|'.join(choices) + ')' 454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef any(*choices): return group(*choices) + '*' 464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef maybe(*choices): return group(*choices) + '?' 474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 484adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoWhitespace = r'[ \f\t]*' 494adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoComment = r'#[^\r\n]*' 504adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 514adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoName = r'[a-zA-Z_]\w*' 524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 534adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoHexnumber = r'0[xX][\da-fA-F]+[lL]?' 544adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOctnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?' 554adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoBinnumber = r'0[bB][01]+[lL]?' 564adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDecnumber = r'[1-9]\d*[lL]?' 574adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoIntnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 584adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoExponent = r'[eE][-+]?\d+' 594adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 604adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoExpfloat = r'\d+' + Exponent 614adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoFloatnumber = group(Pointfloat, Expfloat) 624adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoImagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 634adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoNumber = group(Imagnumber, Floatnumber, Intnumber) 644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of ' string. 664adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSingle = r"[^'\\]*(?:\\.[^'\\]*)*'" 674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of " string. 684adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDouble = r'[^"\\]*(?:\\.[^"\\]*)*"' 694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of ''' string. 704adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Tail end of """ string. 724adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 734adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoTriple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""') 744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Single-line ' or " string. 754adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoString = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# Because of leftmost-then-longest match semantics, be sure to put the 794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# longest operators first (e.g., if = came before ==, == would get 804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# recognized as two instances of =). 814adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r"//=?", 834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r"[+\-*/%&|^=<>]=?", 844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r"~") 854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 864adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoBracket = '[][(){}]' 874adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoSpecial = group(r'\r?\n', r'[:;.,`@]') 884adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoFunny = group(Operator, Bracket, Special) 894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 904adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPlainToken = group(Number, Funny, String, Name) 914adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoToken = Ignore + PlainToken 924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# First (or only) line of ' or " string. 944adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao group("'", r'\\\r?\n'), 964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao group('"', r'\\\r?\n')) 984adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 994adfde8bc82dd39f59e0445588c3e599ada477dJosh GaoPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 1004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotokenprog, pseudoprog, single3prog, double3prog = map( 1024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao re.compile, (Token, PseudoToken, Single3, Double3)) 1034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoendprogs = {"'": re.compile(Single), '"': re.compile(Double), 1044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "'''": single3prog, '"""': double3prog, 1054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "r'''": single3prog, 'r"""': double3prog, 1064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "u'''": single3prog, 'u"""': double3prog, 1074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "ur'''": single3prog, 'ur"""': double3prog, 1084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "R'''": single3prog, 'R"""': double3prog, 1094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "U'''": single3prog, 'U"""': double3prog, 1104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "uR'''": single3prog, 'uR"""': double3prog, 1114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "Ur'''": single3prog, 'Ur"""': double3prog, 1124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "UR'''": single3prog, 'UR"""': double3prog, 1134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "b'''": single3prog, 'b"""': double3prog, 1144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "br'''": single3prog, 'br"""': double3prog, 1154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "B'''": single3prog, 'B"""': double3prog, 1164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "bR'''": single3prog, 'bR"""': double3prog, 1174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "Br'''": single3prog, 'Br"""': double3prog, 1184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "BR'''": single3prog, 'BR"""': double3prog, 1194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 'r': None, 'R': None, 'u': None, 'U': None, 1204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 'b': None, 'B': None} 1214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotriple_quoted = {} 1234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofor t in ("'''", '"""', 1244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "r'''", 'r"""', "R'''", 'R"""', 1254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "u'''", 'u"""', "U'''", 'U"""', 1264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "ur'''", 'ur"""', "Ur'''", 'Ur"""', 1274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "uR'''", 'uR"""', "UR'''", 'UR"""', 1284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "b'''", 'b"""', "B'''", 'B"""', 1294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "br'''", 'br"""', "Br'''", 'Br"""', 1304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "bR'''", 'bR"""', "BR'''", 'BR"""'): 1314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao triple_quoted[t] = t 1324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaosingle_quoted = {} 1334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaofor t in ("'", '"', 1344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "r'", 'r"', "R'", 'R"', 1354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "u'", 'u"', "U'", 'U"', 1364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "ur'", 'ur"', "Ur'", 'Ur"', 1374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "uR'", 'uR"', "UR'", 'UR"', 1384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "b'", 'b"', "B'", 'B"', 1394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "br'", 'br"', "Br'", 'Br"', 1404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "bR'", 'bR"', "BR'", 'BR"' ): 1414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao single_quoted[t] = t 1424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaotabsize = 8 1444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass TokenError(Exception): pass 1464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass StopTokenizing(Exception): pass 1484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef printtoken(type, token, srow_scol, erow_ecol, line): # for testing 1504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao srow, scol = srow_scol 1514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao erow, ecol = erow_ecol 1524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao print "%d,%d-%d,%d:\t%s\t%s" % \ 1534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (srow, scol, erow, ecol, tok_name[type], repr(token)) 1544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef tokenize(readline, tokeneater=printtoken): 1564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """ 1574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao The tokenize() function accepts two parameters: one representing the 1584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao input stream, and one providing an output mechanism for tokenize(). 1594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao The first parameter, readline, must be a callable object which provides 1614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao the same interface as the readline() method of built-in file objects. 1624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Each call to the function should return one line of input as a string. 1634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao The second parameter, tokeneater, must also be a callable object. It is 1654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao called once for each token, with five arguments, corresponding to the 1664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tuples generated by generate_tokens(). 1674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """ 1684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao try: 1694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokenize_loop(readline, tokeneater) 1704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao except StopTokenizing: 1714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pass 1724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao# backwards compatible interface 1744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef tokenize_loop(readline, tokeneater): 1754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao for token_info in generate_tokens(readline): 1764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokeneater(*token_info) 1774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoclass Untokenizer: 1794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def __init__(self): 1814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.tokens = [] 1824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.prev_row = 1 1834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.prev_col = 0 1844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def add_whitespace(self, start): 1864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao row, col = start 1874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert row <= self.prev_row 1884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao col_offset = col - self.prev_col 1894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if col_offset: 1904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.tokens.append(" " * col_offset) 1914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 1924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def untokenize(self, iterable): 1934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao for t in iterable: 1944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if len(t) == 2: 1954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.compat(t, iterable) 1964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 1974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tok_type, token, start, end, line = t 1984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.add_whitespace(start) 1994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.tokens.append(token) 2004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.prev_row, self.prev_col = end 2014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if tok_type in (NEWLINE, NL): 2024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.prev_row += 1 2034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao self.prev_col = 0 2044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return "".join(self.tokens) 2054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao def compat(self, token, iterable): 2074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao startline = False 2084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao indents = [] 2094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao toks_append = self.tokens.append 2104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao toknum, tokval = token 2114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if toknum in (NAME, NUMBER): 2124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokval += ' ' 2134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if toknum in (NEWLINE, NL): 2144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao startline = True 2154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao prevstring = False 2164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao for tok in iterable: 2174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao toknum, tokval = tok[:2] 2184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if toknum in (NAME, NUMBER): 2204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokval += ' ' 2214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Insert a space between two consecutive strings 2234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if toknum == STRING: 2244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if prevstring: 2254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokval = ' ' + tokval 2264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao prevstring = True 2274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 2284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao prevstring = False 2294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if toknum == INDENT: 2314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao indents.append(tokval) 2324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 2334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif toknum == DEDENT: 2344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao indents.pop() 2354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 2364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif toknum in (NEWLINE, NL): 2374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao startline = True 2384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif startline and indents: 2394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao toks_append(indents[-1]) 2404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao startline = False 2414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao toks_append(tokval) 2424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef untokenize(iterable): 2444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """Transform tokens back into Python source code. 2454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Each element returned by the iterable must be a token sequence 2474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao with at least two elements, a token number and token value. If 2484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao only two tokens are passed, the resulting output is poor. 2494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Round-trip invariant for full input: 2514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Untokenized source will match input source exactly 2524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao Round-trip invariant for limited intput: 2544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao # Output text will tokenize the back to the input 2554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao t1 = [tok[:2] for tok in generate_tokens(f.readline)] 2564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao newcode = untokenize(t1) 2574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao readline = iter(newcode.splitlines(1)).next 2584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao t2 = [tok[:2] for tok in generate_tokens(readline)] 2594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert t1 == t2 2604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """ 2614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao ut = Untokenizer() 2624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao return ut.untokenize(iterable) 2634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaodef generate_tokens(readline): 2654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """ 2664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao The generate_tokens() generator requires one argment, readline, which 2674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao must be a callable object which provides the same interface as the 2684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao readline() method of built-in file objects. Each call to the function 2694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao should return one line of input as a string. Alternately, readline 2704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao can be a callable function terminating with StopIteration: 2714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao readline = open(myfile).next # Example of alternate readline 2724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao The generator produces 5-tuples with these members: the token type; the 2744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao token string; a 2-tuple (srow, scol) of ints specifying the row and 2754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao column where the token begins in the source; a 2-tuple (erow, ecol) of 2764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao ints specifying the row and column where the token ends in the source; 2774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao and the line on which the token was found. The line passed is the 2784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao logical line; continuation lines are included. 2794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao """ 2804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao lnum = parenlev = continued = 0 2814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao namechars, numchars = string.ascii_letters + '_', '0123456789' 2824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contstr, needcont = '', 0 2834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contline = None 2844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao indents = [0] 2854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao while 1: # loop over lines in stream 2874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao try: 2884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao line = readline() 2894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao except StopIteration: 2904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao line = '' 2914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao lnum += 1 2924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pos, max = 0, len(line) 2934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 2944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if contstr: # continued string 2954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not line: 2964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao raise TokenError, ("EOF in multi-line string", strstart) 2974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao endmatch = endprog.match(line) 2984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if endmatch: 2994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pos = end = endmatch.end(0) 3004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (STRING, contstr + line[:end], 3014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao strstart, (lnum, end), contline + line) 3024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contstr, needcont = '', 0 3034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contline = None 3044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 3054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (ERRORTOKEN, contstr + line, 3064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao strstart, (lnum, len(line)), contline) 3074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contstr = '' 3084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contline = None 3094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 3104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contstr = contstr + line 3124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contline = contline + line 3134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 3144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif parenlev == 0 and not continued: # new statement 3164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not line: break 3174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao column = 0 3184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao while pos < max: # measure leading whitespace 3194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if line[pos] == ' ': 3204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao column += 1 3214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif line[pos] == '\t': 3224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao column = (column//tabsize + 1)*tabsize 3234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif line[pos] == '\f': 3244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao column = 0 3254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 3274adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pos += 1 3284adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if pos == max: 3294adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 3304adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3314adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if line[pos] in '#\r\n': # skip comments or blank lines 3324adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if line[pos] == '#': 3334adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao comment_token = line[pos:].rstrip('\r\n') 3344adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao nl_pos = pos + len(comment_token) 3354adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (COMMENT, comment_token, 3364adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (lnum, pos), (lnum, pos + len(comment_token)), line) 3374adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (NL, line[nl_pos:], 3384adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (lnum, nl_pos), (lnum, len(line)), line) 3394adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3404adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 3414adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (lnum, pos), (lnum, len(line)), line) 3424adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 3434adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3444adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if column > indents[-1]: # count indents or dedents 3454adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao indents.append(column) 3464adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 3474adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao while column < indents[-1]: 3484adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if column not in indents: 3494adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao raise IndentationError( 3504adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao "unindent does not match any outer indentation level", 3514adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao ("<tokenize>", lnum, pos, line)) 3524adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao indents = indents[:-1] 3534adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 3544adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3554adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: # continued statement 3564adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if not line: 3574adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 3584adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continued = 0 3594adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3604adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao while pos < max: 3614adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pseudomatch = pseudoprog.match(line, pos) 3624adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if pseudomatch: # scan for tokens 3634adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao start, end = pseudomatch.span(1) 3644adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao spos, epos, pos = (lnum, start), (lnum, end), end 3654adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if start == end: 3664adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continue 3674adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao token, initial = line[start:end], line[start] 3684adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 3694adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if initial in numchars or \ 3704adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (initial == '.' and token != '.'): # ordinary number 3714adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (NUMBER, token, spos, epos, line) 3724adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif initial in '\r\n': 3734adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (NL if parenlev > 0 else NEWLINE, 3744adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao token, spos, epos, line) 3754adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif initial == '#': 3764adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao assert not token.endswith("\n") 3774adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (COMMENT, token, spos, epos, line) 3784adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif token in triple_quoted: 3794adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao endprog = endprogs[token] 3804adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao endmatch = endprog.match(line, pos) 3814adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if endmatch: # all on one line 3824adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pos = endmatch.end(0) 3834adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao token = line[start:pos] 3844adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (STRING, token, spos, (lnum, pos), line) 3854adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 3864adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao strstart = (lnum, start) # multiple lines 3874adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contstr = line[start:] 3884adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contline = line 3894adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 3904adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif initial in single_quoted or \ 3914adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao token[:2] in single_quoted or \ 3924adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao token[:3] in single_quoted: 3934adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if token[-1] == '\n': # continued string 3944adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao strstart = (lnum, start) 3954adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao endprog = (endprogs[initial] or endprogs[token[1]] or 3964adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao endprogs[token[2]]) 3974adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contstr, needcont = line[start:], 1 3984adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao contline = line 3994adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao break 4004adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: # ordinary string 4014adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (STRING, token, spos, epos, line) 4024adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif initial in namechars: # ordinary name 4034adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (NAME, token, spos, epos, line) 4044adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif initial == '\\': # continued stmt 4054adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao continued = 1 4064adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 4074adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if initial in '([{': 4084adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao parenlev += 1 4094adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao elif initial in ')]}': 4104adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao parenlev -= 1 4114adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (OP, token, spos, epos, line) 4124adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 4134adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (ERRORTOKEN, line[pos], 4144adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao (lnum, pos), (lnum, pos+1), line) 4154adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao pos += 1 4164adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4174adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao for indent in indents[1:]: # pop remaining indent levels 4184adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 4194adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 4204adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao 4214adfde8bc82dd39f59e0445588c3e599ada477dJosh Gaoif __name__ == '__main__': # testing 4224adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao import sys 4234adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao if len(sys.argv) > 1: 4244adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokenize(open(sys.argv[1]).readline) 4254adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao else: 4264adfde8bc82dd39f59e0445588c3e599ada477dJosh Gao tokenize(sys.stdin.readline) 427