tokenize.py revision 49fd7fa4431da299196d74087df4a04f99f9c46f
1"""Tokenization help for Python programs. 2 3generate_tokens(readline) is a generator that breaks a stream of 4text into Python tokens. It accepts a readline-like method which is called 5repeatedly to get the next line of input (or "" for EOF). It generates 65-tuples with these members: 7 8 the token type (see token.py) 9 the token (a string) 10 the starting (row, column) indices of the token (a 2-tuple of ints) 11 the ending (row, column) indices of the token (a 2-tuple of ints) 12 the original line (string) 13 14It is designed to match the working of the Python tokenizer exactly, except 15that it produces COMMENT tokens for comments and gives type OP for all 16operators 17 18Older entry points 19 tokenize_loop(readline, tokeneater) 20 tokenize(readline, tokeneater=printtoken) 21are the same, except instead of generating tokens, tokeneater is a callback 22function to which the 5 fields described above are passed as 5 arguments, 23each time a new token is found.""" 24 25__author__ = 'Ka-Ping Yee <ping@lfw.org>' 26__credits__ = \ 27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 28 29import string, re 30from token import * 31 32import token 33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", 34 "generate_tokens", "NL", "untokenize"] 35del x 36del token 37 38COMMENT = N_TOKENS 39tok_name[COMMENT] = 'COMMENT' 40NL = N_TOKENS + 1 41tok_name[NL] = 'NL' 42N_TOKENS += 2 43 44def group(*choices): return '(' + '|'.join(choices) + ')' 45def any(*choices): return group(*choices) + '*' 46def maybe(*choices): return group(*choices) + '?' 47 48Whitespace = r'[ \f\t]*' 49Comment = r'#[^\r\n]*' 50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 51Name = r'[a-zA-Z_]\w*' 52 53Hexnumber = r'0[xX][\da-fA-F]*[lL]?' 54Octnumber = r'0[0-7]*[lL]?' 55Decnumber = r'[1-9]\d*[lL]?' 56Intnumber = group(Hexnumber, Octnumber, Decnumber) 57Exponent = r'[eE][-+]?\d+' 58Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 59Expfloat = r'\d+' + Exponent 60Floatnumber = group(Pointfloat, Expfloat) 61Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 62Number = group(Imagnumber, Floatnumber, Intnumber) 63 64# Tail end of ' string. 65Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 66# Tail end of " string. 67Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 68# Tail end of ''' string. 69Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 70# Tail end of """ string. 71Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 72Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""') 73# Single-line ' or " string. 74String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 76 77# Because of leftmost-then-longest match semantics, be sure to put the 78# longest operators first (e.g., if = came before ==, == would get 79# recognized as two instances of =). 80Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 81 r"//=?", 82 r"[+\-*/%&|^=<>]=?", 83 r"~") 84 85Bracket = '[][(){}]' 86Special = group(r'\r?\n', r'[:;.,`@]') 87Funny = group(Operator, Bracket, Special) 88 89PlainToken = group(Number, Funny, String, Name) 90Token = Ignore + PlainToken 91 92# First (or only) line of ' or " string. 93ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 94 group("'", r'\\\r?\n'), 95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 96 group('"', r'\\\r?\n')) 97PseudoExtras = group(r'\\\r?\n', Comment, Triple) 98PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 99 100tokenprog, pseudoprog, single3prog, double3prog = map( 101 re.compile, (Token, PseudoToken, Single3, Double3)) 102endprogs = {"'": re.compile(Single), '"': re.compile(Double), 103 "'''": single3prog, '"""': double3prog, 104 "r'''": single3prog, 'r"""': double3prog, 105 "u'''": single3prog, 'u"""': double3prog, 106 "ur'''": single3prog, 'ur"""': double3prog, 107 "R'''": single3prog, 'R"""': double3prog, 108 "U'''": single3prog, 'U"""': double3prog, 109 "uR'''": single3prog, 'uR"""': double3prog, 110 "Ur'''": single3prog, 'Ur"""': double3prog, 111 "UR'''": single3prog, 'UR"""': double3prog, 112 'r': None, 'R': None, 'u': None, 'U': None} 113 114triple_quoted = {} 115for t in ("'''", '"""', 116 "r'''", 'r"""', "R'''", 'R"""', 117 "u'''", 'u"""', "U'''", 'U"""', 118 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 119 "uR'''", 'uR"""', "UR'''", 'UR"""'): 120 triple_quoted[t] = t 121single_quoted = {} 122for t in ("'", '"', 123 "r'", 'r"', "R'", 'R"', 124 "u'", 'u"', "U'", 'U"', 125 "ur'", 'ur"', "Ur'", 'Ur"', 126 "uR'", 'uR"', "UR'", 'UR"' ): 127 single_quoted[t] = t 128 129tabsize = 8 130 131class TokenError(Exception): pass 132 133class StopTokenizing(Exception): pass 134 135def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing 136 print "%d,%d-%d,%d:\t%s\t%s" % \ 137 (srow, scol, erow, ecol, tok_name[type], repr(token)) 138 139def tokenize(readline, tokeneater=printtoken): 140 """ 141 The tokenize() function accepts two parameters: one representing the 142 input stream, and one providing an output mechanism for tokenize(). 143 144 The first parameter, readline, must be a callable object which provides 145 the same interface as the readline() method of built-in file objects. 146 Each call to the function should return one line of input as a string. 147 148 The second parameter, tokeneater, must also be a callable object. It is 149 called once for each token, with five arguments, corresponding to the 150 tuples generated by generate_tokens(). 151 """ 152 try: 153 tokenize_loop(readline, tokeneater) 154 except StopTokenizing: 155 pass 156 157# backwards compatible interface 158def tokenize_loop(readline, tokeneater): 159 for token_info in generate_tokens(readline): 160 tokeneater(*token_info) 161 162 163def untokenize(iterable): 164 """Transform tokens back into Python source code. 165 166 Each element returned by the iterable must be a token sequence 167 with at least two elements, a token number and token value. 168 169 Round-trip invariant: 170 # Output text will tokenize the back to the input 171 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 172 newcode = untokenize(t1) 173 readline = iter(newcode.splitlines(1)).next 174 t2 = [tok[:2] for tokin generate_tokens(readline)] 175 assert t1 == t2 176 """ 177 178 startline = False 179 indents = [] 180 toks = [] 181 toks_append = toks.append 182 for tok in iterable: 183 toknum, tokval = tok[:2] 184 185 if toknum in (NAME, NUMBER): 186 tokval += ' ' 187 188 if toknum == INDENT: 189 indents.append(tokval) 190 continue 191 elif toknum == DEDENT: 192 indents.pop() 193 continue 194 elif toknum in (NEWLINE, COMMENT, NL): 195 startline = True 196 elif startline and indents: 197 toks_append(indents[-1]) 198 startline = False 199 toks_append(tokval) 200 return ''.join(toks) 201 202 203def generate_tokens(readline): 204 """ 205 The generate_tokens() generator requires one argment, readline, which 206 must be a callable object which provides the same interface as the 207 readline() method of built-in file objects. Each call to the function 208 should return one line of input as a string. Alternately, readline 209 can be a callable function terminating with StopIteration: 210 readline = open(myfile).next # Example of alternate readline 211 212 The generator produces 5-tuples with these members: the token type; the 213 token string; a 2-tuple (srow, scol) of ints specifying the row and 214 column where the token begins in the source; a 2-tuple (erow, ecol) of 215 ints specifying the row and column where the token ends in the source; 216 and the line on which the token was found. The line passed is the 217 logical line; continuation lines are included. 218 """ 219 lnum = parenlev = continued = 0 220 namechars, numchars = string.ascii_letters + '_', '0123456789' 221 contstr, needcont = '', 0 222 contline = None 223 indents = [0] 224 225 while 1: # loop over lines in stream 226 try: 227 line = readline() 228 except StopIteration: 229 line = '' 230 lnum = lnum + 1 231 pos, max = 0, len(line) 232 233 if contstr: # continued string 234 if not line: 235 raise TokenError, ("EOF in multi-line string", strstart) 236 endmatch = endprog.match(line) 237 if endmatch: 238 pos = end = endmatch.end(0) 239 yield (STRING, contstr + line[:end], 240 strstart, (lnum, end), contline + line) 241 contstr, needcont = '', 0 242 contline = None 243 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 244 yield (ERRORTOKEN, contstr + line, 245 strstart, (lnum, len(line)), contline) 246 contstr = '' 247 contline = None 248 continue 249 else: 250 contstr = contstr + line 251 contline = contline + line 252 continue 253 254 elif parenlev == 0 and not continued: # new statement 255 if not line: break 256 column = 0 257 while pos < max: # measure leading whitespace 258 if line[pos] == ' ': column = column + 1 259 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize 260 elif line[pos] == '\f': column = 0 261 else: break 262 pos = pos + 1 263 if pos == max: break 264 265 if line[pos] in '#\r\n': # skip comments or blank lines 266 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 267 (lnum, pos), (lnum, len(line)), line) 268 continue 269 270 if column > indents[-1]: # count indents or dedents 271 indents.append(column) 272 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 273 while column < indents[-1]: 274 if column not in indents: 275 raise IndentationError( 276 "unindent does not match any outer indentation level") 277 indents = indents[:-1] 278 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 279 280 else: # continued statement 281 if not line: 282 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 283 continued = 0 284 285 while pos < max: 286 pseudomatch = pseudoprog.match(line, pos) 287 if pseudomatch: # scan for tokens 288 start, end = pseudomatch.span(1) 289 spos, epos, pos = (lnum, start), (lnum, end), end 290 token, initial = line[start:end], line[start] 291 292 if initial in numchars or \ 293 (initial == '.' and token != '.'): # ordinary number 294 yield (NUMBER, token, spos, epos, line) 295 elif initial in '\r\n': 296 yield (parenlev > 0 and NL or NEWLINE, 297 token, spos, epos, line) 298 elif initial == '#': 299 yield (COMMENT, token, spos, epos, line) 300 elif token in triple_quoted: 301 endprog = endprogs[token] 302 endmatch = endprog.match(line, pos) 303 if endmatch: # all on one line 304 pos = endmatch.end(0) 305 token = line[start:pos] 306 yield (STRING, token, spos, (lnum, pos), line) 307 else: 308 strstart = (lnum, start) # multiple lines 309 contstr = line[start:] 310 contline = line 311 break 312 elif initial in single_quoted or \ 313 token[:2] in single_quoted or \ 314 token[:3] in single_quoted: 315 if token[-1] == '\n': # continued string 316 strstart = (lnum, start) 317 endprog = (endprogs[initial] or endprogs[token[1]] or 318 endprogs[token[2]]) 319 contstr, needcont = line[start:], 1 320 contline = line 321 break 322 else: # ordinary string 323 yield (STRING, token, spos, epos, line) 324 elif initial in namechars: # ordinary name 325 yield (NAME, token, spos, epos, line) 326 elif initial == '\\': # continued stmt 327 continued = 1 328 else: 329 if initial in '([{': parenlev = parenlev + 1 330 elif initial in ')]}': parenlev = parenlev - 1 331 yield (OP, token, spos, epos, line) 332 else: 333 yield (ERRORTOKEN, line[pos], 334 (lnum, pos), (lnum, pos+1), line) 335 pos = pos + 1 336 337 for indent in indents[1:]: # pop remaining indent levels 338 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 339 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 340 341if __name__ == '__main__': # testing 342 import sys 343 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 344 else: tokenize(sys.stdin.readline) 345