tokenize.py revision 33856de84d1115a18b699e0ca93c3b921bc6a1af
1"""Tokenization help for Python programs. 2 3tokenize(readline) is a generator that breaks a stream of 4bytes into Python tokens. It decodes the bytes according to 5PEP-0263 for determining source file encoding. 6 7It accepts a readline-like method which is called 8repeatedly to get the next line of input (or b"" for EOF). It generates 95-tuples with these members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators. Aditionally, all token lists start with an ENCODING token 20which tells you which encoding was used to decode the bytes stream.""" 21 22__author__ = 'Ka-Ping Yee <ping@lfw.org>' 23__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 24 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 25 'Michael Foord') 26import re, string, sys 27from token import * 28from codecs import lookup, BOM_UTF8 29cookie_re = re.compile("coding[:=]\s*([-\w.]+)") 30 31import token 32__all__ = [x for x in dir(token) if not x.startswith("_")] 33__all__.extend(["COMMENT", "tokenize", "detect_encoding", "NL", "untokenize", 34 "ENCODING", "TokenInfo"]) 35del token 36 37COMMENT = N_TOKENS 38tok_name[COMMENT] = 'COMMENT' 39NL = N_TOKENS + 1 40tok_name[NL] = 'NL' 41ENCODING = N_TOKENS + 2 42tok_name[ENCODING] = 'ENCODING' 43N_TOKENS += 3 44 45class TokenInfo(tuple): 46 'TokenInfo(type, string, start, end, line)' 47 48 __slots__ = () 49 50 _fields = ('type', 'string', 'start', 'end', 'line') 51 52 def __new__(cls, type, string, start, end, line): 53 return tuple.__new__(cls, (type, string, start, end, line)) 54 55 @classmethod 56 def _make(cls, iterable, new=tuple.__new__, len=len): 57 'Make a new TokenInfo object from a sequence or iterable' 58 result = new(cls, iterable) 59 if len(result) != 5: 60 raise TypeError('Expected 5 arguments, got %d' % len(result)) 61 return result 62 63 def __repr__(self): 64 return 'TokenInfo(type=%r, string=%r, start=%r, end=%r, line=%r)' % self 65 66 def _asdict(self): 67 'Return a new dict which maps field names to their values' 68 return dict(zip(self._fields, self)) 69 70 def _replace(self, **kwds): 71 'Return a new TokenInfo object replacing specified fields with new values' 72 result = self._make(map(kwds.pop, ('type', 'string', 'start', 'end', 'line'), self)) 73 if kwds: 74 raise ValueError('Got unexpected field names: %r' % kwds.keys()) 75 return result 76 77 def __getnewargs__(self): 78 return tuple(self) 79 80 type = property(lambda t: t[0]) 81 string = property(lambda t: t[1]) 82 start = property(lambda t: t[2]) 83 end = property(lambda t: t[3]) 84 line = property(lambda t: t[4]) 85 86def group(*choices): return '(' + '|'.join(choices) + ')' 87def any(*choices): return group(*choices) + '*' 88def maybe(*choices): return group(*choices) + '?' 89 90# Note: we use unicode matching for names ("\w") but ascii matching for 91# number literals. 92Whitespace = r'[ \f\t]*' 93Comment = r'#[^\r\n]*' 94Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 95Name = r'\w+' 96 97Hexnumber = r'0[xX][0-9a-fA-F]+' 98Binnumber = r'0[bB][01]+' 99Octnumber = r'0[oO][0-7]+' 100Decnumber = r'(?:0+|[1-9][0-9]*)' 101Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 102Exponent = r'[eE][-+]?[0-9]+' 103Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) 104Expfloat = r'[0-9]+' + Exponent 105Floatnumber = group(Pointfloat, Expfloat) 106Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') 107Number = group(Imagnumber, Floatnumber, Intnumber) 108 109# Tail end of ' string. 110Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 111# Tail end of " string. 112Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 113# Tail end of ''' string. 114Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 115# Tail end of """ string. 116Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 117Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""') 118# Single-line ' or " string. 119String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 120 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 121 122# Because of leftmost-then-longest match semantics, be sure to put the 123# longest operators first (e.g., if = came before ==, == would get 124# recognized as two instances of =). 125Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", 126 r"//=?", r"->", 127 r"[+\-*/%&|^=<>]=?", 128 r"~") 129 130Bracket = '[][(){}]' 131Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') 132Funny = group(Operator, Bracket, Special) 133 134PlainToken = group(Number, Funny, String, Name) 135Token = Ignore + PlainToken 136 137# First (or only) line of ' or " string. 138ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 139 group("'", r'\\\r?\n'), 140 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 141 group('"', r'\\\r?\n')) 142PseudoExtras = group(r'\\\r?\n', Comment, Triple) 143PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 144 145def _compile(expr): 146 return re.compile(expr, re.UNICODE) 147 148tokenprog, pseudoprog, single3prog, double3prog = map( 149 _compile, (Token, PseudoToken, Single3, Double3)) 150endprogs = {"'": _compile(Single), '"': _compile(Double), 151 "'''": single3prog, '"""': double3prog, 152 "r'''": single3prog, 'r"""': double3prog, 153 "b'''": single3prog, 'b"""': double3prog, 154 "br'''": single3prog, 'br"""': double3prog, 155 "R'''": single3prog, 'R"""': double3prog, 156 "B'''": single3prog, 'B"""': double3prog, 157 "bR'''": single3prog, 'bR"""': double3prog, 158 "Br'''": single3prog, 'Br"""': double3prog, 159 "BR'''": single3prog, 'BR"""': double3prog, 160 'r': None, 'R': None, 'b': None, 'B': None} 161 162triple_quoted = {} 163for t in ("'''", '"""', 164 "r'''", 'r"""', "R'''", 'R"""', 165 "b'''", 'b"""', "B'''", 'B"""', 166 "br'''", 'br"""', "Br'''", 'Br"""', 167 "bR'''", 'bR"""', "BR'''", 'BR"""'): 168 triple_quoted[t] = t 169single_quoted = {} 170for t in ("'", '"', 171 "r'", 'r"', "R'", 'R"', 172 "b'", 'b"', "B'", 'B"', 173 "br'", 'br"', "Br'", 'Br"', 174 "bR'", 'bR"', "BR'", 'BR"' ): 175 single_quoted[t] = t 176 177del _compile 178 179tabsize = 8 180 181class TokenError(Exception): pass 182 183class StopTokenizing(Exception): pass 184 185 186class Untokenizer: 187 188 def __init__(self): 189 self.tokens = [] 190 self.prev_row = 1 191 self.prev_col = 0 192 self.encoding = None 193 194 def add_whitespace(self, start): 195 row, col = start 196 assert row <= self.prev_row 197 col_offset = col - self.prev_col 198 if col_offset: 199 self.tokens.append(" " * col_offset) 200 201 def untokenize(self, iterable): 202 for t in iterable: 203 if len(t) == 2: 204 self.compat(t, iterable) 205 break 206 tok_type, token, start, end, line = t 207 if tok_type == ENCODING: 208 self.encoding = token 209 continue 210 self.add_whitespace(start) 211 self.tokens.append(token) 212 self.prev_row, self.prev_col = end 213 if tok_type in (NEWLINE, NL): 214 self.prev_row += 1 215 self.prev_col = 0 216 return "".join(self.tokens) 217 218 def compat(self, token, iterable): 219 startline = False 220 indents = [] 221 toks_append = self.tokens.append 222 toknum, tokval = token 223 224 if toknum in (NAME, NUMBER): 225 tokval += ' ' 226 if toknum in (NEWLINE, NL): 227 startline = True 228 prevstring = False 229 for tok in iterable: 230 toknum, tokval = tok[:2] 231 if toknum == ENCODING: 232 self.encoding = tokval 233 continue 234 235 if toknum in (NAME, NUMBER): 236 tokval += ' ' 237 238 # Insert a space between two consecutive strings 239 if toknum == STRING: 240 if prevstring: 241 tokval = ' ' + tokval 242 prevstring = True 243 else: 244 prevstring = False 245 246 if toknum == INDENT: 247 indents.append(tokval) 248 continue 249 elif toknum == DEDENT: 250 indents.pop() 251 continue 252 elif toknum in (NEWLINE, NL): 253 startline = True 254 elif startline and indents: 255 toks_append(indents[-1]) 256 startline = False 257 toks_append(tokval) 258 259 260def untokenize(iterable): 261 """Transform tokens back into Python source code. 262 It returns a bytes object, encoded using the ENCODING 263 token, which is the first token sequence output by tokenize. 264 265 Each element returned by the iterable must be a token sequence 266 with at least two elements, a token number and token value. If 267 only two tokens are passed, the resulting output is poor. 268 269 Round-trip invariant for full input: 270 Untokenized source will match input source exactly 271 272 Round-trip invariant for limited intput: 273 # Output bytes will tokenize the back to the input 274 t1 = [tok[:2] for tok in tokenize(f.readline)] 275 newcode = untokenize(t1) 276 readline = BytesIO(newcode).readline 277 t2 = [tok[:2] for tok in tokenize(readline)] 278 assert t1 == t2 279 """ 280 ut = Untokenizer() 281 out = ut.untokenize(iterable) 282 if ut.encoding is not None: 283 out = out.encode(ut.encoding) 284 return out 285 286 287def _get_normal_name(orig_enc): 288 """Imitates get_normal_name in tokenizer.c.""" 289 # Only care about the first 12 characters. 290 enc = orig_enc[:12].lower().replace("_", "-") 291 if enc == "utf-8" or enc.startswith("utf-8-"): 292 return "utf-8" 293 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 294 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 295 return "iso-8859-1" 296 return orig_enc 297 298def detect_encoding(readline): 299 """ 300 The detect_encoding() function is used to detect the encoding that should 301 be used to decode a Python source file. It requires one argment, readline, 302 in the same way as the tokenize() generator. 303 304 It will call readline a maximum of twice, and return the encoding used 305 (as a string) and a list of any lines (left as bytes) it has read 306 in. 307 308 It detects the encoding from the presence of a utf-8 bom or an encoding 309 cookie as specified in pep-0263. If both a bom and a cookie are present, but 310 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 311 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 312 'utf-8-sig' is returned. 313 314 If no encoding is specified, then the default of 'utf-8' will be returned. 315 """ 316 bom_found = False 317 encoding = None 318 default = 'utf-8' 319 def read_or_stop(): 320 try: 321 return readline() 322 except StopIteration: 323 return b'' 324 325 def find_cookie(line): 326 try: 327 line_string = line.decode('ascii') 328 except UnicodeDecodeError: 329 return None 330 331 matches = cookie_re.findall(line_string) 332 if not matches: 333 return None 334 encoding = _get_normal_name(matches[0]) 335 try: 336 codec = lookup(encoding) 337 except LookupError: 338 # This behaviour mimics the Python interpreter 339 raise SyntaxError("unknown encoding: " + encoding) 340 341 if bom_found: 342 if codec.name != 'utf-8': 343 # This behaviour mimics the Python interpreter 344 raise SyntaxError('encoding problem: utf-8') 345 encoding += '-sig' 346 return encoding 347 348 first = read_or_stop() 349 if first.startswith(BOM_UTF8): 350 bom_found = True 351 first = first[3:] 352 default = 'utf-8-sig' 353 if not first: 354 return default, [] 355 356 encoding = find_cookie(first) 357 if encoding: 358 return encoding, [first] 359 360 second = read_or_stop() 361 if not second: 362 return default, [first] 363 364 encoding = find_cookie(second) 365 if encoding: 366 return encoding, [first, second] 367 368 return default, [first, second] 369 370 371def tokenize(readline): 372 """ 373 The tokenize() generator requires one argment, readline, which 374 must be a callable object which provides the same interface as the 375 readline() method of built-in file objects. Each call to the function 376 should return one line of input as bytes. Alternately, readline 377 can be a callable function terminating with StopIteration: 378 readline = open(myfile, 'rb').__next__ # Example of alternate readline 379 380 The generator produces 5-tuples with these members: the token type; the 381 token string; a 2-tuple (srow, scol) of ints specifying the row and 382 column where the token begins in the source; a 2-tuple (erow, ecol) of 383 ints specifying the row and column where the token ends in the source; 384 and the line on which the token was found. The line passed is the 385 logical line; continuation lines are included. 386 387 The first token sequence will always be an ENCODING token 388 which tells you which encoding was used to decode the bytes stream. 389 """ 390 # This import is here to avoid problems when the itertools module is not 391 # built yet and tokenize is imported. 392 from itertools import chain, repeat 393 encoding, consumed = detect_encoding(readline) 394 rl_gen = iter(readline, b"") 395 empty = repeat(b"") 396 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) 397 398 399def _tokenize(readline, encoding): 400 lnum = parenlev = continued = 0 401 numchars = '0123456789' 402 contstr, needcont = '', 0 403 contline = None 404 indents = [0] 405 406 if encoding is not None: 407 if encoding == "utf-8-sig": 408 # BOM will already have been stripped. 409 encoding = "utf-8" 410 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 411 while True: # loop over lines in stream 412 try: 413 line = readline() 414 except StopIteration: 415 line = b'' 416 417 if encoding is not None: 418 line = line.decode(encoding) 419 lnum += 1 420 pos, max = 0, len(line) 421 422 if contstr: # continued string 423 if not line: 424 raise TokenError("EOF in multi-line string", strstart) 425 endmatch = endprog.match(line) 426 if endmatch: 427 pos = end = endmatch.end(0) 428 yield TokenInfo(STRING, contstr + line[:end], 429 strstart, (lnum, end), contline + line) 430 contstr, needcont = '', 0 431 contline = None 432 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 433 yield TokenInfo(ERRORTOKEN, contstr + line, 434 strstart, (lnum, len(line)), contline) 435 contstr = '' 436 contline = None 437 continue 438 else: 439 contstr = contstr + line 440 contline = contline + line 441 continue 442 443 elif parenlev == 0 and not continued: # new statement 444 if not line: break 445 column = 0 446 while pos < max: # measure leading whitespace 447 if line[pos] == ' ': 448 column += 1 449 elif line[pos] == '\t': 450 column = (column//tabsize + 1)*tabsize 451 elif line[pos] == '\f': 452 column = 0 453 else: 454 break 455 pos += 1 456 if pos == max: 457 break 458 459 if line[pos] in '#\r\n': # skip comments or blank lines 460 if line[pos] == '#': 461 comment_token = line[pos:].rstrip('\r\n') 462 nl_pos = pos + len(comment_token) 463 yield TokenInfo(COMMENT, comment_token, 464 (lnum, pos), (lnum, pos + len(comment_token)), line) 465 yield TokenInfo(NL, line[nl_pos:], 466 (lnum, nl_pos), (lnum, len(line)), line) 467 else: 468 yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:], 469 (lnum, pos), (lnum, len(line)), line) 470 continue 471 472 if column > indents[-1]: # count indents or dedents 473 indents.append(column) 474 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 475 while column < indents[-1]: 476 if column not in indents: 477 raise IndentationError( 478 "unindent does not match any outer indentation level", 479 ("<tokenize>", lnum, pos, line)) 480 indents = indents[:-1] 481 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 482 483 else: # continued statement 484 if not line: 485 raise TokenError("EOF in multi-line statement", (lnum, 0)) 486 continued = 0 487 488 while pos < max: 489 pseudomatch = pseudoprog.match(line, pos) 490 if pseudomatch: # scan for tokens 491 start, end = pseudomatch.span(1) 492 spos, epos, pos = (lnum, start), (lnum, end), end 493 token, initial = line[start:end], line[start] 494 495 if (initial in numchars or # ordinary number 496 (initial == '.' and token != '.' and token != '...')): 497 yield TokenInfo(NUMBER, token, spos, epos, line) 498 elif initial in '\r\n': 499 yield TokenInfo(NL if parenlev > 0 else NEWLINE, 500 token, spos, epos, line) 501 elif initial == '#': 502 assert not token.endswith("\n") 503 yield TokenInfo(COMMENT, token, spos, epos, line) 504 elif token in triple_quoted: 505 endprog = endprogs[token] 506 endmatch = endprog.match(line, pos) 507 if endmatch: # all on one line 508 pos = endmatch.end(0) 509 token = line[start:pos] 510 yield TokenInfo(STRING, token, spos, (lnum, pos), line) 511 else: 512 strstart = (lnum, start) # multiple lines 513 contstr = line[start:] 514 contline = line 515 break 516 elif initial in single_quoted or \ 517 token[:2] in single_quoted or \ 518 token[:3] in single_quoted: 519 if token[-1] == '\n': # continued string 520 strstart = (lnum, start) 521 endprog = (endprogs[initial] or endprogs[token[1]] or 522 endprogs[token[2]]) 523 contstr, needcont = line[start:], 1 524 contline = line 525 break 526 else: # ordinary string 527 yield TokenInfo(STRING, token, spos, epos, line) 528 elif initial.isidentifier(): # ordinary name 529 yield TokenInfo(NAME, token, spos, epos, line) 530 elif initial == '\\': # continued stmt 531 continued = 1 532 else: 533 if initial in '([{': 534 parenlev += 1 535 elif initial in ')]}': 536 parenlev -= 1 537 yield TokenInfo(OP, token, spos, epos, line) 538 else: 539 yield TokenInfo(ERRORTOKEN, line[pos], 540 (lnum, pos), (lnum, pos+1), line) 541 pos += 1 542 543 for indent in indents[1:]: # pop remaining indent levels 544 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 545 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 546 547 548# An undocumented, backwards compatible, API for all the places in the standard 549# library that expect to be able to use tokenize with strings 550def generate_tokens(readline): 551 return _tokenize(readline, None) 552