tokenize.py revision 43e4ea1b17ac912e4f8e55e256b96be0c57a88ee
1"""Tokenization help for Python programs. 2 3tokenize(readline) is a generator that breaks a stream of bytes into 4Python tokens. It decodes the bytes according to PEP-0263 for 5determining source file encoding. 6 7It accepts a readline-like method which is called repeatedly to get the 8next line of input (or b"" for EOF). It generates 5-tuples with these 9members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators. Additionally, all token lists start with an ENCODING token 20which tells you which encoding was used to decode the bytes stream. 21""" 22 23__author__ = 'Ka-Ping Yee <ping@lfw.org>' 24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26 'Michael Foord') 27import re 28import sys 29from token import * 30from codecs import lookup, BOM_UTF8 31cookie_re = re.compile("coding[:=]\s*([-\w.]+)") 32 33import token 34__all__ = [x for x in dir(token) if not x.startswith("_")] 35__all__.extend(["COMMENT", "tokenize", "detect_encoding", "NL", "untokenize", 36 "ENCODING", "TokenInfo"]) 37del token 38 39COMMENT = N_TOKENS 40tok_name[COMMENT] = 'COMMENT' 41NL = N_TOKENS + 1 42tok_name[NL] = 'NL' 43ENCODING = N_TOKENS + 2 44tok_name[ENCODING] = 'ENCODING' 45N_TOKENS += 3 46 47class TokenInfo(tuple): 48 'TokenInfo(type, string, start, end, line)' 49 50 __slots__ = () 51 52 _fields = ('type', 'string', 'start', 'end', 'line') 53 54 def __new__(cls, type, string, start, end, line): 55 return tuple.__new__(cls, (type, string, start, end, line)) 56 57 @classmethod 58 def _make(cls, iterable, new=tuple.__new__, len=len): 59 'Make a new TokenInfo object from a sequence or iterable' 60 result = new(cls, iterable) 61 if len(result) != 5: 62 raise TypeError('Expected 5 arguments, got %d' % len(result)) 63 return result 64 65 def __repr__(self): 66 return 'TokenInfo(type=%r, string=%r, start=%r, end=%r, line=%r)' % self 67 68 def _asdict(self): 69 'Return a new dict which maps field names to their values' 70 return dict(zip(self._fields, self)) 71 72 def _replace(self, **kwds): 73 'Return a new TokenInfo object replacing specified fields with new values' 74 result = self._make(map(kwds.pop, ('type', 'string', 'start', 'end', 'line'), self)) 75 if kwds: 76 raise ValueError('Got unexpected field names: %r' % kwds.keys()) 77 return result 78 79 def __getnewargs__(self): 80 return tuple(self) 81 82 type = property(lambda t: t[0]) 83 string = property(lambda t: t[1]) 84 start = property(lambda t: t[2]) 85 end = property(lambda t: t[3]) 86 line = property(lambda t: t[4]) 87 88def group(*choices): return '(' + '|'.join(choices) + ')' 89def any(*choices): return group(*choices) + '*' 90def maybe(*choices): return group(*choices) + '?' 91 92# Note: we use unicode matching for names ("\w") but ascii matching for 93# number literals. 94Whitespace = r'[ \f\t]*' 95Comment = r'#[^\r\n]*' 96Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 97Name = r'\w+' 98 99Hexnumber = r'0[xX][0-9a-fA-F]+' 100Binnumber = r'0[bB][01]+' 101Octnumber = r'0[oO][0-7]+' 102Decnumber = r'(?:0+|[1-9][0-9]*)' 103Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 104Exponent = r'[eE][-+]?[0-9]+' 105Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) 106Expfloat = r'[0-9]+' + Exponent 107Floatnumber = group(Pointfloat, Expfloat) 108Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') 109Number = group(Imagnumber, Floatnumber, Intnumber) 110 111# Tail end of ' string. 112Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 113# Tail end of " string. 114Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 115# Tail end of ''' string. 116Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 117# Tail end of """ string. 118Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 119Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""') 120# Single-line ' or " string. 121String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 122 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 123 124# Because of leftmost-then-longest match semantics, be sure to put the 125# longest operators first (e.g., if = came before ==, == would get 126# recognized as two instances of =). 127Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", 128 r"//=?", r"->", 129 r"[+\-*/%&|^=<>]=?", 130 r"~") 131 132Bracket = '[][(){}]' 133Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') 134Funny = group(Operator, Bracket, Special) 135 136PlainToken = group(Number, Funny, String, Name) 137Token = Ignore + PlainToken 138 139# First (or only) line of ' or " string. 140ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 141 group("'", r'\\\r?\n'), 142 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 143 group('"', r'\\\r?\n')) 144PseudoExtras = group(r'\\\r?\n', Comment, Triple) 145PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 146 147def _compile(expr): 148 return re.compile(expr, re.UNICODE) 149 150tokenprog, pseudoprog, single3prog, double3prog = map( 151 _compile, (Token, PseudoToken, Single3, Double3)) 152endprogs = {"'": _compile(Single), '"': _compile(Double), 153 "'''": single3prog, '"""': double3prog, 154 "r'''": single3prog, 'r"""': double3prog, 155 "b'''": single3prog, 'b"""': double3prog, 156 "br'''": single3prog, 'br"""': double3prog, 157 "R'''": single3prog, 'R"""': double3prog, 158 "B'''": single3prog, 'B"""': double3prog, 159 "bR'''": single3prog, 'bR"""': double3prog, 160 "Br'''": single3prog, 'Br"""': double3prog, 161 "BR'''": single3prog, 'BR"""': double3prog, 162 'r': None, 'R': None, 'b': None, 'B': None} 163 164triple_quoted = {} 165for t in ("'''", '"""', 166 "r'''", 'r"""', "R'''", 'R"""', 167 "b'''", 'b"""', "B'''", 'B"""', 168 "br'''", 'br"""', "Br'''", 'Br"""', 169 "bR'''", 'bR"""', "BR'''", 'BR"""'): 170 triple_quoted[t] = t 171single_quoted = {} 172for t in ("'", '"', 173 "r'", 'r"', "R'", 'R"', 174 "b'", 'b"', "B'", 'B"', 175 "br'", 'br"', "Br'", 'Br"', 176 "bR'", 'bR"', "BR'", 'BR"' ): 177 single_quoted[t] = t 178 179del _compile 180 181tabsize = 8 182 183class TokenError(Exception): pass 184 185class StopTokenizing(Exception): pass 186 187 188class Untokenizer: 189 190 def __init__(self): 191 self.tokens = [] 192 self.prev_row = 1 193 self.prev_col = 0 194 self.encoding = None 195 196 def add_whitespace(self, start): 197 row, col = start 198 assert row <= self.prev_row 199 col_offset = col - self.prev_col 200 if col_offset: 201 self.tokens.append(" " * col_offset) 202 203 def untokenize(self, iterable): 204 for t in iterable: 205 if len(t) == 2: 206 self.compat(t, iterable) 207 break 208 tok_type, token, start, end, line = t 209 if tok_type == ENCODING: 210 self.encoding = token 211 continue 212 self.add_whitespace(start) 213 self.tokens.append(token) 214 self.prev_row, self.prev_col = end 215 if tok_type in (NEWLINE, NL): 216 self.prev_row += 1 217 self.prev_col = 0 218 return "".join(self.tokens) 219 220 def compat(self, token, iterable): 221 startline = False 222 indents = [] 223 toks_append = self.tokens.append 224 toknum, tokval = token 225 226 if toknum in (NAME, NUMBER): 227 tokval += ' ' 228 if toknum in (NEWLINE, NL): 229 startline = True 230 prevstring = False 231 for tok in iterable: 232 toknum, tokval = tok[:2] 233 if toknum == ENCODING: 234 self.encoding = tokval 235 continue 236 237 if toknum in (NAME, NUMBER): 238 tokval += ' ' 239 240 # Insert a space between two consecutive strings 241 if toknum == STRING: 242 if prevstring: 243 tokval = ' ' + tokval 244 prevstring = True 245 else: 246 prevstring = False 247 248 if toknum == INDENT: 249 indents.append(tokval) 250 continue 251 elif toknum == DEDENT: 252 indents.pop() 253 continue 254 elif toknum in (NEWLINE, NL): 255 startline = True 256 elif startline and indents: 257 toks_append(indents[-1]) 258 startline = False 259 toks_append(tokval) 260 261 262def untokenize(iterable): 263 """Transform tokens back into Python source code. 264 It returns a bytes object, encoded using the ENCODING 265 token, which is the first token sequence output by tokenize. 266 267 Each element returned by the iterable must be a token sequence 268 with at least two elements, a token number and token value. If 269 only two tokens are passed, the resulting output is poor. 270 271 Round-trip invariant for full input: 272 Untokenized source will match input source exactly 273 274 Round-trip invariant for limited intput: 275 # Output bytes will tokenize the back to the input 276 t1 = [tok[:2] for tok in tokenize(f.readline)] 277 newcode = untokenize(t1) 278 readline = BytesIO(newcode).readline 279 t2 = [tok[:2] for tok in tokenize(readline)] 280 assert t1 == t2 281 """ 282 ut = Untokenizer() 283 out = ut.untokenize(iterable) 284 if ut.encoding is not None: 285 out = out.encode(ut.encoding) 286 return out 287 288 289def _get_normal_name(orig_enc): 290 """Imitates get_normal_name in tokenizer.c.""" 291 # Only care about the first 12 characters. 292 enc = orig_enc[:12].lower().replace("_", "-") 293 if enc == "utf-8" or enc.startswith("utf-8-"): 294 return "utf-8" 295 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 296 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 297 return "iso-8859-1" 298 return orig_enc 299 300def detect_encoding(readline): 301 """ 302 The detect_encoding() function is used to detect the encoding that should 303 be used to decode a Python source file. It requires one argment, readline, 304 in the same way as the tokenize() generator. 305 306 It will call readline a maximum of twice, and return the encoding used 307 (as a string) and a list of any lines (left as bytes) it has read in. 308 309 It detects the encoding from the presence of a utf-8 bom or an encoding 310 cookie as specified in pep-0263. If both a bom and a cookie are present, 311 but disagree, a SyntaxError will be raised. If the encoding cookie is an 312 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 313 'utf-8-sig' is returned. 314 315 If no encoding is specified, then the default of 'utf-8' will be returned. 316 """ 317 bom_found = False 318 encoding = None 319 default = 'utf-8' 320 def read_or_stop(): 321 try: 322 return readline() 323 except StopIteration: 324 return b'' 325 326 def find_cookie(line): 327 try: 328 line_string = line.decode('ascii') 329 except UnicodeDecodeError: 330 return None 331 332 matches = cookie_re.findall(line_string) 333 if not matches: 334 return None 335 encoding = _get_normal_name(matches[0]) 336 try: 337 codec = lookup(encoding) 338 except LookupError: 339 # This behaviour mimics the Python interpreter 340 raise SyntaxError("unknown encoding: " + encoding) 341 342 if bom_found: 343 if codec.name != 'utf-8': 344 # This behaviour mimics the Python interpreter 345 raise SyntaxError('encoding problem: utf-8') 346 encoding += '-sig' 347 return encoding 348 349 first = read_or_stop() 350 if first.startswith(BOM_UTF8): 351 bom_found = True 352 first = first[3:] 353 default = 'utf-8-sig' 354 if not first: 355 return default, [] 356 357 encoding = find_cookie(first) 358 if encoding: 359 return encoding, [first] 360 361 second = read_or_stop() 362 if not second: 363 return default, [first] 364 365 encoding = find_cookie(second) 366 if encoding: 367 return encoding, [first, second] 368 369 return default, [first, second] 370 371 372def tokenize(readline): 373 """ 374 The tokenize() generator requires one argment, readline, which 375 must be a callable object which provides the same interface as the 376 readline() method of built-in file objects. Each call to the function 377 should return one line of input as bytes. Alternately, readline 378 can be a callable function terminating with StopIteration: 379 readline = open(myfile, 'rb').__next__ # Example of alternate readline 380 381 The generator produces 5-tuples with these members: the token type; the 382 token string; a 2-tuple (srow, scol) of ints specifying the row and 383 column where the token begins in the source; a 2-tuple (erow, ecol) of 384 ints specifying the row and column where the token ends in the source; 385 and the line on which the token was found. The line passed is the 386 logical line; continuation lines are included. 387 388 The first token sequence will always be an ENCODING token 389 which tells you which encoding was used to decode the bytes stream. 390 """ 391 # This import is here to avoid problems when the itertools module is not 392 # built yet and tokenize is imported. 393 from itertools import chain, repeat 394 encoding, consumed = detect_encoding(readline) 395 rl_gen = iter(readline, b"") 396 empty = repeat(b"") 397 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) 398 399 400def _tokenize(readline, encoding): 401 lnum = parenlev = continued = 0 402 numchars = '0123456789' 403 contstr, needcont = '', 0 404 contline = None 405 indents = [0] 406 407 if encoding is not None: 408 if encoding == "utf-8-sig": 409 # BOM will already have been stripped. 410 encoding = "utf-8" 411 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 412 while True: # loop over lines in stream 413 try: 414 line = readline() 415 except StopIteration: 416 line = b'' 417 418 if encoding is not None: 419 line = line.decode(encoding) 420 lnum += 1 421 pos, max = 0, len(line) 422 423 if contstr: # continued string 424 if not line: 425 raise TokenError("EOF in multi-line string", strstart) 426 endmatch = endprog.match(line) 427 if endmatch: 428 pos = end = endmatch.end(0) 429 yield TokenInfo(STRING, contstr + line[:end], 430 strstart, (lnum, end), contline + line) 431 contstr, needcont = '', 0 432 contline = None 433 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 434 yield TokenInfo(ERRORTOKEN, contstr + line, 435 strstart, (lnum, len(line)), contline) 436 contstr = '' 437 contline = None 438 continue 439 else: 440 contstr = contstr + line 441 contline = contline + line 442 continue 443 444 elif parenlev == 0 and not continued: # new statement 445 if not line: break 446 column = 0 447 while pos < max: # measure leading whitespace 448 if line[pos] == ' ': 449 column += 1 450 elif line[pos] == '\t': 451 column = (column//tabsize + 1)*tabsize 452 elif line[pos] == '\f': 453 column = 0 454 else: 455 break 456 pos += 1 457 if pos == max: 458 break 459 460 if line[pos] in '#\r\n': # skip comments or blank lines 461 if line[pos] == '#': 462 comment_token = line[pos:].rstrip('\r\n') 463 nl_pos = pos + len(comment_token) 464 yield TokenInfo(COMMENT, comment_token, 465 (lnum, pos), (lnum, pos + len(comment_token)), line) 466 yield TokenInfo(NL, line[nl_pos:], 467 (lnum, nl_pos), (lnum, len(line)), line) 468 else: 469 yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:], 470 (lnum, pos), (lnum, len(line)), line) 471 continue 472 473 if column > indents[-1]: # count indents or dedents 474 indents.append(column) 475 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 476 while column < indents[-1]: 477 if column not in indents: 478 raise IndentationError( 479 "unindent does not match any outer indentation level", 480 ("<tokenize>", lnum, pos, line)) 481 indents = indents[:-1] 482 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 483 484 else: # continued statement 485 if not line: 486 raise TokenError("EOF in multi-line statement", (lnum, 0)) 487 continued = 0 488 489 while pos < max: 490 pseudomatch = pseudoprog.match(line, pos) 491 if pseudomatch: # scan for tokens 492 start, end = pseudomatch.span(1) 493 spos, epos, pos = (lnum, start), (lnum, end), end 494 token, initial = line[start:end], line[start] 495 496 if (initial in numchars or # ordinary number 497 (initial == '.' and token != '.' and token != '...')): 498 yield TokenInfo(NUMBER, token, spos, epos, line) 499 elif initial in '\r\n': 500 yield TokenInfo(NL if parenlev > 0 else NEWLINE, 501 token, spos, epos, line) 502 elif initial == '#': 503 assert not token.endswith("\n") 504 yield TokenInfo(COMMENT, token, spos, epos, line) 505 elif token in triple_quoted: 506 endprog = endprogs[token] 507 endmatch = endprog.match(line, pos) 508 if endmatch: # all on one line 509 pos = endmatch.end(0) 510 token = line[start:pos] 511 yield TokenInfo(STRING, token, spos, (lnum, pos), line) 512 else: 513 strstart = (lnum, start) # multiple lines 514 contstr = line[start:] 515 contline = line 516 break 517 elif initial in single_quoted or \ 518 token[:2] in single_quoted or \ 519 token[:3] in single_quoted: 520 if token[-1] == '\n': # continued string 521 strstart = (lnum, start) 522 endprog = (endprogs[initial] or endprogs[token[1]] or 523 endprogs[token[2]]) 524 contstr, needcont = line[start:], 1 525 contline = line 526 break 527 else: # ordinary string 528 yield TokenInfo(STRING, token, spos, epos, line) 529 elif initial.isidentifier(): # ordinary name 530 yield TokenInfo(NAME, token, spos, epos, line) 531 elif initial == '\\': # continued stmt 532 continued = 1 533 else: 534 if initial in '([{': 535 parenlev += 1 536 elif initial in ')]}': 537 parenlev -= 1 538 yield TokenInfo(OP, token, spos, epos, line) 539 else: 540 yield TokenInfo(ERRORTOKEN, line[pos], 541 (lnum, pos), (lnum, pos+1), line) 542 pos += 1 543 544 for indent in indents[1:]: # pop remaining indent levels 545 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 546 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 547 548 549# An undocumented, backwards compatible, API for all the places in the standard 550# library that expect to be able to use tokenize with strings 551def generate_tokens(readline): 552 return _tokenize(readline, None) 553