1"""Tokenization help for Python programs. 2 3tokenize(readline) is a generator that breaks a stream of bytes into 4Python tokens. It decodes the bytes according to PEP-0263 for 5determining source file encoding. 6 7It accepts a readline-like method which is called repeatedly to get the 8next line of input (or b"" for EOF). It generates 5-tuples with these 9members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators. Additionally, all token lists start with an ENCODING token 20which tells you which encoding was used to decode the bytes stream. 21""" 22 23__author__ = 'Ka-Ping Yee <ping@lfw.org>' 24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26 'Michael Foord') 27from builtins import open as _builtin_open 28from codecs import lookup, BOM_UTF8 29import collections 30from io import TextIOWrapper 31from itertools import chain 32import itertools as _itertools 33import re 34import sys 35from token import * 36 37cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 38blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 39 40import token 41__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", 42 "NL", "untokenize", "ENCODING", "TokenInfo"] 43del token 44 45COMMENT = N_TOKENS 46tok_name[COMMENT] = 'COMMENT' 47NL = N_TOKENS + 1 48tok_name[NL] = 'NL' 49ENCODING = N_TOKENS + 2 50tok_name[ENCODING] = 'ENCODING' 51N_TOKENS += 3 52EXACT_TOKEN_TYPES = { 53 '(': LPAR, 54 ')': RPAR, 55 '[': LSQB, 56 ']': RSQB, 57 ':': COLON, 58 ',': COMMA, 59 ';': SEMI, 60 '+': PLUS, 61 '-': MINUS, 62 '*': STAR, 63 '/': SLASH, 64 '|': VBAR, 65 '&': AMPER, 66 '<': LESS, 67 '>': GREATER, 68 '=': EQUAL, 69 '.': DOT, 70 '%': PERCENT, 71 '{': LBRACE, 72 '}': RBRACE, 73 '==': EQEQUAL, 74 '!=': NOTEQUAL, 75 '<=': LESSEQUAL, 76 '>=': GREATEREQUAL, 77 '~': TILDE, 78 '^': CIRCUMFLEX, 79 '<<': LEFTSHIFT, 80 '>>': RIGHTSHIFT, 81 '**': DOUBLESTAR, 82 '+=': PLUSEQUAL, 83 '-=': MINEQUAL, 84 '*=': STAREQUAL, 85 '/=': SLASHEQUAL, 86 '%=': PERCENTEQUAL, 87 '&=': AMPEREQUAL, 88 '|=': VBAREQUAL, 89 '^=': CIRCUMFLEXEQUAL, 90 '<<=': LEFTSHIFTEQUAL, 91 '>>=': RIGHTSHIFTEQUAL, 92 '**=': DOUBLESTAREQUAL, 93 '//': DOUBLESLASH, 94 '//=': DOUBLESLASHEQUAL, 95 '@': AT, 96 '@=': ATEQUAL, 97} 98 99class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 100 def __repr__(self): 101 annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 102 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 103 self._replace(type=annotated_type)) 104 105 @property 106 def exact_type(self): 107 if self.type == OP and self.string in EXACT_TOKEN_TYPES: 108 return EXACT_TOKEN_TYPES[self.string] 109 else: 110 return self.type 111 112def group(*choices): return '(' + '|'.join(choices) + ')' 113def any(*choices): return group(*choices) + '*' 114def maybe(*choices): return group(*choices) + '?' 115 116# Note: we use unicode matching for names ("\w") but ascii matching for 117# number literals. 118Whitespace = r'[ \f\t]*' 119Comment = r'#[^\r\n]*' 120Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 121Name = r'\w+' 122 123Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 124Binnumber = r'0[bB](?:_?[01])+' 125Octnumber = r'0[oO](?:_?[0-7])+' 126Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 127Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 128Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 129Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 130 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 131Expfloat = r'[0-9](?:_?[0-9])*' + Exponent 132Floatnumber = group(Pointfloat, Expfloat) 133Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 134Number = group(Imagnumber, Floatnumber, Intnumber) 135 136# Return the empty string, plus all of the valid string prefixes. 137def _all_string_prefixes(): 138 # The valid string prefixes. Only contain the lower case versions, 139 # and don't contain any permuations (include 'fr', but not 140 # 'rf'). The various permutations will be generated. 141 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 142 # if we add binary f-strings, add: ['fb', 'fbr'] 143 result = set(['']) 144 for prefix in _valid_string_prefixes: 145 for t in _itertools.permutations(prefix): 146 # create a list with upper and lower versions of each 147 # character 148 for u in _itertools.product(*[(c, c.upper()) for c in t]): 149 result.add(''.join(u)) 150 return result 151 152def _compile(expr): 153 return re.compile(expr, re.UNICODE) 154 155# Note that since _all_string_prefixes includes the empty string, 156# StringPrefix can be the empty string (making it optional). 157StringPrefix = group(*_all_string_prefixes()) 158 159# Tail end of ' string. 160Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 161# Tail end of " string. 162Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 163# Tail end of ''' string. 164Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 165# Tail end of """ string. 166Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 167Triple = group(StringPrefix + "'''", StringPrefix + '"""') 168# Single-line ' or " string. 169String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 170 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 171 172# Because of leftmost-then-longest match semantics, be sure to put the 173# longest operators first (e.g., if = came before ==, == would get 174# recognized as two instances of =). 175Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", 176 r"//=?", r"->", 177 r"[+\-*/%&@|^=<>]=?", 178 r"~") 179 180Bracket = '[][(){}]' 181Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') 182Funny = group(Operator, Bracket, Special) 183 184PlainToken = group(Number, Funny, String, Name) 185Token = Ignore + PlainToken 186 187# First (or only) line of ' or " string. 188ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 189 group("'", r'\\\r?\n'), 190 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 191 group('"', r'\\\r?\n')) 192PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 193PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 194 195# For a given string prefix plus quotes, endpats maps it to a regex 196# to match the remainder of that string. _prefix can be empty, for 197# a normal single or triple quoted string (with no prefix). 198endpats = {} 199for _prefix in _all_string_prefixes(): 200 endpats[_prefix + "'"] = Single 201 endpats[_prefix + '"'] = Double 202 endpats[_prefix + "'''"] = Single3 203 endpats[_prefix + '"""'] = Double3 204 205# A set of all of the single and triple quoted string prefixes, 206# including the opening quotes. 207single_quoted = set() 208triple_quoted = set() 209for t in _all_string_prefixes(): 210 for u in (t + '"', t + "'"): 211 single_quoted.add(u) 212 for u in (t + '"""', t + "'''"): 213 triple_quoted.add(u) 214 215tabsize = 8 216 217class TokenError(Exception): pass 218 219class StopTokenizing(Exception): pass 220 221 222class Untokenizer: 223 224 def __init__(self): 225 self.tokens = [] 226 self.prev_row = 1 227 self.prev_col = 0 228 self.encoding = None 229 230 def add_whitespace(self, start): 231 row, col = start 232 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 233 raise ValueError("start ({},{}) precedes previous end ({},{})" 234 .format(row, col, self.prev_row, self.prev_col)) 235 row_offset = row - self.prev_row 236 if row_offset: 237 self.tokens.append("\\\n" * row_offset) 238 self.prev_col = 0 239 col_offset = col - self.prev_col 240 if col_offset: 241 self.tokens.append(" " * col_offset) 242 243 def untokenize(self, iterable): 244 it = iter(iterable) 245 indents = [] 246 startline = False 247 for t in it: 248 if len(t) == 2: 249 self.compat(t, it) 250 break 251 tok_type, token, start, end, line = t 252 if tok_type == ENCODING: 253 self.encoding = token 254 continue 255 if tok_type == ENDMARKER: 256 break 257 if tok_type == INDENT: 258 indents.append(token) 259 continue 260 elif tok_type == DEDENT: 261 indents.pop() 262 self.prev_row, self.prev_col = end 263 continue 264 elif tok_type in (NEWLINE, NL): 265 startline = True 266 elif startline and indents: 267 indent = indents[-1] 268 if start[1] >= len(indent): 269 self.tokens.append(indent) 270 self.prev_col = len(indent) 271 startline = False 272 self.add_whitespace(start) 273 self.tokens.append(token) 274 self.prev_row, self.prev_col = end 275 if tok_type in (NEWLINE, NL): 276 self.prev_row += 1 277 self.prev_col = 0 278 return "".join(self.tokens) 279 280 def compat(self, token, iterable): 281 indents = [] 282 toks_append = self.tokens.append 283 startline = token[0] in (NEWLINE, NL) 284 prevstring = False 285 286 for tok in chain([token], iterable): 287 toknum, tokval = tok[:2] 288 if toknum == ENCODING: 289 self.encoding = tokval 290 continue 291 292 if toknum in (NAME, NUMBER, ASYNC, AWAIT): 293 tokval += ' ' 294 295 # Insert a space between two consecutive strings 296 if toknum == STRING: 297 if prevstring: 298 tokval = ' ' + tokval 299 prevstring = True 300 else: 301 prevstring = False 302 303 if toknum == INDENT: 304 indents.append(tokval) 305 continue 306 elif toknum == DEDENT: 307 indents.pop() 308 continue 309 elif toknum in (NEWLINE, NL): 310 startline = True 311 elif startline and indents: 312 toks_append(indents[-1]) 313 startline = False 314 toks_append(tokval) 315 316 317def untokenize(iterable): 318 """Transform tokens back into Python source code. 319 It returns a bytes object, encoded using the ENCODING 320 token, which is the first token sequence output by tokenize. 321 322 Each element returned by the iterable must be a token sequence 323 with at least two elements, a token number and token value. If 324 only two tokens are passed, the resulting output is poor. 325 326 Round-trip invariant for full input: 327 Untokenized source will match input source exactly 328 329 Round-trip invariant for limited input: 330 # Output bytes will tokenize back to the input 331 t1 = [tok[:2] for tok in tokenize(f.readline)] 332 newcode = untokenize(t1) 333 readline = BytesIO(newcode).readline 334 t2 = [tok[:2] for tok in tokenize(readline)] 335 assert t1 == t2 336 """ 337 ut = Untokenizer() 338 out = ut.untokenize(iterable) 339 if ut.encoding is not None: 340 out = out.encode(ut.encoding) 341 return out 342 343 344def _get_normal_name(orig_enc): 345 """Imitates get_normal_name in tokenizer.c.""" 346 # Only care about the first 12 characters. 347 enc = orig_enc[:12].lower().replace("_", "-") 348 if enc == "utf-8" or enc.startswith("utf-8-"): 349 return "utf-8" 350 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 351 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 352 return "iso-8859-1" 353 return orig_enc 354 355def detect_encoding(readline): 356 """ 357 The detect_encoding() function is used to detect the encoding that should 358 be used to decode a Python source file. It requires one argument, readline, 359 in the same way as the tokenize() generator. 360 361 It will call readline a maximum of twice, and return the encoding used 362 (as a string) and a list of any lines (left as bytes) it has read in. 363 364 It detects the encoding from the presence of a utf-8 bom or an encoding 365 cookie as specified in pep-0263. If both a bom and a cookie are present, 366 but disagree, a SyntaxError will be raised. If the encoding cookie is an 367 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 368 'utf-8-sig' is returned. 369 370 If no encoding is specified, then the default of 'utf-8' will be returned. 371 """ 372 try: 373 filename = readline.__self__.name 374 except AttributeError: 375 filename = None 376 bom_found = False 377 encoding = None 378 default = 'utf-8' 379 def read_or_stop(): 380 try: 381 return readline() 382 except StopIteration: 383 return b'' 384 385 def find_cookie(line): 386 try: 387 # Decode as UTF-8. Either the line is an encoding declaration, 388 # in which case it should be pure ASCII, or it must be UTF-8 389 # per default encoding. 390 line_string = line.decode('utf-8') 391 except UnicodeDecodeError: 392 msg = "invalid or missing encoding declaration" 393 if filename is not None: 394 msg = '{} for {!r}'.format(msg, filename) 395 raise SyntaxError(msg) 396 397 match = cookie_re.match(line_string) 398 if not match: 399 return None 400 encoding = _get_normal_name(match.group(1)) 401 try: 402 codec = lookup(encoding) 403 except LookupError: 404 # This behaviour mimics the Python interpreter 405 if filename is None: 406 msg = "unknown encoding: " + encoding 407 else: 408 msg = "unknown encoding for {!r}: {}".format(filename, 409 encoding) 410 raise SyntaxError(msg) 411 412 if bom_found: 413 if encoding != 'utf-8': 414 # This behaviour mimics the Python interpreter 415 if filename is None: 416 msg = 'encoding problem: utf-8' 417 else: 418 msg = 'encoding problem for {!r}: utf-8'.format(filename) 419 raise SyntaxError(msg) 420 encoding += '-sig' 421 return encoding 422 423 first = read_or_stop() 424 if first.startswith(BOM_UTF8): 425 bom_found = True 426 first = first[3:] 427 default = 'utf-8-sig' 428 if not first: 429 return default, [] 430 431 encoding = find_cookie(first) 432 if encoding: 433 return encoding, [first] 434 if not blank_re.match(first): 435 return default, [first] 436 437 second = read_or_stop() 438 if not second: 439 return default, [first] 440 441 encoding = find_cookie(second) 442 if encoding: 443 return encoding, [first, second] 444 445 return default, [first, second] 446 447 448def open(filename): 449 """Open a file in read only mode using the encoding detected by 450 detect_encoding(). 451 """ 452 buffer = _builtin_open(filename, 'rb') 453 try: 454 encoding, lines = detect_encoding(buffer.readline) 455 buffer.seek(0) 456 text = TextIOWrapper(buffer, encoding, line_buffering=True) 457 text.mode = 'r' 458 return text 459 except: 460 buffer.close() 461 raise 462 463 464def tokenize(readline): 465 """ 466 The tokenize() generator requires one argument, readline, which 467 must be a callable object which provides the same interface as the 468 readline() method of built-in file objects. Each call to the function 469 should return one line of input as bytes. Alternatively, readline 470 can be a callable function terminating with StopIteration: 471 readline = open(myfile, 'rb').__next__ # Example of alternate readline 472 473 The generator produces 5-tuples with these members: the token type; the 474 token string; a 2-tuple (srow, scol) of ints specifying the row and 475 column where the token begins in the source; a 2-tuple (erow, ecol) of 476 ints specifying the row and column where the token ends in the source; 477 and the line on which the token was found. The line passed is the 478 logical line; continuation lines are included. 479 480 The first token sequence will always be an ENCODING token 481 which tells you which encoding was used to decode the bytes stream. 482 """ 483 # This import is here to avoid problems when the itertools module is not 484 # built yet and tokenize is imported. 485 from itertools import chain, repeat 486 encoding, consumed = detect_encoding(readline) 487 rl_gen = iter(readline, b"") 488 empty = repeat(b"") 489 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) 490 491 492def _tokenize(readline, encoding): 493 lnum = parenlev = continued = 0 494 numchars = '0123456789' 495 contstr, needcont = '', 0 496 contline = None 497 indents = [0] 498 499 # 'stashed' and 'async_*' are used for async/await parsing 500 stashed = None 501 async_def = False 502 async_def_indent = 0 503 async_def_nl = False 504 505 if encoding is not None: 506 if encoding == "utf-8-sig": 507 # BOM will already have been stripped. 508 encoding = "utf-8" 509 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 510 while True: # loop over lines in stream 511 try: 512 line = readline() 513 except StopIteration: 514 line = b'' 515 516 if encoding is not None: 517 line = line.decode(encoding) 518 lnum += 1 519 pos, max = 0, len(line) 520 521 if contstr: # continued string 522 if not line: 523 raise TokenError("EOF in multi-line string", strstart) 524 endmatch = endprog.match(line) 525 if endmatch: 526 pos = end = endmatch.end(0) 527 yield TokenInfo(STRING, contstr + line[:end], 528 strstart, (lnum, end), contline + line) 529 contstr, needcont = '', 0 530 contline = None 531 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 532 yield TokenInfo(ERRORTOKEN, contstr + line, 533 strstart, (lnum, len(line)), contline) 534 contstr = '' 535 contline = None 536 continue 537 else: 538 contstr = contstr + line 539 contline = contline + line 540 continue 541 542 elif parenlev == 0 and not continued: # new statement 543 if not line: break 544 column = 0 545 while pos < max: # measure leading whitespace 546 if line[pos] == ' ': 547 column += 1 548 elif line[pos] == '\t': 549 column = (column//tabsize + 1)*tabsize 550 elif line[pos] == '\f': 551 column = 0 552 else: 553 break 554 pos += 1 555 if pos == max: 556 break 557 558 if line[pos] in '#\r\n': # skip comments or blank lines 559 if line[pos] == '#': 560 comment_token = line[pos:].rstrip('\r\n') 561 nl_pos = pos + len(comment_token) 562 yield TokenInfo(COMMENT, comment_token, 563 (lnum, pos), (lnum, pos + len(comment_token)), line) 564 yield TokenInfo(NL, line[nl_pos:], 565 (lnum, nl_pos), (lnum, len(line)), line) 566 else: 567 yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:], 568 (lnum, pos), (lnum, len(line)), line) 569 continue 570 571 if column > indents[-1]: # count indents or dedents 572 indents.append(column) 573 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 574 while column < indents[-1]: 575 if column not in indents: 576 raise IndentationError( 577 "unindent does not match any outer indentation level", 578 ("<tokenize>", lnum, pos, line)) 579 indents = indents[:-1] 580 581 if async_def and async_def_indent >= indents[-1]: 582 async_def = False 583 async_def_nl = False 584 async_def_indent = 0 585 586 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 587 588 if async_def and async_def_nl and async_def_indent >= indents[-1]: 589 async_def = False 590 async_def_nl = False 591 async_def_indent = 0 592 593 else: # continued statement 594 if not line: 595 raise TokenError("EOF in multi-line statement", (lnum, 0)) 596 continued = 0 597 598 while pos < max: 599 pseudomatch = _compile(PseudoToken).match(line, pos) 600 if pseudomatch: # scan for tokens 601 start, end = pseudomatch.span(1) 602 spos, epos, pos = (lnum, start), (lnum, end), end 603 if start == end: 604 continue 605 token, initial = line[start:end], line[start] 606 607 if (initial in numchars or # ordinary number 608 (initial == '.' and token != '.' and token != '...')): 609 yield TokenInfo(NUMBER, token, spos, epos, line) 610 elif initial in '\r\n': 611 if stashed: 612 yield stashed 613 stashed = None 614 if parenlev > 0: 615 yield TokenInfo(NL, token, spos, epos, line) 616 else: 617 yield TokenInfo(NEWLINE, token, spos, epos, line) 618 if async_def: 619 async_def_nl = True 620 621 elif initial == '#': 622 assert not token.endswith("\n") 623 if stashed: 624 yield stashed 625 stashed = None 626 yield TokenInfo(COMMENT, token, spos, epos, line) 627 628 elif token in triple_quoted: 629 endprog = _compile(endpats[token]) 630 endmatch = endprog.match(line, pos) 631 if endmatch: # all on one line 632 pos = endmatch.end(0) 633 token = line[start:pos] 634 yield TokenInfo(STRING, token, spos, (lnum, pos), line) 635 else: 636 strstart = (lnum, start) # multiple lines 637 contstr = line[start:] 638 contline = line 639 break 640 641 # Check up to the first 3 chars of the token to see if 642 # they're in the single_quoted set. If so, they start 643 # a string. 644 # We're using the first 3, because we're looking for 645 # "rb'" (for example) at the start of the token. If 646 # we switch to longer prefixes, this needs to be 647 # adjusted. 648 # Note that initial == token[:1]. 649 # Also note that single quote checking must come after 650 # triple quote checking (above). 651 elif (initial in single_quoted or 652 token[:2] in single_quoted or 653 token[:3] in single_quoted): 654 if token[-1] == '\n': # continued string 655 strstart = (lnum, start) 656 # Again, using the first 3 chars of the 657 # token. This is looking for the matching end 658 # regex for the correct type of quote 659 # character. So it's really looking for 660 # endpats["'"] or endpats['"'], by trying to 661 # skip string prefix characters, if any. 662 endprog = _compile(endpats.get(initial) or 663 endpats.get(token[1]) or 664 endpats.get(token[2])) 665 contstr, needcont = line[start:], 1 666 contline = line 667 break 668 else: # ordinary string 669 yield TokenInfo(STRING, token, spos, epos, line) 670 671 elif initial.isidentifier(): # ordinary name 672 if token in ('async', 'await'): 673 if async_def: 674 yield TokenInfo( 675 ASYNC if token == 'async' else AWAIT, 676 token, spos, epos, line) 677 continue 678 679 tok = TokenInfo(NAME, token, spos, epos, line) 680 if token == 'async' and not stashed: 681 stashed = tok 682 continue 683 684 if token == 'def': 685 if (stashed 686 and stashed.type == NAME 687 and stashed.string == 'async'): 688 689 async_def = True 690 async_def_indent = indents[-1] 691 692 yield TokenInfo(ASYNC, stashed.string, 693 stashed.start, stashed.end, 694 stashed.line) 695 stashed = None 696 697 if stashed: 698 yield stashed 699 stashed = None 700 701 yield tok 702 elif initial == '\\': # continued stmt 703 continued = 1 704 else: 705 if initial in '([{': 706 parenlev += 1 707 elif initial in ')]}': 708 parenlev -= 1 709 if stashed: 710 yield stashed 711 stashed = None 712 yield TokenInfo(OP, token, spos, epos, line) 713 else: 714 yield TokenInfo(ERRORTOKEN, line[pos], 715 (lnum, pos), (lnum, pos+1), line) 716 pos += 1 717 718 if stashed: 719 yield stashed 720 stashed = None 721 722 for indent in indents[1:]: # pop remaining indent levels 723 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 724 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 725 726 727# An undocumented, backwards compatible, API for all the places in the standard 728# library that expect to be able to use tokenize with strings 729def generate_tokens(readline): 730 return _tokenize(readline, None) 731 732def main(): 733 import argparse 734 735 # Helper error handling routines 736 def perror(message): 737 print(message, file=sys.stderr) 738 739 def error(message, filename=None, location=None): 740 if location: 741 args = (filename,) + location + (message,) 742 perror("%s:%d:%d: error: %s" % args) 743 elif filename: 744 perror("%s: error: %s" % (filename, message)) 745 else: 746 perror("error: %s" % message) 747 sys.exit(1) 748 749 # Parse the arguments and options 750 parser = argparse.ArgumentParser(prog='python -m tokenize') 751 parser.add_argument(dest='filename', nargs='?', 752 metavar='filename.py', 753 help='the file to tokenize; defaults to stdin') 754 parser.add_argument('-e', '--exact', dest='exact', action='store_true', 755 help='display token names using the exact type') 756 args = parser.parse_args() 757 758 try: 759 # Tokenize the input 760 if args.filename: 761 filename = args.filename 762 with _builtin_open(filename, 'rb') as f: 763 tokens = list(tokenize(f.readline)) 764 else: 765 filename = "<stdin>" 766 tokens = _tokenize(sys.stdin.readline, None) 767 768 # Output the tokenization 769 for token in tokens: 770 token_type = token.type 771 if args.exact: 772 token_type = token.exact_type 773 token_range = "%d,%d-%d,%d:" % (token.start + token.end) 774 print("%-20s%-15s%-15r" % 775 (token_range, tok_name[token_type], token.string)) 776 except IndentationError as err: 777 line, column = err.args[1][1:3] 778 error(err.args[0], filename, (line, column)) 779 except TokenError as err: 780 line, column = err.args[1] 781 error(err.args[0], filename, (line, column)) 782 except SyntaxError as err: 783 error(err, filename) 784 except OSError as err: 785 error(err) 786 except KeyboardInterrupt: 787 print("interrupted\n") 788 except Exception as err: 789 perror("unexpected error: %s" % err) 790 raise 791 792if __name__ == "__main__": 793 main() 794