1b51eaa183e048a928fb363bac4404e6acf0e3badGuido van Rossum"""Tokenization help for Python programs. 2b51eaa183e048a928fb363bac4404e6acf0e3badGuido van Rossum 343e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunatokenize(readline) is a generator that breaks a stream of bytes into 443e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent XiclunaPython tokens. It decodes the bytes according to PEP-0263 for 543e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunadetermining source file encoding. 6428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 743e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent XiclunaIt accepts a readline-like method which is called repeatedly to get the 843e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunanext line of input (or b"" for EOF). It generates 5-tuples with these 943e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunamembers: 104efb6e964376a46aaa3acf365a6627a37af236bfTim Peters 114efb6e964376a46aaa3acf365a6627a37af236bfTim Peters the token type (see token.py) 124efb6e964376a46aaa3acf365a6627a37af236bfTim Peters the token (a string) 134efb6e964376a46aaa3acf365a6627a37af236bfTim Peters the starting (row, column) indices of the token (a 2-tuple of ints) 144efb6e964376a46aaa3acf365a6627a37af236bfTim Peters the ending (row, column) indices of the token (a 2-tuple of ints) 154efb6e964376a46aaa3acf365a6627a37af236bfTim Peters the original line (string) 164efb6e964376a46aaa3acf365a6627a37af236bfTim Peters 174efb6e964376a46aaa3acf365a6627a37af236bfTim PetersIt is designed to match the working of the Python tokenizer exactly, except 184efb6e964376a46aaa3acf365a6627a37af236bfTim Petersthat it produces COMMENT tokens for comments and gives type OP for all 1943e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunaoperators. Additionally, all token lists start with an ENCODING token 2043e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunawhich tells you which encoding was used to decode the bytes stream. 2143e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna""" 221aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum 23244c593598af4db19e410032fb10793617a895ceKa-Ping Yee__author__ = 'Ka-Ping Yee <ping@lfw.org>' 24428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 'Michael Foord') 27cf4a2f29adb6bdae0b18e983250d7c48d486c9d6Serhiy Storchakafrom builtins import open as _builtin_open 28433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Petersonfrom codecs import lookup, BOM_UTF8 293fb79c747b0cd0884f2a6ede9e36673bec8745f2Raymond Hettingerimport collections 3058c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinnerfrom io import TextIOWrapper 315b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyfrom itertools import chain 321c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithimport itertools as _itertools 335b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyimport re 345b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyimport sys 355b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyfrom token import * 365b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy 37e431d3c9aadb52dd1eea4d1e606e94f1c8471459Serhiy Storchakacookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 38768c16ce0273a74fa846cc388753280b17b02cfcSerhiy Storchakablank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 394d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 4040fc16059f04ee8fda0b5956cc4883eb21ca8f8cSkip Montanaroimport token 41b9d10d08c4eb0dedaea3b1bcde0f13b033e16c85Alexander Belopolsky__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", 42b9d10d08c4eb0dedaea3b1bcde0f13b033e16c85Alexander Belopolsky "NL", "untokenize", "ENCODING", "TokenInfo"] 4340fc16059f04ee8fda0b5956cc4883eb21ca8f8cSkip Montanarodel token 4440fc16059f04ee8fda0b5956cc4883eb21ca8f8cSkip Montanaro 451aec32363f25693e0c3ff81feddf620850b4955dGuido van RossumCOMMENT = N_TOKENS 461aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossumtok_name[COMMENT] = 'COMMENT' 47a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van RossumNL = N_TOKENS + 1 48a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossumtok_name[NL] = 'NL' 49428de65ca99492436130165bfbaeb56d6d1daec7Trent NelsonENCODING = N_TOKENS + 2 50428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsontok_name[ENCODING] = 'ENCODING' 51428de65ca99492436130165bfbaeb56d6d1daec7Trent NelsonN_TOKENS += 3 5200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador IngeEXACT_TOKEN_TYPES = { 5300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '(': LPAR, 5400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge ')': RPAR, 5500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '[': LSQB, 5600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge ']': RSQB, 5700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge ':': COLON, 5800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge ',': COMMA, 5900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge ';': SEMI, 6000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '+': PLUS, 6100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '-': MINUS, 6200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '*': STAR, 6300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '/': SLASH, 6400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '|': VBAR, 6500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '&': AMPER, 6600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '<': LESS, 6700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '>': GREATER, 6800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '=': EQUAL, 6900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '.': DOT, 7000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '%': PERCENT, 7100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '{': LBRACE, 7200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '}': RBRACE, 7300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '==': EQEQUAL, 7400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '!=': NOTEQUAL, 7500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '<=': LESSEQUAL, 7600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '>=': GREATEREQUAL, 7700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '~': TILDE, 7800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '^': CIRCUMFLEX, 7900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '<<': LEFTSHIFT, 8000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '>>': RIGHTSHIFT, 8100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '**': DOUBLESTAR, 8200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '+=': PLUSEQUAL, 8300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '-=': MINEQUAL, 8400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '*=': STAREQUAL, 8500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '/=': SLASHEQUAL, 8600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '%=': PERCENTEQUAL, 8700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '&=': AMPEREQUAL, 8800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '|=': VBAREQUAL, 8900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '^=': CIRCUMFLEXEQUAL, 9000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '<<=': LEFTSHIFTEQUAL, 9100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '>>=': RIGHTSHIFTEQUAL, 9200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '**=': DOUBLESTAREQUAL, 9300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '//': DOUBLESLASH, 9400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge '//=': DOUBLESLASHEQUAL, 95d51374ed78a3e3145911a16cdf3b9b84b3ba7d15Benjamin Peterson '@': AT, 96d51374ed78a3e3145911a16cdf3b9b84b3ba7d15Benjamin Peterson '@=': ATEQUAL, 9700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge} 981aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum 993fb79c747b0cd0884f2a6ede9e36673bec8745f2Raymond Hettingerclass TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 100aa17a7fc98773e0f2b2a23e59a0a2b3d9f1bca84Raymond Hettinger def __repr__(self): 101a0e79408bcf14015995fb4f1f1c3ad88df017496Raymond Hettinger annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 102a0e79408bcf14015995fb4f1f1c3ad88df017496Raymond Hettinger return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 103a0e79408bcf14015995fb4f1f1c3ad88df017496Raymond Hettinger self._replace(type=annotated_type)) 104aa17a7fc98773e0f2b2a23e59a0a2b3d9f1bca84Raymond Hettinger 10500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge @property 10600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge def exact_type(self): 10700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge if self.type == OP and self.string in EXACT_TOKEN_TYPES: 10800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge return EXACT_TOKEN_TYPES[self.string] 10900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge else: 11000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge return self.type 11100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge 112b08b2d316653bf22d39ad76814b5a0e7dad30c31Eric S. Raymonddef group(*choices): return '(' + '|'.join(choices) + ')' 11368468eba635570400f607e140425a222018e56f9Guido van Rossumdef any(*choices): return group(*choices) + '*' 11468468eba635570400f607e140425a222018e56f9Guido van Rossumdef maybe(*choices): return group(*choices) + '?' 1154d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 116fd036451bf0e0ade8783e21df801abf7be96d020Antoine Pitrou# Note: we use unicode matching for names ("\w") but ascii matching for 117fd036451bf0e0ade8783e21df801abf7be96d020Antoine Pitrou# number literals. 1183b631775b26b866e072cd3340f303bf5903af883Guido van RossumWhitespace = r'[ \f\t]*' 1193b631775b26b866e072cd3340f303bf5903af883Guido van RossumComment = r'#[^\r\n]*' 1203b631775b26b866e072cd3340f303bf5903af883Guido van RossumIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 12133856de84d1115a18b699e0ca93c3b921bc6a1afBenjamin PetersonName = r'\w+' 1224d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 123a721abac299bb6529021000a71847486d531b41aBrett CannonHexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 124a721abac299bb6529021000a71847486d531b41aBrett CannonBinnumber = r'0[bB](?:_?[01])+' 125a721abac299bb6529021000a71847486d531b41aBrett CannonOctnumber = r'0[oO](?:_?[0-7])+' 126a721abac299bb6529021000a71847486d531b41aBrett CannonDecnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 127cd16bf640405065e4702539632ce577536207d88Guido van RossumIntnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 128a721abac299bb6529021000a71847486d531b41aBrett CannonExponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 129a721abac299bb6529021000a71847486d531b41aBrett CannonPointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 130a721abac299bb6529021000a71847486d531b41aBrett Cannon r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 131a721abac299bb6529021000a71847486d531b41aBrett CannonExpfloat = r'[0-9](?:_?[0-9])*' + Exponent 1321aec32363f25693e0c3ff81feddf620850b4955dGuido van RossumFloatnumber = group(Pointfloat, Expfloat) 133a721abac299bb6529021000a71847486d531b41aBrett CannonImagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 1341aec32363f25693e0c3ff81feddf620850b4955dGuido van RossumNumber = group(Imagnumber, Floatnumber, Intnumber) 1354d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 1361c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# Return the empty string, plus all of the valid string prefixes. 1371c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithdef _all_string_prefixes(): 1381c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # The valid string prefixes. Only contain the lower case versions, 1391c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # and don't contain any permuations (include 'fr', but not 1401c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # 'rf'). The various permutations will be generated. 1411c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 1421c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # if we add binary f-strings, add: ['fb', 'fbr'] 1431c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith result = set(['']) 1441c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith for prefix in _valid_string_prefixes: 1451c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith for t in _itertools.permutations(prefix): 1461c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # create a list with upper and lower versions of each 1471c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # character 1481c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith for u in _itertools.product(*[(c, c.upper()) for c in t]): 1491c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith result.add(''.join(u)) 1501c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith return result 1511c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith 1521c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithdef _compile(expr): 1531c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith return re.compile(expr, re.UNICODE) 1541c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith 1551c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# Note that since _all_string_prefixes includes the empty string, 1561c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# StringPrefix can be the empty string (making it optional). 1571c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. SmithStringPrefix = group(*_all_string_prefixes()) 158c0eaecafe9809757301551285f2a41ea89f1f228Armin Ronacher 159de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of ' string. 160de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersSingle = r"[^'\\]*(?:\\.[^'\\]*)*'" 161de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of " string. 162de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersDouble = r'[^"\\]*(?:\\.[^"\\]*)*"' 163de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of ''' string. 164de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 165de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of """ string. 166de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 167c0eaecafe9809757301551285f2a41ea89f1f228Armin RonacherTriple = group(StringPrefix + "'''", StringPrefix + '"""') 168de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Single-line ' or " string. 169c0eaecafe9809757301551285f2a41ea89f1f228Armin RonacherString = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 170c0eaecafe9809757301551285f2a41ea89f1f228Armin Ronacher StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 1714d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 172de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Because of leftmost-then-longest match semantics, be sure to put the 173de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# longest operators first (e.g., if = came before ==, == would get 174de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# recognized as two instances of =). 175b053cd8f40dd19985b16f50661640dcefb69888fGuido van RossumOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=", 176c150536b5efadf71fcb4187cad7258be7268e157Neal Norwitz r"//=?", r"->", 177d51374ed78a3e3145911a16cdf3b9b84b3ba7d15Benjamin Peterson r"[+\-*/%&@|^=<>]=?", 178de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters r"~") 179e1519a1b4d8e24ab6a98083c6ec8bf4ec7594111Thomas Wouters 1804d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumBracket = '[][(){}]' 181dde002899db8d04ac25d630fcc3a27e8bbf282eaGeorg BrandlSpecial = group(r'\r?\n', r'\.\.\.', r'[:;.,@]') 182fc6f5339a99d103928bce9eda605564f2a9e8477Guido van RossumFunny = group(Operator, Bracket, Special) 1834d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 1843b631775b26b866e072cd3340f303bf5903af883Guido van RossumPlainToken = group(Number, Funny, String, Name) 185fc6f5339a99d103928bce9eda605564f2a9e8477Guido van RossumToken = Ignore + PlainToken 1864d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum 187de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# First (or only) line of ' or " string. 188c0eaecafe9809757301551285f2a41ea89f1f228Armin RonacherContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 1891ff08b1243dcb07db975640b2f3cbc82985bee81Ka-Ping Yee group("'", r'\\\r?\n'), 190c0eaecafe9809757301551285f2a41ea89f1f228Armin Ronacher StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 1911ff08b1243dcb07db975640b2f3cbc82985bee81Ka-Ping Yee group('"', r'\\\r?\n')) 1922cc3b4ba9ffa658784da03f14a0a068e2c61d1b3Ezio MelottiPseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 1933b631775b26b866e072cd3340f303bf5903af883Guido van RossumPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 1941aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum 1951c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# For a given string prefix plus quotes, endpats maps it to a regex 1961c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# to match the remainder of that string. _prefix can be empty, for 1971c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# a normal single or triple quoted string (with no prefix). 1981c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithendpats = {} 1991c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithfor _prefix in _all_string_prefixes(): 2001c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith endpats[_prefix + "'"] = Single 2011c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith endpats[_prefix + '"'] = Double 2021c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith endpats[_prefix + "'''"] = Single3 2031c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith endpats[_prefix + '"""'] = Double3 2041c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith 2051c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# A set of all of the single and triple quoted string prefixes, 2061c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# including the opening quotes. 2071c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithsingle_quoted = set() 2081c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithtriple_quoted = set() 2091c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithfor t in _all_string_prefixes(): 2101c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith for u in (t + '"', t + "'"): 2111c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith single_quoted.add(u) 2121c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith for u in (t + '"""', t + "'''"): 2131c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith triple_quoted.add(u) 2149d6897accc49f40414fbecafeb1c65562c6e4647Guido van Rossum 215fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossumtabsize = 8 2169b8d801c37fa29420848ebc1b50c601893b36287Fred Drake 21728c62bbdb2545eddf04ba7e2f2daa0dcedbb6052Ka-Ping Yeeclass TokenError(Exception): pass 21828c62bbdb2545eddf04ba7e2f2daa0dcedbb6052Ka-Ping Yee 21928c62bbdb2545eddf04ba7e2f2daa0dcedbb6052Ka-Ping Yeeclass StopTokenizing(Exception): pass 2209b8d801c37fa29420848ebc1b50c601893b36287Fred Drake 2215ca576ed0a0c697c7e7547adfd0b3af010fd2053Tim Peters 22289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Woutersclass Untokenizer: 22389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters 22489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters def __init__(self): 22589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.tokens = [] 22689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.prev_row = 1 22789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.prev_col = 0 228428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson self.encoding = None 22989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters 23089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters def add_whitespace(self, start): 23189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters row, col = start 2325e6db313686c200da425a54d2e0c95fa40107b1dTerry Jan Reedy if row < self.prev_row or row == self.prev_row and col < self.prev_col: 2335e6db313686c200da425a54d2e0c95fa40107b1dTerry Jan Reedy raise ValueError("start ({},{}) precedes previous end ({},{})" 2345e6db313686c200da425a54d2e0c95fa40107b1dTerry Jan Reedy .format(row, col, self.prev_row, self.prev_col)) 2359dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy row_offset = row - self.prev_row 236f106f8f29cf5eb90f39e0734d248a53b071f05c0Terry Jan Reedy if row_offset: 2379dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy self.tokens.append("\\\n" * row_offset) 2389dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy self.prev_col = 0 23989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters col_offset = col - self.prev_col 24089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters if col_offset: 24189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.tokens.append(" " * col_offset) 24289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters 24389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters def untokenize(self, iterable): 2445b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy it = iter(iterable) 245e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang indents = [] 246e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang startline = False 2475b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy for t in it: 24889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters if len(t) == 2: 2495b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy self.compat(t, it) 25089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters break 25189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters tok_type, token, start, end, line = t 252428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if tok_type == ENCODING: 253428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson self.encoding = token 254428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson continue 2559dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy if tok_type == ENDMARKER: 2569dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy break 257e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang if tok_type == INDENT: 258e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang indents.append(token) 259e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang continue 260e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang elif tok_type == DEDENT: 261e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang indents.pop() 262e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang self.prev_row, self.prev_col = end 263e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang continue 264e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang elif tok_type in (NEWLINE, NL): 265e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang startline = True 266e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang elif startline and indents: 267e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang indent = indents[-1] 268e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang if start[1] >= len(indent): 269e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang self.tokens.append(indent) 270e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang self.prev_col = len(indent) 271e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang startline = False 27289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.add_whitespace(start) 27389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.tokens.append(token) 27489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.prev_row, self.prev_col = end 27589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters if tok_type in (NEWLINE, NL): 27689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.prev_row += 1 27789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters self.prev_col = 0 27889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters return "".join(self.tokens) 27989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters 28089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters def compat(self, token, iterable): 28189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters indents = [] 28289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters toks_append = self.tokens.append 2835b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy startline = token[0] in (NEWLINE, NL) 284ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes prevstring = False 2855b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy 2865b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy for tok in chain([token], iterable): 28789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters toknum, tokval = tok[:2] 288428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if toknum == ENCODING: 289428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson self.encoding = tokval 290428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson continue 29189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters 2927544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if toknum in (NAME, NUMBER, ASYNC, AWAIT): 29389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters tokval += ' ' 29489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters 295ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes # Insert a space between two consecutive strings 296ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes if toknum == STRING: 297ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes if prevstring: 298ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes tokval = ' ' + tokval 299ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes prevstring = True 300ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes else: 301ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes prevstring = False 302ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes 30389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters if toknum == INDENT: 30489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters indents.append(tokval) 30589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters continue 30689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters elif toknum == DEDENT: 30789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters indents.pop() 30889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters continue 30989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters elif toknum in (NEWLINE, NL): 31089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters startline = True 31189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters elif startline and indents: 31289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters toks_append(indents[-1]) 31389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters startline = False 31489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters toks_append(tokval) 31568c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger 316428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 31768c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettingerdef untokenize(iterable): 31868c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger """Transform tokens back into Python source code. 319428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson It returns a bytes object, encoded using the ENCODING 320428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson token, which is the first token sequence output by tokenize. 32168c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger 32268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger Each element returned by the iterable must be a token sequence 32389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters with at least two elements, a token number and token value. If 32489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters only two tokens are passed, the resulting output is poor. 32589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters 32689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters Round-trip invariant for full input: 32789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters Untokenized source will match input source exactly 32868c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger 329ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag Round-trip invariant for limited input: 330ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag # Output bytes will tokenize back to the input 331428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson t1 = [tok[:2] for tok in tokenize(f.readline)] 33268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger newcode = untokenize(t1) 333428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson readline = BytesIO(newcode).readline 334428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson t2 = [tok[:2] for tok in tokenize(readline)] 33568c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger assert t1 == t2 33668c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger """ 33789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters ut = Untokenizer() 338428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson out = ut.untokenize(iterable) 339428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if ut.encoding is not None: 340428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson out = out.encode(ut.encoding) 341428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson return out 34268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger 343428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 344d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Petersondef _get_normal_name(orig_enc): 345d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson """Imitates get_normal_name in tokenizer.c.""" 346d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson # Only care about the first 12 characters. 347d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson enc = orig_enc[:12].lower().replace("_", "-") 348d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson if enc == "utf-8" or enc.startswith("utf-8-"): 349d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson return "utf-8" 350d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 351d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 352d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson return "iso-8859-1" 353d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson return orig_enc 354d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson 355428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef detect_encoding(readline): 356d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger """ 357428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson The detect_encoding() function is used to detect the encoding that should 3584bcc796acc17f8ab7eeaa3f7faa6a61135b2c090Ezio Melotti be used to decode a Python source file. It requires one argument, readline, 359428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson in the same way as the tokenize() generator. 360428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 361428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson It will call readline a maximum of twice, and return the encoding used 36243e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna (as a string) and a list of any lines (left as bytes) it has read in. 363428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 364428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson It detects the encoding from the presence of a utf-8 bom or an encoding 36543e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna cookie as specified in pep-0263. If both a bom and a cookie are present, 36643e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna but disagree, a SyntaxError will be raised. If the encoding cookie is an 36743e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 368689a55809818a846d2733241642572840d20570bBenjamin Peterson 'utf-8-sig' is returned. 369428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 370428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson If no encoding is specified, then the default of 'utf-8' will be returned. 371428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson """ 372c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon try: 373c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon filename = readline.__self__.name 374c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon except AttributeError: 375c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon filename = None 376428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson bom_found = False 377428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson encoding = None 378689a55809818a846d2733241642572840d20570bBenjamin Peterson default = 'utf-8' 379428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson def read_or_stop(): 380428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson try: 381428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson return readline() 382428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson except StopIteration: 383428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson return b'' 384428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 385428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson def find_cookie(line): 386428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson try: 38763674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis # Decode as UTF-8. Either the line is an encoding declaration, 38863674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis # in which case it should be pure ASCII, or it must be UTF-8 38963674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis # per default encoding. 39063674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis line_string = line.decode('utf-8') 391428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson except UnicodeDecodeError: 392c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon msg = "invalid or missing encoding declaration" 393c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon if filename is not None: 394c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon msg = '{} for {!r}'.format(msg, filename) 395c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon raise SyntaxError(msg) 396433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson 397dafea851901fc1de278ad79727d3b44f46ba5a31Serhiy Storchaka match = cookie_re.match(line_string) 398dafea851901fc1de278ad79727d3b44f46ba5a31Serhiy Storchaka if not match: 399433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson return None 400dafea851901fc1de278ad79727d3b44f46ba5a31Serhiy Storchaka encoding = _get_normal_name(match.group(1)) 401433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson try: 402433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson codec = lookup(encoding) 403433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson except LookupError: 404433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson # This behaviour mimics the Python interpreter 405c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon if filename is None: 406c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon msg = "unknown encoding: " + encoding 407c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon else: 408c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon msg = "unknown encoding for {!r}: {}".format(filename, 409c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon encoding) 410c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon raise SyntaxError(msg) 411433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson 4121613ed810801df8327ae6f55b7785fec3a9dc6bbBenjamin Peterson if bom_found: 41311f0b41e9de3805441ddd4142df9f6b7f4432ca7Florent Xicluna if encoding != 'utf-8': 4141613ed810801df8327ae6f55b7785fec3a9dc6bbBenjamin Peterson # This behaviour mimics the Python interpreter 415c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon if filename is None: 416c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon msg = 'encoding problem: utf-8' 417c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon else: 418c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon msg = 'encoding problem for {!r}: utf-8'.format(filename) 419c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon raise SyntaxError(msg) 4201613ed810801df8327ae6f55b7785fec3a9dc6bbBenjamin Peterson encoding += '-sig' 421433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson return encoding 422428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 423428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson first = read_or_stop() 424433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson if first.startswith(BOM_UTF8): 425428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson bom_found = True 426428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson first = first[3:] 427689a55809818a846d2733241642572840d20570bBenjamin Peterson default = 'utf-8-sig' 428428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if not first: 429689a55809818a846d2733241642572840d20570bBenjamin Peterson return default, [] 430428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 431428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson encoding = find_cookie(first) 432428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if encoding: 433428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson return encoding, [first] 434768c16ce0273a74fa846cc388753280b17b02cfcSerhiy Storchaka if not blank_re.match(first): 435768c16ce0273a74fa846cc388753280b17b02cfcSerhiy Storchaka return default, [first] 436428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 437428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson second = read_or_stop() 438428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if not second: 439689a55809818a846d2733241642572840d20570bBenjamin Peterson return default, [first] 440428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 441428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson encoding = find_cookie(second) 442428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if encoding: 443428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson return encoding, [first, second] 444428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 445689a55809818a846d2733241642572840d20570bBenjamin Peterson return default, [first, second] 446428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 447428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 44858c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinnerdef open(filename): 44958c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner """Open a file in read only mode using the encoding detected by 45058c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner detect_encoding(). 45158c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner """ 452969175091c4556e5b7e128ba91ae39f0b80153afVictor Stinner buffer = _builtin_open(filename, 'rb') 453387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner try: 454387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner encoding, lines = detect_encoding(buffer.readline) 455387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner buffer.seek(0) 456387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner text = TextIOWrapper(buffer, encoding, line_buffering=True) 457387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner text.mode = 'r' 458387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner return text 459387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner except: 460387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner buffer.close() 461387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner raise 46258c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner 46358c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner 464428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef tokenize(readline): 465428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson """ 466ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag The tokenize() generator requires one argument, readline, which 467d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger must be a callable object which provides the same interface as the 46843e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna readline() method of built-in file objects. Each call to the function 469ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag should return one line of input as bytes. Alternatively, readline 47068c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger can be a callable function terminating with StopIteration: 471428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson readline = open(myfile, 'rb').__next__ # Example of alternate readline 4728ac1495a6a1d18111a626cec0c7f2eb67df3edb3Tim Peters 473d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger The generator produces 5-tuples with these members: the token type; the 474d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger token string; a 2-tuple (srow, scol) of ints specifying the row and 475d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger column where the token begins in the source; a 2-tuple (erow, ecol) of 476d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger ints specifying the row and column where the token ends in the source; 47743e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna and the line on which the token was found. The line passed is the 4788ac1495a6a1d18111a626cec0c7f2eb67df3edb3Tim Peters logical line; continuation lines are included. 479428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 480428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson The first token sequence will always be an ENCODING token 481428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson which tells you which encoding was used to decode the bytes stream. 482d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger """ 48321db77e396c00c0490b6344a130bdbcef62bfa73Benjamin Peterson # This import is here to avoid problems when the itertools module is not 48421db77e396c00c0490b6344a130bdbcef62bfa73Benjamin Peterson # built yet and tokenize is imported. 48581dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson from itertools import chain, repeat 486428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson encoding, consumed = detect_encoding(readline) 48781dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson rl_gen = iter(readline, b"") 48881dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson empty = repeat(b"") 48981dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding) 490428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 491428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 492428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef _tokenize(readline, encoding): 4931aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum lnum = parenlev = continued = 0 49433856de84d1115a18b699e0ca93c3b921bc6a1afBenjamin Peterson numchars = '0123456789' 495de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum contstr, needcont = '', 0 496a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum contline = None 497fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum indents = [0] 4981aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum 49996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov # 'stashed' and 'async_*' are used for async/await parsing 5007544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = None 50196ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def = False 50296ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_indent = 0 50396ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_nl = False 5047544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 505428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if encoding is not None: 506689a55809818a846d2733241642572840d20570bBenjamin Peterson if encoding == "utf-8-sig": 507689a55809818a846d2733241642572840d20570bBenjamin Peterson # BOM will already have been stripped. 508689a55809818a846d2733241642572840d20570bBenjamin Peterson encoding = "utf-8" 509a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 5100fe14383a8576ee5eb4a6aa83c96484281b360fdBenjamin Peterson while True: # loop over lines in stream 51168c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger try: 51268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger line = readline() 51368c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger except StopIteration: 514428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson line = b'' 515428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 516428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson if encoding is not None: 517428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson line = line.decode(encoding) 518a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson lnum += 1 519fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum pos, max = 0, len(line) 520fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 521fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum if contstr: # continued string 522de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum if not line: 523ce36ad8a467d914eb5c91f33835b9eaea18ee93bCollin Winter raise TokenError("EOF in multi-line string", strstart) 5243b631775b26b866e072cd3340f303bf5903af883Guido van Rossum endmatch = endprog.match(line) 5253b631775b26b866e072cd3340f303bf5903af883Guido van Rossum if endmatch: 5263b631775b26b866e072cd3340f303bf5903af883Guido van Rossum pos = end = endmatch.end(0) 527a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(STRING, contstr + line[:end], 52889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters strstart, (lnum, end), contline + line) 529de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum contstr, needcont = '', 0 530a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum contline = None 531de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 532a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(ERRORTOKEN, contstr + line, 533a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum strstart, (lnum, len(line)), contline) 534fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum contstr = '' 535a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum contline = None 536de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum continue 537fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum else: 538fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum contstr = contstr + line 539a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum contline = contline + line 540fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum continue 541fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 5421aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum elif parenlev == 0 and not continued: # new statement 543fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum if not line: break 544fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum column = 0 5451aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum while pos < max: # measure leading whitespace 546a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson if line[pos] == ' ': 547a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson column += 1 548a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson elif line[pos] == '\t': 549a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson column = (column//tabsize + 1)*tabsize 550a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson elif line[pos] == '\f': 551a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson column = 0 552a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson else: 553a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson break 554a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson pos += 1 555a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson if pos == max: 556a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson break 5571aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum 5581aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum if line[pos] in '#\r\n': # skip comments or blank lines 55989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters if line[pos] == '#': 56089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters comment_token = line[pos:].rstrip('\r\n') 56189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters nl_pos = pos + len(comment_token) 562a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(COMMENT, comment_token, 56389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters (lnum, pos), (lnum, pos + len(comment_token)), line) 564a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(NL, line[nl_pos:], 56589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters (lnum, nl_pos), (lnum, len(line)), line) 56689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters else: 567a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:], 5681aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum (lnum, pos), (lnum, len(line)), line) 5691aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum continue 570fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 571fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum if column > indents[-1]: # count indents or dedents 572fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum indents.append(column) 573a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 574fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum while column < indents[-1]: 575da99d1cbfeedafd41263ac2d1b397d57c14ab28eRaymond Hettinger if column not in indents: 576da99d1cbfeedafd41263ac2d1b397d57c14ab28eRaymond Hettinger raise IndentationError( 57700ee7baf49430d8a6eed355a5fd7a05179325747Thomas Wouters "unindent does not match any outer indentation level", 57800ee7baf49430d8a6eed355a5fd7a05179325747Thomas Wouters ("<tokenize>", lnum, pos, line)) 579fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum indents = indents[:-1] 5807544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 58196ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov if async_def and async_def_indent >= indents[-1]: 58296ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def = False 58396ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_nl = False 58496ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_indent = 0 5857544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 586a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 587fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 58896ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov if async_def and async_def_nl and async_def_indent >= indents[-1]: 58996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def = False 59096ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_nl = False 59196ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_indent = 0 59296ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov 593fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum else: # continued statement 594de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum if not line: 595ce36ad8a467d914eb5c91f33835b9eaea18ee93bCollin Winter raise TokenError("EOF in multi-line statement", (lnum, 0)) 596fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum continued = 0 597fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 598fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum while pos < max: 59910a99b024df0d30911b198146d0206c8f6d0d6c7Antoine Pitrou pseudomatch = _compile(PseudoToken).match(line, pos) 6003b631775b26b866e072cd3340f303bf5903af883Guido van Rossum if pseudomatch: # scan for tokens 6013b631775b26b866e072cd3340f303bf5903af883Guido van Rossum start, end = pseudomatch.span(1) 602de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum spos, epos, pos = (lnum, start), (lnum, end), end 6032cc3b4ba9ffa658784da03f14a0a068e2c61d1b3Ezio Melotti if start == end: 6042cc3b4ba9ffa658784da03f14a0a068e2c61d1b3Ezio Melotti continue 6051aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum token, initial = line[start:end], line[start] 606fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 607dde002899db8d04ac25d630fcc3a27e8bbf282eaGeorg Brandl if (initial in numchars or # ordinary number 608dde002899db8d04ac25d630fcc3a27e8bbf282eaGeorg Brandl (initial == '.' and token != '.' and token != '...')): 609a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(NUMBER, token, spos, epos, line) 6101aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum elif initial in '\r\n': 6117544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if stashed: 6127544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield stashed 6137544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = None 61496ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov if parenlev > 0: 61596ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov yield TokenInfo(NL, token, spos, epos, line) 61696ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov else: 61796ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov yield TokenInfo(NEWLINE, token, spos, epos, line) 61896ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov if async_def: 61996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_nl = True 62096ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov 6211aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum elif initial == '#': 62289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters assert not token.endswith("\n") 6237544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if stashed: 6247544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield stashed 6257544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = None 626a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(COMMENT, token, spos, epos, line) 6271c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith 6289d6897accc49f40414fbecafeb1c65562c6e4647Guido van Rossum elif token in triple_quoted: 62910a99b024df0d30911b198146d0206c8f6d0d6c7Antoine Pitrou endprog = _compile(endpats[token]) 6303b631775b26b866e072cd3340f303bf5903af883Guido van Rossum endmatch = endprog.match(line, pos) 6313b631775b26b866e072cd3340f303bf5903af883Guido van Rossum if endmatch: # all on one line 6323b631775b26b866e072cd3340f303bf5903af883Guido van Rossum pos = endmatch.end(0) 6331aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum token = line[start:pos] 634a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(STRING, token, spos, (lnum, pos), line) 635fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum else: 6361aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum strstart = (lnum, start) # multiple lines 6371aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum contstr = line[start:] 638a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum contline = line 639fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum break 6401c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith 6411c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # Check up to the first 3 chars of the token to see if 6421c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # they're in the single_quoted set. If so, they start 6431c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # a string. 6441c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # We're using the first 3, because we're looking for 6451c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # "rb'" (for example) at the start of the token. If 6461c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # we switch to longer prefixes, this needs to be 6471c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # adjusted. 6481c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # Note that initial == token[:1]. 649a7161e7facdfa1d6f673beb16a95a647ce764b32Berker Peksag # Also note that single quote checking must come after 6501c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # triple quote checking (above). 6511c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith elif (initial in single_quoted or 6521c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith token[:2] in single_quoted or 6531c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith token[:3] in single_quoted): 654fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum if token[-1] == '\n': # continued string 6551aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum strstart = (lnum, start) 6561c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # Again, using the first 3 chars of the 6571c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # token. This is looking for the matching end 6581c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # regex for the correct type of quote 6591c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # character. So it's really looking for 6601c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # endpats["'"] or endpats['"'], by trying to 6611c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith # skip string prefix characters, if any. 6621c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith endprog = _compile(endpats.get(initial) or 6631c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith endpats.get(token[1]) or 6641c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith endpats.get(token[2])) 665de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum contstr, needcont = line[start:], 1 666a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum contline = line 667fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum break 668fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum else: # ordinary string 669a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(STRING, token, spos, epos, line) 6701c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith 67133856de84d1115a18b699e0ca93c3b921bc6a1afBenjamin Peterson elif initial.isidentifier(): # ordinary name 6727544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if token in ('async', 'await'): 67396ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov if async_def: 6747544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield TokenInfo( 6757544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov ASYNC if token == 'async' else AWAIT, 6767544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov token, spos, epos, line) 6777544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov continue 6787544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 6797544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov tok = TokenInfo(NAME, token, spos, epos, line) 6807544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if token == 'async' and not stashed: 6817544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = tok 6827544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov continue 6837544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 6847544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if token == 'def': 6857544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if (stashed 6867544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov and stashed.type == NAME 6877544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov and stashed.string == 'async'): 6887544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 68996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def = True 69096ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov async_def_indent = indents[-1] 6917544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 6927544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield TokenInfo(ASYNC, stashed.string, 6937544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed.start, stashed.end, 6947544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed.line) 6957544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = None 6967544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 6977544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if stashed: 6987544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield stashed 6997544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = None 7007544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 7017544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield tok 7023b631775b26b866e072cd3340f303bf5903af883Guido van Rossum elif initial == '\\': # continued stmt 7033b631775b26b866e072cd3340f303bf5903af883Guido van Rossum continued = 1 704fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum else: 705a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson if initial in '([{': 706a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson parenlev += 1 707a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson elif initial in ')]}': 708a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson parenlev -= 1 7097544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if stashed: 7107544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield stashed 7117544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = None 712a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(OP, token, spos, epos, line) 713fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum else: 714a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(ERRORTOKEN, line[pos], 715de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum (lnum, pos), (lnum, pos+1), line) 716a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson pos += 1 717fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 7187544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov if stashed: 7197544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov yield stashed 7207544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov stashed = None 7217544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov 722fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum for indent in indents[1:]: # pop remaining indent levels 723a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 724a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 725fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum 726428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson 727428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson# An undocumented, backwards compatible, API for all the places in the standard 728428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson# library that expect to be able to use tokenize with strings 729428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef generate_tokens(readline): 730428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson return _tokenize(readline, None) 7316c60d099e5ed97ee0026687c1ec3401cca49c0c2Raymond Hettinger 73214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Ingedef main(): 73314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge import argparse 73414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge 73514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge # Helper error handling routines 73614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge def perror(message): 73714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge print(message, file=sys.stderr) 73814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge 73914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge def error(message, filename=None, location=None): 74014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge if location: 74114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge args = (filename,) + location + (message,) 74214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge perror("%s:%d:%d: error: %s" % args) 74314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge elif filename: 74414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge perror("%s: error: %s" % (filename, message)) 74514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge else: 74614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge perror("error: %s" % message) 74714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge sys.exit(1) 74814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge 74914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge # Parse the arguments and options 75014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge parser = argparse.ArgumentParser(prog='python -m tokenize') 75114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge parser.add_argument(dest='filename', nargs='?', 75214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge metavar='filename.py', 75314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge help='the file to tokenize; defaults to stdin') 75400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge parser.add_argument('-e', '--exact', dest='exact', action='store_true', 75500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge help='display token names using the exact type') 75614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge args = parser.parse_args() 75714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge 75814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge try: 75914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge # Tokenize the input 76014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge if args.filename: 76114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge filename = args.filename 762969175091c4556e5b7e128ba91ae39f0b80153afVictor Stinner with _builtin_open(filename, 'rb') as f: 76314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge tokens = list(tokenize(f.readline)) 76414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge else: 76514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge filename = "<stdin>" 76614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge tokens = _tokenize(sys.stdin.readline, None) 76714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge 76814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge # Output the tokenization 76914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge for token in tokens: 77000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge token_type = token.type 77100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge if args.exact: 77200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge token_type = token.exact_type 77314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge token_range = "%d,%d-%d,%d:" % (token.start + token.end) 77414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge print("%-20s%-15s%-15r" % 77500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge (token_range, tok_name[token_type], token.string)) 77614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge except IndentationError as err: 77714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge line, column = err.args[1][1:3] 77814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge error(err.args[0], filename, (line, column)) 77914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge except TokenError as err: 78014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge line, column = err.args[1] 78114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge error(err.args[0], filename, (line, column)) 78214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge except SyntaxError as err: 78314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge error(err, filename) 784f7a17b48d748e1835bcf9df86fb7fb318bb020f8Andrew Svetlov except OSError as err: 78514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge error(err) 78614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge except KeyboardInterrupt: 78714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge print("interrupted\n") 78814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge except Exception as err: 78914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge perror("unexpected error: %s" % err) 79014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge raise 79114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge 7926c60d099e5ed97ee0026687c1ec3401cca49c0c2Raymond Hettingerif __name__ == "__main__": 79314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge main() 794