1b51eaa183e048a928fb363bac4404e6acf0e3badGuido van Rossum"""Tokenization help for Python programs.
2b51eaa183e048a928fb363bac4404e6acf0e3badGuido van Rossum
343e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunatokenize(readline) is a generator that breaks a stream of bytes into
443e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent XiclunaPython tokens.  It decodes the bytes according to PEP-0263 for
543e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunadetermining source file encoding.
6428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
743e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent XiclunaIt accepts a readline-like method which is called repeatedly to get the
843e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunanext line of input (or b"" for EOF).  It generates 5-tuples with these
943e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunamembers:
104efb6e964376a46aaa3acf365a6627a37af236bfTim Peters
114efb6e964376a46aaa3acf365a6627a37af236bfTim Peters    the token type (see token.py)
124efb6e964376a46aaa3acf365a6627a37af236bfTim Peters    the token (a string)
134efb6e964376a46aaa3acf365a6627a37af236bfTim Peters    the starting (row, column) indices of the token (a 2-tuple of ints)
144efb6e964376a46aaa3acf365a6627a37af236bfTim Peters    the ending (row, column) indices of the token (a 2-tuple of ints)
154efb6e964376a46aaa3acf365a6627a37af236bfTim Peters    the original line (string)
164efb6e964376a46aaa3acf365a6627a37af236bfTim Peters
174efb6e964376a46aaa3acf365a6627a37af236bfTim PetersIt is designed to match the working of the Python tokenizer exactly, except
184efb6e964376a46aaa3acf365a6627a37af236bfTim Petersthat it produces COMMENT tokens for comments and gives type OP for all
1943e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunaoperators.  Additionally, all token lists start with an ENCODING token
2043e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xiclunawhich tells you which encoding was used to decode the bytes stream.
2143e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna"""
221aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum
23244c593598af4db19e410032fb10793617a895ceKa-Ping Yee__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson               'Michael Foord')
27cf4a2f29adb6bdae0b18e983250d7c48d486c9d6Serhiy Storchakafrom builtins import open as _builtin_open
28433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Petersonfrom codecs import lookup, BOM_UTF8
293fb79c747b0cd0884f2a6ede9e36673bec8745f2Raymond Hettingerimport collections
3058c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinnerfrom io import TextIOWrapper
315b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyfrom itertools import chain
321c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithimport itertools as _itertools
335b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyimport re
345b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyimport sys
355b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedyfrom token import *
365b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy
37e431d3c9aadb52dd1eea4d1e606e94f1c8471459Serhiy Storchakacookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
38768c16ce0273a74fa846cc388753280b17b02cfcSerhiy Storchakablank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
394d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
4040fc16059f04ee8fda0b5956cc4883eb21ca8f8cSkip Montanaroimport token
41b9d10d08c4eb0dedaea3b1bcde0f13b033e16c85Alexander Belopolsky__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
42b9d10d08c4eb0dedaea3b1bcde0f13b033e16c85Alexander Belopolsky                           "NL", "untokenize", "ENCODING", "TokenInfo"]
4340fc16059f04ee8fda0b5956cc4883eb21ca8f8cSkip Montanarodel token
4440fc16059f04ee8fda0b5956cc4883eb21ca8f8cSkip Montanaro
451aec32363f25693e0c3ff81feddf620850b4955dGuido van RossumCOMMENT = N_TOKENS
461aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossumtok_name[COMMENT] = 'COMMENT'
47a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van RossumNL = N_TOKENS + 1
48a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossumtok_name[NL] = 'NL'
49428de65ca99492436130165bfbaeb56d6d1daec7Trent NelsonENCODING = N_TOKENS + 2
50428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsontok_name[ENCODING] = 'ENCODING'
51428de65ca99492436130165bfbaeb56d6d1daec7Trent NelsonN_TOKENS += 3
5200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador IngeEXACT_TOKEN_TYPES = {
5300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '(':   LPAR,
5400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    ')':   RPAR,
5500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '[':   LSQB,
5600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    ']':   RSQB,
5700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    ':':   COLON,
5800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    ',':   COMMA,
5900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    ';':   SEMI,
6000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '+':   PLUS,
6100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '-':   MINUS,
6200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '*':   STAR,
6300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '/':   SLASH,
6400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '|':   VBAR,
6500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '&':   AMPER,
6600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '<':   LESS,
6700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '>':   GREATER,
6800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '=':   EQUAL,
6900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '.':   DOT,
7000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '%':   PERCENT,
7100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '{':   LBRACE,
7200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '}':   RBRACE,
7300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '==':  EQEQUAL,
7400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '!=':  NOTEQUAL,
7500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '<=':  LESSEQUAL,
7600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '>=':  GREATEREQUAL,
7700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '~':   TILDE,
7800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '^':   CIRCUMFLEX,
7900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '<<':  LEFTSHIFT,
8000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '>>':  RIGHTSHIFT,
8100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '**':  DOUBLESTAR,
8200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '+=':  PLUSEQUAL,
8300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '-=':  MINEQUAL,
8400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '*=':  STAREQUAL,
8500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '/=':  SLASHEQUAL,
8600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '%=':  PERCENTEQUAL,
8700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '&=':  AMPEREQUAL,
8800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '|=':  VBAREQUAL,
8900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '^=': CIRCUMFLEXEQUAL,
9000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '<<=': LEFTSHIFTEQUAL,
9100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '>>=': RIGHTSHIFTEQUAL,
9200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '**=': DOUBLESTAREQUAL,
9300c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '//':  DOUBLESLASH,
9400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    '//=': DOUBLESLASHEQUAL,
95d51374ed78a3e3145911a16cdf3b9b84b3ba7d15Benjamin Peterson    '@':   AT,
96d51374ed78a3e3145911a16cdf3b9b84b3ba7d15Benjamin Peterson    '@=':  ATEQUAL,
9700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge}
981aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum
993fb79c747b0cd0884f2a6ede9e36673bec8745f2Raymond Hettingerclass TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
100aa17a7fc98773e0f2b2a23e59a0a2b3d9f1bca84Raymond Hettinger    def __repr__(self):
101a0e79408bcf14015995fb4f1f1c3ad88df017496Raymond Hettinger        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
102a0e79408bcf14015995fb4f1f1c3ad88df017496Raymond Hettinger        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
103a0e79408bcf14015995fb4f1f1c3ad88df017496Raymond Hettinger                self._replace(type=annotated_type))
104aa17a7fc98773e0f2b2a23e59a0a2b3d9f1bca84Raymond Hettinger
10500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    @property
10600c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    def exact_type(self):
10700c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
10800c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge            return EXACT_TOKEN_TYPES[self.string]
10900c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge        else:
11000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge            return self.type
11100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge
112b08b2d316653bf22d39ad76814b5a0e7dad30c31Eric S. Raymonddef group(*choices): return '(' + '|'.join(choices) + ')'
11368468eba635570400f607e140425a222018e56f9Guido van Rossumdef any(*choices): return group(*choices) + '*'
11468468eba635570400f607e140425a222018e56f9Guido van Rossumdef maybe(*choices): return group(*choices) + '?'
1154d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
116fd036451bf0e0ade8783e21df801abf7be96d020Antoine Pitrou# Note: we use unicode matching for names ("\w") but ascii matching for
117fd036451bf0e0ade8783e21df801abf7be96d020Antoine Pitrou# number literals.
1183b631775b26b866e072cd3340f303bf5903af883Guido van RossumWhitespace = r'[ \f\t]*'
1193b631775b26b866e072cd3340f303bf5903af883Guido van RossumComment = r'#[^\r\n]*'
1203b631775b26b866e072cd3340f303bf5903af883Guido van RossumIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
12133856de84d1115a18b699e0ca93c3b921bc6a1afBenjamin PetersonName = r'\w+'
1224d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
123a721abac299bb6529021000a71847486d531b41aBrett CannonHexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
124a721abac299bb6529021000a71847486d531b41aBrett CannonBinnumber = r'0[bB](?:_?[01])+'
125a721abac299bb6529021000a71847486d531b41aBrett CannonOctnumber = r'0[oO](?:_?[0-7])+'
126a721abac299bb6529021000a71847486d531b41aBrett CannonDecnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
127cd16bf640405065e4702539632ce577536207d88Guido van RossumIntnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
128a721abac299bb6529021000a71847486d531b41aBrett CannonExponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
129a721abac299bb6529021000a71847486d531b41aBrett CannonPointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
130a721abac299bb6529021000a71847486d531b41aBrett Cannon                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
131a721abac299bb6529021000a71847486d531b41aBrett CannonExpfloat = r'[0-9](?:_?[0-9])*' + Exponent
1321aec32363f25693e0c3ff81feddf620850b4955dGuido van RossumFloatnumber = group(Pointfloat, Expfloat)
133a721abac299bb6529021000a71847486d531b41aBrett CannonImagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
1341aec32363f25693e0c3ff81feddf620850b4955dGuido van RossumNumber = group(Imagnumber, Floatnumber, Intnumber)
1354d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
1361c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# Return the empty string, plus all of the valid string prefixes.
1371c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithdef _all_string_prefixes():
1381c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    # The valid string prefixes. Only contain the lower case versions,
1391c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    #  and don't contain any permuations (include 'fr', but not
1401c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    #  'rf'). The various permutations will be generated.
1411c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
1421c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    # if we add binary f-strings, add: ['fb', 'fbr']
1431c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    result = set([''])
1441c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    for prefix in _valid_string_prefixes:
1451c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith        for t in _itertools.permutations(prefix):
1461c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith            # create a list with upper and lower versions of each
1471c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith            #  character
1481c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith            for u in _itertools.product(*[(c, c.upper()) for c in t]):
1491c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                result.add(''.join(u))
1501c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    return result
1511c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith
1521c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithdef _compile(expr):
1531c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    return re.compile(expr, re.UNICODE)
1541c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith
1551c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# Note that since _all_string_prefixes includes the empty string,
1561c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith#  StringPrefix can be the empty string (making it optional).
1571c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. SmithStringPrefix = group(*_all_string_prefixes())
158c0eaecafe9809757301551285f2a41ea89f1f228Armin Ronacher
159de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of ' string.
160de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersSingle = r"[^'\\]*(?:\\.[^'\\]*)*'"
161de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of " string.
162de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersDouble = r'[^"\\]*(?:\\.[^"\\]*)*"'
163de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of ''' string.
164de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
165de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Tail end of """ string.
166de49583a0d59f806b88b0f6a869f470047b3cbceTim PetersDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
167c0eaecafe9809757301551285f2a41ea89f1f228Armin RonacherTriple = group(StringPrefix + "'''", StringPrefix + '"""')
168de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Single-line ' or " string.
169c0eaecafe9809757301551285f2a41ea89f1f228Armin RonacherString = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
170c0eaecafe9809757301551285f2a41ea89f1f228Armin Ronacher               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
1714d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
172de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# Because of leftmost-then-longest match semantics, be sure to put the
173de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# longest operators first (e.g., if = came before ==, == would get
174de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# recognized as two instances of =).
175b053cd8f40dd19985b16f50661640dcefb69888fGuido van RossumOperator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
176c150536b5efadf71fcb4187cad7258be7268e157Neal Norwitz                 r"//=?", r"->",
177d51374ed78a3e3145911a16cdf3b9b84b3ba7d15Benjamin Peterson                 r"[+\-*/%&@|^=<>]=?",
178de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters                 r"~")
179e1519a1b4d8e24ab6a98083c6ec8bf4ec7594111Thomas Wouters
1804d8e859e8f0a209a7e999ce9cc0988156c795949Guido van RossumBracket = '[][(){}]'
181dde002899db8d04ac25d630fcc3a27e8bbf282eaGeorg BrandlSpecial = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
182fc6f5339a99d103928bce9eda605564f2a9e8477Guido van RossumFunny = group(Operator, Bracket, Special)
1834d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
1843b631775b26b866e072cd3340f303bf5903af883Guido van RossumPlainToken = group(Number, Funny, String, Name)
185fc6f5339a99d103928bce9eda605564f2a9e8477Guido van RossumToken = Ignore + PlainToken
1864d8e859e8f0a209a7e999ce9cc0988156c795949Guido van Rossum
187de49583a0d59f806b88b0f6a869f470047b3cbceTim Peters# First (or only) line of ' or " string.
188c0eaecafe9809757301551285f2a41ea89f1f228Armin RonacherContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
1891ff08b1243dcb07db975640b2f3cbc82985bee81Ka-Ping Yee                group("'", r'\\\r?\n'),
190c0eaecafe9809757301551285f2a41ea89f1f228Armin Ronacher                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
1911ff08b1243dcb07db975640b2f3cbc82985bee81Ka-Ping Yee                group('"', r'\\\r?\n'))
1922cc3b4ba9ffa658784da03f14a0a068e2c61d1b3Ezio MelottiPseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
1933b631775b26b866e072cd3340f303bf5903af883Guido van RossumPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
1941aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum
1951c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# For a given string prefix plus quotes, endpats maps it to a regex
1961c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith#  to match the remainder of that string. _prefix can be empty, for
1971c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith#  a normal single or triple quoted string (with no prefix).
1981c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithendpats = {}
1991c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithfor _prefix in _all_string_prefixes():
2001c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    endpats[_prefix + "'"] = Single
2011c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    endpats[_prefix + '"'] = Double
2021c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    endpats[_prefix + "'''"] = Single3
2031c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    endpats[_prefix + '"""'] = Double3
2041c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith
2051c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith# A set of all of the single and triple quoted string prefixes,
2061c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith#  including the opening quotes.
2071c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithsingle_quoted = set()
2081c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithtriple_quoted = set()
2091c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smithfor t in _all_string_prefixes():
2101c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    for u in (t + '"', t + "'"):
2111c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith        single_quoted.add(u)
2121c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith    for u in (t + '"""', t + "'''"):
2131c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith        triple_quoted.add(u)
2149d6897accc49f40414fbecafeb1c65562c6e4647Guido van Rossum
215fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossumtabsize = 8
2169b8d801c37fa29420848ebc1b50c601893b36287Fred Drake
21728c62bbdb2545eddf04ba7e2f2daa0dcedbb6052Ka-Ping Yeeclass TokenError(Exception): pass
21828c62bbdb2545eddf04ba7e2f2daa0dcedbb6052Ka-Ping Yee
21928c62bbdb2545eddf04ba7e2f2daa0dcedbb6052Ka-Ping Yeeclass StopTokenizing(Exception): pass
2209b8d801c37fa29420848ebc1b50c601893b36287Fred Drake
2215ca576ed0a0c697c7e7547adfd0b3af010fd2053Tim Peters
22289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Woutersclass Untokenizer:
22389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters
22489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    def __init__(self):
22589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        self.tokens = []
22689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        self.prev_row = 1
22789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        self.prev_col = 0
228428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        self.encoding = None
22989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters
23089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    def add_whitespace(self, start):
23189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        row, col = start
2325e6db313686c200da425a54d2e0c95fa40107b1dTerry Jan Reedy        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
2335e6db313686c200da425a54d2e0c95fa40107b1dTerry Jan Reedy            raise ValueError("start ({},{}) precedes previous end ({},{})"
2345e6db313686c200da425a54d2e0c95fa40107b1dTerry Jan Reedy                             .format(row, col, self.prev_row, self.prev_col))
2359dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy        row_offset = row - self.prev_row
236f106f8f29cf5eb90f39e0734d248a53b071f05c0Terry Jan Reedy        if row_offset:
2379dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy            self.tokens.append("\\\n" * row_offset)
2389dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy            self.prev_col = 0
23989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        col_offset = col - self.prev_col
24089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        if col_offset:
24189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            self.tokens.append(" " * col_offset)
24289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters
24389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    def untokenize(self, iterable):
2445b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy        it = iter(iterable)
245e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang        indents = []
246e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang        startline = False
2475b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy        for t in it:
24889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            if len(t) == 2:
2495b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy                self.compat(t, it)
25089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                break
25189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            tok_type, token, start, end, line = t
252428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson            if tok_type == ENCODING:
253428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson                self.encoding = token
254428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson                continue
2559dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy            if tok_type == ENDMARKER:
2569dc3a36c849c15c227a8af218cfb215abe7b3c48Terry Jan Reedy                break
257e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang            if tok_type == INDENT:
258e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                indents.append(token)
259e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                continue
260e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang            elif tok_type == DEDENT:
261e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                indents.pop()
262e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                self.prev_row, self.prev_col = end
263e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                continue
264e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang            elif tok_type in (NEWLINE, NL):
265e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                startline = True
266e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang            elif startline and indents:
267e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                indent = indents[-1]
268e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                if start[1] >= len(indent):
269e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                    self.tokens.append(indent)
270e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                    self.prev_col = len(indent)
271e411b6629fb5f7bc01bec89df75737875ce6d8f5Dingyuan Wang                startline = False
27289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            self.add_whitespace(start)
27389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            self.tokens.append(token)
27489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            self.prev_row, self.prev_col = end
27589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            if tok_type in (NEWLINE, NL):
27689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                self.prev_row += 1
27789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                self.prev_col = 0
27889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        return "".join(self.tokens)
27989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters
28089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    def compat(self, token, iterable):
28189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        indents = []
28289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        toks_append = self.tokens.append
2835b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy        startline = token[0] in (NEWLINE, NL)
284ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes        prevstring = False
2855b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy
2865b8d2c3af76e704926cf5915ad0e6af59a232e61Terry Jan Reedy        for tok in chain([token], iterable):
28789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            toknum, tokval = tok[:2]
288428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson            if toknum == ENCODING:
289428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson                self.encoding = tokval
290428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson                continue
29189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters
2927544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
29389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                tokval += ' '
29489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters
295ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes            # Insert a space between two consecutive strings
296ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes            if toknum == STRING:
297ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes                if prevstring:
298ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes                    tokval = ' ' + tokval
299ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes                prevstring = True
300ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes            else:
301ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes                prevstring = False
302ba4af493a5bcece67bc6ae369bfea0592b10f9e5Christian Heimes
30389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            if toknum == INDENT:
30489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                indents.append(tokval)
30589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                continue
30689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            elif toknum == DEDENT:
30789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                indents.pop()
30889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                continue
30989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            elif toknum in (NEWLINE, NL):
31089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                startline = True
31189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            elif startline and indents:
31289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                toks_append(indents[-1])
31389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                startline = False
31489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters            toks_append(tokval)
31568c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger
316428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
31768c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettingerdef untokenize(iterable):
31868c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger    """Transform tokens back into Python source code.
319428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    It returns a bytes object, encoded using the ENCODING
320428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    token, which is the first token sequence output by tokenize.
32168c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger
32268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger    Each element returned by the iterable must be a token sequence
32389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    with at least two elements, a token number and token value.  If
32489f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    only two tokens are passed, the resulting output is poor.
32589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters
32689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    Round-trip invariant for full input:
32789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters        Untokenized source will match input source exactly
32868c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger
329ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag    Round-trip invariant for limited input:
330ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag        # Output bytes will tokenize back to the input
331428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        t1 = [tok[:2] for tok in tokenize(f.readline)]
33268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger        newcode = untokenize(t1)
333428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        readline = BytesIO(newcode).readline
334428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        t2 = [tok[:2] for tok in tokenize(readline)]
33568c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger        assert t1 == t2
33668c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger    """
33789f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters    ut = Untokenizer()
338428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    out = ut.untokenize(iterable)
339428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    if ut.encoding is not None:
340428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        out = out.encode(ut.encoding)
341428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    return out
34268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger
343428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
344d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Petersondef _get_normal_name(orig_enc):
345d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson    """Imitates get_normal_name in tokenizer.c."""
346d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson    # Only care about the first 12 characters.
347d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson    enc = orig_enc[:12].lower().replace("_", "-")
348d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson    if enc == "utf-8" or enc.startswith("utf-8-"):
349d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson        return "utf-8"
350d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
351d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
352d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson        return "iso-8859-1"
353d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson    return orig_enc
354d3afadaa4908df544e0181c11199e59b1bfb5c37Benjamin Peterson
355428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef detect_encoding(readline):
356d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger    """
357428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    The detect_encoding() function is used to detect the encoding that should
3584bcc796acc17f8ab7eeaa3f7faa6a61135b2c090Ezio Melotti    be used to decode a Python source file.  It requires one argument, readline,
359428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    in the same way as the tokenize() generator.
360428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
361428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    It will call readline a maximum of twice, and return the encoding used
36243e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna    (as a string) and a list of any lines (left as bytes) it has read in.
363428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
364428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    It detects the encoding from the presence of a utf-8 bom or an encoding
36543e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna    cookie as specified in pep-0263.  If both a bom and a cookie are present,
36643e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
36743e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
368689a55809818a846d2733241642572840d20570bBenjamin Peterson    'utf-8-sig' is returned.
369428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
370428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    If no encoding is specified, then the default of 'utf-8' will be returned.
371428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    """
372c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon    try:
373c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon        filename = readline.__self__.name
374c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon    except AttributeError:
375c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon        filename = None
376428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    bom_found = False
377428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    encoding = None
378689a55809818a846d2733241642572840d20570bBenjamin Peterson    default = 'utf-8'
379428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    def read_or_stop():
380428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        try:
381428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson            return readline()
382428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        except StopIteration:
383428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson            return b''
384428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
385428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    def find_cookie(line):
386428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        try:
38763674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis            # Decode as UTF-8. Either the line is an encoding declaration,
38863674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis            # in which case it should be pure ASCII, or it must be UTF-8
38963674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis            # per default encoding.
39063674f4b52aa7c2832fec09a026e24cd521e491bMartin v. Löwis            line_string = line.decode('utf-8')
391428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        except UnicodeDecodeError:
392c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon            msg = "invalid or missing encoding declaration"
393c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon            if filename is not None:
394c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                msg = '{} for {!r}'.format(msg, filename)
395c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon            raise SyntaxError(msg)
396433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson
397dafea851901fc1de278ad79727d3b44f46ba5a31Serhiy Storchaka        match = cookie_re.match(line_string)
398dafea851901fc1de278ad79727d3b44f46ba5a31Serhiy Storchaka        if not match:
399433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson            return None
400dafea851901fc1de278ad79727d3b44f46ba5a31Serhiy Storchaka        encoding = _get_normal_name(match.group(1))
401433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson        try:
402433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson            codec = lookup(encoding)
403433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson        except LookupError:
404433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson            # This behaviour mimics the Python interpreter
405c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon            if filename is None:
406c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                msg = "unknown encoding: " + encoding
407c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon            else:
408c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                msg = "unknown encoding for {!r}: {}".format(filename,
409c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                        encoding)
410c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon            raise SyntaxError(msg)
411433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson
4121613ed810801df8327ae6f55b7785fec3a9dc6bbBenjamin Peterson        if bom_found:
41311f0b41e9de3805441ddd4142df9f6b7f4432ca7Florent Xicluna            if encoding != 'utf-8':
4141613ed810801df8327ae6f55b7785fec3a9dc6bbBenjamin Peterson                # This behaviour mimics the Python interpreter
415c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                if filename is None:
416c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                    msg = 'encoding problem: utf-8'
417c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                else:
418c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
419c33f3f2339fd3217a0c6fe3df916616abab2fab4Brett Cannon                raise SyntaxError(msg)
4201613ed810801df8327ae6f55b7785fec3a9dc6bbBenjamin Peterson            encoding += '-sig'
421433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson        return encoding
422428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
423428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    first = read_or_stop()
424433f32c3be3b23adc4ec389ff9e78f49c7288f3dBenjamin Peterson    if first.startswith(BOM_UTF8):
425428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        bom_found = True
426428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        first = first[3:]
427689a55809818a846d2733241642572840d20570bBenjamin Peterson        default = 'utf-8-sig'
428428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    if not first:
429689a55809818a846d2733241642572840d20570bBenjamin Peterson        return default, []
430428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
431428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    encoding = find_cookie(first)
432428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    if encoding:
433428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        return encoding, [first]
434768c16ce0273a74fa846cc388753280b17b02cfcSerhiy Storchaka    if not blank_re.match(first):
435768c16ce0273a74fa846cc388753280b17b02cfcSerhiy Storchaka        return default, [first]
436428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
437428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    second = read_or_stop()
438428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    if not second:
439689a55809818a846d2733241642572840d20570bBenjamin Peterson        return default, [first]
440428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
441428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    encoding = find_cookie(second)
442428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    if encoding:
443428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        return encoding, [first, second]
444428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
445689a55809818a846d2733241642572840d20570bBenjamin Peterson    return default, [first, second]
446428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
447428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
44858c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinnerdef open(filename):
44958c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner    """Open a file in read only mode using the encoding detected by
45058c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner    detect_encoding().
45158c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner    """
452969175091c4556e5b7e128ba91ae39f0b80153afVictor Stinner    buffer = _builtin_open(filename, 'rb')
453387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner    try:
454387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner        encoding, lines = detect_encoding(buffer.readline)
455387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner        buffer.seek(0)
456387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner        text = TextIOWrapper(buffer, encoding, line_buffering=True)
457387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner        text.mode = 'r'
458387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner        return text
459387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner    except:
460387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner        buffer.close()
461387729e183365a366c48fce7a9abfcaf4ec6ff4eVictor Stinner        raise
46258c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner
46358c0752a33253641c1423fac2d4ef3f623fbcb46Victor Stinner
464428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef tokenize(readline):
465428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    """
466ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag    The tokenize() generator requires one argument, readline, which
467d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger    must be a callable object which provides the same interface as the
46843e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna    readline() method of built-in file objects.  Each call to the function
469ff8d0873aabe54009af533f9f6a76fa91392a80aBerker Peksag    should return one line of input as bytes.  Alternatively, readline
47068c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger    can be a callable function terminating with StopIteration:
471428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        readline = open(myfile, 'rb').__next__  # Example of alternate readline
4728ac1495a6a1d18111a626cec0c7f2eb67df3edb3Tim Peters
473d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger    The generator produces 5-tuples with these members: the token type; the
474d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger    token string; a 2-tuple (srow, scol) of ints specifying the row and
475d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger    column where the token begins in the source; a 2-tuple (erow, ecol) of
476d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger    ints specifying the row and column where the token ends in the source;
47743e4ea1b17ac912e4f8e55e256b96be0c57a88eeFlorent Xicluna    and the line on which the token was found.  The line passed is the
4788ac1495a6a1d18111a626cec0c7f2eb67df3edb3Tim Peters    logical line; continuation lines are included.
479428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
480428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    The first token sequence will always be an ENCODING token
481428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    which tells you which encoding was used to decode the bytes stream.
482d1fa3db52de5f337e9aae5f3baad16fe62da2d0fRaymond Hettinger    """
48321db77e396c00c0490b6344a130bdbcef62bfa73Benjamin Peterson    # This import is here to avoid problems when the itertools module is not
48421db77e396c00c0490b6344a130bdbcef62bfa73Benjamin Peterson    # built yet and tokenize is imported.
48581dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson    from itertools import chain, repeat
486428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    encoding, consumed = detect_encoding(readline)
48781dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson    rl_gen = iter(readline, b"")
48881dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson    empty = repeat(b"")
48981dd8b9594d88ff1d2c8f5efea687645bbc36d6fBenjamin Peterson    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
490428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
491428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
492428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef _tokenize(readline, encoding):
4931aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum    lnum = parenlev = continued = 0
49433856de84d1115a18b699e0ca93c3b921bc6a1afBenjamin Peterson    numchars = '0123456789'
495de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum    contstr, needcont = '', 0
496a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum    contline = None
497fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum    indents = [0]
4981aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum
49996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov    # 'stashed' and 'async_*' are used for async/await parsing
5007544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov    stashed = None
50196ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov    async_def = False
50296ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov    async_def_indent = 0
50396ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov    async_def_nl = False
5047544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
505428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    if encoding is not None:
506689a55809818a846d2733241642572840d20570bBenjamin Peterson        if encoding == "utf-8-sig":
507689a55809818a846d2733241642572840d20570bBenjamin Peterson            # BOM will already have been stripped.
508689a55809818a846d2733241642572840d20570bBenjamin Peterson            encoding = "utf-8"
509a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
5100fe14383a8576ee5eb4a6aa83c96484281b360fdBenjamin Peterson    while True:             # loop over lines in stream
51168c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger        try:
51268c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger            line = readline()
51368c04534182f2c09783b6506701a8bc25c98b4a9Raymond Hettinger        except StopIteration:
514428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson            line = b''
515428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
516428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson        if encoding is not None:
517428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson            line = line.decode(encoding)
518a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson        lnum += 1
519fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum        pos, max = 0, len(line)
520fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
521fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum        if contstr:                            # continued string
522de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum            if not line:
523ce36ad8a467d914eb5c91f33835b9eaea18ee93bCollin Winter                raise TokenError("EOF in multi-line string", strstart)
5243b631775b26b866e072cd3340f303bf5903af883Guido van Rossum            endmatch = endprog.match(line)
5253b631775b26b866e072cd3340f303bf5903af883Guido van Rossum            if endmatch:
5263b631775b26b866e072cd3340f303bf5903af883Guido van Rossum                pos = end = endmatch.end(0)
527a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                yield TokenInfo(STRING, contstr + line[:end],
52889f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                       strstart, (lnum, end), contline + line)
529de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum                contstr, needcont = '', 0
530a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum                contline = None
531de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
532a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                yield TokenInfo(ERRORTOKEN, contstr + line,
533a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum                           strstart, (lnum, len(line)), contline)
534fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                contstr = ''
535a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum                contline = None
536de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum                continue
537fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum            else:
538fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                contstr = contstr + line
539a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum                contline = contline + line
540fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                continue
541fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
5421aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum        elif parenlev == 0 and not continued:  # new statement
543fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum            if not line: break
544fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum            column = 0
5451aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum            while pos < max:                   # measure leading whitespace
546a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                if line[pos] == ' ':
547a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                    column += 1
548a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                elif line[pos] == '\t':
549a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                    column = (column//tabsize + 1)*tabsize
550a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                elif line[pos] == '\f':
551a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                    column = 0
552a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                else:
553a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                    break
554a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                pos += 1
555a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson            if pos == max:
556a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                break
5571aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum
5581aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum            if line[pos] in '#\r\n':           # skip comments or blank lines
55989f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                if line[pos] == '#':
56089f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                    comment_token = line[pos:].rstrip('\r\n')
56189f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                    nl_pos = pos + len(comment_token)
562a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                    yield TokenInfo(COMMENT, comment_token,
56389f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                           (lnum, pos), (lnum, pos + len(comment_token)), line)
564a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                    yield TokenInfo(NL, line[nl_pos:],
56589f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                           (lnum, nl_pos), (lnum, len(line)), line)
56689f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                else:
567a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
5681aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                           (lnum, pos), (lnum, len(line)), line)
5691aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                continue
570fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
571fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum            if column > indents[-1]:           # count indents or dedents
572fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                indents.append(column)
573a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
574fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum            while column < indents[-1]:
575da99d1cbfeedafd41263ac2d1b397d57c14ab28eRaymond Hettinger                if column not in indents:
576da99d1cbfeedafd41263ac2d1b397d57c14ab28eRaymond Hettinger                    raise IndentationError(
57700ee7baf49430d8a6eed355a5fd7a05179325747Thomas Wouters                        "unindent does not match any outer indentation level",
57800ee7baf49430d8a6eed355a5fd7a05179325747Thomas Wouters                        ("<tokenize>", lnum, pos, line))
579fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                indents = indents[:-1]
5807544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
58196ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                if async_def and async_def_indent >= indents[-1]:
58296ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                    async_def = False
58396ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                    async_def_nl = False
58496ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                    async_def_indent = 0
5857544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
586a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
587fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
58896ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov            if async_def and async_def_nl and async_def_indent >= indents[-1]:
58996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                async_def = False
59096ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                async_def_nl = False
59196ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                async_def_indent = 0
59296ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov
593fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum        else:                                  # continued statement
594de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum            if not line:
595ce36ad8a467d914eb5c91f33835b9eaea18ee93bCollin Winter                raise TokenError("EOF in multi-line statement", (lnum, 0))
596fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum            continued = 0
597fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
598fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum        while pos < max:
59910a99b024df0d30911b198146d0206c8f6d0d6c7Antoine Pitrou            pseudomatch = _compile(PseudoToken).match(line, pos)
6003b631775b26b866e072cd3340f303bf5903af883Guido van Rossum            if pseudomatch:                                # scan for tokens
6013b631775b26b866e072cd3340f303bf5903af883Guido van Rossum                start, end = pseudomatch.span(1)
602de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum                spos, epos, pos = (lnum, start), (lnum, end), end
6032cc3b4ba9ffa658784da03f14a0a068e2c61d1b3Ezio Melotti                if start == end:
6042cc3b4ba9ffa658784da03f14a0a068e2c61d1b3Ezio Melotti                    continue
6051aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                token, initial = line[start:end], line[start]
606fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
607dde002899db8d04ac25d630fcc3a27e8bbf282eaGeorg Brandl                if (initial in numchars or                  # ordinary number
608dde002899db8d04ac25d630fcc3a27e8bbf282eaGeorg Brandl                    (initial == '.' and token != '.' and token != '...')):
609a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                    yield TokenInfo(NUMBER, token, spos, epos, line)
6101aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                elif initial in '\r\n':
6117544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    if stashed:
6127544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        yield stashed
6137544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        stashed = None
61496ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                    if parenlev > 0:
61596ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                        yield TokenInfo(NL, token, spos, epos, line)
61696ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                    else:
61796ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                        yield TokenInfo(NEWLINE, token, spos, epos, line)
61896ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                        if async_def:
61996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                            async_def_nl = True
62096ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov
6211aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                elif initial == '#':
62289f507fe8c497b3f70fdcecce8bc240f9af2bbe2Thomas Wouters                    assert not token.endswith("\n")
6237544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    if stashed:
6247544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        yield stashed
6257544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        stashed = None
626a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                    yield TokenInfo(COMMENT, token, spos, epos, line)
6271c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith
6289d6897accc49f40414fbecafeb1c65562c6e4647Guido van Rossum                elif token in triple_quoted:
62910a99b024df0d30911b198146d0206c8f6d0d6c7Antoine Pitrou                    endprog = _compile(endpats[token])
6303b631775b26b866e072cd3340f303bf5903af883Guido van Rossum                    endmatch = endprog.match(line, pos)
6313b631775b26b866e072cd3340f303bf5903af883Guido van Rossum                    if endmatch:                           # all on one line
6323b631775b26b866e072cd3340f303bf5903af883Guido van Rossum                        pos = endmatch.end(0)
6331aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                        token = line[start:pos]
634a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
635fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                    else:
6361aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                        strstart = (lnum, start)           # multiple lines
6371aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                        contstr = line[start:]
638a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum                        contline = line
639fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                        break
6401c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith
6411c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                # Check up to the first 3 chars of the token to see if
6421c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                #  they're in the single_quoted set. If so, they start
6431c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                #  a string.
6441c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                # We're using the first 3, because we're looking for
6451c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                #  "rb'" (for example) at the start of the token. If
6461c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                #  we switch to longer prefixes, this needs to be
6471c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                #  adjusted.
6481c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                # Note that initial == token[:1].
649a7161e7facdfa1d6f673beb16a95a647ce764b32Berker Peksag                # Also note that single quote checking must come after
6501c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                #  triple quote checking (above).
6511c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                elif (initial in single_quoted or
6521c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                      token[:2] in single_quoted or
6531c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                      token[:3] in single_quoted):
654fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                    if token[-1] == '\n':                  # continued string
6551aec32363f25693e0c3ff81feddf620850b4955dGuido van Rossum                        strstart = (lnum, start)
6561c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                        # Again, using the first 3 chars of the
6571c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                        #  token. This is looking for the matching end
6581c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                        #  regex for the correct type of quote
6591c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                        #  character. So it's really looking for
6601c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                        #  endpats["'"] or endpats['"'], by trying to
6611c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                        #  skip string prefix characters, if any.
6621c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                        endprog = _compile(endpats.get(initial) or
6631c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                                           endpats.get(token[1]) or
6641c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith                                           endpats.get(token[2]))
665de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum                        contstr, needcont = line[start:], 1
666a90c78b9186f5ba8d91d3be0e684f81f2068c771Guido van Rossum                        contline = line
667fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                        break
668fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                    else:                                  # ordinary string
669a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                        yield TokenInfo(STRING, token, spos, epos, line)
6701c8222c80a8a534bd9357aafc3fba7b6927efc15Eric V. Smith
67133856de84d1115a18b699e0ca93c3b921bc6a1afBenjamin Peterson                elif initial.isidentifier():               # ordinary name
6727544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    if token in ('async', 'await'):
67396ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                        if async_def:
6747544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                            yield TokenInfo(
6757544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                                ASYNC if token == 'async' else AWAIT,
6767544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                                token, spos, epos, line)
6777544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                            continue
6787544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
6797544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    tok = TokenInfo(NAME, token, spos, epos, line)
6807544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    if token == 'async' and not stashed:
6817544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        stashed = tok
6827544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        continue
6837544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
6847544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    if token == 'def':
6857544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        if (stashed
6867544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                                and stashed.type == NAME
6877544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                                and stashed.string == 'async'):
6887544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
68996ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                            async_def = True
69096ec934e755355cfc5af036db8641646b7ddb45eYury Selivanov                            async_def_indent = indents[-1]
6917544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
6927544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                            yield TokenInfo(ASYNC, stashed.string,
6937544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                                            stashed.start, stashed.end,
6947544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                                            stashed.line)
6957544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                            stashed = None
6967544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
6977544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    if stashed:
6987544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        yield stashed
6997544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        stashed = None
7007544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
7017544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    yield tok
7023b631775b26b866e072cd3340f303bf5903af883Guido van Rossum                elif initial == '\\':                      # continued stmt
7033b631775b26b866e072cd3340f303bf5903af883Guido van Rossum                    continued = 1
704fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum                else:
705a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                    if initial in '([{':
706a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                        parenlev += 1
707a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                    elif initial in ')]}':
708a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                        parenlev -= 1
7097544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                    if stashed:
7107544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        yield stashed
7117544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov                        stashed = None
712a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                    yield TokenInfo(OP, token, spos, epos, line)
713fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum            else:
714a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger                yield TokenInfo(ERRORTOKEN, line[pos],
715de65527e4b0925692f0d75f388116b7958a390bbGuido van Rossum                           (lnum, pos), (lnum, pos+1), line)
716a0dfa82eca0f4b9855b6e234f9b21e5d60c88a10Benjamin Peterson                pos += 1
717fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
7187544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov    if stashed:
7197544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov        yield stashed
7207544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov        stashed = None
7217544508f0245173bff5866aa1598c8f6cce1fc5fYury Selivanov
722fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum    for indent in indents[1:]:                 # pop remaining indent levels
723a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
724a48db39992aaf4d83759135e4c9a2c9757764e62Raymond Hettinger    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
725fc6f5339a99d103928bce9eda605564f2a9e8477Guido van Rossum
726428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson
727428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson# An undocumented, backwards compatible, API for all the places in the standard
728428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson# library that expect to be able to use tokenize with strings
729428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelsondef generate_tokens(readline):
730428de65ca99492436130165bfbaeb56d6d1daec7Trent Nelson    return _tokenize(readline, None)
7316c60d099e5ed97ee0026687c1ec3401cca49c0c2Raymond Hettinger
73214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Ingedef main():
73314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    import argparse
73414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge
73514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    # Helper error handling routines
73614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    def perror(message):
73714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        print(message, file=sys.stderr)
73814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge
73914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    def error(message, filename=None, location=None):
74014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        if location:
74114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            args = (filename,) + location + (message,)
74214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            perror("%s:%d:%d: error: %s" % args)
74314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        elif filename:
74414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            perror("%s: error: %s" % (filename, message))
74514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        else:
74614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            perror("error: %s" % message)
74714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        sys.exit(1)
74814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge
74914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    # Parse the arguments and options
75014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    parser = argparse.ArgumentParser(prog='python -m tokenize')
75114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    parser.add_argument(dest='filename', nargs='?',
75214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge                        metavar='filename.py',
75314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge                        help='the file to tokenize; defaults to stdin')
75400c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
75500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge                        help='display token names using the exact type')
75614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    args = parser.parse_args()
75714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge
75814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    try:
75914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        # Tokenize the input
76014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        if args.filename:
76114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            filename = args.filename
762969175091c4556e5b7e128ba91ae39f0b80153afVictor Stinner            with _builtin_open(filename, 'rb') as f:
76314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge                tokens = list(tokenize(f.readline))
76414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        else:
76514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            filename = "<stdin>"
76614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            tokens = _tokenize(sys.stdin.readline, None)
76714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge
76814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        # Output the tokenization
76914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        for token in tokens:
77000c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge            token_type = token.type
77100c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge            if args.exact:
77200c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge                token_type = token.exact_type
77314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
77414c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge            print("%-20s%-15s%-15r" %
77500c7f85298b9803371b4a0019ce8732ed8a2dd3bMeador Inge                  (token_range, tok_name[token_type], token.string))
77614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    except IndentationError as err:
77714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        line, column = err.args[1][1:3]
77814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        error(err.args[0], filename, (line, column))
77914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    except TokenError as err:
78014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        line, column = err.args[1]
78114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        error(err.args[0], filename, (line, column))
78214c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    except SyntaxError as err:
78314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        error(err, filename)
784f7a17b48d748e1835bcf9df86fb7fb318bb020f8Andrew Svetlov    except OSError as err:
78514c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        error(err)
78614c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    except KeyboardInterrupt:
78714c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        print("interrupted\n")
78814c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    except Exception as err:
78914c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        perror("unexpected error: %s" % err)
79014c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge        raise
79114c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge
7926c60d099e5ed97ee0026687c1ec3401cca49c0c2Raymond Hettingerif __name__ == "__main__":
79314c0f03b587e3ec9679cf19a0c5f598c45157429Meador Inge    main()
794