1"""Tokenization help for Python programs.
2
3tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens.  It decodes the bytes according to PEP-0263 for
5determining source file encoding.
6
7It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF).  It generates 5-tuples with these
9members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators.  Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
22
23__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26               'Michael Foord')
27from builtins import open as _builtin_open
28from codecs import lookup, BOM_UTF8
29import collections
30from io import TextIOWrapper
31from itertools import chain
32import itertools as _itertools
33import re
34import sys
35from token import *
36
37cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
38blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
39
40import token
41__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
42                           "NL", "untokenize", "ENCODING", "TokenInfo"]
43del token
44
45COMMENT = N_TOKENS
46tok_name[COMMENT] = 'COMMENT'
47NL = N_TOKENS + 1
48tok_name[NL] = 'NL'
49ENCODING = N_TOKENS + 2
50tok_name[ENCODING] = 'ENCODING'
51N_TOKENS += 3
52EXACT_TOKEN_TYPES = {
53    '(':   LPAR,
54    ')':   RPAR,
55    '[':   LSQB,
56    ']':   RSQB,
57    ':':   COLON,
58    ',':   COMMA,
59    ';':   SEMI,
60    '+':   PLUS,
61    '-':   MINUS,
62    '*':   STAR,
63    '/':   SLASH,
64    '|':   VBAR,
65    '&':   AMPER,
66    '<':   LESS,
67    '>':   GREATER,
68    '=':   EQUAL,
69    '.':   DOT,
70    '%':   PERCENT,
71    '{':   LBRACE,
72    '}':   RBRACE,
73    '==':  EQEQUAL,
74    '!=':  NOTEQUAL,
75    '<=':  LESSEQUAL,
76    '>=':  GREATEREQUAL,
77    '~':   TILDE,
78    '^':   CIRCUMFLEX,
79    '<<':  LEFTSHIFT,
80    '>>':  RIGHTSHIFT,
81    '**':  DOUBLESTAR,
82    '+=':  PLUSEQUAL,
83    '-=':  MINEQUAL,
84    '*=':  STAREQUAL,
85    '/=':  SLASHEQUAL,
86    '%=':  PERCENTEQUAL,
87    '&=':  AMPEREQUAL,
88    '|=':  VBAREQUAL,
89    '^=': CIRCUMFLEXEQUAL,
90    '<<=': LEFTSHIFTEQUAL,
91    '>>=': RIGHTSHIFTEQUAL,
92    '**=': DOUBLESTAREQUAL,
93    '//':  DOUBLESLASH,
94    '//=': DOUBLESLASHEQUAL,
95    '@':   AT,
96    '@=':  ATEQUAL,
97}
98
99class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
100    def __repr__(self):
101        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
102        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
103                self._replace(type=annotated_type))
104
105    @property
106    def exact_type(self):
107        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
108            return EXACT_TOKEN_TYPES[self.string]
109        else:
110            return self.type
111
112def group(*choices): return '(' + '|'.join(choices) + ')'
113def any(*choices): return group(*choices) + '*'
114def maybe(*choices): return group(*choices) + '?'
115
116# Note: we use unicode matching for names ("\w") but ascii matching for
117# number literals.
118Whitespace = r'[ \f\t]*'
119Comment = r'#[^\r\n]*'
120Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
121Name = r'\w+'
122
123Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
124Binnumber = r'0[bB](?:_?[01])+'
125Octnumber = r'0[oO](?:_?[0-7])+'
126Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
127Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
128Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
129Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
130                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
131Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
132Floatnumber = group(Pointfloat, Expfloat)
133Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
134Number = group(Imagnumber, Floatnumber, Intnumber)
135
136# Return the empty string, plus all of the valid string prefixes.
137def _all_string_prefixes():
138    # The valid string prefixes. Only contain the lower case versions,
139    #  and don't contain any permuations (include 'fr', but not
140    #  'rf'). The various permutations will be generated.
141    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
142    # if we add binary f-strings, add: ['fb', 'fbr']
143    result = set([''])
144    for prefix in _valid_string_prefixes:
145        for t in _itertools.permutations(prefix):
146            # create a list with upper and lower versions of each
147            #  character
148            for u in _itertools.product(*[(c, c.upper()) for c in t]):
149                result.add(''.join(u))
150    return result
151
152def _compile(expr):
153    return re.compile(expr, re.UNICODE)
154
155# Note that since _all_string_prefixes includes the empty string,
156#  StringPrefix can be the empty string (making it optional).
157StringPrefix = group(*_all_string_prefixes())
158
159# Tail end of ' string.
160Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
161# Tail end of " string.
162Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
163# Tail end of ''' string.
164Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
165# Tail end of """ string.
166Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
167Triple = group(StringPrefix + "'''", StringPrefix + '"""')
168# Single-line ' or " string.
169String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
170               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
171
172# Because of leftmost-then-longest match semantics, be sure to put the
173# longest operators first (e.g., if = came before ==, == would get
174# recognized as two instances of =).
175Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
176                 r"//=?", r"->",
177                 r"[+\-*/%&@|^=<>]=?",
178                 r"~")
179
180Bracket = '[][(){}]'
181Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
182Funny = group(Operator, Bracket, Special)
183
184PlainToken = group(Number, Funny, String, Name)
185Token = Ignore + PlainToken
186
187# First (or only) line of ' or " string.
188ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
189                group("'", r'\\\r?\n'),
190                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
191                group('"', r'\\\r?\n'))
192PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
193PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
194
195# For a given string prefix plus quotes, endpats maps it to a regex
196#  to match the remainder of that string. _prefix can be empty, for
197#  a normal single or triple quoted string (with no prefix).
198endpats = {}
199for _prefix in _all_string_prefixes():
200    endpats[_prefix + "'"] = Single
201    endpats[_prefix + '"'] = Double
202    endpats[_prefix + "'''"] = Single3
203    endpats[_prefix + '"""'] = Double3
204
205# A set of all of the single and triple quoted string prefixes,
206#  including the opening quotes.
207single_quoted = set()
208triple_quoted = set()
209for t in _all_string_prefixes():
210    for u in (t + '"', t + "'"):
211        single_quoted.add(u)
212    for u in (t + '"""', t + "'''"):
213        triple_quoted.add(u)
214
215tabsize = 8
216
217class TokenError(Exception): pass
218
219class StopTokenizing(Exception): pass
220
221
222class Untokenizer:
223
224    def __init__(self):
225        self.tokens = []
226        self.prev_row = 1
227        self.prev_col = 0
228        self.encoding = None
229
230    def add_whitespace(self, start):
231        row, col = start
232        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
233            raise ValueError("start ({},{}) precedes previous end ({},{})"
234                             .format(row, col, self.prev_row, self.prev_col))
235        row_offset = row - self.prev_row
236        if row_offset:
237            self.tokens.append("\\\n" * row_offset)
238            self.prev_col = 0
239        col_offset = col - self.prev_col
240        if col_offset:
241            self.tokens.append(" " * col_offset)
242
243    def untokenize(self, iterable):
244        it = iter(iterable)
245        indents = []
246        startline = False
247        for t in it:
248            if len(t) == 2:
249                self.compat(t, it)
250                break
251            tok_type, token, start, end, line = t
252            if tok_type == ENCODING:
253                self.encoding = token
254                continue
255            if tok_type == ENDMARKER:
256                break
257            if tok_type == INDENT:
258                indents.append(token)
259                continue
260            elif tok_type == DEDENT:
261                indents.pop()
262                self.prev_row, self.prev_col = end
263                continue
264            elif tok_type in (NEWLINE, NL):
265                startline = True
266            elif startline and indents:
267                indent = indents[-1]
268                if start[1] >= len(indent):
269                    self.tokens.append(indent)
270                    self.prev_col = len(indent)
271                startline = False
272            self.add_whitespace(start)
273            self.tokens.append(token)
274            self.prev_row, self.prev_col = end
275            if tok_type in (NEWLINE, NL):
276                self.prev_row += 1
277                self.prev_col = 0
278        return "".join(self.tokens)
279
280    def compat(self, token, iterable):
281        indents = []
282        toks_append = self.tokens.append
283        startline = token[0] in (NEWLINE, NL)
284        prevstring = False
285
286        for tok in chain([token], iterable):
287            toknum, tokval = tok[:2]
288            if toknum == ENCODING:
289                self.encoding = tokval
290                continue
291
292            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
293                tokval += ' '
294
295            # Insert a space between two consecutive strings
296            if toknum == STRING:
297                if prevstring:
298                    tokval = ' ' + tokval
299                prevstring = True
300            else:
301                prevstring = False
302
303            if toknum == INDENT:
304                indents.append(tokval)
305                continue
306            elif toknum == DEDENT:
307                indents.pop()
308                continue
309            elif toknum in (NEWLINE, NL):
310                startline = True
311            elif startline and indents:
312                toks_append(indents[-1])
313                startline = False
314            toks_append(tokval)
315
316
317def untokenize(iterable):
318    """Transform tokens back into Python source code.
319    It returns a bytes object, encoded using the ENCODING
320    token, which is the first token sequence output by tokenize.
321
322    Each element returned by the iterable must be a token sequence
323    with at least two elements, a token number and token value.  If
324    only two tokens are passed, the resulting output is poor.
325
326    Round-trip invariant for full input:
327        Untokenized source will match input source exactly
328
329    Round-trip invariant for limited input:
330        # Output bytes will tokenize back to the input
331        t1 = [tok[:2] for tok in tokenize(f.readline)]
332        newcode = untokenize(t1)
333        readline = BytesIO(newcode).readline
334        t2 = [tok[:2] for tok in tokenize(readline)]
335        assert t1 == t2
336    """
337    ut = Untokenizer()
338    out = ut.untokenize(iterable)
339    if ut.encoding is not None:
340        out = out.encode(ut.encoding)
341    return out
342
343
344def _get_normal_name(orig_enc):
345    """Imitates get_normal_name in tokenizer.c."""
346    # Only care about the first 12 characters.
347    enc = orig_enc[:12].lower().replace("_", "-")
348    if enc == "utf-8" or enc.startswith("utf-8-"):
349        return "utf-8"
350    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
351       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
352        return "iso-8859-1"
353    return orig_enc
354
355def detect_encoding(readline):
356    """
357    The detect_encoding() function is used to detect the encoding that should
358    be used to decode a Python source file.  It requires one argument, readline,
359    in the same way as the tokenize() generator.
360
361    It will call readline a maximum of twice, and return the encoding used
362    (as a string) and a list of any lines (left as bytes) it has read in.
363
364    It detects the encoding from the presence of a utf-8 bom or an encoding
365    cookie as specified in pep-0263.  If both a bom and a cookie are present,
366    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
367    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
368    'utf-8-sig' is returned.
369
370    If no encoding is specified, then the default of 'utf-8' will be returned.
371    """
372    try:
373        filename = readline.__self__.name
374    except AttributeError:
375        filename = None
376    bom_found = False
377    encoding = None
378    default = 'utf-8'
379    def read_or_stop():
380        try:
381            return readline()
382        except StopIteration:
383            return b''
384
385    def find_cookie(line):
386        try:
387            # Decode as UTF-8. Either the line is an encoding declaration,
388            # in which case it should be pure ASCII, or it must be UTF-8
389            # per default encoding.
390            line_string = line.decode('utf-8')
391        except UnicodeDecodeError:
392            msg = "invalid or missing encoding declaration"
393            if filename is not None:
394                msg = '{} for {!r}'.format(msg, filename)
395            raise SyntaxError(msg)
396
397        match = cookie_re.match(line_string)
398        if not match:
399            return None
400        encoding = _get_normal_name(match.group(1))
401        try:
402            codec = lookup(encoding)
403        except LookupError:
404            # This behaviour mimics the Python interpreter
405            if filename is None:
406                msg = "unknown encoding: " + encoding
407            else:
408                msg = "unknown encoding for {!r}: {}".format(filename,
409                        encoding)
410            raise SyntaxError(msg)
411
412        if bom_found:
413            if encoding != 'utf-8':
414                # This behaviour mimics the Python interpreter
415                if filename is None:
416                    msg = 'encoding problem: utf-8'
417                else:
418                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
419                raise SyntaxError(msg)
420            encoding += '-sig'
421        return encoding
422
423    first = read_or_stop()
424    if first.startswith(BOM_UTF8):
425        bom_found = True
426        first = first[3:]
427        default = 'utf-8-sig'
428    if not first:
429        return default, []
430
431    encoding = find_cookie(first)
432    if encoding:
433        return encoding, [first]
434    if not blank_re.match(first):
435        return default, [first]
436
437    second = read_or_stop()
438    if not second:
439        return default, [first]
440
441    encoding = find_cookie(second)
442    if encoding:
443        return encoding, [first, second]
444
445    return default, [first, second]
446
447
448def open(filename):
449    """Open a file in read only mode using the encoding detected by
450    detect_encoding().
451    """
452    buffer = _builtin_open(filename, 'rb')
453    try:
454        encoding, lines = detect_encoding(buffer.readline)
455        buffer.seek(0)
456        text = TextIOWrapper(buffer, encoding, line_buffering=True)
457        text.mode = 'r'
458        return text
459    except:
460        buffer.close()
461        raise
462
463
464def tokenize(readline):
465    """
466    The tokenize() generator requires one argument, readline, which
467    must be a callable object which provides the same interface as the
468    readline() method of built-in file objects.  Each call to the function
469    should return one line of input as bytes.  Alternatively, readline
470    can be a callable function terminating with StopIteration:
471        readline = open(myfile, 'rb').__next__  # Example of alternate readline
472
473    The generator produces 5-tuples with these members: the token type; the
474    token string; a 2-tuple (srow, scol) of ints specifying the row and
475    column where the token begins in the source; a 2-tuple (erow, ecol) of
476    ints specifying the row and column where the token ends in the source;
477    and the line on which the token was found.  The line passed is the
478    logical line; continuation lines are included.
479
480    The first token sequence will always be an ENCODING token
481    which tells you which encoding was used to decode the bytes stream.
482    """
483    # This import is here to avoid problems when the itertools module is not
484    # built yet and tokenize is imported.
485    from itertools import chain, repeat
486    encoding, consumed = detect_encoding(readline)
487    rl_gen = iter(readline, b"")
488    empty = repeat(b"")
489    return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
490
491
492def _tokenize(readline, encoding):
493    lnum = parenlev = continued = 0
494    numchars = '0123456789'
495    contstr, needcont = '', 0
496    contline = None
497    indents = [0]
498
499    # 'stashed' and 'async_*' are used for async/await parsing
500    stashed = None
501    async_def = False
502    async_def_indent = 0
503    async_def_nl = False
504
505    if encoding is not None:
506        if encoding == "utf-8-sig":
507            # BOM will already have been stripped.
508            encoding = "utf-8"
509        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
510    while True:             # loop over lines in stream
511        try:
512            line = readline()
513        except StopIteration:
514            line = b''
515
516        if encoding is not None:
517            line = line.decode(encoding)
518        lnum += 1
519        pos, max = 0, len(line)
520
521        if contstr:                            # continued string
522            if not line:
523                raise TokenError("EOF in multi-line string", strstart)
524            endmatch = endprog.match(line)
525            if endmatch:
526                pos = end = endmatch.end(0)
527                yield TokenInfo(STRING, contstr + line[:end],
528                       strstart, (lnum, end), contline + line)
529                contstr, needcont = '', 0
530                contline = None
531            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
532                yield TokenInfo(ERRORTOKEN, contstr + line,
533                           strstart, (lnum, len(line)), contline)
534                contstr = ''
535                contline = None
536                continue
537            else:
538                contstr = contstr + line
539                contline = contline + line
540                continue
541
542        elif parenlev == 0 and not continued:  # new statement
543            if not line: break
544            column = 0
545            while pos < max:                   # measure leading whitespace
546                if line[pos] == ' ':
547                    column += 1
548                elif line[pos] == '\t':
549                    column = (column//tabsize + 1)*tabsize
550                elif line[pos] == '\f':
551                    column = 0
552                else:
553                    break
554                pos += 1
555            if pos == max:
556                break
557
558            if line[pos] in '#\r\n':           # skip comments or blank lines
559                if line[pos] == '#':
560                    comment_token = line[pos:].rstrip('\r\n')
561                    nl_pos = pos + len(comment_token)
562                    yield TokenInfo(COMMENT, comment_token,
563                           (lnum, pos), (lnum, pos + len(comment_token)), line)
564                    yield TokenInfo(NL, line[nl_pos:],
565                           (lnum, nl_pos), (lnum, len(line)), line)
566                else:
567                    yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
568                           (lnum, pos), (lnum, len(line)), line)
569                continue
570
571            if column > indents[-1]:           # count indents or dedents
572                indents.append(column)
573                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
574            while column < indents[-1]:
575                if column not in indents:
576                    raise IndentationError(
577                        "unindent does not match any outer indentation level",
578                        ("<tokenize>", lnum, pos, line))
579                indents = indents[:-1]
580
581                if async_def and async_def_indent >= indents[-1]:
582                    async_def = False
583                    async_def_nl = False
584                    async_def_indent = 0
585
586                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
587
588            if async_def and async_def_nl and async_def_indent >= indents[-1]:
589                async_def = False
590                async_def_nl = False
591                async_def_indent = 0
592
593        else:                                  # continued statement
594            if not line:
595                raise TokenError("EOF in multi-line statement", (lnum, 0))
596            continued = 0
597
598        while pos < max:
599            pseudomatch = _compile(PseudoToken).match(line, pos)
600            if pseudomatch:                                # scan for tokens
601                start, end = pseudomatch.span(1)
602                spos, epos, pos = (lnum, start), (lnum, end), end
603                if start == end:
604                    continue
605                token, initial = line[start:end], line[start]
606
607                if (initial in numchars or                  # ordinary number
608                    (initial == '.' and token != '.' and token != '...')):
609                    yield TokenInfo(NUMBER, token, spos, epos, line)
610                elif initial in '\r\n':
611                    if stashed:
612                        yield stashed
613                        stashed = None
614                    if parenlev > 0:
615                        yield TokenInfo(NL, token, spos, epos, line)
616                    else:
617                        yield TokenInfo(NEWLINE, token, spos, epos, line)
618                        if async_def:
619                            async_def_nl = True
620
621                elif initial == '#':
622                    assert not token.endswith("\n")
623                    if stashed:
624                        yield stashed
625                        stashed = None
626                    yield TokenInfo(COMMENT, token, spos, epos, line)
627
628                elif token in triple_quoted:
629                    endprog = _compile(endpats[token])
630                    endmatch = endprog.match(line, pos)
631                    if endmatch:                           # all on one line
632                        pos = endmatch.end(0)
633                        token = line[start:pos]
634                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
635                    else:
636                        strstart = (lnum, start)           # multiple lines
637                        contstr = line[start:]
638                        contline = line
639                        break
640
641                # Check up to the first 3 chars of the token to see if
642                #  they're in the single_quoted set. If so, they start
643                #  a string.
644                # We're using the first 3, because we're looking for
645                #  "rb'" (for example) at the start of the token. If
646                #  we switch to longer prefixes, this needs to be
647                #  adjusted.
648                # Note that initial == token[:1].
649                # Also note that single quote checking must come after
650                #  triple quote checking (above).
651                elif (initial in single_quoted or
652                      token[:2] in single_quoted or
653                      token[:3] in single_quoted):
654                    if token[-1] == '\n':                  # continued string
655                        strstart = (lnum, start)
656                        # Again, using the first 3 chars of the
657                        #  token. This is looking for the matching end
658                        #  regex for the correct type of quote
659                        #  character. So it's really looking for
660                        #  endpats["'"] or endpats['"'], by trying to
661                        #  skip string prefix characters, if any.
662                        endprog = _compile(endpats.get(initial) or
663                                           endpats.get(token[1]) or
664                                           endpats.get(token[2]))
665                        contstr, needcont = line[start:], 1
666                        contline = line
667                        break
668                    else:                                  # ordinary string
669                        yield TokenInfo(STRING, token, spos, epos, line)
670
671                elif initial.isidentifier():               # ordinary name
672                    if token in ('async', 'await'):
673                        if async_def:
674                            yield TokenInfo(
675                                ASYNC if token == 'async' else AWAIT,
676                                token, spos, epos, line)
677                            continue
678
679                    tok = TokenInfo(NAME, token, spos, epos, line)
680                    if token == 'async' and not stashed:
681                        stashed = tok
682                        continue
683
684                    if token == 'def':
685                        if (stashed
686                                and stashed.type == NAME
687                                and stashed.string == 'async'):
688
689                            async_def = True
690                            async_def_indent = indents[-1]
691
692                            yield TokenInfo(ASYNC, stashed.string,
693                                            stashed.start, stashed.end,
694                                            stashed.line)
695                            stashed = None
696
697                    if stashed:
698                        yield stashed
699                        stashed = None
700
701                    yield tok
702                elif initial == '\\':                      # continued stmt
703                    continued = 1
704                else:
705                    if initial in '([{':
706                        parenlev += 1
707                    elif initial in ')]}':
708                        parenlev -= 1
709                    if stashed:
710                        yield stashed
711                        stashed = None
712                    yield TokenInfo(OP, token, spos, epos, line)
713            else:
714                yield TokenInfo(ERRORTOKEN, line[pos],
715                           (lnum, pos), (lnum, pos+1), line)
716                pos += 1
717
718    if stashed:
719        yield stashed
720        stashed = None
721
722    for indent in indents[1:]:                 # pop remaining indent levels
723        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
724    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
725
726
727# An undocumented, backwards compatible, API for all the places in the standard
728# library that expect to be able to use tokenize with strings
729def generate_tokens(readline):
730    return _tokenize(readline, None)
731
732def main():
733    import argparse
734
735    # Helper error handling routines
736    def perror(message):
737        print(message, file=sys.stderr)
738
739    def error(message, filename=None, location=None):
740        if location:
741            args = (filename,) + location + (message,)
742            perror("%s:%d:%d: error: %s" % args)
743        elif filename:
744            perror("%s: error: %s" % (filename, message))
745        else:
746            perror("error: %s" % message)
747        sys.exit(1)
748
749    # Parse the arguments and options
750    parser = argparse.ArgumentParser(prog='python -m tokenize')
751    parser.add_argument(dest='filename', nargs='?',
752                        metavar='filename.py',
753                        help='the file to tokenize; defaults to stdin')
754    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
755                        help='display token names using the exact type')
756    args = parser.parse_args()
757
758    try:
759        # Tokenize the input
760        if args.filename:
761            filename = args.filename
762            with _builtin_open(filename, 'rb') as f:
763                tokens = list(tokenize(f.readline))
764        else:
765            filename = "<stdin>"
766            tokens = _tokenize(sys.stdin.readline, None)
767
768        # Output the tokenization
769        for token in tokens:
770            token_type = token.type
771            if args.exact:
772                token_type = token.exact_type
773            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
774            print("%-20s%-15s%-15r" %
775                  (token_range, tok_name[token_type], token.string))
776    except IndentationError as err:
777        line, column = err.args[1][1:3]
778        error(err.args[0], filename, (line, column))
779    except TokenError as err:
780        line, column = err.args[1]
781        error(err.args[0], filename, (line, column))
782    except SyntaxError as err:
783        error(err, filename)
784    except OSError as err:
785        error(err)
786    except KeyboardInterrupt:
787        print("interrupted\n")
788    except Exception as err:
789        perror("unexpected error: %s" % err)
790        raise
791
792if __name__ == "__main__":
793    main()
794