1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "bytesobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
21#define is_potential_identifier_start(c) (\
22              (c >= 'a' && c <= 'z')\
23               || (c >= 'A' && c <= 'Z')\
24               || c == '_'\
25               || (c >= 128))
26
27#define is_potential_identifier_char(c) (\
28              (c >= 'a' && c <= 'z')\
29               || (c >= 'A' && c <= 'Z')\
30               || (c >= '0' && c <= '9')\
31               || c == '_'\
32               || (c >= 128))
33
34extern char *PyOS_Readline(FILE *, FILE *, const char *);
35/* Return malloc'ed string including trailing \n;
36   empty malloc'ed string for EOF;
37   NULL if interrupted */
38
39/* Don't ever change this -- it would break the portability of Python code */
40#define TABSIZE 8
41
42/* Forward */
43static struct tok_state *tok_new(void);
44static int tok_nextc(struct tok_state *tok);
45static void tok_backup(struct tok_state *tok, int c);
46
47
48/* Token names */
49
50const char *_PyParser_TokenNames[] = {
51    "ENDMARKER",
52    "NAME",
53    "NUMBER",
54    "STRING",
55    "NEWLINE",
56    "INDENT",
57    "DEDENT",
58    "LPAR",
59    "RPAR",
60    "LSQB",
61    "RSQB",
62    "COLON",
63    "COMMA",
64    "SEMI",
65    "PLUS",
66    "MINUS",
67    "STAR",
68    "SLASH",
69    "VBAR",
70    "AMPER",
71    "LESS",
72    "GREATER",
73    "EQUAL",
74    "DOT",
75    "PERCENT",
76    "LBRACE",
77    "RBRACE",
78    "EQEQUAL",
79    "NOTEQUAL",
80    "LESSEQUAL",
81    "GREATEREQUAL",
82    "TILDE",
83    "CIRCUMFLEX",
84    "LEFTSHIFT",
85    "RIGHTSHIFT",
86    "DOUBLESTAR",
87    "PLUSEQUAL",
88    "MINEQUAL",
89    "STAREQUAL",
90    "SLASHEQUAL",
91    "PERCENTEQUAL",
92    "AMPEREQUAL",
93    "VBAREQUAL",
94    "CIRCUMFLEXEQUAL",
95    "LEFTSHIFTEQUAL",
96    "RIGHTSHIFTEQUAL",
97    "DOUBLESTAREQUAL",
98    "DOUBLESLASH",
99    "DOUBLESLASHEQUAL",
100    "AT",
101    "ATEQUAL",
102    "RARROW",
103    "ELLIPSIS",
104    /* This table must match the #defines in token.h! */
105    "OP",
106    "AWAIT",
107    "ASYNC",
108    "<ERRORTOKEN>",
109    "<N_TOKENS>"
110};
111
112
113/* Create and initialize a new tok_state structure */
114
115static struct tok_state *
116tok_new(void)
117{
118    struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119                                            sizeof(struct tok_state));
120    if (tok == NULL)
121        return NULL;
122    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123    tok->done = E_OK;
124    tok->fp = NULL;
125    tok->input = NULL;
126    tok->tabsize = TABSIZE;
127    tok->indent = 0;
128    tok->indstack[0] = 0;
129
130    tok->atbol = 1;
131    tok->pendin = 0;
132    tok->prompt = tok->nextprompt = NULL;
133    tok->lineno = 0;
134    tok->level = 0;
135    tok->altwarning = 1;
136    tok->alterror = 1;
137    tok->alttabsize = 1;
138    tok->altindstack[0] = 0;
139    tok->decoding_state = STATE_INIT;
140    tok->decoding_erred = 0;
141    tok->read_coding_spec = 0;
142    tok->enc = NULL;
143    tok->encoding = NULL;
144    tok->cont_line = 0;
145#ifndef PGEN
146    tok->filename = NULL;
147    tok->decoding_readline = NULL;
148    tok->decoding_buffer = NULL;
149#endif
150
151    tok->async_def = 0;
152    tok->async_def_indent = 0;
153    tok->async_def_nl = 0;
154
155    return tok;
156}
157
158static char *
159new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
160{
161    char* result = (char *)PyMem_MALLOC(len + 1);
162    if (!result) {
163        tok->done = E_NOMEM;
164        return NULL;
165    }
166    memcpy(result, s, len);
167    result[len] = '\0';
168    return result;
169}
170
171#ifdef PGEN
172
173static char *
174decoding_fgets(char *s, int size, struct tok_state *tok)
175{
176    return fgets(s, size, tok->fp);
177}
178
179static int
180decoding_feof(struct tok_state *tok)
181{
182    return feof(tok->fp);
183}
184
185static char *
186decode_str(const char *str, int exec_input, struct tok_state *tok)
187{
188    return new_string(str, strlen(str), tok);
189}
190
191#else /* PGEN */
192
193static char *
194error_ret(struct tok_state *tok) /* XXX */
195{
196    tok->decoding_erred = 1;
197    if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198        PyMem_FREE(tok->buf);
199    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
200    tok->done = E_DECODE;
201    return NULL;                /* as if it were EOF */
202}
203
204
205static const char *
206get_normal_name(const char *s)  /* for utf-8 and latin-1 */
207{
208    char buf[13];
209    int i;
210    for (i = 0; i < 12; i++) {
211        int c = s[i];
212        if (c == '\0')
213            break;
214        else if (c == '_')
215            buf[i] = '-';
216        else
217            buf[i] = tolower(c);
218    }
219    buf[i] = '\0';
220    if (strcmp(buf, "utf-8") == 0 ||
221        strncmp(buf, "utf-8-", 6) == 0)
222        return "utf-8";
223    else if (strcmp(buf, "latin-1") == 0 ||
224             strcmp(buf, "iso-8859-1") == 0 ||
225             strcmp(buf, "iso-latin-1") == 0 ||
226             strncmp(buf, "latin-1-", 8) == 0 ||
227             strncmp(buf, "iso-8859-1-", 11) == 0 ||
228             strncmp(buf, "iso-latin-1-", 12) == 0)
229        return "iso-8859-1";
230    else
231        return s;
232}
233
234/* Return the coding spec in S, or NULL if none is found.  */
235
236static int
237get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
238{
239    Py_ssize_t i;
240    *spec = NULL;
241    /* Coding spec must be in a comment, and that comment must be
242     * the only statement on the source code line. */
243    for (i = 0; i < size - 6; i++) {
244        if (s[i] == '#')
245            break;
246        if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
247            return 1;
248    }
249    for (; i < size - 6; i++) { /* XXX inefficient search */
250        const char* t = s + i;
251        if (strncmp(t, "coding", 6) == 0) {
252            const char* begin = NULL;
253            t += 6;
254            if (t[0] != ':' && t[0] != '=')
255                continue;
256            do {
257                t++;
258            } while (t[0] == '\x20' || t[0] == '\t');
259
260            begin = t;
261            while (Py_ISALNUM(t[0]) ||
262                   t[0] == '-' || t[0] == '_' || t[0] == '.')
263                t++;
264
265            if (begin < t) {
266                char* r = new_string(begin, t - begin, tok);
267                const char* q;
268                if (!r)
269                    return 0;
270                q = get_normal_name(r);
271                if (r != q) {
272                    PyMem_FREE(r);
273                    r = new_string(q, strlen(q), tok);
274                    if (!r)
275                        return 0;
276                }
277                *spec = r;
278                break;
279            }
280        }
281    }
282    return 1;
283}
284
285/* Check whether the line contains a coding spec. If it does,
286   invoke the set_readline function for the new encoding.
287   This function receives the tok_state and the new encoding.
288   Return 1 on success, 0 on failure.  */
289
290static int
291check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
292                  int set_readline(struct tok_state *, const char *))
293{
294    char *cs;
295    int r = 1;
296
297    if (tok->cont_line) {
298        /* It's a continuation line, so it can't be a coding spec. */
299        tok->read_coding_spec = 1;
300        return 1;
301    }
302    if (!get_coding_spec(line, &cs, size, tok))
303        return 0;
304    if (!cs) {
305        Py_ssize_t i;
306        for (i = 0; i < size; i++) {
307            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
308                break;
309            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
310                /* Stop checking coding spec after a line containing
311                 * anything except a comment. */
312                tok->read_coding_spec = 1;
313                break;
314            }
315        }
316        return 1;
317    }
318    tok->read_coding_spec = 1;
319    if (tok->encoding == NULL) {
320        assert(tok->decoding_state == STATE_RAW);
321        if (strcmp(cs, "utf-8") == 0) {
322            tok->encoding = cs;
323        } else {
324            r = set_readline(tok, cs);
325            if (r) {
326                tok->encoding = cs;
327                tok->decoding_state = STATE_NORMAL;
328            }
329            else {
330                PyErr_Format(PyExc_SyntaxError,
331                             "encoding problem: %s", cs);
332                PyMem_FREE(cs);
333            }
334        }
335    } else {                /* then, compare cs with BOM */
336        r = (strcmp(tok->encoding, cs) == 0);
337        if (!r)
338            PyErr_Format(PyExc_SyntaxError,
339                         "encoding problem: %s with BOM", cs);
340        PyMem_FREE(cs);
341    }
342    return r;
343}
344
345/* See whether the file starts with a BOM. If it does,
346   invoke the set_readline function with the new encoding.
347   Return 1 on success, 0 on failure.  */
348
349static int
350check_bom(int get_char(struct tok_state *),
351          void unget_char(int, struct tok_state *),
352          int set_readline(struct tok_state *, const char *),
353          struct tok_state *tok)
354{
355    int ch1, ch2, ch3;
356    ch1 = get_char(tok);
357    tok->decoding_state = STATE_RAW;
358    if (ch1 == EOF) {
359        return 1;
360    } else if (ch1 == 0xEF) {
361        ch2 = get_char(tok);
362        if (ch2 != 0xBB) {
363            unget_char(ch2, tok);
364            unget_char(ch1, tok);
365            return 1;
366        }
367        ch3 = get_char(tok);
368        if (ch3 != 0xBF) {
369            unget_char(ch3, tok);
370            unget_char(ch2, tok);
371            unget_char(ch1, tok);
372            return 1;
373        }
374#if 0
375    /* Disable support for UTF-16 BOMs until a decision
376       is made whether this needs to be supported.  */
377    } else if (ch1 == 0xFE) {
378        ch2 = get_char(tok);
379        if (ch2 != 0xFF) {
380            unget_char(ch2, tok);
381            unget_char(ch1, tok);
382            return 1;
383        }
384        if (!set_readline(tok, "utf-16-be"))
385            return 0;
386        tok->decoding_state = STATE_NORMAL;
387    } else if (ch1 == 0xFF) {
388        ch2 = get_char(tok);
389        if (ch2 != 0xFE) {
390            unget_char(ch2, tok);
391            unget_char(ch1, tok);
392            return 1;
393        }
394        if (!set_readline(tok, "utf-16-le"))
395            return 0;
396        tok->decoding_state = STATE_NORMAL;
397#endif
398    } else {
399        unget_char(ch1, tok);
400        return 1;
401    }
402    if (tok->encoding != NULL)
403        PyMem_FREE(tok->encoding);
404    tok->encoding = new_string("utf-8", 5, tok);
405    if (!tok->encoding)
406        return 0;
407    /* No need to set_readline: input is already utf-8 */
408    return 1;
409}
410
411/* Read a line of text from TOK into S, using the stream in TOK.
412   Return NULL on failure, else S.
413
414   On entry, tok->decoding_buffer will be one of:
415     1) NULL: need to call tok->decoding_readline to get a new line
416     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
417       stored the result in tok->decoding_buffer
418     3) PyByteArrayObject *: previous call to fp_readl did not have enough room
419       (in the s buffer) to copy entire contents of the line read
420       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
421       In this case, fp_readl is called in a loop (with an expanded buffer)
422       until the buffer ends with a '\n' (or until the end of the file is
423       reached): see tok_nextc and its calls to decoding_fgets.
424*/
425
426static char *
427fp_readl(char *s, int size, struct tok_state *tok)
428{
429    PyObject* bufobj;
430    const char *buf;
431    Py_ssize_t buflen;
432
433    /* Ask for one less byte so we can terminate it */
434    assert(size > 0);
435    size--;
436
437    if (tok->decoding_buffer) {
438        bufobj = tok->decoding_buffer;
439        Py_INCREF(bufobj);
440    }
441    else
442    {
443        bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
444        if (bufobj == NULL)
445            goto error;
446    }
447    if (PyUnicode_CheckExact(bufobj))
448    {
449        buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
450        if (buf == NULL) {
451            goto error;
452        }
453    }
454    else
455    {
456        buf = PyByteArray_AsString(bufobj);
457        if (buf == NULL) {
458            goto error;
459        }
460        buflen = PyByteArray_GET_SIZE(bufobj);
461    }
462
463    Py_XDECREF(tok->decoding_buffer);
464    if (buflen > size) {
465        /* Too many chars, the rest goes into tok->decoding_buffer */
466        tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
467                                                         buflen-size);
468        if (tok->decoding_buffer == NULL)
469            goto error;
470        buflen = size;
471    }
472    else
473        tok->decoding_buffer = NULL;
474
475    memcpy(s, buf, buflen);
476    s[buflen] = '\0';
477    if (buflen == 0) /* EOF */
478        s = NULL;
479    Py_DECREF(bufobj);
480    return s;
481
482error:
483    Py_XDECREF(bufobj);
484    return error_ret(tok);
485}
486
487/* Set the readline function for TOK to a StreamReader's
488   readline function. The StreamReader is named ENC.
489
490   This function is called from check_bom and check_coding_spec.
491
492   ENC is usually identical to the future value of tok->encoding,
493   except for the (currently unsupported) case of UTF-16.
494
495   Return 1 on success, 0 on failure. */
496
497static int
498fp_setreadl(struct tok_state *tok, const char* enc)
499{
500    PyObject *readline, *io, *stream;
501    _Py_IDENTIFIER(open);
502    _Py_IDENTIFIER(readline);
503    int fd;
504    long pos;
505
506    fd = fileno(tok->fp);
507    /* Due to buffering the file offset for fd can be different from the file
508     * position of tok->fp.  If tok->fp was opened in text mode on Windows,
509     * its file position counts CRLF as one char and can't be directly mapped
510     * to the file offset for fd.  Instead we step back one byte and read to
511     * the end of line.*/
512    pos = ftell(tok->fp);
513    if (pos == -1 ||
514        lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
515        PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
516        return 0;
517    }
518
519    io = PyImport_ImportModuleNoBlock("io");
520    if (io == NULL)
521        return 0;
522
523    stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
524                    fd, "r", -1, enc, Py_None, Py_None, Py_False);
525    Py_DECREF(io);
526    if (stream == NULL)
527        return 0;
528
529    readline = _PyObject_GetAttrId(stream, &PyId_readline);
530    Py_DECREF(stream);
531    if (readline == NULL)
532        return 0;
533    Py_XSETREF(tok->decoding_readline, readline);
534
535    if (pos > 0) {
536        PyObject *bufobj = PyObject_CallObject(readline, NULL);
537        if (bufobj == NULL)
538            return 0;
539        Py_DECREF(bufobj);
540    }
541
542    return 1;
543}
544
545/* Fetch the next byte from TOK. */
546
547static int fp_getc(struct tok_state *tok) {
548    return getc(tok->fp);
549}
550
551/* Unfetch the last byte back into TOK.  */
552
553static void fp_ungetc(int c, struct tok_state *tok) {
554    ungetc(c, tok->fp);
555}
556
557/* Check whether the characters at s start a valid
558   UTF-8 sequence. Return the number of characters forming
559   the sequence if yes, 0 if not.  */
560static int valid_utf8(const unsigned char* s)
561{
562    int expected = 0;
563    int length;
564    if (*s < 0x80)
565        /* single-byte code */
566        return 1;
567    if (*s < 0xc0)
568        /* following byte */
569        return 0;
570    if (*s < 0xE0)
571        expected = 1;
572    else if (*s < 0xF0)
573        expected = 2;
574    else if (*s < 0xF8)
575        expected = 3;
576    else
577        return 0;
578    length = expected + 1;
579    for (; expected; expected--)
580        if (s[expected] < 0x80 || s[expected] >= 0xC0)
581            return 0;
582    return length;
583}
584
585/* Read a line of input from TOK. Determine encoding
586   if necessary.  */
587
588static char *
589decoding_fgets(char *s, int size, struct tok_state *tok)
590{
591    char *line = NULL;
592    int badchar = 0;
593    for (;;) {
594        if (tok->decoding_state == STATE_NORMAL) {
595            /* We already have a codec associated with
596               this input. */
597            line = fp_readl(s, size, tok);
598            break;
599        } else if (tok->decoding_state == STATE_RAW) {
600            /* We want a 'raw' read. */
601            line = Py_UniversalNewlineFgets(s, size,
602                                            tok->fp, NULL);
603            break;
604        } else {
605            /* We have not yet determined the encoding.
606               If an encoding is found, use the file-pointer
607               reader functions from now on. */
608            if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
609                return error_ret(tok);
610            assert(tok->decoding_state != STATE_INIT);
611        }
612    }
613    if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
614        if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
615            return error_ret(tok);
616        }
617    }
618#ifndef PGEN
619    /* The default encoding is UTF-8, so make sure we don't have any
620       non-UTF-8 sequences in it. */
621    if (line && !tok->encoding) {
622        unsigned char *c;
623        int length;
624        for (c = (unsigned char *)line; *c; c += length)
625            if (!(length = valid_utf8(c))) {
626                badchar = *c;
627                break;
628            }
629    }
630    if (badchar) {
631        /* Need to add 1 to the line number, since this line
632           has not been counted, yet.  */
633        PyErr_Format(PyExc_SyntaxError,
634                "Non-UTF-8 code starting with '\\x%.2x' "
635                "in file %U on line %i, "
636                "but no encoding declared; "
637                "see http://python.org/dev/peps/pep-0263/ for details",
638                badchar, tok->filename, tok->lineno + 1);
639        return error_ret(tok);
640    }
641#endif
642    return line;
643}
644
645static int
646decoding_feof(struct tok_state *tok)
647{
648    if (tok->decoding_state != STATE_NORMAL) {
649        return feof(tok->fp);
650    } else {
651        PyObject* buf = tok->decoding_buffer;
652        if (buf == NULL) {
653            buf = PyObject_CallObject(tok->decoding_readline, NULL);
654            if (buf == NULL) {
655                error_ret(tok);
656                return 1;
657            } else {
658                tok->decoding_buffer = buf;
659            }
660        }
661        return PyObject_Length(buf) == 0;
662    }
663}
664
665/* Fetch a byte from TOK, using the string buffer. */
666
667static int
668buf_getc(struct tok_state *tok) {
669    return Py_CHARMASK(*tok->str++);
670}
671
672/* Unfetch a byte from TOK, using the string buffer. */
673
674static void
675buf_ungetc(int c, struct tok_state *tok) {
676    tok->str--;
677    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
678}
679
680/* Set the readline function for TOK to ENC. For the string-based
681   tokenizer, this means to just record the encoding. */
682
683static int
684buf_setreadl(struct tok_state *tok, const char* enc) {
685    tok->enc = enc;
686    return 1;
687}
688
689/* Return a UTF-8 encoding Python string object from the
690   C byte string STR, which is encoded with ENC. */
691
692static PyObject *
693translate_into_utf8(const char* str, const char* enc) {
694    PyObject *utf8;
695    PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
696    if (buf == NULL)
697        return NULL;
698    utf8 = PyUnicode_AsUTF8String(buf);
699    Py_DECREF(buf);
700    return utf8;
701}
702
703
704static char *
705translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
706    int skip_next_lf = 0;
707    size_t needed_length = strlen(s) + 2, final_length;
708    char *buf, *current;
709    char c = '\0';
710    buf = PyMem_MALLOC(needed_length);
711    if (buf == NULL) {
712        tok->done = E_NOMEM;
713        return NULL;
714    }
715    for (current = buf; *s; s++, current++) {
716        c = *s;
717        if (skip_next_lf) {
718            skip_next_lf = 0;
719            if (c == '\n') {
720                c = *++s;
721                if (!c)
722                    break;
723            }
724        }
725        if (c == '\r') {
726            skip_next_lf = 1;
727            c = '\n';
728        }
729        *current = c;
730    }
731    /* If this is exec input, add a newline to the end of the string if
732       there isn't one already. */
733    if (exec_input && c != '\n') {
734        *current = '\n';
735        current++;
736    }
737    *current = '\0';
738    final_length = current - buf + 1;
739    if (final_length < needed_length && final_length)
740        /* should never fail */
741        buf = PyMem_REALLOC(buf, final_length);
742    return buf;
743}
744
745/* Decode a byte string STR for use as the buffer of TOK.
746   Look for encoding declarations inside STR, and record them
747   inside TOK.  */
748
749static const char *
750decode_str(const char *input, int single, struct tok_state *tok)
751{
752    PyObject* utf8 = NULL;
753    const char *str;
754    const char *s;
755    const char *newl[2] = {NULL, NULL};
756    int lineno = 0;
757    tok->input = str = translate_newlines(input, single, tok);
758    if (str == NULL)
759        return NULL;
760    tok->enc = NULL;
761    tok->str = str;
762    if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
763        return error_ret(tok);
764    str = tok->str;             /* string after BOM if any */
765    assert(str);
766    if (tok->enc != NULL) {
767        utf8 = translate_into_utf8(str, tok->enc);
768        if (utf8 == NULL)
769            return error_ret(tok);
770        str = PyBytes_AsString(utf8);
771    }
772    for (s = str;; s++) {
773        if (*s == '\0') break;
774        else if (*s == '\n') {
775            assert(lineno < 2);
776            newl[lineno] = s;
777            lineno++;
778            if (lineno == 2) break;
779        }
780    }
781    tok->enc = NULL;
782    /* need to check line 1 and 2 separately since check_coding_spec
783       assumes a single line as input */
784    if (newl[0]) {
785        if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
786            return error_ret(tok);
787        if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
788            if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
789                                   tok, buf_setreadl))
790                return error_ret(tok);
791        }
792    }
793    if (tok->enc != NULL) {
794        assert(utf8 == NULL);
795        utf8 = translate_into_utf8(str, tok->enc);
796        if (utf8 == NULL)
797            return error_ret(tok);
798        str = PyBytes_AS_STRING(utf8);
799    }
800    assert(tok->decoding_buffer == NULL);
801    tok->decoding_buffer = utf8; /* CAUTION */
802    return str;
803}
804
805#endif /* PGEN */
806
807/* Set up tokenizer for string */
808
809struct tok_state *
810PyTokenizer_FromString(const char *str, int exec_input)
811{
812    struct tok_state *tok = tok_new();
813    if (tok == NULL)
814        return NULL;
815    str = decode_str(str, exec_input, tok);
816    if (str == NULL) {
817        PyTokenizer_Free(tok);
818        return NULL;
819    }
820
821    /* XXX: constify members. */
822    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
823    return tok;
824}
825
826struct tok_state *
827PyTokenizer_FromUTF8(const char *str, int exec_input)
828{
829    struct tok_state *tok = tok_new();
830    if (tok == NULL)
831        return NULL;
832#ifndef PGEN
833    tok->input = str = translate_newlines(str, exec_input, tok);
834#endif
835    if (str == NULL) {
836        PyTokenizer_Free(tok);
837        return NULL;
838    }
839    tok->decoding_state = STATE_RAW;
840    tok->read_coding_spec = 1;
841    tok->enc = NULL;
842    tok->str = str;
843    tok->encoding = (char *)PyMem_MALLOC(6);
844    if (!tok->encoding) {
845        PyTokenizer_Free(tok);
846        return NULL;
847    }
848    strcpy(tok->encoding, "utf-8");
849
850    /* XXX: constify members. */
851    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
852    return tok;
853}
854
855/* Set up tokenizer for file */
856
857struct tok_state *
858PyTokenizer_FromFile(FILE *fp, const char* enc,
859                     const char *ps1, const char *ps2)
860{
861    struct tok_state *tok = tok_new();
862    if (tok == NULL)
863        return NULL;
864    if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
865        PyTokenizer_Free(tok);
866        return NULL;
867    }
868    tok->cur = tok->inp = tok->buf;
869    tok->end = tok->buf + BUFSIZ;
870    tok->fp = fp;
871    tok->prompt = ps1;
872    tok->nextprompt = ps2;
873    if (enc != NULL) {
874        /* Must copy encoding declaration since it
875           gets copied into the parse tree. */
876        tok->encoding = PyMem_MALLOC(strlen(enc)+1);
877        if (!tok->encoding) {
878            PyTokenizer_Free(tok);
879            return NULL;
880        }
881        strcpy(tok->encoding, enc);
882        tok->decoding_state = STATE_NORMAL;
883    }
884    return tok;
885}
886
887
888/* Free a tok_state structure */
889
890void
891PyTokenizer_Free(struct tok_state *tok)
892{
893    if (tok->encoding != NULL)
894        PyMem_FREE(tok->encoding);
895#ifndef PGEN
896    Py_XDECREF(tok->decoding_readline);
897    Py_XDECREF(tok->decoding_buffer);
898    Py_XDECREF(tok->filename);
899#endif
900    if (tok->fp != NULL && tok->buf != NULL)
901        PyMem_FREE(tok->buf);
902    if (tok->input)
903        PyMem_FREE((char *)tok->input);
904    PyMem_FREE(tok);
905}
906
907/* Get next char, updating state; error code goes into tok->done */
908
909static int
910tok_nextc(struct tok_state *tok)
911{
912    for (;;) {
913        if (tok->cur != tok->inp) {
914            return Py_CHARMASK(*tok->cur++); /* Fast path */
915        }
916        if (tok->done != E_OK)
917            return EOF;
918        if (tok->fp == NULL) {
919            char *end = strchr(tok->inp, '\n');
920            if (end != NULL)
921                end++;
922            else {
923                end = strchr(tok->inp, '\0');
924                if (end == tok->inp) {
925                    tok->done = E_EOF;
926                    return EOF;
927                }
928            }
929            if (tok->start == NULL)
930                tok->buf = tok->cur;
931            tok->line_start = tok->cur;
932            tok->lineno++;
933            tok->inp = end;
934            return Py_CHARMASK(*tok->cur++);
935        }
936        if (tok->prompt != NULL) {
937            char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
938#ifndef PGEN
939            if (newtok != NULL) {
940                char *translated = translate_newlines(newtok, 0, tok);
941                PyMem_FREE(newtok);
942                if (translated == NULL)
943                    return EOF;
944                newtok = translated;
945            }
946            if (tok->encoding && newtok && *newtok) {
947                /* Recode to UTF-8 */
948                Py_ssize_t buflen;
949                const char* buf;
950                PyObject *u = translate_into_utf8(newtok, tok->encoding);
951                PyMem_FREE(newtok);
952                if (!u) {
953                    tok->done = E_DECODE;
954                    return EOF;
955                }
956                buflen = PyBytes_GET_SIZE(u);
957                buf = PyBytes_AS_STRING(u);
958                newtok = PyMem_MALLOC(buflen+1);
959                strcpy(newtok, buf);
960                Py_DECREF(u);
961            }
962#endif
963            if (tok->nextprompt != NULL)
964                tok->prompt = tok->nextprompt;
965            if (newtok == NULL)
966                tok->done = E_INTR;
967            else if (*newtok == '\0') {
968                PyMem_FREE(newtok);
969                tok->done = E_EOF;
970            }
971            else if (tok->start != NULL) {
972                size_t start = tok->start - tok->buf;
973                size_t oldlen = tok->cur - tok->buf;
974                size_t newlen = oldlen + strlen(newtok);
975                char *buf = tok->buf;
976                buf = (char *)PyMem_REALLOC(buf, newlen+1);
977                tok->lineno++;
978                if (buf == NULL) {
979                    PyMem_FREE(tok->buf);
980                    tok->buf = NULL;
981                    PyMem_FREE(newtok);
982                    tok->done = E_NOMEM;
983                    return EOF;
984                }
985                tok->buf = buf;
986                tok->cur = tok->buf + oldlen;
987                tok->line_start = tok->cur;
988                strcpy(tok->buf + oldlen, newtok);
989                PyMem_FREE(newtok);
990                tok->inp = tok->buf + newlen;
991                tok->end = tok->inp + 1;
992                tok->start = tok->buf + start;
993            }
994            else {
995                tok->lineno++;
996                if (tok->buf != NULL)
997                    PyMem_FREE(tok->buf);
998                tok->buf = newtok;
999                tok->cur = tok->buf;
1000                tok->line_start = tok->buf;
1001                tok->inp = strchr(tok->buf, '\0');
1002                tok->end = tok->inp + 1;
1003            }
1004        }
1005        else {
1006            int done = 0;
1007            Py_ssize_t cur = 0;
1008            char *pt;
1009            if (tok->start == NULL) {
1010                if (tok->buf == NULL) {
1011                    tok->buf = (char *)
1012                        PyMem_MALLOC(BUFSIZ);
1013                    if (tok->buf == NULL) {
1014                        tok->done = E_NOMEM;
1015                        return EOF;
1016                    }
1017                    tok->end = tok->buf + BUFSIZ;
1018                }
1019                if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1020                          tok) == NULL) {
1021                    if (!tok->decoding_erred)
1022                        tok->done = E_EOF;
1023                    done = 1;
1024                }
1025                else {
1026                    tok->done = E_OK;
1027                    tok->inp = strchr(tok->buf, '\0');
1028                    done = tok->inp == tok->buf || tok->inp[-1] == '\n';
1029                }
1030            }
1031            else {
1032                cur = tok->cur - tok->buf;
1033                if (decoding_feof(tok)) {
1034                    tok->done = E_EOF;
1035                    done = 1;
1036                }
1037                else
1038                    tok->done = E_OK;
1039            }
1040            tok->lineno++;
1041            /* Read until '\n' or EOF */
1042            while (!done) {
1043                Py_ssize_t curstart = tok->start == NULL ? -1 :
1044                          tok->start - tok->buf;
1045                Py_ssize_t curvalid = tok->inp - tok->buf;
1046                Py_ssize_t newsize = curvalid + BUFSIZ;
1047                char *newbuf = tok->buf;
1048                newbuf = (char *)PyMem_REALLOC(newbuf,
1049                                               newsize);
1050                if (newbuf == NULL) {
1051                    tok->done = E_NOMEM;
1052                    tok->cur = tok->inp;
1053                    return EOF;
1054                }
1055                tok->buf = newbuf;
1056                tok->cur = tok->buf + cur;
1057                tok->line_start = tok->cur;
1058                tok->inp = tok->buf + curvalid;
1059                tok->end = tok->buf + newsize;
1060                tok->start = curstart < 0 ? NULL :
1061                         tok->buf + curstart;
1062                if (decoding_fgets(tok->inp,
1063                               (int)(tok->end - tok->inp),
1064                               tok) == NULL) {
1065                    /* Break out early on decoding
1066                       errors, as tok->buf will be NULL
1067                     */
1068                    if (tok->decoding_erred)
1069                        return EOF;
1070                    /* Last line does not end in \n,
1071                       fake one */
1072                    strcpy(tok->inp, "\n");
1073                }
1074                tok->inp = strchr(tok->inp, '\0');
1075                done = tok->inp[-1] == '\n';
1076            }
1077            if (tok->buf != NULL) {
1078                tok->cur = tok->buf + cur;
1079                tok->line_start = tok->cur;
1080                /* replace "\r\n" with "\n" */
1081                /* For Mac leave the \r, giving a syntax error */
1082                pt = tok->inp - 2;
1083                if (pt >= tok->buf && *pt == '\r') {
1084                    *pt++ = '\n';
1085                    *pt = '\0';
1086                    tok->inp = pt;
1087                }
1088            }
1089        }
1090        if (tok->done != E_OK) {
1091            if (tok->prompt != NULL)
1092                PySys_WriteStderr("\n");
1093            tok->cur = tok->inp;
1094            return EOF;
1095        }
1096    }
1097    /*NOTREACHED*/
1098}
1099
1100
1101/* Back-up one character */
1102
1103static void
1104tok_backup(struct tok_state *tok, int c)
1105{
1106    if (c != EOF) {
1107        if (--tok->cur < tok->buf)
1108            Py_FatalError("tok_backup: beginning of buffer");
1109        if (*tok->cur != c)
1110            *tok->cur = c;
1111    }
1112}
1113
1114
1115/* Return the token corresponding to a single character */
1116
1117int
1118PyToken_OneChar(int c)
1119{
1120    switch (c) {
1121    case '(':           return LPAR;
1122    case ')':           return RPAR;
1123    case '[':           return LSQB;
1124    case ']':           return RSQB;
1125    case ':':           return COLON;
1126    case ',':           return COMMA;
1127    case ';':           return SEMI;
1128    case '+':           return PLUS;
1129    case '-':           return MINUS;
1130    case '*':           return STAR;
1131    case '/':           return SLASH;
1132    case '|':           return VBAR;
1133    case '&':           return AMPER;
1134    case '<':           return LESS;
1135    case '>':           return GREATER;
1136    case '=':           return EQUAL;
1137    case '.':           return DOT;
1138    case '%':           return PERCENT;
1139    case '{':           return LBRACE;
1140    case '}':           return RBRACE;
1141    case '^':           return CIRCUMFLEX;
1142    case '~':           return TILDE;
1143    case '@':           return AT;
1144    default:            return OP;
1145    }
1146}
1147
1148
1149int
1150PyToken_TwoChars(int c1, int c2)
1151{
1152    switch (c1) {
1153    case '=':
1154        switch (c2) {
1155        case '=':               return EQEQUAL;
1156        }
1157        break;
1158    case '!':
1159        switch (c2) {
1160        case '=':               return NOTEQUAL;
1161        }
1162        break;
1163    case '<':
1164        switch (c2) {
1165        case '>':               return NOTEQUAL;
1166        case '=':               return LESSEQUAL;
1167        case '<':               return LEFTSHIFT;
1168        }
1169        break;
1170    case '>':
1171        switch (c2) {
1172        case '=':               return GREATEREQUAL;
1173        case '>':               return RIGHTSHIFT;
1174        }
1175        break;
1176    case '+':
1177        switch (c2) {
1178        case '=':               return PLUSEQUAL;
1179        }
1180        break;
1181    case '-':
1182        switch (c2) {
1183        case '=':               return MINEQUAL;
1184        case '>':               return RARROW;
1185        }
1186        break;
1187    case '*':
1188        switch (c2) {
1189        case '*':               return DOUBLESTAR;
1190        case '=':               return STAREQUAL;
1191        }
1192        break;
1193    case '/':
1194        switch (c2) {
1195        case '/':               return DOUBLESLASH;
1196        case '=':               return SLASHEQUAL;
1197        }
1198        break;
1199    case '|':
1200        switch (c2) {
1201        case '=':               return VBAREQUAL;
1202        }
1203        break;
1204    case '%':
1205        switch (c2) {
1206        case '=':               return PERCENTEQUAL;
1207        }
1208        break;
1209    case '&':
1210        switch (c2) {
1211        case '=':               return AMPEREQUAL;
1212        }
1213        break;
1214    case '^':
1215        switch (c2) {
1216        case '=':               return CIRCUMFLEXEQUAL;
1217        }
1218        break;
1219    case '@':
1220        switch (c2) {
1221        case '=':               return ATEQUAL;
1222        }
1223        break;
1224    }
1225    return OP;
1226}
1227
1228int
1229PyToken_ThreeChars(int c1, int c2, int c3)
1230{
1231    switch (c1) {
1232    case '<':
1233        switch (c2) {
1234        case '<':
1235            switch (c3) {
1236            case '=':
1237                return LEFTSHIFTEQUAL;
1238            }
1239            break;
1240        }
1241        break;
1242    case '>':
1243        switch (c2) {
1244        case '>':
1245            switch (c3) {
1246            case '=':
1247                return RIGHTSHIFTEQUAL;
1248            }
1249            break;
1250        }
1251        break;
1252    case '*':
1253        switch (c2) {
1254        case '*':
1255            switch (c3) {
1256            case '=':
1257                return DOUBLESTAREQUAL;
1258            }
1259            break;
1260        }
1261        break;
1262    case '/':
1263        switch (c2) {
1264        case '/':
1265            switch (c3) {
1266            case '=':
1267                return DOUBLESLASHEQUAL;
1268            }
1269            break;
1270        }
1271        break;
1272    case '.':
1273        switch (c2) {
1274        case '.':
1275            switch (c3) {
1276            case '.':
1277                return ELLIPSIS;
1278            }
1279            break;
1280        }
1281        break;
1282    }
1283    return OP;
1284}
1285
1286static int
1287indenterror(struct tok_state *tok)
1288{
1289    if (tok->alterror) {
1290        tok->done = E_TABSPACE;
1291        tok->cur = tok->inp;
1292        return 1;
1293    }
1294    if (tok->altwarning) {
1295#ifdef PGEN
1296        PySys_WriteStderr("inconsistent use of tabs and spaces "
1297                          "in indentation\n");
1298#else
1299        PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
1300                          "in indentation\n", tok->filename);
1301#endif
1302        tok->altwarning = 0;
1303    }
1304    return 0;
1305}
1306
1307#ifdef PGEN
1308#define verify_identifier(tok) 1
1309#else
1310/* Verify that the identifier follows PEP 3131.
1311   All identifier strings are guaranteed to be "ready" unicode objects.
1312 */
1313static int
1314verify_identifier(struct tok_state *tok)
1315{
1316    PyObject *s;
1317    int result;
1318    if (tok->decoding_erred)
1319        return 0;
1320    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1321    if (s == NULL || PyUnicode_READY(s) == -1) {
1322        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1323            PyErr_Clear();
1324            tok->done = E_IDENTIFIER;
1325        } else {
1326            tok->done = E_ERROR;
1327        }
1328        return 0;
1329    }
1330    result = PyUnicode_IsIdentifier(s);
1331    Py_DECREF(s);
1332    if (result == 0)
1333        tok->done = E_IDENTIFIER;
1334    return result;
1335}
1336#endif
1337
1338static int
1339tok_decimal_tail(struct tok_state *tok)
1340{
1341    int c;
1342
1343    while (1) {
1344        do {
1345            c = tok_nextc(tok);
1346        } while (isdigit(c));
1347        if (c != '_') {
1348            break;
1349        }
1350        c = tok_nextc(tok);
1351        if (!isdigit(c)) {
1352            tok->done = E_TOKEN;
1353            tok_backup(tok, c);
1354            return 0;
1355        }
1356    }
1357    return c;
1358}
1359
1360/* Get next token, after space stripping etc. */
1361
1362static int
1363tok_get(struct tok_state *tok, char **p_start, char **p_end)
1364{
1365    int c;
1366    int blankline, nonascii;
1367
1368    *p_start = *p_end = NULL;
1369  nextline:
1370    tok->start = NULL;
1371    blankline = 0;
1372
1373    /* Get indentation level */
1374    if (tok->atbol) {
1375        int col = 0;
1376        int altcol = 0;
1377        tok->atbol = 0;
1378        for (;;) {
1379            c = tok_nextc(tok);
1380            if (c == ' ') {
1381                col++, altcol++;
1382            }
1383            else if (c == '\t') {
1384                col = (col/tok->tabsize + 1) * tok->tabsize;
1385                altcol = (altcol/tok->alttabsize + 1)
1386                    * tok->alttabsize;
1387            }
1388            else if (c == '\014')  {/* Control-L (formfeed) */
1389                col = altcol = 0; /* For Emacs users */
1390            }
1391            else {
1392                break;
1393            }
1394        }
1395        tok_backup(tok, c);
1396        if (c == '#' || c == '\n') {
1397            /* Lines with only whitespace and/or comments
1398               shouldn't affect the indentation and are
1399               not passed to the parser as NEWLINE tokens,
1400               except *totally* empty lines in interactive
1401               mode, which signal the end of a command group. */
1402            if (col == 0 && c == '\n' && tok->prompt != NULL) {
1403                blankline = 0; /* Let it through */
1404            }
1405            else {
1406                blankline = 1; /* Ignore completely */
1407            }
1408            /* We can't jump back right here since we still
1409               may need to skip to the end of a comment */
1410        }
1411        if (!blankline && tok->level == 0) {
1412            if (col == tok->indstack[tok->indent]) {
1413                /* No change */
1414                if (altcol != tok->altindstack[tok->indent]) {
1415                    if (indenterror(tok)) {
1416                        return ERRORTOKEN;
1417                    }
1418                }
1419            }
1420            else if (col > tok->indstack[tok->indent]) {
1421                /* Indent -- always one */
1422                if (tok->indent+1 >= MAXINDENT) {
1423                    tok->done = E_TOODEEP;
1424                    tok->cur = tok->inp;
1425                    return ERRORTOKEN;
1426                }
1427                if (altcol <= tok->altindstack[tok->indent]) {
1428                    if (indenterror(tok)) {
1429                        return ERRORTOKEN;
1430                    }
1431                }
1432                tok->pendin++;
1433                tok->indstack[++tok->indent] = col;
1434                tok->altindstack[tok->indent] = altcol;
1435            }
1436            else /* col < tok->indstack[tok->indent] */ {
1437                /* Dedent -- any number, must be consistent */
1438                while (tok->indent > 0 &&
1439                    col < tok->indstack[tok->indent]) {
1440                    tok->pendin--;
1441                    tok->indent--;
1442                }
1443                if (col != tok->indstack[tok->indent]) {
1444                    tok->done = E_DEDENT;
1445                    tok->cur = tok->inp;
1446                    return ERRORTOKEN;
1447                }
1448                if (altcol != tok->altindstack[tok->indent]) {
1449                    if (indenterror(tok)) {
1450                        return ERRORTOKEN;
1451                    }
1452                }
1453            }
1454        }
1455    }
1456
1457    tok->start = tok->cur;
1458
1459    /* Return pending indents/dedents */
1460    if (tok->pendin != 0) {
1461        if (tok->pendin < 0) {
1462            tok->pendin++;
1463            return DEDENT;
1464        }
1465        else {
1466            tok->pendin--;
1467            return INDENT;
1468        }
1469    }
1470
1471    if (tok->async_def
1472        && !blankline
1473        && tok->level == 0
1474        /* There was a NEWLINE after ASYNC DEF,
1475           so we're past the signature. */
1476        && tok->async_def_nl
1477        /* Current indentation level is less than where
1478           the async function was defined */
1479        && tok->async_def_indent >= tok->indent)
1480    {
1481        tok->async_def = 0;
1482        tok->async_def_indent = 0;
1483        tok->async_def_nl = 0;
1484    }
1485
1486 again:
1487    tok->start = NULL;
1488    /* Skip spaces */
1489    do {
1490        c = tok_nextc(tok);
1491    } while (c == ' ' || c == '\t' || c == '\014');
1492
1493    /* Set start of current token */
1494    tok->start = tok->cur - 1;
1495
1496    /* Skip comment */
1497    if (c == '#') {
1498        while (c != EOF && c != '\n') {
1499            c = tok_nextc(tok);
1500        }
1501    }
1502
1503    /* Check for EOF and errors now */
1504    if (c == EOF) {
1505        return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1506    }
1507
1508    /* Identifier (most frequent token!) */
1509    nonascii = 0;
1510    if (is_potential_identifier_start(c)) {
1511        /* Process the various legal combinations of b"", r"", u"", and f"". */
1512        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1513        while (1) {
1514            if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1515                saw_b = 1;
1516            /* Since this is a backwards compatibility support literal we don't
1517               want to support it in arbitrary order like byte literals. */
1518            else if (!(saw_b || saw_u || saw_r || saw_f)
1519                     && (c == 'u'|| c == 'U')) {
1520                saw_u = 1;
1521            }
1522            /* ur"" and ru"" are not supported */
1523            else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1524                saw_r = 1;
1525            }
1526            else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1527                saw_f = 1;
1528            }
1529            else {
1530                break;
1531            }
1532            c = tok_nextc(tok);
1533            if (c == '"' || c == '\'') {
1534                goto letter_quote;
1535            }
1536        }
1537        while (is_potential_identifier_char(c)) {
1538            if (c >= 128) {
1539                nonascii = 1;
1540            }
1541            c = tok_nextc(tok);
1542        }
1543        tok_backup(tok, c);
1544        if (nonascii && !verify_identifier(tok)) {
1545            return ERRORTOKEN;
1546        }
1547        *p_start = tok->start;
1548        *p_end = tok->cur;
1549
1550        /* async/await parsing block. */
1551        if (tok->cur - tok->start == 5) {
1552            /* Current token length is 5. */
1553            if (tok->async_def) {
1554                /* We're inside an 'async def' function. */
1555                if (memcmp(tok->start, "async", 5) == 0) {
1556                    return ASYNC;
1557                }
1558                if (memcmp(tok->start, "await", 5) == 0) {
1559                    return AWAIT;
1560                }
1561            }
1562            else if (memcmp(tok->start, "async", 5) == 0) {
1563                /* The current token is 'async'.
1564                   Look ahead one token.*/
1565
1566                struct tok_state ahead_tok;
1567                char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1568                int ahead_tok_kind;
1569
1570                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1571                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1572                                         &ahead_tok_end);
1573
1574                if (ahead_tok_kind == NAME
1575                    && ahead_tok.cur - ahead_tok.start == 3
1576                    && memcmp(ahead_tok.start, "def", 3) == 0)
1577                {
1578                    /* The next token is going to be 'def', so instead of
1579                       returning 'async' NAME token, we return ASYNC. */
1580                    tok->async_def_indent = tok->indent;
1581                    tok->async_def = 1;
1582                    return ASYNC;
1583                }
1584            }
1585        }
1586
1587        return NAME;
1588    }
1589
1590    /* Newline */
1591    if (c == '\n') {
1592        tok->atbol = 1;
1593        if (blankline || tok->level > 0) {
1594            goto nextline;
1595        }
1596        *p_start = tok->start;
1597        *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1598        tok->cont_line = 0;
1599        if (tok->async_def) {
1600            /* We're somewhere inside an 'async def' function, and
1601               we've encountered a NEWLINE after its signature. */
1602            tok->async_def_nl = 1;
1603        }
1604        return NEWLINE;
1605    }
1606
1607    /* Period or number starting with period? */
1608    if (c == '.') {
1609        c = tok_nextc(tok);
1610        if (isdigit(c)) {
1611            goto fraction;
1612        } else if (c == '.') {
1613            c = tok_nextc(tok);
1614            if (c == '.') {
1615                *p_start = tok->start;
1616                *p_end = tok->cur;
1617                return ELLIPSIS;
1618            }
1619            else {
1620                tok_backup(tok, c);
1621            }
1622            tok_backup(tok, '.');
1623        }
1624        else {
1625            tok_backup(tok, c);
1626        }
1627        *p_start = tok->start;
1628        *p_end = tok->cur;
1629        return DOT;
1630    }
1631
1632    /* Number */
1633    if (isdigit(c)) {
1634        if (c == '0') {
1635            /* Hex, octal or binary -- maybe. */
1636            c = tok_nextc(tok);
1637            if (c == 'x' || c == 'X') {
1638                /* Hex */
1639                c = tok_nextc(tok);
1640                do {
1641                    if (c == '_') {
1642                        c = tok_nextc(tok);
1643                    }
1644                    if (!isxdigit(c)) {
1645                        tok->done = E_TOKEN;
1646                        tok_backup(tok, c);
1647                        return ERRORTOKEN;
1648                    }
1649                    do {
1650                        c = tok_nextc(tok);
1651                    } while (isxdigit(c));
1652                } while (c == '_');
1653            }
1654            else if (c == 'o' || c == 'O') {
1655                /* Octal */
1656                c = tok_nextc(tok);
1657                do {
1658                    if (c == '_') {
1659                        c = tok_nextc(tok);
1660                    }
1661                    if (c < '0' || c >= '8') {
1662                        tok->done = E_TOKEN;
1663                        tok_backup(tok, c);
1664                        return ERRORTOKEN;
1665                    }
1666                    do {
1667                        c = tok_nextc(tok);
1668                    } while ('0' <= c && c < '8');
1669                } while (c == '_');
1670            }
1671            else if (c == 'b' || c == 'B') {
1672                /* Binary */
1673                c = tok_nextc(tok);
1674                do {
1675                    if (c == '_') {
1676                        c = tok_nextc(tok);
1677                    }
1678                    if (c != '0' && c != '1') {
1679                        tok->done = E_TOKEN;
1680                        tok_backup(tok, c);
1681                        return ERRORTOKEN;
1682                    }
1683                    do {
1684                        c = tok_nextc(tok);
1685                    } while (c == '0' || c == '1');
1686                } while (c == '_');
1687            }
1688            else {
1689                int nonzero = 0;
1690                /* maybe old-style octal; c is first char of it */
1691                /* in any case, allow '0' as a literal */
1692                while (1) {
1693                    if (c == '_') {
1694                        c = tok_nextc(tok);
1695                        if (!isdigit(c)) {
1696                            tok->done = E_TOKEN;
1697                            tok_backup(tok, c);
1698                            return ERRORTOKEN;
1699                        }
1700                    }
1701                    if (c != '0') {
1702                        break;
1703                    }
1704                    c = tok_nextc(tok);
1705                }
1706                if (isdigit(c)) {
1707                    nonzero = 1;
1708                    c = tok_decimal_tail(tok);
1709                    if (c == 0) {
1710                        return ERRORTOKEN;
1711                    }
1712                }
1713                if (c == '.') {
1714                    c = tok_nextc(tok);
1715                    goto fraction;
1716                }
1717                else if (c == 'e' || c == 'E') {
1718                    goto exponent;
1719                }
1720                else if (c == 'j' || c == 'J') {
1721                    goto imaginary;
1722                }
1723                else if (nonzero) {
1724                    /* Old-style octal: now disallowed. */
1725                    tok->done = E_TOKEN;
1726                    tok_backup(tok, c);
1727                    return ERRORTOKEN;
1728                }
1729            }
1730        }
1731        else {
1732            /* Decimal */
1733            c = tok_decimal_tail(tok);
1734            if (c == 0) {
1735                return ERRORTOKEN;
1736            }
1737            {
1738                /* Accept floating point numbers. */
1739                if (c == '.') {
1740                    c = tok_nextc(tok);
1741        fraction:
1742                    /* Fraction */
1743                    if (isdigit(c)) {
1744                        c = tok_decimal_tail(tok);
1745                        if (c == 0) {
1746                            return ERRORTOKEN;
1747                        }
1748                    }
1749                }
1750                if (c == 'e' || c == 'E') {
1751                    int e;
1752                  exponent:
1753                    e = c;
1754                    /* Exponent part */
1755                    c = tok_nextc(tok);
1756                    if (c == '+' || c == '-') {
1757                        c = tok_nextc(tok);
1758                        if (!isdigit(c)) {
1759                            tok->done = E_TOKEN;
1760                            tok_backup(tok, c);
1761                            return ERRORTOKEN;
1762                        }
1763                    } else if (!isdigit(c)) {
1764                        tok_backup(tok, c);
1765                        tok_backup(tok, e);
1766                        *p_start = tok->start;
1767                        *p_end = tok->cur;
1768                        return NUMBER;
1769                    }
1770                    c = tok_decimal_tail(tok);
1771                    if (c == 0) {
1772                        return ERRORTOKEN;
1773                    }
1774                }
1775                if (c == 'j' || c == 'J') {
1776                    /* Imaginary part */
1777        imaginary:
1778                    c = tok_nextc(tok);
1779                }
1780            }
1781        }
1782        tok_backup(tok, c);
1783        *p_start = tok->start;
1784        *p_end = tok->cur;
1785        return NUMBER;
1786    }
1787
1788  letter_quote:
1789    /* String */
1790    if (c == '\'' || c == '"') {
1791        int quote = c;
1792        int quote_size = 1;             /* 1 or 3 */
1793        int end_quote_size = 0;
1794
1795        /* Find the quote size and start of string */
1796        c = tok_nextc(tok);
1797        if (c == quote) {
1798            c = tok_nextc(tok);
1799            if (c == quote) {
1800                quote_size = 3;
1801            }
1802            else {
1803                end_quote_size = 1;     /* empty string found */
1804            }
1805        }
1806        if (c != quote) {
1807            tok_backup(tok, c);
1808        }
1809
1810        /* Get rest of string */
1811        while (end_quote_size != quote_size) {
1812            c = tok_nextc(tok);
1813            if (c == EOF) {
1814                if (quote_size == 3) {
1815                    tok->done = E_EOFS;
1816                }
1817                else {
1818                    tok->done = E_EOLS;
1819                }
1820                tok->cur = tok->inp;
1821                return ERRORTOKEN;
1822            }
1823            if (quote_size == 1 && c == '\n') {
1824                tok->done = E_EOLS;
1825                tok->cur = tok->inp;
1826                return ERRORTOKEN;
1827            }
1828            if (c == quote) {
1829                end_quote_size += 1;
1830            }
1831            else {
1832                end_quote_size = 0;
1833                if (c == '\\') {
1834                    tok_nextc(tok);  /* skip escaped char */
1835                }
1836            }
1837        }
1838
1839        *p_start = tok->start;
1840        *p_end = tok->cur;
1841        return STRING;
1842    }
1843
1844    /* Line continuation */
1845    if (c == '\\') {
1846        c = tok_nextc(tok);
1847        if (c != '\n') {
1848            tok->done = E_LINECONT;
1849            tok->cur = tok->inp;
1850            return ERRORTOKEN;
1851        }
1852        tok->cont_line = 1;
1853        goto again; /* Read next line */
1854    }
1855
1856    /* Check for two-character token */
1857    {
1858        int c2 = tok_nextc(tok);
1859        int token = PyToken_TwoChars(c, c2);
1860        if (token != OP) {
1861            int c3 = tok_nextc(tok);
1862            int token3 = PyToken_ThreeChars(c, c2, c3);
1863            if (token3 != OP) {
1864                token = token3;
1865            }
1866            else {
1867                tok_backup(tok, c3);
1868            }
1869            *p_start = tok->start;
1870            *p_end = tok->cur;
1871            return token;
1872        }
1873        tok_backup(tok, c2);
1874    }
1875
1876    /* Keep track of parentheses nesting level */
1877    switch (c) {
1878    case '(':
1879    case '[':
1880    case '{':
1881        tok->level++;
1882        break;
1883    case ')':
1884    case ']':
1885    case '}':
1886        tok->level--;
1887        break;
1888    }
1889
1890    /* Punctuation character */
1891    *p_start = tok->start;
1892    *p_end = tok->cur;
1893    return PyToken_OneChar(c);
1894}
1895
1896int
1897PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1898{
1899    int result = tok_get(tok, p_start, p_end);
1900    if (tok->decoding_erred) {
1901        result = ERRORTOKEN;
1902        tok->done = E_DECODE;
1903    }
1904    return result;
1905}
1906
1907/* Get the encoding of a Python file. Check for the coding cookie and check if
1908   the file starts with a BOM.
1909
1910   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1911   encoding in the first or second line of the file (in which case the encoding
1912   should be assumed to be UTF-8).
1913
1914   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1915   by the caller. */
1916
1917char *
1918PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1919{
1920    struct tok_state *tok;
1921    FILE *fp;
1922    char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1923
1924#ifndef PGEN
1925    fd = _Py_dup(fd);
1926#else
1927    fd = dup(fd);
1928#endif
1929    if (fd < 0) {
1930        return NULL;
1931    }
1932
1933    fp = fdopen(fd, "r");
1934    if (fp == NULL) {
1935        return NULL;
1936    }
1937    tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1938    if (tok == NULL) {
1939        fclose(fp);
1940        return NULL;
1941    }
1942#ifndef PGEN
1943    if (filename != NULL) {
1944        Py_INCREF(filename);
1945        tok->filename = filename;
1946    }
1947    else {
1948        tok->filename = PyUnicode_FromString("<string>");
1949        if (tok->filename == NULL) {
1950            fclose(fp);
1951            PyTokenizer_Free(tok);
1952            return encoding;
1953        }
1954    }
1955#endif
1956    while (tok->lineno < 2 && tok->done == E_OK) {
1957        PyTokenizer_Get(tok, &p_start, &p_end);
1958    }
1959    fclose(fp);
1960    if (tok->encoding) {
1961        encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1962        if (encoding)
1963        strcpy(encoding, tok->encoding);
1964    }
1965    PyTokenizer_Free(tok);
1966    return encoding;
1967}
1968
1969char *
1970PyTokenizer_FindEncoding(int fd)
1971{
1972    return PyTokenizer_FindEncodingFilename(fd, NULL);
1973}
1974
1975#ifdef Py_DEBUG
1976
1977void
1978tok_dump(int type, char *start, char *end)
1979{
1980    printf("%s", _PyParser_TokenNames[type]);
1981    if (type == NAME || type == NUMBER || type == STRING || type == OP)
1982        printf("(%.*s)", (int)(end - start), start);
1983}
1984
1985#endif
1986