1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#include "pydebug.h"
20#endif /* PGEN */
21
22extern char *PyOS_Readline(FILE *, FILE *, char *);
23/* Return malloc'ed string including trailing \n;
24   empty malloc'ed string for EOF;
25   NULL if interrupted */
26
27/* Don't ever change this -- it would break the portability of Python code */
28#define TABSIZE 8
29
30/* Forward */
31static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
34
35/* Token names */
36
37char *_PyParser_TokenNames[] = {
38    "ENDMARKER",
39    "NAME",
40    "NUMBER",
41    "STRING",
42    "NEWLINE",
43    "INDENT",
44    "DEDENT",
45    "LPAR",
46    "RPAR",
47    "LSQB",
48    "RSQB",
49    "COLON",
50    "COMMA",
51    "SEMI",
52    "PLUS",
53    "MINUS",
54    "STAR",
55    "SLASH",
56    "VBAR",
57    "AMPER",
58    "LESS",
59    "GREATER",
60    "EQUAL",
61    "DOT",
62    "PERCENT",
63    "BACKQUOTE",
64    "LBRACE",
65    "RBRACE",
66    "EQEQUAL",
67    "NOTEQUAL",
68    "LESSEQUAL",
69    "GREATEREQUAL",
70    "TILDE",
71    "CIRCUMFLEX",
72    "LEFTSHIFT",
73    "RIGHTSHIFT",
74    "DOUBLESTAR",
75    "PLUSEQUAL",
76    "MINEQUAL",
77    "STAREQUAL",
78    "SLASHEQUAL",
79    "PERCENTEQUAL",
80    "AMPEREQUAL",
81    "VBAREQUAL",
82    "CIRCUMFLEXEQUAL",
83    "LEFTSHIFTEQUAL",
84    "RIGHTSHIFTEQUAL",
85    "DOUBLESTAREQUAL",
86    "DOUBLESLASH",
87    "DOUBLESLASHEQUAL",
88    "AT",
89    /* This table must match the #defines in token.h! */
90    "OP",
91    "<ERRORTOKEN>",
92    "<N_TOKENS>"
93};
94
95/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
98tok_new(void)
99{
100    struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
101                                            sizeof(struct tok_state));
102    if (tok == NULL)
103        return NULL;
104    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
105    tok->done = E_OK;
106    tok->fp = NULL;
107    tok->input = NULL;
108    tok->tabsize = TABSIZE;
109    tok->indent = 0;
110    tok->indstack[0] = 0;
111    tok->atbol = 1;
112    tok->pendin = 0;
113    tok->prompt = tok->nextprompt = NULL;
114    tok->lineno = 0;
115    tok->level = 0;
116    tok->filename = NULL;
117    tok->altwarning = 0;
118    tok->alterror = 0;
119    tok->alttabsize = 1;
120    tok->altindstack[0] = 0;
121    tok->decoding_state = 0;
122    tok->decoding_erred = 0;
123    tok->read_coding_spec = 0;
124    tok->encoding = NULL;
125    tok->cont_line = 0;
126#ifndef PGEN
127    tok->decoding_readline = NULL;
128    tok->decoding_buffer = NULL;
129#endif
130    return tok;
131}
132
133static char *
134new_string(const char *s, Py_ssize_t len)
135{
136    char* result = (char *)PyMem_MALLOC(len + 1);
137    if (result != NULL) {
138        memcpy(result, s, len);
139        result[len] = '\0';
140    }
141    return result;
142}
143
144#ifdef PGEN
145
146static char *
147decoding_fgets(char *s, int size, struct tok_state *tok)
148{
149    return fgets(s, size, tok->fp);
150}
151
152static int
153decoding_feof(struct tok_state *tok)
154{
155    return feof(tok->fp);
156}
157
158static char *
159decode_str(const char *str, int exec_input, struct tok_state *tok)
160{
161    return new_string(str, strlen(str));
162}
163
164#else /* PGEN */
165
166static char *
167error_ret(struct tok_state *tok) /* XXX */
168{
169    tok->decoding_erred = 1;
170    if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
171        PyMem_FREE(tok->buf);
172    tok->buf = NULL;
173    return NULL;                /* as if it were EOF */
174}
175
176
177static char *
178get_normal_name(char *s)        /* for utf-8 and latin-1 */
179{
180    char buf[13];
181    int i;
182    for (i = 0; i < 12; i++) {
183        int c = s[i];
184        if (c == '\0')
185            break;
186        else if (c == '_')
187            buf[i] = '-';
188        else
189            buf[i] = tolower(c);
190    }
191    buf[i] = '\0';
192    if (strcmp(buf, "utf-8") == 0 ||
193        strncmp(buf, "utf-8-", 6) == 0)
194        return "utf-8";
195    else if (strcmp(buf, "latin-1") == 0 ||
196             strcmp(buf, "iso-8859-1") == 0 ||
197             strcmp(buf, "iso-latin-1") == 0 ||
198             strncmp(buf, "latin-1-", 8) == 0 ||
199             strncmp(buf, "iso-8859-1-", 11) == 0 ||
200             strncmp(buf, "iso-latin-1-", 12) == 0)
201        return "iso-8859-1";
202    else
203        return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found.  */
207
208static char *
209get_coding_spec(const char *s, Py_ssize_t size)
210{
211    Py_ssize_t i;
212    /* Coding spec must be in a comment, and that comment must be
213     * the only statement on the source code line. */
214    for (i = 0; i < size - 6; i++) {
215        if (s[i] == '#')
216            break;
217        if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218            return NULL;
219    }
220    for (; i < size - 6; i++) { /* XXX inefficient search */
221        const char* t = s + i;
222        if (strncmp(t, "coding", 6) == 0) {
223            const char* begin = NULL;
224            t += 6;
225            if (t[0] != ':' && t[0] != '=')
226                continue;
227            do {
228                t++;
229            } while (t[0] == '\x20' || t[0] == '\t');
230
231            begin = t;
232            while (Py_ISALNUM(t[0]) ||
233                   t[0] == '-' || t[0] == '_' || t[0] == '.')
234                t++;
235
236            if (begin < t) {
237                char* r = new_string(begin, t - begin);
238                char* q = get_normal_name(r);
239                if (r != q) {
240                    PyMem_FREE(r);
241                    r = new_string(q, strlen(q));
242                }
243                return r;
244            }
245        }
246    }
247    return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251   invoke the set_readline function for the new encoding.
252   This function receives the tok_state and the new encoding.
253   Return 1 on success, 0 on failure.  */
254
255static int
256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257                  int set_readline(struct tok_state *, const char *))
258{
259    char * cs;
260    int r = 1;
261
262    if (tok->cont_line) {
263        /* It's a continuation line, so it can't be a coding spec. */
264        tok->read_coding_spec = 1;
265        return 1;
266    }
267    cs = get_coding_spec(line, size);
268    if (!cs) {
269        Py_ssize_t i;
270        for (i = 0; i < size; i++) {
271            if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
272                break;
273            if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
274                /* Stop checking coding spec after a line containing
275                 * anything except a comment. */
276                tok->read_coding_spec = 1;
277                break;
278            }
279        }
280    } else {
281        tok->read_coding_spec = 1;
282        if (tok->encoding == NULL) {
283            assert(tok->decoding_state == 1); /* raw */
284            if (strcmp(cs, "utf-8") == 0 ||
285                strcmp(cs, "iso-8859-1") == 0) {
286                tok->encoding = cs;
287            } else {
288#ifdef Py_USING_UNICODE
289                r = set_readline(tok, cs);
290                if (r) {
291                    tok->encoding = cs;
292                    tok->decoding_state = -1;
293                }
294                else {
295                    PyErr_Format(PyExc_SyntaxError,
296                                 "encoding problem: %s", cs);
297                    PyMem_FREE(cs);
298                }
299#else
300                /* Without Unicode support, we cannot
301                   process the coding spec. Since there
302                   won't be any Unicode literals, that
303                   won't matter. */
304                PyMem_FREE(cs);
305#endif
306            }
307        } else {                /* then, compare cs with BOM */
308            r = (strcmp(tok->encoding, cs) == 0);
309            if (!r)
310                PyErr_Format(PyExc_SyntaxError,
311                             "encoding problem: %s with BOM", cs);
312            PyMem_FREE(cs);
313        }
314    }
315    return r;
316}
317
318/* See whether the file starts with a BOM. If it does,
319   invoke the set_readline function with the new encoding.
320   Return 1 on success, 0 on failure.  */
321
322static int
323check_bom(int get_char(struct tok_state *),
324          void unget_char(int, struct tok_state *),
325          int set_readline(struct tok_state *, const char *),
326          struct tok_state *tok)
327{
328    int ch1, ch2, ch3;
329    ch1 = get_char(tok);
330    tok->decoding_state = 1;
331    if (ch1 == EOF) {
332        return 1;
333    } else if (ch1 == 0xEF) {
334        ch2 = get_char(tok);
335        if (ch2 != 0xBB) {
336            unget_char(ch2, tok);
337            unget_char(ch1, tok);
338            return 1;
339        }
340        ch3 = get_char(tok);
341        if (ch3 != 0xBF) {
342            unget_char(ch3, tok);
343            unget_char(ch2, tok);
344            unget_char(ch1, tok);
345            return 1;
346        }
347#if 0
348    /* Disable support for UTF-16 BOMs until a decision
349       is made whether this needs to be supported.  */
350    } else if (ch1 == 0xFE) {
351        ch2 = get_char(tok);
352        if (ch2 != 0xFF) {
353            unget_char(ch2, tok);
354            unget_char(ch1, tok);
355            return 1;
356        }
357        if (!set_readline(tok, "utf-16-be"))
358            return 0;
359        tok->decoding_state = -1;
360    } else if (ch1 == 0xFF) {
361        ch2 = get_char(tok);
362        if (ch2 != 0xFE) {
363            unget_char(ch2, tok);
364            unget_char(ch1, tok);
365            return 1;
366        }
367        if (!set_readline(tok, "utf-16-le"))
368            return 0;
369        tok->decoding_state = -1;
370#endif
371    } else {
372        unget_char(ch1, tok);
373        return 1;
374    }
375    if (tok->encoding != NULL)
376        PyMem_FREE(tok->encoding);
377    tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
378    return 1;
379}
380
381/* Read a line of text from TOK into S, using the stream in TOK.
382   Return NULL on failure, else S.
383
384   On entry, tok->decoding_buffer will be one of:
385     1) NULL: need to call tok->decoding_readline to get a new line
386     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
387       stored the result in tok->decoding_buffer
388     3) PyStringObject *: previous call to fp_readl did not have enough room
389       (in the s buffer) to copy entire contents of the line read
390       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
391       In this case, fp_readl is called in a loop (with an expanded buffer)
392       until the buffer ends with a '\n' (or until the end of the file is
393       reached): see tok_nextc and its calls to decoding_fgets.
394*/
395
396static char *
397fp_readl(char *s, int size, struct tok_state *tok)
398{
399#ifndef Py_USING_UNICODE
400    /* In a non-Unicode built, this should never be called. */
401    Py_FatalError("fp_readl should not be called in this build.");
402    return NULL; /* Keep compiler happy (not reachable) */
403#else
404    PyObject* utf8 = NULL;
405    PyObject* buf = tok->decoding_buffer;
406    char *str;
407    Py_ssize_t utf8len;
408
409    /* Ask for one less byte so we can terminate it */
410    assert(size > 0);
411    size--;
412
413    if (buf == NULL) {
414        buf = PyObject_CallObject(tok->decoding_readline, NULL);
415        if (buf == NULL)
416            return error_ret(tok);
417        if (!PyUnicode_Check(buf)) {
418            Py_DECREF(buf);
419            PyErr_SetString(PyExc_SyntaxError,
420                            "codec did not return a unicode object");
421            return error_ret(tok);
422        }
423    } else {
424        tok->decoding_buffer = NULL;
425        if (PyString_CheckExact(buf))
426            utf8 = buf;
427    }
428    if (utf8 == NULL) {
429        utf8 = PyUnicode_AsUTF8String(buf);
430        Py_DECREF(buf);
431        if (utf8 == NULL)
432            return error_ret(tok);
433    }
434    str = PyString_AsString(utf8);
435    utf8len = PyString_GET_SIZE(utf8);
436    if (utf8len > size) {
437        tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
438        if (tok->decoding_buffer == NULL) {
439            Py_DECREF(utf8);
440            return error_ret(tok);
441        }
442        utf8len = size;
443    }
444    memcpy(s, str, utf8len);
445    s[utf8len] = '\0';
446    Py_DECREF(utf8);
447    if (utf8len == 0)
448        return NULL; /* EOF */
449    return s;
450#endif
451}
452
453/* Set the readline function for TOK to a StreamReader's
454   readline function. The StreamReader is named ENC.
455
456   This function is called from check_bom and check_coding_spec.
457
458   ENC is usually identical to the future value of tok->encoding,
459   except for the (currently unsupported) case of UTF-16.
460
461   Return 1 on success, 0 on failure. */
462
463static int
464fp_setreadl(struct tok_state *tok, const char* enc)
465{
466    PyObject *reader, *stream, *readline;
467
468    /* XXX: constify filename argument. */
469    stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
470    if (stream == NULL)
471        return 0;
472
473    reader = PyCodec_StreamReader(enc, stream, NULL);
474    Py_DECREF(stream);
475    if (reader == NULL)
476        return 0;
477
478    readline = PyObject_GetAttrString(reader, "readline");
479    Py_DECREF(reader);
480    if (readline == NULL)
481        return 0;
482
483    tok->decoding_readline = readline;
484    return 1;
485}
486
487/* Fetch the next byte from TOK. */
488
489static int fp_getc(struct tok_state *tok) {
490    return getc(tok->fp);
491}
492
493/* Unfetch the last byte back into TOK.  */
494
495static void fp_ungetc(int c, struct tok_state *tok) {
496    ungetc(c, tok->fp);
497}
498
499/* Read a line of input from TOK. Determine encoding
500   if necessary.  */
501
502static char *
503decoding_fgets(char *s, int size, struct tok_state *tok)
504{
505    char *line = NULL;
506    int badchar = 0;
507    for (;;) {
508        if (tok->decoding_state < 0) {
509            /* We already have a codec associated with
510               this input. */
511            line = fp_readl(s, size, tok);
512            break;
513        } else if (tok->decoding_state > 0) {
514            /* We want a 'raw' read. */
515            line = Py_UniversalNewlineFgets(s, size,
516                                            tok->fp, NULL);
517            break;
518        } else {
519            /* We have not yet determined the encoding.
520               If an encoding is found, use the file-pointer
521               reader functions from now on. */
522            if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
523                return error_ret(tok);
524            assert(tok->decoding_state != 0);
525        }
526    }
527    if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
528        if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
529            return error_ret(tok);
530        }
531    }
532#ifndef PGEN
533    /* The default encoding is ASCII, so make sure we don't have any
534       non-ASCII bytes in it. */
535    if (line && !tok->encoding) {
536        unsigned char *c;
537        for (c = (unsigned char *)line; *c; c++)
538            if (*c > 127) {
539                badchar = *c;
540                break;
541            }
542    }
543    if (badchar) {
544        char buf[500];
545        /* Need to add 1 to the line number, since this line
546           has not been counted, yet.  */
547        sprintf(buf,
548            "Non-ASCII character '\\x%.2x' "
549            "in file %.200s on line %i, "
550            "but no encoding declared; "
551            "see http://python.org/dev/peps/pep-0263/ for details",
552            badchar, tok->filename, tok->lineno + 1);
553        PyErr_SetString(PyExc_SyntaxError, buf);
554        return error_ret(tok);
555    }
556#endif
557    return line;
558}
559
560static int
561decoding_feof(struct tok_state *tok)
562{
563    if (tok->decoding_state >= 0) {
564        return feof(tok->fp);
565    } else {
566        PyObject* buf = tok->decoding_buffer;
567        if (buf == NULL) {
568            buf = PyObject_CallObject(tok->decoding_readline, NULL);
569            if (buf == NULL) {
570                error_ret(tok);
571                return 1;
572            } else {
573                tok->decoding_buffer = buf;
574            }
575        }
576        return PyObject_Length(buf) == 0;
577    }
578}
579
580/* Fetch a byte from TOK, using the string buffer. */
581
582static int
583buf_getc(struct tok_state *tok) {
584    return Py_CHARMASK(*tok->str++);
585}
586
587/* Unfetch a byte from TOK, using the string buffer. */
588
589static void
590buf_ungetc(int c, struct tok_state *tok) {
591    tok->str--;
592    assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
593}
594
595/* Set the readline function for TOK to ENC. For the string-based
596   tokenizer, this means to just record the encoding. */
597
598static int
599buf_setreadl(struct tok_state *tok, const char* enc) {
600    tok->enc = enc;
601    return 1;
602}
603
604/* Return a UTF-8 encoding Python string object from the
605   C byte string STR, which is encoded with ENC. */
606
607#ifdef Py_USING_UNICODE
608static PyObject *
609translate_into_utf8(const char* str, const char* enc) {
610    PyObject *utf8;
611    PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
612    if (buf == NULL)
613        return NULL;
614    utf8 = PyUnicode_AsUTF8String(buf);
615    Py_DECREF(buf);
616    return utf8;
617}
618#endif
619
620
621static char *
622translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
623    int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
624    char *buf, *current;
625    char c = '\0';
626    buf = PyMem_MALLOC(needed_length);
627    if (buf == NULL) {
628        tok->done = E_NOMEM;
629        return NULL;
630    }
631    for (current = buf; *s; s++, current++) {
632        c = *s;
633        if (skip_next_lf) {
634            skip_next_lf = 0;
635            if (c == '\n') {
636                c = *++s;
637                if (!c)
638                    break;
639            }
640        }
641        if (c == '\r') {
642            skip_next_lf = 1;
643            c = '\n';
644        }
645        *current = c;
646    }
647    /* If this is exec input, add a newline to the end of the string if
648       there isn't one already. */
649    if (exec_input && c != '\n') {
650        *current = '\n';
651        current++;
652    }
653    *current = '\0';
654    final_length = current - buf + 1;
655    if (final_length < needed_length && final_length)
656        /* should never fail */
657        buf = PyMem_REALLOC(buf, final_length);
658    return buf;
659}
660
661/* Decode a byte string STR for use as the buffer of TOK.
662   Look for encoding declarations inside STR, and record them
663   inside TOK.  */
664
665static const char *
666decode_str(const char *input, int single, struct tok_state *tok)
667{
668    PyObject* utf8 = NULL;
669    const char *str;
670    const char *s;
671    const char *newl[2] = {NULL, NULL};
672    int lineno = 0;
673    tok->input = str = translate_newlines(input, single, tok);
674    if (str == NULL)
675        return NULL;
676    tok->enc = NULL;
677    tok->str = str;
678    if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
679        return error_ret(tok);
680    str = tok->str;             /* string after BOM if any */
681    assert(str);
682#ifdef Py_USING_UNICODE
683    if (tok->enc != NULL) {
684        utf8 = translate_into_utf8(str, tok->enc);
685        if (utf8 == NULL)
686            return error_ret(tok);
687        str = PyString_AsString(utf8);
688    }
689#endif
690    for (s = str;; s++) {
691        if (*s == '\0') break;
692        else if (*s == '\n') {
693            assert(lineno < 2);
694            newl[lineno] = s;
695            lineno++;
696            if (lineno == 2) break;
697        }
698    }
699    tok->enc = NULL;
700    /* need to check line 1 and 2 separately since check_coding_spec
701       assumes a single line as input */
702    if (newl[0]) {
703        if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704            return error_ret(tok);
705        if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
706            if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707                                   tok, buf_setreadl))
708                return error_ret(tok);
709        }
710    }
711#ifdef Py_USING_UNICODE
712    if (tok->enc != NULL) {
713        assert(utf8 == NULL);
714        utf8 = translate_into_utf8(str, tok->enc);
715        if (utf8 == NULL)
716            return error_ret(tok);
717        str = PyString_AsString(utf8);
718    }
719#endif
720    assert(tok->decoding_buffer == NULL);
721    tok->decoding_buffer = utf8; /* CAUTION */
722    return str;
723}
724
725#endif /* PGEN */
726
727/* Set up tokenizer for string */
728
729struct tok_state *
730PyTokenizer_FromString(const char *str, int exec_input)
731{
732    struct tok_state *tok = tok_new();
733    if (tok == NULL)
734        return NULL;
735    str = (char *)decode_str(str, exec_input, tok);
736    if (str == NULL) {
737        PyTokenizer_Free(tok);
738        return NULL;
739    }
740
741    /* XXX: constify members. */
742    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
743    return tok;
744}
745
746
747/* Set up tokenizer for file */
748
749struct tok_state *
750PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
751{
752    struct tok_state *tok = tok_new();
753    if (tok == NULL)
754        return NULL;
755    if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
756        PyTokenizer_Free(tok);
757        return NULL;
758    }
759    tok->cur = tok->inp = tok->buf;
760    tok->end = tok->buf + BUFSIZ;
761    tok->fp = fp;
762    tok->prompt = ps1;
763    tok->nextprompt = ps2;
764    return tok;
765}
766
767
768/* Free a tok_state structure */
769
770void
771PyTokenizer_Free(struct tok_state *tok)
772{
773    if (tok->encoding != NULL)
774        PyMem_FREE(tok->encoding);
775#ifndef PGEN
776    Py_XDECREF(tok->decoding_readline);
777    Py_XDECREF(tok->decoding_buffer);
778#endif
779    if (tok->fp != NULL && tok->buf != NULL)
780        PyMem_FREE(tok->buf);
781    if (tok->input)
782        PyMem_FREE((char *)tok->input);
783    PyMem_FREE(tok);
784}
785
786#if !defined(PGEN) && defined(Py_USING_UNICODE)
787static int
788tok_stdin_decode(struct tok_state *tok, char **inp)
789{
790    PyObject *enc, *sysstdin, *decoded, *utf8;
791    const char *encoding;
792    char *converted;
793
794    if (PySys_GetFile((char *)"stdin", NULL) != stdin)
795        return 0;
796    sysstdin = PySys_GetObject("stdin");
797    if (sysstdin == NULL || !PyFile_Check(sysstdin))
798        return 0;
799
800    enc = ((PyFileObject *)sysstdin)->f_encoding;
801    if (enc == NULL || !PyString_Check(enc))
802        return 0;
803    Py_INCREF(enc);
804
805    encoding = PyString_AsString(enc);
806    decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
807    if (decoded == NULL)
808        goto error_clear;
809
810    utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
811    Py_DECREF(decoded);
812    if (utf8 == NULL)
813        goto error_clear;
814
815    assert(PyString_Check(utf8));
816    converted = new_string(PyString_AS_STRING(utf8),
817                           PyString_GET_SIZE(utf8));
818    Py_DECREF(utf8);
819    if (converted == NULL)
820        goto error_nomem;
821
822    PyMem_FREE(*inp);
823    *inp = converted;
824    if (tok->encoding != NULL)
825        PyMem_FREE(tok->encoding);
826    tok->encoding = new_string(encoding, strlen(encoding));
827    if (tok->encoding == NULL)
828        goto error_nomem;
829
830    Py_DECREF(enc);
831    return 0;
832
833error_nomem:
834    Py_DECREF(enc);
835    tok->done = E_NOMEM;
836    return -1;
837
838error_clear:
839    Py_DECREF(enc);
840    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
841        tok->done = E_ERROR;
842        return -1;
843    }
844    /* Fallback to iso-8859-1: for backward compatibility */
845    PyErr_Clear();
846    return 0;
847}
848#endif
849
850/* Get next char, updating state; error code goes into tok->done */
851
852static int
853tok_nextc(register struct tok_state *tok)
854{
855    for (;;) {
856        if (tok->cur != tok->inp) {
857            return Py_CHARMASK(*tok->cur++); /* Fast path */
858        }
859        if (tok->done != E_OK)
860            return EOF;
861        if (tok->fp == NULL) {
862            char *end = strchr(tok->inp, '\n');
863            if (end != NULL)
864                end++;
865            else {
866                end = strchr(tok->inp, '\0');
867                if (end == tok->inp) {
868                    tok->done = E_EOF;
869                    return EOF;
870                }
871            }
872            if (tok->start == NULL)
873                tok->buf = tok->cur;
874            tok->line_start = tok->cur;
875            tok->lineno++;
876            tok->inp = end;
877            return Py_CHARMASK(*tok->cur++);
878        }
879        if (tok->prompt != NULL) {
880            char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
881            if (tok->nextprompt != NULL)
882                tok->prompt = tok->nextprompt;
883            if (newtok == NULL)
884                tok->done = E_INTR;
885            else if (*newtok == '\0') {
886                PyMem_FREE(newtok);
887                tok->done = E_EOF;
888            }
889#if !defined(PGEN) && defined(Py_USING_UNICODE)
890            else if (tok_stdin_decode(tok, &newtok) != 0)
891                PyMem_FREE(newtok);
892#endif
893            else if (tok->start != NULL) {
894                size_t start = tok->start - tok->buf;
895                size_t oldlen = tok->cur - tok->buf;
896                size_t newlen = oldlen + strlen(newtok);
897                char *buf = tok->buf;
898                buf = (char *)PyMem_REALLOC(buf, newlen+1);
899                tok->lineno++;
900                if (buf == NULL) {
901                    PyMem_FREE(tok->buf);
902                    tok->buf = NULL;
903                    PyMem_FREE(newtok);
904                    tok->done = E_NOMEM;
905                    return EOF;
906                }
907                tok->buf = buf;
908                tok->cur = tok->buf + oldlen;
909                tok->line_start = tok->cur;
910                strcpy(tok->buf + oldlen, newtok);
911                PyMem_FREE(newtok);
912                tok->inp = tok->buf + newlen;
913                tok->end = tok->inp + 1;
914                tok->start = tok->buf + start;
915            }
916            else {
917                tok->lineno++;
918                if (tok->buf != NULL)
919                    PyMem_FREE(tok->buf);
920                tok->buf = newtok;
921                tok->line_start = tok->buf;
922                tok->cur = tok->buf;
923                tok->line_start = tok->buf;
924                tok->inp = strchr(tok->buf, '\0');
925                tok->end = tok->inp + 1;
926            }
927        }
928        else {
929            int done = 0;
930            Py_ssize_t cur = 0;
931            char *pt;
932            if (tok->start == NULL) {
933                if (tok->buf == NULL) {
934                    tok->buf = (char *)
935                        PyMem_MALLOC(BUFSIZ);
936                    if (tok->buf == NULL) {
937                        tok->done = E_NOMEM;
938                        return EOF;
939                    }
940                    tok->end = tok->buf + BUFSIZ;
941                }
942                if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
943                          tok) == NULL) {
944                    tok->done = E_EOF;
945                    done = 1;
946                }
947                else {
948                    tok->done = E_OK;
949                    tok->inp = strchr(tok->buf, '\0');
950                    done = tok->inp[-1] == '\n';
951                }
952            }
953            else {
954                cur = tok->cur - tok->buf;
955                if (decoding_feof(tok)) {
956                    tok->done = E_EOF;
957                    done = 1;
958                }
959                else
960                    tok->done = E_OK;
961            }
962            tok->lineno++;
963            /* Read until '\n' or EOF */
964            while (!done) {
965                Py_ssize_t curstart = tok->start == NULL ? -1 :
966                          tok->start - tok->buf;
967                Py_ssize_t curvalid = tok->inp - tok->buf;
968                Py_ssize_t newsize = curvalid + BUFSIZ;
969                char *newbuf = tok->buf;
970                newbuf = (char *)PyMem_REALLOC(newbuf,
971                                               newsize);
972                if (newbuf == NULL) {
973                    tok->done = E_NOMEM;
974                    tok->cur = tok->inp;
975                    return EOF;
976                }
977                tok->buf = newbuf;
978                tok->inp = tok->buf + curvalid;
979                tok->end = tok->buf + newsize;
980                tok->start = curstart < 0 ? NULL :
981                         tok->buf + curstart;
982                if (decoding_fgets(tok->inp,
983                               (int)(tok->end - tok->inp),
984                               tok) == NULL) {
985                    /* Break out early on decoding
986                       errors, as tok->buf will be NULL
987                     */
988                    if (tok->decoding_erred)
989                        return EOF;
990                    /* Last line does not end in \n,
991                       fake one */
992                    strcpy(tok->inp, "\n");
993                }
994                tok->inp = strchr(tok->inp, '\0');
995                done = tok->inp[-1] == '\n';
996            }
997            if (tok->buf != NULL) {
998                tok->cur = tok->buf + cur;
999                tok->line_start = tok->cur;
1000                /* replace "\r\n" with "\n" */
1001                /* For Mac leave the \r, giving a syntax error */
1002                pt = tok->inp - 2;
1003                if (pt >= tok->buf && *pt == '\r') {
1004                    *pt++ = '\n';
1005                    *pt = '\0';
1006                    tok->inp = pt;
1007                }
1008            }
1009        }
1010        if (tok->done != E_OK) {
1011            if (tok->prompt != NULL)
1012                PySys_WriteStderr("\n");
1013            tok->cur = tok->inp;
1014            return EOF;
1015        }
1016    }
1017    /*NOTREACHED*/
1018}
1019
1020
1021/* Back-up one character */
1022
1023static void
1024tok_backup(register struct tok_state *tok, register int c)
1025{
1026    if (c != EOF) {
1027        if (--tok->cur < tok->buf)
1028            Py_FatalError("tok_backup: beginning of buffer");
1029        if (*tok->cur != c)
1030            *tok->cur = c;
1031    }
1032}
1033
1034
1035/* Return the token corresponding to a single character */
1036
1037int
1038PyToken_OneChar(int c)
1039{
1040    switch (c) {
1041    case '(':           return LPAR;
1042    case ')':           return RPAR;
1043    case '[':           return LSQB;
1044    case ']':           return RSQB;
1045    case ':':           return COLON;
1046    case ',':           return COMMA;
1047    case ';':           return SEMI;
1048    case '+':           return PLUS;
1049    case '-':           return MINUS;
1050    case '*':           return STAR;
1051    case '/':           return SLASH;
1052    case '|':           return VBAR;
1053    case '&':           return AMPER;
1054    case '<':           return LESS;
1055    case '>':           return GREATER;
1056    case '=':           return EQUAL;
1057    case '.':           return DOT;
1058    case '%':           return PERCENT;
1059    case '`':           return BACKQUOTE;
1060    case '{':           return LBRACE;
1061    case '}':           return RBRACE;
1062    case '^':           return CIRCUMFLEX;
1063    case '~':           return TILDE;
1064    case '@':       return AT;
1065    default:            return OP;
1066    }
1067}
1068
1069
1070int
1071PyToken_TwoChars(int c1, int c2)
1072{
1073    switch (c1) {
1074    case '=':
1075        switch (c2) {
1076        case '=':               return EQEQUAL;
1077        }
1078        break;
1079    case '!':
1080        switch (c2) {
1081        case '=':               return NOTEQUAL;
1082        }
1083        break;
1084    case '<':
1085        switch (c2) {
1086        case '>':               return NOTEQUAL;
1087        case '=':               return LESSEQUAL;
1088        case '<':               return LEFTSHIFT;
1089        }
1090        break;
1091    case '>':
1092        switch (c2) {
1093        case '=':               return GREATEREQUAL;
1094        case '>':               return RIGHTSHIFT;
1095        }
1096        break;
1097    case '+':
1098        switch (c2) {
1099        case '=':               return PLUSEQUAL;
1100        }
1101        break;
1102    case '-':
1103        switch (c2) {
1104        case '=':               return MINEQUAL;
1105        }
1106        break;
1107    case '*':
1108        switch (c2) {
1109        case '*':               return DOUBLESTAR;
1110        case '=':               return STAREQUAL;
1111        }
1112        break;
1113    case '/':
1114        switch (c2) {
1115        case '/':               return DOUBLESLASH;
1116        case '=':               return SLASHEQUAL;
1117        }
1118        break;
1119    case '|':
1120        switch (c2) {
1121        case '=':               return VBAREQUAL;
1122        }
1123        break;
1124    case '%':
1125        switch (c2) {
1126        case '=':               return PERCENTEQUAL;
1127        }
1128        break;
1129    case '&':
1130        switch (c2) {
1131        case '=':               return AMPEREQUAL;
1132        }
1133        break;
1134    case '^':
1135        switch (c2) {
1136        case '=':               return CIRCUMFLEXEQUAL;
1137        }
1138        break;
1139    }
1140    return OP;
1141}
1142
1143int
1144PyToken_ThreeChars(int c1, int c2, int c3)
1145{
1146    switch (c1) {
1147    case '<':
1148        switch (c2) {
1149        case '<':
1150            switch (c3) {
1151            case '=':
1152                return LEFTSHIFTEQUAL;
1153            }
1154            break;
1155        }
1156        break;
1157    case '>':
1158        switch (c2) {
1159        case '>':
1160            switch (c3) {
1161            case '=':
1162                return RIGHTSHIFTEQUAL;
1163            }
1164            break;
1165        }
1166        break;
1167    case '*':
1168        switch (c2) {
1169        case '*':
1170            switch (c3) {
1171            case '=':
1172                return DOUBLESTAREQUAL;
1173            }
1174            break;
1175        }
1176        break;
1177    case '/':
1178        switch (c2) {
1179        case '/':
1180            switch (c3) {
1181            case '=':
1182                return DOUBLESLASHEQUAL;
1183            }
1184            break;
1185        }
1186        break;
1187    }
1188    return OP;
1189}
1190
1191static int
1192indenterror(struct tok_state *tok)
1193{
1194    if (tok->alterror) {
1195        tok->done = E_TABSPACE;
1196        tok->cur = tok->inp;
1197        return 1;
1198    }
1199    if (tok->altwarning) {
1200        PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1201                          "in indentation\n", tok->filename);
1202        tok->altwarning = 0;
1203    }
1204    return 0;
1205}
1206
1207/* Get next token, after space stripping etc. */
1208
1209static int
1210tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1211{
1212    register int c;
1213    int blankline;
1214
1215    *p_start = *p_end = NULL;
1216  nextline:
1217    tok->start = NULL;
1218    blankline = 0;
1219
1220    /* Get indentation level */
1221    if (tok->atbol) {
1222        register int col = 0;
1223        register int altcol = 0;
1224        tok->atbol = 0;
1225        for (;;) {
1226            c = tok_nextc(tok);
1227            if (c == ' ')
1228                col++, altcol++;
1229            else if (c == '\t') {
1230                col = (col/tok->tabsize + 1) * tok->tabsize;
1231                altcol = (altcol/tok->alttabsize + 1)
1232                    * tok->alttabsize;
1233            }
1234            else if (c == '\014') /* Control-L (formfeed) */
1235                col = altcol = 0; /* For Emacs users */
1236            else
1237                break;
1238        }
1239        tok_backup(tok, c);
1240        if (c == '#' || c == '\n') {
1241            /* Lines with only whitespace and/or comments
1242               shouldn't affect the indentation and are
1243               not passed to the parser as NEWLINE tokens,
1244               except *totally* empty lines in interactive
1245               mode, which signal the end of a command group. */
1246            if (col == 0 && c == '\n' && tok->prompt != NULL)
1247                blankline = 0; /* Let it through */
1248            else
1249                blankline = 1; /* Ignore completely */
1250            /* We can't jump back right here since we still
1251               may need to skip to the end of a comment */
1252        }
1253        if (!blankline && tok->level == 0) {
1254            if (col == tok->indstack[tok->indent]) {
1255                /* No change */
1256                if (altcol != tok->altindstack[tok->indent]) {
1257                    if (indenterror(tok))
1258                        return ERRORTOKEN;
1259                }
1260            }
1261            else if (col > tok->indstack[tok->indent]) {
1262                /* Indent -- always one */
1263                if (tok->indent+1 >= MAXINDENT) {
1264                    tok->done = E_TOODEEP;
1265                    tok->cur = tok->inp;
1266                    return ERRORTOKEN;
1267                }
1268                if (altcol <= tok->altindstack[tok->indent]) {
1269                    if (indenterror(tok))
1270                        return ERRORTOKEN;
1271                }
1272                tok->pendin++;
1273                tok->indstack[++tok->indent] = col;
1274                tok->altindstack[tok->indent] = altcol;
1275            }
1276            else /* col < tok->indstack[tok->indent] */ {
1277                /* Dedent -- any number, must be consistent */
1278                while (tok->indent > 0 &&
1279                    col < tok->indstack[tok->indent]) {
1280                    tok->pendin--;
1281                    tok->indent--;
1282                }
1283                if (col != tok->indstack[tok->indent]) {
1284                    tok->done = E_DEDENT;
1285                    tok->cur = tok->inp;
1286                    return ERRORTOKEN;
1287                }
1288                if (altcol != tok->altindstack[tok->indent]) {
1289                    if (indenterror(tok))
1290                        return ERRORTOKEN;
1291                }
1292            }
1293        }
1294    }
1295
1296    tok->start = tok->cur;
1297
1298    /* Return pending indents/dedents */
1299    if (tok->pendin != 0) {
1300        if (tok->pendin < 0) {
1301            tok->pendin++;
1302            return DEDENT;
1303        }
1304        else {
1305            tok->pendin--;
1306            return INDENT;
1307        }
1308    }
1309
1310 again:
1311    tok->start = NULL;
1312    /* Skip spaces */
1313    do {
1314        c = tok_nextc(tok);
1315    } while (c == ' ' || c == '\t' || c == '\014');
1316
1317    /* Set start of current token */
1318    tok->start = tok->cur - 1;
1319
1320    /* Skip comment, while looking for tab-setting magic */
1321    if (c == '#') {
1322        static char *tabforms[] = {
1323            "tab-width:",                       /* Emacs */
1324            ":tabstop=",                        /* vim, full form */
1325            ":ts=",                             /* vim, abbreviated form */
1326            "set tabsize=",                     /* will vi never die? */
1327        /* more templates can be added here to support other editors */
1328        };
1329        char cbuf[80];
1330        char *tp, **cp;
1331        tp = cbuf;
1332        do {
1333            *tp++ = c = tok_nextc(tok);
1334        } while (c != EOF && c != '\n' &&
1335                 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1336        *tp = '\0';
1337        for (cp = tabforms;
1338             cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1339             cp++) {
1340            if ((tp = strstr(cbuf, *cp))) {
1341                int newsize = atoi(tp + strlen(*cp));
1342
1343                if (newsize >= 1 && newsize <= 40) {
1344                    tok->tabsize = newsize;
1345                    if (Py_VerboseFlag)
1346                        PySys_WriteStderr(
1347                        "Tab size set to %d\n",
1348                        newsize);
1349                }
1350            }
1351        }
1352        while (c != EOF && c != '\n')
1353            c = tok_nextc(tok);
1354    }
1355
1356    /* Check for EOF and errors now */
1357    if (c == EOF) {
1358        return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1359    }
1360
1361    /* Identifier (most frequent token!) */
1362    if (Py_ISALPHA(c) || c == '_') {
1363        /* Process r"", u"" and ur"" */
1364        switch (c) {
1365        case 'b':
1366        case 'B':
1367            c = tok_nextc(tok);
1368            if (c == 'r' || c == 'R')
1369                c = tok_nextc(tok);
1370            if (c == '"' || c == '\'')
1371                goto letter_quote;
1372            break;
1373        case 'r':
1374        case 'R':
1375            c = tok_nextc(tok);
1376            if (c == '"' || c == '\'')
1377                goto letter_quote;
1378            break;
1379        case 'u':
1380        case 'U':
1381            c = tok_nextc(tok);
1382            if (c == 'r' || c == 'R')
1383                c = tok_nextc(tok);
1384            if (c == '"' || c == '\'')
1385                goto letter_quote;
1386            break;
1387        }
1388        while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
1389            c = tok_nextc(tok);
1390        }
1391        tok_backup(tok, c);
1392        *p_start = tok->start;
1393        *p_end = tok->cur;
1394        return NAME;
1395    }
1396
1397    /* Newline */
1398    if (c == '\n') {
1399        tok->atbol = 1;
1400        if (blankline || tok->level > 0)
1401            goto nextline;
1402        *p_start = tok->start;
1403        *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1404        tok->cont_line = 0;
1405        return NEWLINE;
1406    }
1407
1408    /* Period or number starting with period? */
1409    if (c == '.') {
1410        c = tok_nextc(tok);
1411        if (isdigit(c)) {
1412            goto fraction;
1413        }
1414        else {
1415            tok_backup(tok, c);
1416            *p_start = tok->start;
1417            *p_end = tok->cur;
1418            return DOT;
1419        }
1420    }
1421
1422    /* Number */
1423    if (isdigit(c)) {
1424        if (c == '0') {
1425            /* Hex, octal or binary -- maybe. */
1426            c = tok_nextc(tok);
1427            if (c == '.')
1428                goto fraction;
1429#ifndef WITHOUT_COMPLEX
1430            if (c == 'j' || c == 'J')
1431                goto imaginary;
1432#endif
1433            if (c == 'x' || c == 'X') {
1434
1435                /* Hex */
1436                c = tok_nextc(tok);
1437                if (!isxdigit(c)) {
1438                    tok->done = E_TOKEN;
1439                    tok_backup(tok, c);
1440                    return ERRORTOKEN;
1441                }
1442                do {
1443                    c = tok_nextc(tok);
1444                } while (isxdigit(c));
1445            }
1446            else if (c == 'o' || c == 'O') {
1447                /* Octal */
1448                c = tok_nextc(tok);
1449                if (c < '0' || c >= '8') {
1450                    tok->done = E_TOKEN;
1451                    tok_backup(tok, c);
1452                    return ERRORTOKEN;
1453                }
1454                do {
1455                    c = tok_nextc(tok);
1456                } while ('0' <= c && c < '8');
1457            }
1458            else if (c == 'b' || c == 'B') {
1459                /* Binary */
1460                c = tok_nextc(tok);
1461                if (c != '0' && c != '1') {
1462                    tok->done = E_TOKEN;
1463                    tok_backup(tok, c);
1464                    return ERRORTOKEN;
1465                }
1466                do {
1467                    c = tok_nextc(tok);
1468                } while (c == '0' || c == '1');
1469            }
1470            else {
1471                int found_decimal = 0;
1472                /* Octal; c is first char of it */
1473                /* There's no 'isoctdigit' macro, sigh */
1474                while ('0' <= c && c < '8') {
1475                    c = tok_nextc(tok);
1476                }
1477                if (isdigit(c)) {
1478                    found_decimal = 1;
1479                    do {
1480                        c = tok_nextc(tok);
1481                    } while (isdigit(c));
1482                }
1483                if (c == '.')
1484                    goto fraction;
1485                else if (c == 'e' || c == 'E')
1486                    goto exponent;
1487#ifndef WITHOUT_COMPLEX
1488                else if (c == 'j' || c == 'J')
1489                    goto imaginary;
1490#endif
1491                else if (found_decimal) {
1492                    tok->done = E_TOKEN;
1493                    tok_backup(tok, c);
1494                    return ERRORTOKEN;
1495                }
1496            }
1497            if (c == 'l' || c == 'L')
1498                c = tok_nextc(tok);
1499        }
1500        else {
1501            /* Decimal */
1502            do {
1503                c = tok_nextc(tok);
1504            } while (isdigit(c));
1505            if (c == 'l' || c == 'L')
1506                c = tok_nextc(tok);
1507            else {
1508                /* Accept floating point numbers. */
1509                if (c == '.') {
1510        fraction:
1511                    /* Fraction */
1512                    do {
1513                        c = tok_nextc(tok);
1514                    } while (isdigit(c));
1515                }
1516                if (c == 'e' || c == 'E') {
1517                    int e;
1518                  exponent:
1519                    e = c;
1520                    /* Exponent part */
1521                    c = tok_nextc(tok);
1522                    if (c == '+' || c == '-') {
1523                        c = tok_nextc(tok);
1524                        if (!isdigit(c)) {
1525                            tok->done = E_TOKEN;
1526                            tok_backup(tok, c);
1527                            return ERRORTOKEN;
1528                        }
1529                    } else if (!isdigit(c)) {
1530                        tok_backup(tok, c);
1531                        tok_backup(tok, e);
1532                        *p_start = tok->start;
1533                        *p_end = tok->cur;
1534                        return NUMBER;
1535                    }
1536                    do {
1537                        c = tok_nextc(tok);
1538                    } while (isdigit(c));
1539                }
1540#ifndef WITHOUT_COMPLEX
1541                if (c == 'j' || c == 'J')
1542                    /* Imaginary part */
1543        imaginary:
1544                    c = tok_nextc(tok);
1545#endif
1546            }
1547        }
1548        tok_backup(tok, c);
1549        *p_start = tok->start;
1550        *p_end = tok->cur;
1551        return NUMBER;
1552    }
1553
1554  letter_quote:
1555    /* String */
1556    if (c == '\'' || c == '"') {
1557        Py_ssize_t quote2 = tok->cur - tok->start + 1;
1558        int quote = c;
1559        int triple = 0;
1560        int tripcount = 0;
1561        for (;;) {
1562            c = tok_nextc(tok);
1563            if (c == '\n') {
1564                if (!triple) {
1565                    tok->done = E_EOLS;
1566                    tok_backup(tok, c);
1567                    return ERRORTOKEN;
1568                }
1569                tripcount = 0;
1570                tok->cont_line = 1; /* multiline string. */
1571            }
1572            else if (c == EOF) {
1573                if (triple)
1574                    tok->done = E_EOFS;
1575                else
1576                    tok->done = E_EOLS;
1577                tok->cur = tok->inp;
1578                return ERRORTOKEN;
1579            }
1580            else if (c == quote) {
1581                tripcount++;
1582                if (tok->cur - tok->start == quote2) {
1583                    c = tok_nextc(tok);
1584                    if (c == quote) {
1585                        triple = 1;
1586                        tripcount = 0;
1587                        continue;
1588                    }
1589                    tok_backup(tok, c);
1590                }
1591                if (!triple || tripcount == 3)
1592                    break;
1593            }
1594            else if (c == '\\') {
1595                tripcount = 0;
1596                c = tok_nextc(tok);
1597                if (c == EOF) {
1598                    tok->done = E_EOLS;
1599                    tok->cur = tok->inp;
1600                    return ERRORTOKEN;
1601                }
1602            }
1603            else
1604                tripcount = 0;
1605        }
1606        *p_start = tok->start;
1607        *p_end = tok->cur;
1608        return STRING;
1609    }
1610
1611    /* Line continuation */
1612    if (c == '\\') {
1613        c = tok_nextc(tok);
1614        if (c != '\n') {
1615            tok->done = E_LINECONT;
1616            tok->cur = tok->inp;
1617            return ERRORTOKEN;
1618        }
1619        tok->cont_line = 1;
1620        goto again; /* Read next line */
1621    }
1622
1623    /* Check for two-character token */
1624    {
1625        int c2 = tok_nextc(tok);
1626        int token = PyToken_TwoChars(c, c2);
1627#ifndef PGEN
1628        if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1629            if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1630                                   "<> not supported in 3.x; use !=",
1631                                   tok->filename, tok->lineno,
1632                                   NULL, NULL)) {
1633                return ERRORTOKEN;
1634            }
1635        }
1636#endif
1637        if (token != OP) {
1638            int c3 = tok_nextc(tok);
1639            int token3 = PyToken_ThreeChars(c, c2, c3);
1640            if (token3 != OP) {
1641                token = token3;
1642            } else {
1643                tok_backup(tok, c3);
1644            }
1645            *p_start = tok->start;
1646            *p_end = tok->cur;
1647            return token;
1648        }
1649        tok_backup(tok, c2);
1650    }
1651
1652    /* Keep track of parentheses nesting level */
1653    switch (c) {
1654    case '(':
1655    case '[':
1656    case '{':
1657        tok->level++;
1658        break;
1659    case ')':
1660    case ']':
1661    case '}':
1662        tok->level--;
1663        break;
1664    }
1665
1666    /* Punctuation character */
1667    *p_start = tok->start;
1668    *p_end = tok->cur;
1669    return PyToken_OneChar(c);
1670}
1671
1672int
1673PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1674{
1675    int result = tok_get(tok, p_start, p_end);
1676    if (tok->decoding_erred) {
1677        result = ERRORTOKEN;
1678        tok->done = E_DECODE;
1679    }
1680    return result;
1681}
1682
1683/* This function is only called from parsetok. However, it cannot live
1684   there, as it must be empty for PGEN, and we can check for PGEN only
1685   in this file. */
1686
1687#if defined(PGEN) || !defined(Py_USING_UNICODE)
1688char*
1689PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1690{
1691    return NULL;
1692}
1693#else
1694#ifdef Py_USING_UNICODE
1695static PyObject *
1696dec_utf8(const char *enc, const char *text, size_t len) {
1697    PyObject *ret = NULL;
1698    PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1699    if (unicode_text) {
1700        ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1701        Py_DECREF(unicode_text);
1702    }
1703    if (!ret) {
1704        PyErr_Clear();
1705    }
1706    return ret;
1707}
1708char *
1709PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1710{
1711    char *text = NULL;
1712    if (tok->encoding) {
1713        /* convert source to original encondig */
1714        PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1715        if (lineobj != NULL) {
1716            int linelen = PyString_Size(lineobj);
1717            const char *line = PyString_AsString(lineobj);
1718            text = PyObject_MALLOC(linelen + 1);
1719            if (text != NULL && line != NULL) {
1720                if (linelen)
1721                    strncpy(text, line, linelen);
1722                text[linelen] = '\0';
1723            }
1724            Py_DECREF(lineobj);
1725
1726            /* adjust error offset */
1727            if (*offset > 1) {
1728                PyObject *offsetobj = dec_utf8(tok->encoding,
1729                                               tok->buf, *offset-1);
1730                if (offsetobj) {
1731                    *offset = PyString_Size(offsetobj) + 1;
1732                    Py_DECREF(offsetobj);
1733                }
1734            }
1735
1736        }
1737    }
1738    return text;
1739
1740}
1741#endif /* defined(Py_USING_UNICODE) */
1742#endif
1743
1744
1745#ifdef Py_DEBUG
1746
1747void
1748tok_dump(int type, char *start, char *end)
1749{
1750    printf("%s", _PyParser_TokenNames[type]);
1751    if (type == NAME || type == NUMBER || type == STRING || type == OP)
1752        printf("(%.*s)", (int)(end - start), start);
1753}
1754
1755#endif
1756