tokenizer.c revision 4ceeeb09d8ff445888b24aa324bc06175d141cb9
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#include "pydebug.h"
20#endif /* PGEN */
21
22extern char *PyOS_Readline(FILE *, FILE *, char *);
23/* Return malloc'ed string including trailing \n;
24   empty malloc'ed string for EOF;
25   NULL if interrupted */
26
27/* Don't ever change this -- it would break the portability of Python code */
28#define TABSIZE 8
29
30/* Forward */
31static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
34
35/* Token names */
36
37char *_PyParser_TokenNames[] = {
38	"ENDMARKER",
39	"NAME",
40	"NUMBER",
41	"STRING",
42	"NEWLINE",
43	"INDENT",
44	"DEDENT",
45	"LPAR",
46	"RPAR",
47	"LSQB",
48	"RSQB",
49	"COLON",
50	"COMMA",
51	"SEMI",
52	"PLUS",
53	"MINUS",
54	"STAR",
55	"SLASH",
56	"VBAR",
57	"AMPER",
58	"LESS",
59	"GREATER",
60	"EQUAL",
61	"DOT",
62	"PERCENT",
63	"BACKQUOTE",
64	"LBRACE",
65	"RBRACE",
66	"EQEQUAL",
67	"NOTEQUAL",
68	"LESSEQUAL",
69	"GREATEREQUAL",
70	"TILDE",
71	"CIRCUMFLEX",
72	"LEFTSHIFT",
73	"RIGHTSHIFT",
74	"DOUBLESTAR",
75	"PLUSEQUAL",
76	"MINEQUAL",
77	"STAREQUAL",
78	"SLASHEQUAL",
79	"PERCENTEQUAL",
80	"AMPEREQUAL",
81	"VBAREQUAL",
82	"CIRCUMFLEXEQUAL",
83	"LEFTSHIFTEQUAL",
84	"RIGHTSHIFTEQUAL",
85	"DOUBLESTAREQUAL",
86	"DOUBLESLASH",
87	"DOUBLESLASHEQUAL",
88	"AT",
89	/* This table must match the #defines in token.h! */
90	"OP",
91	"<ERRORTOKEN>",
92	"<N_TOKENS>"
93};
94
95
96/* Ensure that the locale does not interfere with tokenization. */
97
98static int
99ascii_isalpha(int c)
100{
101	return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
102}
103
104static int
105ascii_isalnum(int c)
106{
107	return ascii_isalpha(c) || ('0' <= c && c <= '9');
108}
109
110
111/* Create and initialize a new tok_state structure */
112
113static struct tok_state *
114tok_new(void)
115{
116	struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
117                                                sizeof(struct tok_state));
118	if (tok == NULL)
119		return NULL;
120	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
121	tok->done = E_OK;
122	tok->fp = NULL;
123	tok->input = NULL;
124	tok->tabsize = TABSIZE;
125	tok->indent = 0;
126	tok->indstack[0] = 0;
127	tok->atbol = 1;
128	tok->pendin = 0;
129	tok->prompt = tok->nextprompt = NULL;
130	tok->lineno = 0;
131	tok->level = 0;
132	tok->filename = NULL;
133	tok->altwarning = 0;
134	tok->alterror = 0;
135	tok->alttabsize = 1;
136	tok->altindstack[0] = 0;
137	tok->decoding_state = 0;
138	tok->decoding_erred = 0;
139	tok->read_coding_spec = 0;
140	tok->encoding = NULL;
141        tok->cont_line = 0;
142#ifndef PGEN
143	tok->decoding_readline = NULL;
144	tok->decoding_buffer = NULL;
145#endif
146	return tok;
147}
148
149static char *
150new_string(const char *s, Py_ssize_t len)
151{
152	char* result = (char *)PyMem_MALLOC(len + 1);
153	if (result != NULL) {
154		memcpy(result, s, len);
155		result[len] = '\0';
156	}
157	return result;
158}
159
160#ifdef PGEN
161
162static char *
163decoding_fgets(char *s, int size, struct tok_state *tok)
164{
165	return fgets(s, size, tok->fp);
166}
167
168static int
169decoding_feof(struct tok_state *tok)
170{
171	return feof(tok->fp);
172}
173
174static char *
175decode_str(const char *str, int exec_input, struct tok_state *tok)
176{
177	return new_string(str, strlen(str));
178}
179
180#else /* PGEN */
181
182static char *
183error_ret(struct tok_state *tok) /* XXX */
184{
185	tok->decoding_erred = 1;
186	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
187		PyMem_FREE(tok->buf);
188	tok->buf = NULL;
189	return NULL;		/* as if it were EOF */
190}
191
192
193static char *
194get_normal_name(char *s)	/* for utf-8 and latin-1 */
195{
196	char buf[13];
197	int i;
198	for (i = 0; i < 12; i++) {
199		int c = s[i];
200		if (c == '\0')
201			break;
202		else if (c == '_')
203			buf[i] = '-';
204		else
205			buf[i] = tolower(c);
206	}
207	buf[i] = '\0';
208	if (strcmp(buf, "utf-8") == 0 ||
209	    strncmp(buf, "utf-8-", 6) == 0)
210		return "utf-8";
211	else if (strcmp(buf, "latin-1") == 0 ||
212		 strcmp(buf, "iso-8859-1") == 0 ||
213		 strcmp(buf, "iso-latin-1") == 0 ||
214		 strncmp(buf, "latin-1-", 8) == 0 ||
215		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
216		 strncmp(buf, "iso-latin-1-", 12) == 0)
217		return "iso-8859-1";
218	else
219		return s;
220}
221
222/* Return the coding spec in S, or NULL if none is found.  */
223
224static char *
225get_coding_spec(const char *s, Py_ssize_t size)
226{
227	Py_ssize_t i;
228	/* Coding spec must be in a comment, and that comment must be
229         * the only statement on the source code line. */
230        for (i = 0; i < size - 6; i++) {
231		if (s[i] == '#')
232			break;
233		if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
234			return NULL;
235	}
236	for (; i < size - 6; i++) { /* XXX inefficient search */
237		const char* t = s + i;
238		if (strncmp(t, "coding", 6) == 0) {
239			const char* begin = NULL;
240			t += 6;
241			if (t[0] != ':' && t[0] != '=')
242				continue;
243			do {
244				t++;
245			} while (t[0] == '\x20' || t[0] == '\t');
246
247			begin = t;
248			while (ascii_isalnum(Py_CHARMASK(t[0])) ||
249			       t[0] == '-' || t[0] == '_' || t[0] == '.')
250				t++;
251
252			if (begin < t) {
253				char* r = new_string(begin, t - begin);
254				char* q = get_normal_name(r);
255				if (r != q) {
256					PyMem_FREE(r);
257					r = new_string(q, strlen(q));
258				}
259				return r;
260			}
261		}
262	}
263	return NULL;
264}
265
266/* Check whether the line contains a coding spec. If it does,
267   invoke the set_readline function for the new encoding.
268   This function receives the tok_state and the new encoding.
269   Return 1 on success, 0 on failure.  */
270
271static int
272check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
273		  int set_readline(struct tok_state *, const char *))
274{
275	char * cs;
276	int r = 1;
277
278        if (tok->cont_line)
279		/* It's a continuation line, so it can't be a coding spec. */
280		return 1;
281	cs = get_coding_spec(line, size);
282	if (cs != NULL) {
283		tok->read_coding_spec = 1;
284		if (tok->encoding == NULL) {
285			assert(tok->decoding_state == 1); /* raw */
286			if (strcmp(cs, "utf-8") == 0 ||
287			    strcmp(cs, "iso-8859-1") == 0) {
288				tok->encoding = cs;
289			} else {
290#ifdef Py_USING_UNICODE
291				r = set_readline(tok, cs);
292				if (r) {
293					tok->encoding = cs;
294					tok->decoding_state = -1;
295				}
296				else
297					PyMem_FREE(cs);
298#else
299                                /* Without Unicode support, we cannot
300                                   process the coding spec. Since there
301                                   won't be any Unicode literals, that
302                                   won't matter. */
303				PyMem_FREE(cs);
304#endif
305			}
306		} else {	/* then, compare cs with BOM */
307			r = (strcmp(tok->encoding, cs) == 0);
308			PyMem_FREE(cs);
309		}
310	}
311	if (!r) {
312		cs = tok->encoding;
313		if (!cs)
314			cs = "with BOM";
315		PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
316	}
317	return r;
318}
319
320/* See whether the file starts with a BOM. If it does,
321   invoke the set_readline function with the new encoding.
322   Return 1 on success, 0 on failure.  */
323
324static int
325check_bom(int get_char(struct tok_state *),
326	  void unget_char(int, struct tok_state *),
327	  int set_readline(struct tok_state *, const char *),
328	  struct tok_state *tok)
329{
330	int ch1, ch2, ch3;
331	ch1 = get_char(tok);
332	tok->decoding_state = 1;
333	if (ch1 == EOF) {
334		return 1;
335	} else if (ch1 == 0xEF) {
336		ch2 = get_char(tok);
337		if (ch2 != 0xBB) {
338			unget_char(ch2, tok);
339			unget_char(ch1, tok);
340			return 1;
341		}
342		ch3 = get_char(tok);
343		if (ch3 != 0xBF) {
344			unget_char(ch3, tok);
345			unget_char(ch2, tok);
346			unget_char(ch1, tok);
347			return 1;
348		}
349#if 0
350	/* Disable support for UTF-16 BOMs until a decision
351	   is made whether this needs to be supported.  */
352	} else if (ch1 == 0xFE) {
353		ch2 = get_char(tok);
354		if (ch2 != 0xFF) {
355			unget_char(ch2, tok);
356			unget_char(ch1, tok);
357			return 1;
358		}
359		if (!set_readline(tok, "utf-16-be"))
360			return 0;
361		tok->decoding_state = -1;
362	} else if (ch1 == 0xFF) {
363		ch2 = get_char(tok);
364		if (ch2 != 0xFE) {
365			unget_char(ch2, tok);
366			unget_char(ch1, tok);
367			return 1;
368		}
369		if (!set_readline(tok, "utf-16-le"))
370			return 0;
371		tok->decoding_state = -1;
372#endif
373	} else {
374		unget_char(ch1, tok);
375		return 1;
376	}
377	if (tok->encoding != NULL)
378		PyMem_FREE(tok->encoding);
379	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
380	return 1;
381}
382
383/* Read a line of text from TOK into S, using the stream in TOK.
384   Return NULL on failure, else S.
385
386   On entry, tok->decoding_buffer will be one of:
387     1) NULL: need to call tok->decoding_readline to get a new line
388     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
389           stored the result in tok->decoding_buffer
390     3) PyStringObject *: previous call to fp_readl did not have enough room
391           (in the s buffer) to copy entire contents of the line read
392           by tok->decoding_readline.  tok->decoding_buffer has the overflow.
393           In this case, fp_readl is called in a loop (with an expanded buffer)
394           until the buffer ends with a '\n' (or until the end of the file is
395           reached): see tok_nextc and its calls to decoding_fgets.
396*/
397
398static char *
399fp_readl(char *s, int size, struct tok_state *tok)
400{
401#ifndef Py_USING_UNICODE
402	/* In a non-Unicode built, this should never be called. */
403	Py_FatalError("fp_readl should not be called in this build.");
404	return NULL; /* Keep compiler happy (not reachable) */
405#else
406	PyObject* utf8 = NULL;
407	PyObject* buf = tok->decoding_buffer;
408	char *str;
409	Py_ssize_t utf8len;
410
411	/* Ask for one less byte so we can terminate it */
412	assert(size > 0);
413	size--;
414
415	if (buf == NULL) {
416		buf = PyObject_CallObject(tok->decoding_readline, NULL);
417		if (buf == NULL)
418			return error_ret(tok);
419	} else {
420		tok->decoding_buffer = NULL;
421		if (PyString_CheckExact(buf))
422			utf8 = buf;
423	}
424	if (utf8 == NULL) {
425		utf8 = PyUnicode_AsUTF8String(buf);
426		Py_DECREF(buf);
427		if (utf8 == NULL)
428			return error_ret(tok);
429	}
430	str = PyString_AsString(utf8);
431	utf8len = PyString_GET_SIZE(utf8);
432	if (utf8len > size) {
433		tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
434		if (tok->decoding_buffer == NULL) {
435			Py_DECREF(utf8);
436			return error_ret(tok);
437		}
438		utf8len = size;
439	}
440	memcpy(s, str, utf8len);
441	s[utf8len] = '\0';
442	Py_DECREF(utf8);
443	if (utf8len == 0)
444		return NULL; /* EOF */
445	return s;
446#endif
447}
448
449/* Set the readline function for TOK to a StreamReader's
450   readline function. The StreamReader is named ENC.
451
452   This function is called from check_bom and check_coding_spec.
453
454   ENC is usually identical to the future value of tok->encoding,
455   except for the (currently unsupported) case of UTF-16.
456
457   Return 1 on success, 0 on failure. */
458
459static int
460fp_setreadl(struct tok_state *tok, const char* enc)
461{
462	PyObject *reader, *stream, *readline;
463
464	/* XXX: constify filename argument. */
465	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
466	if (stream == NULL)
467		return 0;
468
469	reader = PyCodec_StreamReader(enc, stream, NULL);
470	Py_DECREF(stream);
471	if (reader == NULL)
472		return 0;
473
474	readline = PyObject_GetAttrString(reader, "readline");
475	Py_DECREF(reader);
476	if (readline == NULL)
477		return 0;
478
479	tok->decoding_readline = readline;
480	return 1;
481}
482
483/* Fetch the next byte from TOK. */
484
485static int fp_getc(struct tok_state *tok) {
486	return getc(tok->fp);
487}
488
489/* Unfetch the last byte back into TOK.  */
490
491static void fp_ungetc(int c, struct tok_state *tok) {
492	ungetc(c, tok->fp);
493}
494
495/* Read a line of input from TOK. Determine encoding
496   if necessary.  */
497
498static char *
499decoding_fgets(char *s, int size, struct tok_state *tok)
500{
501	char *line = NULL;
502	int badchar = 0;
503	for (;;) {
504		if (tok->decoding_state < 0) {
505			/* We already have a codec associated with
506			   this input. */
507			line = fp_readl(s, size, tok);
508			break;
509		} else if (tok->decoding_state > 0) {
510			/* We want a 'raw' read. */
511			line = Py_UniversalNewlineFgets(s, size,
512							tok->fp, NULL);
513			break;
514		} else {
515			/* We have not yet determined the encoding.
516			   If an encoding is found, use the file-pointer
517			   reader functions from now on. */
518			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
519				return error_ret(tok);
520			assert(tok->decoding_state != 0);
521		}
522	}
523	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
524		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
525			return error_ret(tok);
526		}
527	}
528#ifndef PGEN
529	/* The default encoding is ASCII, so make sure we don't have any
530           non-ASCII bytes in it. */
531	if (line && !tok->encoding) {
532		unsigned char *c;
533		for (c = (unsigned char *)line; *c; c++)
534			if (*c > 127) {
535				badchar = *c;
536				break;
537			}
538	}
539	if (badchar) {
540		char buf[500];
541		/* Need to add 1 to the line number, since this line
542		   has not been counted, yet.  */
543		sprintf(buf,
544			"Non-ASCII character '\\x%.2x' "
545			"in file %.200s on line %i, "
546			"but no encoding declared; "
547			"see http://www.python.org/peps/pep-0263.html for details",
548			badchar, tok->filename, tok->lineno + 1);
549		PyErr_SetString(PyExc_SyntaxError, buf);
550		return error_ret(tok);
551	}
552#endif
553	return line;
554}
555
556static int
557decoding_feof(struct tok_state *tok)
558{
559	if (tok->decoding_state >= 0) {
560		return feof(tok->fp);
561	} else {
562		PyObject* buf = tok->decoding_buffer;
563		if (buf == NULL) {
564			buf = PyObject_CallObject(tok->decoding_readline, NULL);
565			if (buf == NULL) {
566				error_ret(tok);
567				return 1;
568			} else {
569				tok->decoding_buffer = buf;
570			}
571		}
572		return PyObject_Length(buf) == 0;
573	}
574}
575
576/* Fetch a byte from TOK, using the string buffer. */
577
578static int
579buf_getc(struct tok_state *tok) {
580	return Py_CHARMASK(*tok->str++);
581}
582
583/* Unfetch a byte from TOK, using the string buffer. */
584
585static void
586buf_ungetc(int c, struct tok_state *tok) {
587	tok->str--;
588	assert(Py_CHARMASK(*tok->str) == c);	/* tok->cur may point to read-only segment */
589}
590
591/* Set the readline function for TOK to ENC. For the string-based
592   tokenizer, this means to just record the encoding. */
593
594static int
595buf_setreadl(struct tok_state *tok, const char* enc) {
596	tok->enc = enc;
597	return 1;
598}
599
600/* Return a UTF-8 encoding Python string object from the
601   C byte string STR, which is encoded with ENC. */
602
603#ifdef Py_USING_UNICODE
604static PyObject *
605translate_into_utf8(const char* str, const char* enc) {
606	PyObject *utf8;
607	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
608	if (buf == NULL)
609		return NULL;
610	utf8 = PyUnicode_AsUTF8String(buf);
611	Py_DECREF(buf);
612	return utf8;
613}
614#endif
615
616
617static char *
618translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
619	int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
620	char *buf, *current;
621	char c = '\0';
622	buf = PyMem_MALLOC(needed_length);
623	if (buf == NULL) {
624		tok->done = E_NOMEM;
625		return NULL;
626	}
627	for (current = buf; *s; s++, current++) {
628		c = *s;
629		if (skip_next_lf) {
630			skip_next_lf = 0;
631			if (c == '\n') {
632				c = *++s;
633				if (!c)
634					break;
635			}
636		}
637		if (c == '\r') {
638			skip_next_lf = 1;
639			c = '\n';
640		}
641		*current = c;
642	}
643	/* If this is exec input, add a newline to the end of the string if
644	   there isn't one already. */
645	if (exec_input && c != '\n') {
646		*current = '\n';
647		current++;
648	}
649	*current = '\0';
650	final_length = current - buf + 1;
651	if (final_length < needed_length && final_length)
652		/* should never fail */
653		buf = PyMem_REALLOC(buf, final_length);
654	return buf;
655}
656
657/* Decode a byte string STR for use as the buffer of TOK.
658   Look for encoding declarations inside STR, and record them
659   inside TOK.  */
660
661static const char *
662decode_str(const char *input, int single, struct tok_state *tok)
663{
664	PyObject* utf8 = NULL;
665	const char *str;
666	const char *s;
667	const char *newl[2] = {NULL, NULL};
668	int lineno = 0;
669	tok->input = str = translate_newlines(input, single, tok);
670	if (str == NULL)
671		return NULL;
672	tok->enc = NULL;
673	tok->str = str;
674	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
675		return error_ret(tok);
676	str = tok->str;		/* string after BOM if any */
677	assert(str);
678#ifdef Py_USING_UNICODE
679	if (tok->enc != NULL) {
680		utf8 = translate_into_utf8(str, tok->enc);
681		if (utf8 == NULL)
682			return error_ret(tok);
683		str = PyString_AsString(utf8);
684	}
685#endif
686	for (s = str;; s++) {
687		if (*s == '\0') break;
688		else if (*s == '\n') {
689			assert(lineno < 2);
690			newl[lineno] = s;
691			lineno++;
692			if (lineno == 2) break;
693		}
694	}
695	tok->enc = NULL;
696	/* need to check line 1 and 2 separately since check_coding_spec
697	   assumes a single line as input */
698	if (newl[0]) {
699		if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
700			return error_ret(tok);
701		if (tok->enc == NULL && newl[1]) {
702			if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
703					       tok, buf_setreadl))
704				return error_ret(tok);
705		}
706	}
707#ifdef Py_USING_UNICODE
708	if (tok->enc != NULL) {
709		assert(utf8 == NULL);
710		utf8 = translate_into_utf8(str, tok->enc);
711		if (utf8 == NULL)
712			return error_ret(tok);
713		str = PyString_AsString(utf8);
714	}
715#endif
716	assert(tok->decoding_buffer == NULL);
717	tok->decoding_buffer = utf8; /* CAUTION */
718	return str;
719}
720
721#endif /* PGEN */
722
723/* Set up tokenizer for string */
724
725struct tok_state *
726PyTokenizer_FromString(const char *str, int exec_input)
727{
728	struct tok_state *tok = tok_new();
729	if (tok == NULL)
730		return NULL;
731	str = (char *)decode_str(str, exec_input, tok);
732	if (str == NULL) {
733		PyTokenizer_Free(tok);
734		return NULL;
735	}
736
737	/* XXX: constify members. */
738	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
739	return tok;
740}
741
742
743/* Set up tokenizer for file */
744
745struct tok_state *
746PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
747{
748	struct tok_state *tok = tok_new();
749	if (tok == NULL)
750		return NULL;
751	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
752		PyTokenizer_Free(tok);
753		return NULL;
754	}
755	tok->cur = tok->inp = tok->buf;
756	tok->end = tok->buf + BUFSIZ;
757	tok->fp = fp;
758	tok->prompt = ps1;
759	tok->nextprompt = ps2;
760	return tok;
761}
762
763
764/* Free a tok_state structure */
765
766void
767PyTokenizer_Free(struct tok_state *tok)
768{
769	if (tok->encoding != NULL)
770		PyMem_FREE(tok->encoding);
771#ifndef PGEN
772	Py_XDECREF(tok->decoding_readline);
773	Py_XDECREF(tok->decoding_buffer);
774#endif
775	if (tok->fp != NULL && tok->buf != NULL)
776		PyMem_FREE(tok->buf);
777	if (tok->input)
778		PyMem_FREE((char *)tok->input);
779	PyMem_FREE(tok);
780}
781
782#if !defined(PGEN) && defined(Py_USING_UNICODE)
783static int
784tok_stdin_decode(struct tok_state *tok, char **inp)
785{
786	PyObject *enc, *sysstdin, *decoded, *utf8;
787	const char *encoding;
788	char *converted;
789
790	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
791		return 0;
792	sysstdin = PySys_GetObject("stdin");
793	if (sysstdin == NULL || !PyFile_Check(sysstdin))
794		return 0;
795
796	enc = ((PyFileObject *)sysstdin)->f_encoding;
797	if (enc == NULL || !PyString_Check(enc))
798		return 0;
799	Py_INCREF(enc);
800
801	encoding = PyString_AsString(enc);
802	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
803	if (decoded == NULL)
804		goto error_clear;
805
806	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
807	Py_DECREF(decoded);
808	if (utf8 == NULL)
809		goto error_clear;
810
811	assert(PyString_Check(utf8));
812	converted = new_string(PyString_AS_STRING(utf8),
813			       PyString_GET_SIZE(utf8));
814	Py_DECREF(utf8);
815	if (converted == NULL)
816		goto error_nomem;
817
818	PyMem_FREE(*inp);
819	*inp = converted;
820	if (tok->encoding != NULL)
821		PyMem_FREE(tok->encoding);
822	tok->encoding = new_string(encoding, strlen(encoding));
823	if (tok->encoding == NULL)
824		goto error_nomem;
825
826	Py_DECREF(enc);
827	return 0;
828
829error_nomem:
830	Py_DECREF(enc);
831	tok->done = E_NOMEM;
832	return -1;
833
834error_clear:
835	Py_DECREF(enc);
836	if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
837		tok->done = E_ERROR;
838		return -1;
839	}
840	/* Fallback to iso-8859-1: for backward compatibility */
841	PyErr_Clear();
842	return 0;
843}
844#endif
845
846/* Get next char, updating state; error code goes into tok->done */
847
848static int
849tok_nextc(register struct tok_state *tok)
850{
851	for (;;) {
852		if (tok->cur != tok->inp) {
853			return Py_CHARMASK(*tok->cur++); /* Fast path */
854		}
855		if (tok->done != E_OK)
856			return EOF;
857		if (tok->fp == NULL) {
858			char *end = strchr(tok->inp, '\n');
859			if (end != NULL)
860				end++;
861			else {
862				end = strchr(tok->inp, '\0');
863				if (end == tok->inp) {
864					tok->done = E_EOF;
865					return EOF;
866				}
867			}
868			if (tok->start == NULL)
869				tok->buf = tok->cur;
870			tok->line_start = tok->cur;
871			tok->lineno++;
872			tok->inp = end;
873			return Py_CHARMASK(*tok->cur++);
874		}
875		if (tok->prompt != NULL) {
876			char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
877			if (tok->nextprompt != NULL)
878				tok->prompt = tok->nextprompt;
879			if (newtok == NULL)
880				tok->done = E_INTR;
881			else if (*newtok == '\0') {
882				PyMem_FREE(newtok);
883				tok->done = E_EOF;
884			}
885#if !defined(PGEN) && defined(Py_USING_UNICODE)
886			else if (tok_stdin_decode(tok, &newtok) != 0)
887				PyMem_FREE(newtok);
888#endif
889			else if (tok->start != NULL) {
890				size_t start = tok->start - tok->buf;
891				size_t oldlen = tok->cur - tok->buf;
892				size_t newlen = oldlen + strlen(newtok);
893				char *buf = tok->buf;
894				buf = (char *)PyMem_REALLOC(buf, newlen+1);
895				tok->lineno++;
896				if (buf == NULL) {
897					PyMem_FREE(tok->buf);
898					tok->buf = NULL;
899					PyMem_FREE(newtok);
900					tok->done = E_NOMEM;
901					return EOF;
902				}
903				tok->buf = buf;
904				tok->cur = tok->buf + oldlen;
905				tok->line_start = tok->cur;
906				strcpy(tok->buf + oldlen, newtok);
907				PyMem_FREE(newtok);
908				tok->inp = tok->buf + newlen;
909				tok->end = tok->inp + 1;
910				tok->start = tok->buf + start;
911			}
912			else {
913				tok->lineno++;
914				if (tok->buf != NULL)
915					PyMem_FREE(tok->buf);
916				tok->buf = newtok;
917				tok->line_start = tok->buf;
918				tok->cur = tok->buf;
919				tok->line_start = tok->buf;
920				tok->inp = strchr(tok->buf, '\0');
921				tok->end = tok->inp + 1;
922			}
923		}
924		else {
925			int done = 0;
926			Py_ssize_t cur = 0;
927			char *pt;
928			if (tok->start == NULL) {
929				if (tok->buf == NULL) {
930					tok->buf = (char *)
931						PyMem_MALLOC(BUFSIZ);
932					if (tok->buf == NULL) {
933						tok->done = E_NOMEM;
934						return EOF;
935					}
936					tok->end = tok->buf + BUFSIZ;
937				}
938				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
939					  tok) == NULL) {
940					tok->done = E_EOF;
941					done = 1;
942				}
943				else {
944					tok->done = E_OK;
945					tok->inp = strchr(tok->buf, '\0');
946					done = tok->inp[-1] == '\n';
947				}
948			}
949			else {
950				cur = tok->cur - tok->buf;
951				if (decoding_feof(tok)) {
952					tok->done = E_EOF;
953					done = 1;
954				}
955				else
956					tok->done = E_OK;
957			}
958			tok->lineno++;
959			/* Read until '\n' or EOF */
960			while (!done) {
961				Py_ssize_t curstart = tok->start == NULL ? -1 :
962					          tok->start - tok->buf;
963				Py_ssize_t curvalid = tok->inp - tok->buf;
964				Py_ssize_t newsize = curvalid + BUFSIZ;
965				char *newbuf = tok->buf;
966				newbuf = (char *)PyMem_REALLOC(newbuf,
967							       newsize);
968				if (newbuf == NULL) {
969					tok->done = E_NOMEM;
970					tok->cur = tok->inp;
971					return EOF;
972				}
973				tok->buf = newbuf;
974				tok->inp = tok->buf + curvalid;
975				tok->end = tok->buf + newsize;
976				tok->start = curstart < 0 ? NULL :
977					     tok->buf + curstart;
978				if (decoding_fgets(tok->inp,
979					       (int)(tok->end - tok->inp),
980					       tok) == NULL) {
981					/* Break out early on decoding
982					   errors, as tok->buf will be NULL
983					 */
984					if (tok->decoding_erred)
985						return EOF;
986					/* Last line does not end in \n,
987					   fake one */
988					strcpy(tok->inp, "\n");
989				}
990				tok->inp = strchr(tok->inp, '\0');
991				done = tok->inp[-1] == '\n';
992			}
993			if (tok->buf != NULL) {
994				tok->cur = tok->buf + cur;
995				tok->line_start = tok->cur;
996				/* replace "\r\n" with "\n" */
997				/* For Mac leave the \r, giving a syntax error */
998				pt = tok->inp - 2;
999				if (pt >= tok->buf && *pt == '\r') {
1000					*pt++ = '\n';
1001					*pt = '\0';
1002					tok->inp = pt;
1003				}
1004			}
1005		}
1006		if (tok->done != E_OK) {
1007			if (tok->prompt != NULL)
1008				PySys_WriteStderr("\n");
1009			tok->cur = tok->inp;
1010			return EOF;
1011		}
1012	}
1013	/*NOTREACHED*/
1014}
1015
1016
1017/* Back-up one character */
1018
1019static void
1020tok_backup(register struct tok_state *tok, register int c)
1021{
1022	if (c != EOF) {
1023		if (--tok->cur < tok->buf)
1024			Py_FatalError("tok_backup: beginning of buffer");
1025		if (*tok->cur != c)
1026			*tok->cur = c;
1027	}
1028}
1029
1030
1031/* Return the token corresponding to a single character */
1032
1033int
1034PyToken_OneChar(int c)
1035{
1036	switch (c) {
1037	case '(':	return LPAR;
1038	case ')':	return RPAR;
1039	case '[':	return LSQB;
1040	case ']':	return RSQB;
1041	case ':':	return COLON;
1042	case ',':	return COMMA;
1043	case ';':	return SEMI;
1044	case '+':	return PLUS;
1045	case '-':	return MINUS;
1046	case '*':	return STAR;
1047	case '/':	return SLASH;
1048	case '|':	return VBAR;
1049	case '&':	return AMPER;
1050	case '<':	return LESS;
1051	case '>':	return GREATER;
1052	case '=':	return EQUAL;
1053	case '.':	return DOT;
1054	case '%':	return PERCENT;
1055	case '`':	return BACKQUOTE;
1056	case '{':	return LBRACE;
1057	case '}':	return RBRACE;
1058	case '^':	return CIRCUMFLEX;
1059	case '~':	return TILDE;
1060	case '@':       return AT;
1061	default:	return OP;
1062	}
1063}
1064
1065
1066int
1067PyToken_TwoChars(int c1, int c2)
1068{
1069	switch (c1) {
1070	case '=':
1071		switch (c2) {
1072		case '=':	return EQEQUAL;
1073		}
1074		break;
1075	case '!':
1076		switch (c2) {
1077		case '=':	return NOTEQUAL;
1078		}
1079		break;
1080	case '<':
1081		switch (c2) {
1082		case '>':	return NOTEQUAL;
1083		case '=':	return LESSEQUAL;
1084		case '<':	return LEFTSHIFT;
1085		}
1086		break;
1087	case '>':
1088		switch (c2) {
1089		case '=':	return GREATEREQUAL;
1090		case '>':	return RIGHTSHIFT;
1091		}
1092		break;
1093	case '+':
1094		switch (c2) {
1095		case '=':	return PLUSEQUAL;
1096		}
1097		break;
1098	case '-':
1099		switch (c2) {
1100		case '=':	return MINEQUAL;
1101		}
1102		break;
1103	case '*':
1104		switch (c2) {
1105		case '*':	return DOUBLESTAR;
1106		case '=':	return STAREQUAL;
1107		}
1108		break;
1109	case '/':
1110		switch (c2) {
1111		case '/':	return DOUBLESLASH;
1112		case '=':	return SLASHEQUAL;
1113		}
1114		break;
1115	case '|':
1116		switch (c2) {
1117		case '=':	return VBAREQUAL;
1118		}
1119		break;
1120	case '%':
1121		switch (c2) {
1122		case '=':	return PERCENTEQUAL;
1123		}
1124		break;
1125	case '&':
1126		switch (c2) {
1127		case '=':	return AMPEREQUAL;
1128		}
1129		break;
1130	case '^':
1131		switch (c2) {
1132		case '=':	return CIRCUMFLEXEQUAL;
1133		}
1134		break;
1135	}
1136	return OP;
1137}
1138
1139int
1140PyToken_ThreeChars(int c1, int c2, int c3)
1141{
1142	switch (c1) {
1143	case '<':
1144		switch (c2) {
1145		case '<':
1146			switch (c3) {
1147			case '=':
1148				return LEFTSHIFTEQUAL;
1149			}
1150			break;
1151		}
1152		break;
1153	case '>':
1154		switch (c2) {
1155		case '>':
1156			switch (c3) {
1157			case '=':
1158				return RIGHTSHIFTEQUAL;
1159			}
1160			break;
1161		}
1162		break;
1163	case '*':
1164		switch (c2) {
1165		case '*':
1166			switch (c3) {
1167			case '=':
1168				return DOUBLESTAREQUAL;
1169			}
1170			break;
1171		}
1172		break;
1173	case '/':
1174		switch (c2) {
1175		case '/':
1176			switch (c3) {
1177			case '=':
1178				return DOUBLESLASHEQUAL;
1179			}
1180			break;
1181		}
1182		break;
1183	}
1184	return OP;
1185}
1186
1187static int
1188indenterror(struct tok_state *tok)
1189{
1190	if (tok->alterror) {
1191		tok->done = E_TABSPACE;
1192		tok->cur = tok->inp;
1193		return 1;
1194	}
1195	if (tok->altwarning) {
1196		PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1197                                  "in indentation\n", tok->filename);
1198		tok->altwarning = 0;
1199	}
1200	return 0;
1201}
1202
1203/* Get next token, after space stripping etc. */
1204
1205static int
1206tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1207{
1208	register int c;
1209	int blankline;
1210
1211	*p_start = *p_end = NULL;
1212  nextline:
1213	tok->start = NULL;
1214	blankline = 0;
1215
1216	/* Get indentation level */
1217	if (tok->atbol) {
1218		register int col = 0;
1219		register int altcol = 0;
1220		tok->atbol = 0;
1221		for (;;) {
1222			c = tok_nextc(tok);
1223			if (c == ' ')
1224				col++, altcol++;
1225			else if (c == '\t') {
1226				col = (col/tok->tabsize + 1) * tok->tabsize;
1227				altcol = (altcol/tok->alttabsize + 1)
1228					* tok->alttabsize;
1229			}
1230			else if (c == '\014') /* Control-L (formfeed) */
1231				col = altcol = 0; /* For Emacs users */
1232			else
1233				break;
1234		}
1235		tok_backup(tok, c);
1236		if (c == '#' || c == '\n') {
1237			/* Lines with only whitespace and/or comments
1238			   shouldn't affect the indentation and are
1239			   not passed to the parser as NEWLINE tokens,
1240			   except *totally* empty lines in interactive
1241			   mode, which signal the end of a command group. */
1242			if (col == 0 && c == '\n' && tok->prompt != NULL)
1243				blankline = 0; /* Let it through */
1244			else
1245				blankline = 1; /* Ignore completely */
1246			/* We can't jump back right here since we still
1247			   may need to skip to the end of a comment */
1248		}
1249		if (!blankline && tok->level == 0) {
1250			if (col == tok->indstack[tok->indent]) {
1251				/* No change */
1252				if (altcol != tok->altindstack[tok->indent]) {
1253					if (indenterror(tok))
1254						return ERRORTOKEN;
1255				}
1256			}
1257			else if (col > tok->indstack[tok->indent]) {
1258				/* Indent -- always one */
1259				if (tok->indent+1 >= MAXINDENT) {
1260					tok->done = E_TOODEEP;
1261					tok->cur = tok->inp;
1262					return ERRORTOKEN;
1263				}
1264				if (altcol <= tok->altindstack[tok->indent]) {
1265					if (indenterror(tok))
1266						return ERRORTOKEN;
1267				}
1268				tok->pendin++;
1269				tok->indstack[++tok->indent] = col;
1270				tok->altindstack[tok->indent] = altcol;
1271			}
1272			else /* col < tok->indstack[tok->indent] */ {
1273				/* Dedent -- any number, must be consistent */
1274				while (tok->indent > 0 &&
1275					col < tok->indstack[tok->indent]) {
1276					tok->pendin--;
1277					tok->indent--;
1278				}
1279				if (col != tok->indstack[tok->indent]) {
1280					tok->done = E_DEDENT;
1281					tok->cur = tok->inp;
1282					return ERRORTOKEN;
1283				}
1284				if (altcol != tok->altindstack[tok->indent]) {
1285					if (indenterror(tok))
1286						return ERRORTOKEN;
1287				}
1288			}
1289		}
1290	}
1291
1292	tok->start = tok->cur;
1293
1294	/* Return pending indents/dedents */
1295	if (tok->pendin != 0) {
1296		if (tok->pendin < 0) {
1297			tok->pendin++;
1298			return DEDENT;
1299		}
1300		else {
1301			tok->pendin--;
1302			return INDENT;
1303		}
1304	}
1305
1306 again:
1307	tok->start = NULL;
1308	/* Skip spaces */
1309	do {
1310		c = tok_nextc(tok);
1311	} while (c == ' ' || c == '\t' || c == '\014');
1312
1313	/* Set start of current token */
1314	tok->start = tok->cur - 1;
1315
1316	/* Skip comment, while looking for tab-setting magic */
1317	if (c == '#') {
1318		static char *tabforms[] = {
1319			"tab-width:",		/* Emacs */
1320			":tabstop=",		/* vim, full form */
1321			":ts=",			/* vim, abbreviated form */
1322			"set tabsize=",		/* will vi never die? */
1323		/* more templates can be added here to support other editors */
1324		};
1325		char cbuf[80];
1326		char *tp, **cp;
1327		tp = cbuf;
1328		do {
1329			*tp++ = c = tok_nextc(tok);
1330		} while (c != EOF && c != '\n' &&
1331			 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1332		*tp = '\0';
1333		for (cp = tabforms;
1334		     cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1335		     cp++) {
1336			if ((tp = strstr(cbuf, *cp))) {
1337				int newsize = atoi(tp + strlen(*cp));
1338
1339				if (newsize >= 1 && newsize <= 40) {
1340					tok->tabsize = newsize;
1341					if (Py_VerboseFlag)
1342					    PySys_WriteStderr(
1343						"Tab size set to %d\n",
1344						newsize);
1345				}
1346			}
1347		}
1348		while (c != EOF && c != '\n')
1349			c = tok_nextc(tok);
1350	}
1351
1352	/* Check for EOF and errors now */
1353	if (c == EOF) {
1354		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1355	}
1356
1357	/* Identifier (most frequent token!) */
1358	if (ascii_isalpha(c) || c == '_') {
1359		/* Process r"", u"" and ur"" */
1360		switch (c) {
1361		case 'b':
1362		case 'B':
1363			c = tok_nextc(tok);
1364			if (c == 'r' || c == 'R')
1365				c = tok_nextc(tok);
1366			if (c == '"' || c == '\'')
1367				goto letter_quote;
1368			break;
1369		case 'r':
1370		case 'R':
1371			c = tok_nextc(tok);
1372			if (c == '"' || c == '\'')
1373				goto letter_quote;
1374			break;
1375		case 'u':
1376		case 'U':
1377			c = tok_nextc(tok);
1378			if (c == 'r' || c == 'R')
1379				c = tok_nextc(tok);
1380			if (c == '"' || c == '\'')
1381				goto letter_quote;
1382			break;
1383		}
1384		while (ascii_isalnum(c) || c == '_') {
1385			c = tok_nextc(tok);
1386		}
1387		tok_backup(tok, c);
1388		*p_start = tok->start;
1389		*p_end = tok->cur;
1390		return NAME;
1391	}
1392
1393	/* Newline */
1394	if (c == '\n') {
1395		tok->atbol = 1;
1396		if (blankline || tok->level > 0)
1397			goto nextline;
1398		*p_start = tok->start;
1399		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1400                tok->cont_line = 0;
1401		return NEWLINE;
1402	}
1403
1404	/* Period or number starting with period? */
1405	if (c == '.') {
1406		c = tok_nextc(tok);
1407		if (isdigit(c)) {
1408			goto fraction;
1409		}
1410		else {
1411			tok_backup(tok, c);
1412			*p_start = tok->start;
1413			*p_end = tok->cur;
1414			return DOT;
1415		}
1416	}
1417
1418	/* Number */
1419	if (isdigit(c)) {
1420		if (c == '0') {
1421			/* Hex, octal or binary -- maybe. */
1422			c = tok_nextc(tok);
1423			if (c == '.')
1424				goto fraction;
1425#ifndef WITHOUT_COMPLEX
1426			if (c == 'j' || c == 'J')
1427				goto imaginary;
1428#endif
1429			if (c == 'x' || c == 'X') {
1430
1431				/* Hex */
1432				c = tok_nextc(tok);
1433				if (!isxdigit(c)) {
1434					tok->done = E_TOKEN;
1435					tok_backup(tok, c);
1436					return ERRORTOKEN;
1437				}
1438				do {
1439					c = tok_nextc(tok);
1440				} while (isxdigit(c));
1441			}
1442                        else if (c == 'o' || c == 'O') {
1443				/* Octal */
1444				c = tok_nextc(tok);
1445				if (c < '0' || c >= '8') {
1446					tok->done = E_TOKEN;
1447					tok_backup(tok, c);
1448					return ERRORTOKEN;
1449				}
1450				do {
1451					c = tok_nextc(tok);
1452				} while ('0' <= c && c < '8');
1453			}
1454			else if (c == 'b' || c == 'B') {
1455				/* Binary */
1456				c = tok_nextc(tok);
1457				if (c != '0' && c != '1') {
1458					tok->done = E_TOKEN;
1459					tok_backup(tok, c);
1460					return ERRORTOKEN;
1461				}
1462				do {
1463					c = tok_nextc(tok);
1464				} while (c == '0' || c == '1');
1465			}
1466			else {
1467				int found_decimal = 0;
1468				/* Octal; c is first char of it */
1469				/* There's no 'isoctdigit' macro, sigh */
1470				while ('0' <= c && c < '8') {
1471					c = tok_nextc(tok);
1472				}
1473				if (isdigit(c)) {
1474					found_decimal = 1;
1475					do {
1476						c = tok_nextc(tok);
1477					} while (isdigit(c));
1478				}
1479				if (c == '.')
1480					goto fraction;
1481				else if (c == 'e' || c == 'E')
1482					goto exponent;
1483#ifndef WITHOUT_COMPLEX
1484				else if (c == 'j' || c == 'J')
1485					goto imaginary;
1486#endif
1487				else if (found_decimal) {
1488					tok->done = E_TOKEN;
1489					tok_backup(tok, c);
1490					return ERRORTOKEN;
1491				}
1492			}
1493			if (c == 'l' || c == 'L')
1494				c = tok_nextc(tok);
1495		}
1496		else {
1497			/* Decimal */
1498			do {
1499				c = tok_nextc(tok);
1500			} while (isdigit(c));
1501			if (c == 'l' || c == 'L')
1502				c = tok_nextc(tok);
1503			else {
1504				/* Accept floating point numbers. */
1505				if (c == '.') {
1506		fraction:
1507					/* Fraction */
1508					do {
1509						c = tok_nextc(tok);
1510					} while (isdigit(c));
1511				}
1512				if (c == 'e' || c == 'E') {
1513		exponent:
1514					/* Exponent part */
1515					c = tok_nextc(tok);
1516					if (c == '+' || c == '-')
1517						c = tok_nextc(tok);
1518					if (!isdigit(c)) {
1519						tok->done = E_TOKEN;
1520						tok_backup(tok, c);
1521						return ERRORTOKEN;
1522					}
1523					do {
1524						c = tok_nextc(tok);
1525					} while (isdigit(c));
1526				}
1527#ifndef WITHOUT_COMPLEX
1528				if (c == 'j' || c == 'J')
1529					/* Imaginary part */
1530		imaginary:
1531					c = tok_nextc(tok);
1532#endif
1533			}
1534		}
1535		tok_backup(tok, c);
1536		*p_start = tok->start;
1537		*p_end = tok->cur;
1538		return NUMBER;
1539	}
1540
1541  letter_quote:
1542	/* String */
1543	if (c == '\'' || c == '"') {
1544		Py_ssize_t quote2 = tok->cur - tok->start + 1;
1545		int quote = c;
1546		int triple = 0;
1547		int tripcount = 0;
1548		for (;;) {
1549			c = tok_nextc(tok);
1550			if (c == '\n') {
1551				if (!triple) {
1552					tok->done = E_EOLS;
1553					tok_backup(tok, c);
1554					return ERRORTOKEN;
1555				}
1556				tripcount = 0;
1557                                tok->cont_line = 1; /* multiline string. */
1558			}
1559			else if (c == EOF) {
1560				if (triple)
1561					tok->done = E_EOFS;
1562				else
1563					tok->done = E_EOLS;
1564				tok->cur = tok->inp;
1565				return ERRORTOKEN;
1566			}
1567			else if (c == quote) {
1568				tripcount++;
1569				if (tok->cur - tok->start == quote2) {
1570					c = tok_nextc(tok);
1571					if (c == quote) {
1572						triple = 1;
1573						tripcount = 0;
1574						continue;
1575					}
1576					tok_backup(tok, c);
1577				}
1578				if (!triple || tripcount == 3)
1579					break;
1580			}
1581			else if (c == '\\') {
1582				tripcount = 0;
1583				c = tok_nextc(tok);
1584				if (c == EOF) {
1585					tok->done = E_EOLS;
1586					tok->cur = tok->inp;
1587					return ERRORTOKEN;
1588				}
1589			}
1590			else
1591				tripcount = 0;
1592		}
1593		*p_start = tok->start;
1594		*p_end = tok->cur;
1595		return STRING;
1596	}
1597
1598	/* Line continuation */
1599	if (c == '\\') {
1600		c = tok_nextc(tok);
1601		if (c != '\n') {
1602			tok->done = E_LINECONT;
1603			tok->cur = tok->inp;
1604			return ERRORTOKEN;
1605		}
1606                tok->cont_line = 1;
1607		goto again; /* Read next line */
1608	}
1609
1610	/* Check for two-character token */
1611	{
1612		int c2 = tok_nextc(tok);
1613		int token = PyToken_TwoChars(c, c2);
1614#ifndef PGEN
1615		if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1616			if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1617					       "<> not supported in 3.x; use !=",
1618					       tok->filename, tok->lineno,
1619					       NULL, NULL)) {
1620				return ERRORTOKEN;
1621			}
1622		}
1623#endif
1624		if (token != OP) {
1625			int c3 = tok_nextc(tok);
1626			int token3 = PyToken_ThreeChars(c, c2, c3);
1627			if (token3 != OP) {
1628				token = token3;
1629			} else {
1630				tok_backup(tok, c3);
1631			}
1632			*p_start = tok->start;
1633			*p_end = tok->cur;
1634			return token;
1635		}
1636		tok_backup(tok, c2);
1637	}
1638
1639	/* Keep track of parentheses nesting level */
1640	switch (c) {
1641	case '(':
1642	case '[':
1643	case '{':
1644		tok->level++;
1645		break;
1646	case ')':
1647	case ']':
1648	case '}':
1649		tok->level--;
1650		break;
1651	}
1652
1653	/* Punctuation character */
1654	*p_start = tok->start;
1655	*p_end = tok->cur;
1656	return PyToken_OneChar(c);
1657}
1658
1659int
1660PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1661{
1662	int result = tok_get(tok, p_start, p_end);
1663	if (tok->decoding_erred) {
1664		result = ERRORTOKEN;
1665		tok->done = E_DECODE;
1666	}
1667	return result;
1668}
1669
1670/* This function is only called from parsetok. However, it cannot live
1671   there, as it must be empty for PGEN, and we can check for PGEN only
1672   in this file. */
1673
1674#if defined(PGEN) || !defined(Py_USING_UNICODE)
1675char*
1676PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1677{
1678	return NULL;
1679}
1680#else
1681#ifdef Py_USING_UNICODE
1682static PyObject *
1683dec_utf8(const char *enc, const char *text, size_t len) {
1684	PyObject *ret = NULL;
1685	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1686	if (unicode_text) {
1687		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1688		Py_DECREF(unicode_text);
1689	}
1690	if (!ret) {
1691		PyErr_Clear();
1692	}
1693	return ret;
1694}
1695char *
1696PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1697{
1698	char *text = NULL;
1699	if (tok->encoding) {
1700		/* convert source to original encondig */
1701		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1702		if (lineobj != NULL) {
1703			int linelen = PyString_Size(lineobj);
1704			const char *line = PyString_AsString(lineobj);
1705			text = PyObject_MALLOC(linelen + 1);
1706			if (text != NULL && line != NULL) {
1707				if (linelen)
1708					strncpy(text, line, linelen);
1709				text[linelen] = '\0';
1710			}
1711			Py_DECREF(lineobj);
1712
1713			/* adjust error offset */
1714			if (*offset > 1) {
1715				PyObject *offsetobj = dec_utf8(tok->encoding,
1716							       tok->buf, *offset-1);
1717				if (offsetobj) {
1718					*offset = PyString_Size(offsetobj) + 1;
1719					Py_DECREF(offsetobj);
1720				}
1721			}
1722
1723		}
1724	}
1725	return text;
1726
1727}
1728#endif /* defined(Py_USING_UNICODE) */
1729#endif
1730
1731
1732#ifdef Py_DEBUG
1733
1734void
1735tok_dump(int type, char *start, char *end)
1736{
1737	printf("%s", _PyParser_TokenNames[type]);
1738	if (type == NAME || type == NUMBER || type == STRING || type == OP)
1739		printf("(%.*s)", (int)(end - start), start);
1740}
1741
1742#endif
1743