tokenizer.c revision 9ff19b54346d39d15cdcf75e9d66ab46ea6064d6
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#include "pydebug.h"
20#endif /* PGEN */
21
22extern char *PyOS_Readline(FILE *, FILE *, char *);
23/* Return malloc'ed string including trailing \n;
24   empty malloc'ed string for EOF;
25   NULL if interrupted */
26
27/* Don't ever change this -- it would break the portability of Python code */
28#define TABSIZE 8
29
30/* Convert a possibly signed character to a nonnegative int */
31/* XXX This assumes characters are 8 bits wide */
32#ifdef __CHAR_UNSIGNED__
33#define Py_CHARMASK(c)		(c)
34#else
35#define Py_CHARMASK(c)		((c) & 0xff)
36#endif
37
38/* Forward */
39static struct tok_state *tok_new(void);
40static int tok_nextc(struct tok_state *tok);
41static void tok_backup(struct tok_state *tok, int c);
42
43/* Token names */
44
45char *_PyParser_TokenNames[] = {
46	"ENDMARKER",
47	"NAME",
48	"NUMBER",
49	"STRING",
50	"NEWLINE",
51	"INDENT",
52	"DEDENT",
53	"LPAR",
54	"RPAR",
55	"LSQB",
56	"RSQB",
57	"COLON",
58	"COMMA",
59	"SEMI",
60	"PLUS",
61	"MINUS",
62	"STAR",
63	"SLASH",
64	"VBAR",
65	"AMPER",
66	"LESS",
67	"GREATER",
68	"EQUAL",
69	"DOT",
70	"PERCENT",
71	"BACKQUOTE",
72	"LBRACE",
73	"RBRACE",
74	"EQEQUAL",
75	"NOTEQUAL",
76	"LESSEQUAL",
77	"GREATEREQUAL",
78	"TILDE",
79	"CIRCUMFLEX",
80	"LEFTSHIFT",
81	"RIGHTSHIFT",
82	"DOUBLESTAR",
83	"PLUSEQUAL",
84	"MINEQUAL",
85	"STAREQUAL",
86	"SLASHEQUAL",
87	"PERCENTEQUAL",
88	"AMPEREQUAL",
89	"VBAREQUAL",
90	"CIRCUMFLEXEQUAL",
91	"LEFTSHIFTEQUAL",
92	"RIGHTSHIFTEQUAL",
93	"DOUBLESTAREQUAL",
94	"DOUBLESLASH",
95	"DOUBLESLASHEQUAL",
96	"AT",
97	/* This table must match the #defines in token.h! */
98	"OP",
99	"<ERRORTOKEN>",
100	"<N_TOKENS>"
101};
102
103
104/* Create and initialize a new tok_state structure */
105
106static struct tok_state *
107tok_new(void)
108{
109	struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110                                                sizeof(struct tok_state));
111	if (tok == NULL)
112		return NULL;
113	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
114	tok->done = E_OK;
115	tok->fp = NULL;
116	tok->tabsize = TABSIZE;
117	tok->indent = 0;
118	tok->indstack[0] = 0;
119	tok->atbol = 1;
120	tok->pendin = 0;
121	tok->prompt = tok->nextprompt = NULL;
122	tok->lineno = 0;
123	tok->level = 0;
124	tok->filename = NULL;
125	tok->altwarning = 0;
126	tok->alterror = 0;
127	tok->alttabsize = 1;
128	tok->altindstack[0] = 0;
129	tok->decoding_state = 0;
130	tok->decoding_erred = 0;
131	tok->read_coding_spec = 0;
132	tok->encoding = NULL;
133        tok->cont_line = 0;
134#ifndef PGEN
135	tok->decoding_readline = NULL;
136	tok->decoding_buffer = NULL;
137#endif
138	return tok;
139}
140
141#ifdef PGEN
142
143static char *
144decoding_fgets(char *s, int size, struct tok_state *tok)
145{
146	return fgets(s, size, tok->fp);
147}
148
149static int
150decoding_feof(struct tok_state *tok)
151{
152	return feof(tok->fp);
153}
154
155static const char *
156decode_str(const char *str, struct tok_state *tok)
157{
158	return str;
159}
160
161#else /* PGEN */
162
163static char *
164error_ret(struct tok_state *tok) /* XXX */
165{
166	tok->decoding_erred = 1;
167	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
168		PyMem_FREE(tok->buf);
169	tok->buf = NULL;
170	return NULL;		/* as if it were EOF */
171}
172
173static char *
174new_string(const char *s, Py_ssize_t len)
175{
176	char* result = (char *)PyMem_MALLOC(len + 1);
177	if (result != NULL) {
178		memcpy(result, s, len);
179		result[len] = '\0';
180	}
181	return result;
182}
183
184static char *
185get_normal_name(char *s)	/* for utf-8 and latin-1 */
186{
187	char buf[13];
188	int i;
189	for (i = 0; i < 12; i++) {
190		int c = s[i];
191		if (c == '\0') break;
192		else if (c == '_') buf[i] = '-';
193		else buf[i] = tolower(c);
194	}
195	buf[i] = '\0';
196	if (strcmp(buf, "utf-8") == 0 ||
197	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198	else if (strcmp(buf, "latin-1") == 0 ||
199		 strcmp(buf, "iso-8859-1") == 0 ||
200		 strcmp(buf, "iso-latin-1") == 0 ||
201		 strncmp(buf, "latin-1-", 8) == 0 ||
202		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204	else return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found.  */
208
209static char *
210get_coding_spec(const char *s, Py_ssize_t size)
211{
212	Py_ssize_t i;
213	/* Coding spec must be in a comment, and that comment must be
214         * the only statement on the source code line. */
215        for (i = 0; i < size - 6; i++) {
216		if (s[i] == '#')
217			break;
218		if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219			return NULL;
220	}
221	for (; i < size - 6; i++) { /* XXX inefficient search */
222		const char* t = s + i;
223		if (strncmp(t, "coding", 6) == 0) {
224			const char* begin = NULL;
225			t += 6;
226			if (t[0] != ':' && t[0] != '=')
227				continue;
228			do {
229				t++;
230			} while (t[0] == '\x20' || t[0] == '\t');
231
232			begin = t;
233			while (isalnum(Py_CHARMASK(t[0])) ||
234			       t[0] == '-' || t[0] == '_' || t[0] == '.')
235				t++;
236
237			if (begin < t) {
238				char* r = new_string(begin, t - begin);
239				char* q = get_normal_name(r);
240				if (r != q) {
241					PyMem_FREE(r);
242					r = new_string(q, strlen(q));
243				}
244				return r;
245			}
246		}
247	}
248	return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252   invoke the set_readline function for the new encoding.
253   This function receives the tok_state and the new encoding.
254   Return 1 on success, 0 on failure.  */
255
256static int
257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
258		  int set_readline(struct tok_state *, const char *))
259{
260	char * cs;
261	int r = 1;
262
263        if (tok->cont_line)
264		/* It's a continuation line, so it can't be a coding spec. */
265		return 1;
266	cs = get_coding_spec(line, size);
267	if (cs != NULL) {
268		tok->read_coding_spec = 1;
269		if (tok->encoding == NULL) {
270			assert(tok->decoding_state == 1); /* raw */
271			if (strcmp(cs, "utf-8") == 0 ||
272			    strcmp(cs, "iso-8859-1") == 0) {
273				tok->encoding = cs;
274			} else {
275#ifdef Py_USING_UNICODE
276				r = set_readline(tok, cs);
277				if (r) {
278					tok->encoding = cs;
279					tok->decoding_state = -1;
280				}
281				else
282					PyMem_FREE(cs);
283#else
284                                /* Without Unicode support, we cannot
285                                   process the coding spec. Since there
286                                   won't be any Unicode literals, that
287                                   won't matter. */
288				PyMem_FREE(cs);
289#endif
290			}
291		} else {	/* then, compare cs with BOM */
292			r = (strcmp(tok->encoding, cs) == 0);
293			PyMem_FREE(cs);
294		}
295	}
296	if (!r) {
297		cs = tok->encoding;
298		if (!cs)
299			cs = "with BOM";
300		PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301	}
302	return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306   invoke the set_readline function with the new encoding.
307   Return 1 on success, 0 on failure.  */
308
309static int
310check_bom(int get_char(struct tok_state *),
311	  void unget_char(int, struct tok_state *),
312	  int set_readline(struct tok_state *, const char *),
313	  struct tok_state *tok)
314{
315	int ch = get_char(tok);
316	tok->decoding_state = 1;
317	if (ch == EOF) {
318		return 1;
319	} else if (ch == 0xEF) {
320		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
321		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
322#if 0
323	/* Disable support for UTF-16 BOMs until a decision
324	   is made whether this needs to be supported.  */
325	} else if (ch == 0xFE) {
326		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
327		if (!set_readline(tok, "utf-16-be")) return 0;
328		tok->decoding_state = -1;
329	} else if (ch == 0xFF) {
330		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
331		if (!set_readline(tok, "utf-16-le")) return 0;
332		tok->decoding_state = -1;
333#endif
334	} else {
335		unget_char(ch, tok);
336		return 1;
337	}
338	if (tok->encoding != NULL)
339		PyMem_FREE(tok->encoding);
340	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
341	return 1;
342  NON_BOM:
343	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
344	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
345	return 1;
346}
347
348/* Read a line of text from TOK into S, using the stream in TOK.
349   Return NULL on failure, else S.
350
351   On entry, tok->decoding_buffer will be one of:
352     1) NULL: need to call tok->decoding_readline to get a new line
353     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
354           stored the result in tok->decoding_buffer
355     3) PyStringObject *: previous call to fp_readl did not have enough room
356           (in the s buffer) to copy entire contents of the line read
357           by tok->decoding_readline.  tok->decoding_buffer has the overflow.
358           In this case, fp_readl is called in a loop (with an expanded buffer)
359           until the buffer ends with a '\n' (or until the end of the file is
360           reached): see tok_nextc and its calls to decoding_fgets.
361*/
362
363static char *
364fp_readl(char *s, int size, struct tok_state *tok)
365{
366#ifndef Py_USING_UNICODE
367	/* In a non-Unicode built, this should never be called. */
368	Py_FatalError("fp_readl should not be called in this build.");
369	return NULL; /* Keep compiler happy (not reachable) */
370#else
371	PyObject* utf8 = NULL;
372	PyObject* buf = tok->decoding_buffer;
373	char *str;
374	Py_ssize_t utf8len;
375
376	/* Ask for one less byte so we can terminate it */
377	assert(size > 0);
378	size--;
379
380	if (buf == NULL) {
381		buf = PyObject_CallObject(tok->decoding_readline, NULL);
382		if (buf == NULL)
383			return error_ret(tok);
384	} else {
385		tok->decoding_buffer = NULL;
386		if (PyString_CheckExact(buf))
387			utf8 = buf;
388	}
389	if (utf8 == NULL) {
390		utf8 = PyUnicode_AsUTF8String(buf);
391		Py_DECREF(buf);
392		if (utf8 == NULL)
393			return error_ret(tok);
394	}
395	str = PyString_AsString(utf8);
396	utf8len = PyString_GET_SIZE(utf8);
397	if (utf8len > size) {
398		tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
399		if (tok->decoding_buffer == NULL) {
400			Py_DECREF(utf8);
401			return error_ret(tok);
402		}
403		utf8len = size;
404	}
405	memcpy(s, str, utf8len);
406	s[utf8len] = '\0';
407	Py_DECREF(utf8);
408	if (utf8len == 0) return NULL; /* EOF */
409	return s;
410#endif
411}
412
413/* Set the readline function for TOK to a StreamReader's
414   readline function. The StreamReader is named ENC.
415
416   This function is called from check_bom and check_coding_spec.
417
418   ENC is usually identical to the future value of tok->encoding,
419   except for the (currently unsupported) case of UTF-16.
420
421   Return 1 on success, 0 on failure. */
422
423static int
424fp_setreadl(struct tok_state *tok, const char* enc)
425{
426	PyObject *reader, *stream, *readline;
427
428	/* XXX: constify filename argument. */
429	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
430	if (stream == NULL)
431		return 0;
432
433	reader = PyCodec_StreamReader(enc, stream, NULL);
434	Py_DECREF(stream);
435	if (reader == NULL)
436		return 0;
437
438	readline = PyObject_GetAttrString(reader, "readline");
439	Py_DECREF(reader);
440	if (readline == NULL)
441		return 0;
442
443	tok->decoding_readline = readline;
444	return 1;
445}
446
447/* Fetch the next byte from TOK. */
448
449static int fp_getc(struct tok_state *tok) {
450	return getc(tok->fp);
451}
452
453/* Unfetch the last byte back into TOK.  */
454
455static void fp_ungetc(int c, struct tok_state *tok) {
456	ungetc(c, tok->fp);
457}
458
459/* Read a line of input from TOK. Determine encoding
460   if necessary.  */
461
462static char *
463decoding_fgets(char *s, int size, struct tok_state *tok)
464{
465	char *line = NULL;
466	int badchar = 0;
467	for (;;) {
468		if (tok->decoding_state < 0) {
469			/* We already have a codec associated with
470			   this input. */
471			line = fp_readl(s, size, tok);
472			break;
473		} else if (tok->decoding_state > 0) {
474			/* We want a 'raw' read. */
475			line = Py_UniversalNewlineFgets(s, size,
476							tok->fp, NULL);
477			break;
478		} else {
479			/* We have not yet determined the encoding.
480			   If an encoding is found, use the file-pointer
481			   reader functions from now on. */
482			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
483				return error_ret(tok);
484			assert(tok->decoding_state != 0);
485		}
486	}
487	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
488		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
489			return error_ret(tok);
490		}
491	}
492#ifndef PGEN
493	/* The default encoding is ASCII, so make sure we don't have any
494           non-ASCII bytes in it. */
495	if (line && !tok->encoding) {
496		unsigned char *c;
497		for (c = (unsigned char *)line; *c; c++)
498			if (*c > 127) {
499				badchar = *c;
500				break;
501			}
502	}
503	if (badchar) {
504		char buf[500];
505		/* Need to add 1 to the line number, since this line
506		   has not been counted, yet.  */
507		sprintf(buf,
508			"Non-ASCII character '\\x%.2x' "
509			"in file %.200s on line %i, "
510			"but no encoding declared; "
511			"see http://www.python.org/peps/pep-0263.html for details",
512			badchar, tok->filename, tok->lineno + 1);
513		PyErr_SetString(PyExc_SyntaxError, buf);
514		return error_ret(tok);
515	}
516#endif
517	return line;
518}
519
520static int
521decoding_feof(struct tok_state *tok)
522{
523	if (tok->decoding_state >= 0) {
524		return feof(tok->fp);
525	} else {
526		PyObject* buf = tok->decoding_buffer;
527		if (buf == NULL) {
528			buf = PyObject_CallObject(tok->decoding_readline, NULL);
529			if (buf == NULL) {
530				error_ret(tok);
531				return 1;
532			} else {
533				tok->decoding_buffer = buf;
534			}
535		}
536		return PyObject_Length(buf) == 0;
537	}
538}
539
540/* Fetch a byte from TOK, using the string buffer. */
541
542static int
543buf_getc(struct tok_state *tok) {
544	return Py_CHARMASK(*tok->str++);
545}
546
547/* Unfetch a byte from TOK, using the string buffer. */
548
549static void
550buf_ungetc(int c, struct tok_state *tok) {
551	tok->str--;
552	assert(Py_CHARMASK(*tok->str) == c);	/* tok->cur may point to read-only segment */
553}
554
555/* Set the readline function for TOK to ENC. For the string-based
556   tokenizer, this means to just record the encoding. */
557
558static int
559buf_setreadl(struct tok_state *tok, const char* enc) {
560	tok->enc = enc;
561	return 1;
562}
563
564/* Return a UTF-8 encoding Python string object from the
565   C byte string STR, which is encoded with ENC. */
566
567#ifdef Py_USING_UNICODE
568static PyObject *
569translate_into_utf8(const char* str, const char* enc) {
570	PyObject *utf8;
571	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
572	if (buf == NULL)
573		return NULL;
574	utf8 = PyUnicode_AsUTF8String(buf);
575	Py_DECREF(buf);
576	return utf8;
577}
578#endif
579
580/* Decode a byte string STR for use as the buffer of TOK.
581   Look for encoding declarations inside STR, and record them
582   inside TOK.  */
583
584static const char *
585decode_str(const char *str, struct tok_state *tok)
586{
587	PyObject* utf8 = NULL;
588	const char *s;
589	const char *newl[2] = {NULL, NULL};
590	int lineno = 0;
591	tok->enc = NULL;
592	tok->str = str;
593	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
594		return error_ret(tok);
595	str = tok->str;		/* string after BOM if any */
596	assert(str);
597#ifdef Py_USING_UNICODE
598	if (tok->enc != NULL) {
599		utf8 = translate_into_utf8(str, tok->enc);
600		if (utf8 == NULL)
601			return error_ret(tok);
602		str = PyString_AsString(utf8);
603	}
604#endif
605	for (s = str;; s++) {
606		if (*s == '\0') break;
607		else if (*s == '\n') {
608			assert(lineno < 2);
609			newl[lineno] = s;
610			lineno++;
611			if (lineno == 2) break;
612		}
613	}
614	tok->enc = NULL;
615	/* need to check line 1 and 2 separately since check_coding_spec
616	   assumes a single line as input */
617	if (newl[0]) {
618		if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
619			return error_ret(tok);
620		if (tok->enc == NULL && newl[1]) {
621			if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
622					       tok, buf_setreadl))
623				return error_ret(tok);
624		}
625	}
626#ifdef Py_USING_UNICODE
627	if (tok->enc != NULL) {
628		assert(utf8 == NULL);
629		utf8 = translate_into_utf8(str, tok->enc);
630		if (utf8 == NULL) {
631			PyErr_Format(PyExc_SyntaxError,
632				"unknown encoding: %s", tok->enc);
633			return error_ret(tok);
634		}
635		str = PyString_AsString(utf8);
636	}
637#endif
638	assert(tok->decoding_buffer == NULL);
639	tok->decoding_buffer = utf8; /* CAUTION */
640	return str;
641}
642
643#endif /* PGEN */
644
645/* Set up tokenizer for string */
646
647struct tok_state *
648PyTokenizer_FromString(const char *str)
649{
650	struct tok_state *tok = tok_new();
651	if (tok == NULL)
652		return NULL;
653	str = (char *)decode_str(str, tok);
654	if (str == NULL) {
655		PyTokenizer_Free(tok);
656		return NULL;
657	}
658
659	/* XXX: constify members. */
660	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
661	return tok;
662}
663
664
665/* Set up tokenizer for file */
666
667struct tok_state *
668PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
669{
670	struct tok_state *tok = tok_new();
671	if (tok == NULL)
672		return NULL;
673	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
674		PyTokenizer_Free(tok);
675		return NULL;
676	}
677	tok->cur = tok->inp = tok->buf;
678	tok->end = tok->buf + BUFSIZ;
679	tok->fp = fp;
680	tok->prompt = ps1;
681	tok->nextprompt = ps2;
682	return tok;
683}
684
685
686/* Free a tok_state structure */
687
688void
689PyTokenizer_Free(struct tok_state *tok)
690{
691	if (tok->encoding != NULL)
692		PyMem_FREE(tok->encoding);
693#ifndef PGEN
694	Py_XDECREF(tok->decoding_readline);
695	Py_XDECREF(tok->decoding_buffer);
696#endif
697	if (tok->fp != NULL && tok->buf != NULL)
698		PyMem_FREE(tok->buf);
699	PyMem_FREE(tok);
700}
701
702#if !defined(PGEN) && defined(Py_USING_UNICODE)
703static int
704tok_stdin_decode(struct tok_state *tok, char **inp)
705{
706	PyObject *enc, *sysstdin, *decoded, *utf8;
707	const char *encoding;
708	char *converted;
709
710	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
711		return 0;
712	sysstdin = PySys_GetObject("stdin");
713	if (sysstdin == NULL || !PyFile_Check(sysstdin))
714		return 0;
715
716	enc = ((PyFileObject *)sysstdin)->f_encoding;
717	if (enc == NULL || !PyString_Check(enc))
718		return 0;
719	Py_INCREF(enc);
720
721	encoding = PyString_AsString(enc);
722	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
723	if (decoded == NULL)
724		goto error_clear;
725
726	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
727	Py_DECREF(decoded);
728	if (utf8 == NULL)
729		goto error_clear;
730
731	assert(PyString_Check(utf8));
732	converted = new_string(PyString_AS_STRING(utf8),
733			       PyString_GET_SIZE(utf8));
734	Py_DECREF(utf8);
735	if (converted == NULL)
736		goto error_nomem;
737
738	PyMem_FREE(*inp);
739	*inp = converted;
740	if (tok->encoding != NULL)
741		PyMem_FREE(tok->encoding);
742	tok->encoding = new_string(encoding, strlen(encoding));
743	if (tok->encoding == NULL)
744		goto error_nomem;
745
746	Py_DECREF(enc);
747	return 0;
748
749error_nomem:
750	Py_DECREF(enc);
751	tok->done = E_NOMEM;
752	return -1;
753
754error_clear:
755	/* Fallback to iso-8859-1: for backward compatibility */
756	Py_DECREF(enc);
757	PyErr_Clear();
758	return 0;
759}
760#endif
761
762/* Get next char, updating state; error code goes into tok->done */
763
764static int
765tok_nextc(register struct tok_state *tok)
766{
767	for (;;) {
768		if (tok->cur != tok->inp) {
769			return Py_CHARMASK(*tok->cur++); /* Fast path */
770		}
771		if (tok->done != E_OK)
772			return EOF;
773		if (tok->fp == NULL) {
774			char *end = strchr(tok->inp, '\n');
775			if (end != NULL)
776				end++;
777			else {
778				end = strchr(tok->inp, '\0');
779				if (end == tok->inp) {
780					tok->done = E_EOF;
781					return EOF;
782				}
783			}
784			if (tok->start == NULL)
785				tok->buf = tok->cur;
786			tok->line_start = tok->cur;
787			tok->lineno++;
788			tok->inp = end;
789			return Py_CHARMASK(*tok->cur++);
790		}
791		if (tok->prompt != NULL) {
792			char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
793			if (tok->nextprompt != NULL)
794				tok->prompt = tok->nextprompt;
795			if (newtok == NULL)
796				tok->done = E_INTR;
797			else if (*newtok == '\0') {
798				PyMem_FREE(newtok);
799				tok->done = E_EOF;
800			}
801#if !defined(PGEN) && defined(Py_USING_UNICODE)
802			else if (tok_stdin_decode(tok, &newtok) != 0)
803				PyMem_FREE(newtok);
804#endif
805			else if (tok->start != NULL) {
806				size_t start = tok->start - tok->buf;
807				size_t oldlen = tok->cur - tok->buf;
808				size_t newlen = oldlen + strlen(newtok);
809				char *buf = tok->buf;
810				buf = (char *)PyMem_REALLOC(buf, newlen+1);
811				tok->lineno++;
812				if (buf == NULL) {
813					PyMem_FREE(tok->buf);
814					tok->buf = NULL;
815					PyMem_FREE(newtok);
816					tok->done = E_NOMEM;
817					return EOF;
818				}
819				tok->buf = buf;
820				tok->cur = tok->buf + oldlen;
821				tok->line_start = tok->cur;
822				strcpy(tok->buf + oldlen, newtok);
823				PyMem_FREE(newtok);
824				tok->inp = tok->buf + newlen;
825				tok->end = tok->inp + 1;
826				tok->start = tok->buf + start;
827			}
828			else {
829				tok->lineno++;
830				if (tok->buf != NULL)
831					PyMem_FREE(tok->buf);
832				tok->buf = newtok;
833				tok->line_start = tok->buf;
834				tok->cur = tok->buf;
835				tok->line_start = tok->buf;
836				tok->inp = strchr(tok->buf, '\0');
837				tok->end = tok->inp + 1;
838			}
839		}
840		else {
841			int done = 0;
842			Py_ssize_t cur = 0;
843			char *pt;
844			if (tok->start == NULL) {
845				if (tok->buf == NULL) {
846					tok->buf = (char *)
847						PyMem_MALLOC(BUFSIZ);
848					if (tok->buf == NULL) {
849						tok->done = E_NOMEM;
850						return EOF;
851					}
852					tok->end = tok->buf + BUFSIZ;
853				}
854				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
855					  tok) == NULL) {
856					tok->done = E_EOF;
857					done = 1;
858				}
859				else {
860					tok->done = E_OK;
861					tok->inp = strchr(tok->buf, '\0');
862					done = tok->inp[-1] == '\n';
863				}
864			}
865			else {
866				cur = tok->cur - tok->buf;
867				if (decoding_feof(tok)) {
868					tok->done = E_EOF;
869					done = 1;
870				}
871				else
872					tok->done = E_OK;
873			}
874			tok->lineno++;
875			/* Read until '\n' or EOF */
876			while (!done) {
877				Py_ssize_t curstart = tok->start == NULL ? -1 :
878					          tok->start - tok->buf;
879				Py_ssize_t curvalid = tok->inp - tok->buf;
880				Py_ssize_t newsize = curvalid + BUFSIZ;
881				char *newbuf = tok->buf;
882				newbuf = (char *)PyMem_REALLOC(newbuf,
883							       newsize);
884				if (newbuf == NULL) {
885					tok->done = E_NOMEM;
886					tok->cur = tok->inp;
887					return EOF;
888				}
889				tok->buf = newbuf;
890				tok->inp = tok->buf + curvalid;
891				tok->end = tok->buf + newsize;
892				tok->start = curstart < 0 ? NULL :
893					     tok->buf + curstart;
894				if (decoding_fgets(tok->inp,
895					       (int)(tok->end - tok->inp),
896					       tok) == NULL) {
897					/* Break out early on decoding
898					   errors, as tok->buf will be NULL
899					 */
900					if (tok->decoding_erred)
901						return EOF;
902					/* Last line does not end in \n,
903					   fake one */
904					strcpy(tok->inp, "\n");
905				}
906				tok->inp = strchr(tok->inp, '\0');
907				done = tok->inp[-1] == '\n';
908			}
909			if (tok->buf != NULL) {
910				tok->cur = tok->buf + cur;
911				tok->line_start = tok->cur;
912				/* replace "\r\n" with "\n" */
913				/* For Mac leave the \r, giving a syntax error */
914				pt = tok->inp - 2;
915				if (pt >= tok->buf && *pt == '\r') {
916					*pt++ = '\n';
917					*pt = '\0';
918					tok->inp = pt;
919				}
920			}
921		}
922		if (tok->done != E_OK) {
923			if (tok->prompt != NULL)
924				PySys_WriteStderr("\n");
925			tok->cur = tok->inp;
926			return EOF;
927		}
928	}
929	/*NOTREACHED*/
930}
931
932
933/* Back-up one character */
934
935static void
936tok_backup(register struct tok_state *tok, register int c)
937{
938	if (c != EOF) {
939		if (--tok->cur < tok->buf)
940			Py_FatalError("tok_backup: begin of buffer");
941		if (*tok->cur != c)
942			*tok->cur = c;
943	}
944}
945
946
947/* Return the token corresponding to a single character */
948
949int
950PyToken_OneChar(int c)
951{
952	switch (c) {
953	case '(':	return LPAR;
954	case ')':	return RPAR;
955	case '[':	return LSQB;
956	case ']':	return RSQB;
957	case ':':	return COLON;
958	case ',':	return COMMA;
959	case ';':	return SEMI;
960	case '+':	return PLUS;
961	case '-':	return MINUS;
962	case '*':	return STAR;
963	case '/':	return SLASH;
964	case '|':	return VBAR;
965	case '&':	return AMPER;
966	case '<':	return LESS;
967	case '>':	return GREATER;
968	case '=':	return EQUAL;
969	case '.':	return DOT;
970	case '%':	return PERCENT;
971	case '`':	return BACKQUOTE;
972	case '{':	return LBRACE;
973	case '}':	return RBRACE;
974	case '^':	return CIRCUMFLEX;
975	case '~':	return TILDE;
976	case '@':       return AT;
977	default:	return OP;
978	}
979}
980
981
982int
983PyToken_TwoChars(int c1, int c2)
984{
985	switch (c1) {
986	case '=':
987		switch (c2) {
988		case '=':	return EQEQUAL;
989		}
990		break;
991	case '!':
992		switch (c2) {
993		case '=':	return NOTEQUAL;
994		}
995		break;
996	case '<':
997		switch (c2) {
998		case '>':	return NOTEQUAL;
999		case '=':	return LESSEQUAL;
1000		case '<':	return LEFTSHIFT;
1001		}
1002		break;
1003	case '>':
1004		switch (c2) {
1005		case '=':	return GREATEREQUAL;
1006		case '>':	return RIGHTSHIFT;
1007		}
1008		break;
1009	case '+':
1010		switch (c2) {
1011		case '=':	return PLUSEQUAL;
1012		}
1013		break;
1014	case '-':
1015		switch (c2) {
1016		case '=':	return MINEQUAL;
1017		}
1018		break;
1019	case '*':
1020		switch (c2) {
1021		case '*':	return DOUBLESTAR;
1022		case '=':	return STAREQUAL;
1023		}
1024		break;
1025	case '/':
1026		switch (c2) {
1027		case '/':	return DOUBLESLASH;
1028		case '=':	return SLASHEQUAL;
1029		}
1030		break;
1031	case '|':
1032		switch (c2) {
1033		case '=':	return VBAREQUAL;
1034		}
1035		break;
1036	case '%':
1037		switch (c2) {
1038		case '=':	return PERCENTEQUAL;
1039		}
1040		break;
1041	case '&':
1042		switch (c2) {
1043		case '=':	return AMPEREQUAL;
1044		}
1045		break;
1046	case '^':
1047		switch (c2) {
1048		case '=':	return CIRCUMFLEXEQUAL;
1049		}
1050		break;
1051	}
1052	return OP;
1053}
1054
1055int
1056PyToken_ThreeChars(int c1, int c2, int c3)
1057{
1058	switch (c1) {
1059	case '<':
1060		switch (c2) {
1061		case '<':
1062			switch (c3) {
1063			case '=':
1064				return LEFTSHIFTEQUAL;
1065			}
1066			break;
1067		}
1068		break;
1069	case '>':
1070		switch (c2) {
1071		case '>':
1072			switch (c3) {
1073			case '=':
1074				return RIGHTSHIFTEQUAL;
1075			}
1076			break;
1077		}
1078		break;
1079	case '*':
1080		switch (c2) {
1081		case '*':
1082			switch (c3) {
1083			case '=':
1084				return DOUBLESTAREQUAL;
1085			}
1086			break;
1087		}
1088		break;
1089	case '/':
1090		switch (c2) {
1091		case '/':
1092			switch (c3) {
1093			case '=':
1094				return DOUBLESLASHEQUAL;
1095			}
1096			break;
1097		}
1098		break;
1099	}
1100	return OP;
1101}
1102
1103static int
1104indenterror(struct tok_state *tok)
1105{
1106	if (tok->alterror) {
1107		tok->done = E_TABSPACE;
1108		tok->cur = tok->inp;
1109		return 1;
1110	}
1111	if (tok->altwarning) {
1112		PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1113                                  "in indentation\n", tok->filename);
1114		tok->altwarning = 0;
1115	}
1116	return 0;
1117}
1118
1119
1120/* Get next token, after space stripping etc. */
1121
1122static int
1123tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1124{
1125	register int c;
1126	int blankline;
1127
1128	*p_start = *p_end = NULL;
1129  nextline:
1130	tok->start = NULL;
1131	blankline = 0;
1132
1133	/* Get indentation level */
1134	if (tok->atbol) {
1135		register int col = 0;
1136		register int altcol = 0;
1137		tok->atbol = 0;
1138		for (;;) {
1139			c = tok_nextc(tok);
1140			if (c == ' ')
1141				col++, altcol++;
1142			else if (c == '\t') {
1143				col = (col/tok->tabsize + 1) * tok->tabsize;
1144				altcol = (altcol/tok->alttabsize + 1)
1145					* tok->alttabsize;
1146			}
1147			else if (c == '\014') /* Control-L (formfeed) */
1148				col = altcol = 0; /* For Emacs users */
1149			else
1150				break;
1151		}
1152		tok_backup(tok, c);
1153		if (c == '#' || c == '\n') {
1154			/* Lines with only whitespace and/or comments
1155			   shouldn't affect the indentation and are
1156			   not passed to the parser as NEWLINE tokens,
1157			   except *totally* empty lines in interactive
1158			   mode, which signal the end of a command group. */
1159			if (col == 0 && c == '\n' && tok->prompt != NULL)
1160				blankline = 0; /* Let it through */
1161			else
1162				blankline = 1; /* Ignore completely */
1163			/* We can't jump back right here since we still
1164			   may need to skip to the end of a comment */
1165		}
1166		if (!blankline && tok->level == 0) {
1167			if (col == tok->indstack[tok->indent]) {
1168				/* No change */
1169				if (altcol != tok->altindstack[tok->indent]) {
1170					if (indenterror(tok))
1171						return ERRORTOKEN;
1172				}
1173			}
1174			else if (col > tok->indstack[tok->indent]) {
1175				/* Indent -- always one */
1176				if (tok->indent+1 >= MAXINDENT) {
1177					tok->done = E_TOODEEP;
1178					tok->cur = tok->inp;
1179					return ERRORTOKEN;
1180				}
1181				if (altcol <= tok->altindstack[tok->indent]) {
1182					if (indenterror(tok))
1183						return ERRORTOKEN;
1184				}
1185				tok->pendin++;
1186				tok->indstack[++tok->indent] = col;
1187				tok->altindstack[tok->indent] = altcol;
1188			}
1189			else /* col < tok->indstack[tok->indent] */ {
1190				/* Dedent -- any number, must be consistent */
1191				while (tok->indent > 0 &&
1192					col < tok->indstack[tok->indent]) {
1193					tok->pendin--;
1194					tok->indent--;
1195				}
1196				if (col != tok->indstack[tok->indent]) {
1197					tok->done = E_DEDENT;
1198					tok->cur = tok->inp;
1199					return ERRORTOKEN;
1200				}
1201				if (altcol != tok->altindstack[tok->indent]) {
1202					if (indenterror(tok))
1203						return ERRORTOKEN;
1204				}
1205			}
1206		}
1207	}
1208
1209	tok->start = tok->cur;
1210
1211	/* Return pending indents/dedents */
1212	if (tok->pendin != 0) {
1213		if (tok->pendin < 0) {
1214			tok->pendin++;
1215			return DEDENT;
1216		}
1217		else {
1218			tok->pendin--;
1219			return INDENT;
1220		}
1221	}
1222
1223 again:
1224	tok->start = NULL;
1225	/* Skip spaces */
1226	do {
1227		c = tok_nextc(tok);
1228	} while (c == ' ' || c == '\t' || c == '\014');
1229
1230	/* Set start of current token */
1231	tok->start = tok->cur - 1;
1232
1233	/* Skip comment, while looking for tab-setting magic */
1234	if (c == '#') {
1235		static char *tabforms[] = {
1236			"tab-width:",		/* Emacs */
1237			":tabstop=",		/* vim, full form */
1238			":ts=",			/* vim, abbreviated form */
1239			"set tabsize=",		/* will vi never die? */
1240		/* more templates can be added here to support other editors */
1241		};
1242		char cbuf[80];
1243		char *tp, **cp;
1244		tp = cbuf;
1245		do {
1246			*tp++ = c = tok_nextc(tok);
1247		} while (c != EOF && c != '\n' &&
1248			 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1249		*tp = '\0';
1250		for (cp = tabforms;
1251		     cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1252		     cp++) {
1253			if ((tp = strstr(cbuf, *cp))) {
1254				int newsize = atoi(tp + strlen(*cp));
1255
1256				if (newsize >= 1 && newsize <= 40) {
1257					tok->tabsize = newsize;
1258					if (Py_VerboseFlag)
1259					    PySys_WriteStderr(
1260						"Tab size set to %d\n",
1261						newsize);
1262				}
1263			}
1264		}
1265		while (c != EOF && c != '\n')
1266			c = tok_nextc(tok);
1267	}
1268
1269	/* Check for EOF and errors now */
1270	if (c == EOF) {
1271		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1272	}
1273
1274	/* Identifier (most frequent token!) */
1275	if (isalpha(c) || c == '_') {
1276		/* Process r"", u"" and ur"" */
1277		switch (c) {
1278		case 'b':
1279		case 'B':
1280			c = tok_nextc(tok);
1281			if (c == 'r' || c == 'R')
1282				c = tok_nextc(tok);
1283			if (c == '"' || c == '\'')
1284				goto letter_quote;
1285			break;
1286		case 'r':
1287		case 'R':
1288			c = tok_nextc(tok);
1289			if (c == '"' || c == '\'')
1290				goto letter_quote;
1291			break;
1292		case 'u':
1293		case 'U':
1294			c = tok_nextc(tok);
1295			if (c == 'r' || c == 'R')
1296				c = tok_nextc(tok);
1297			if (c == '"' || c == '\'')
1298				goto letter_quote;
1299			break;
1300		}
1301		while (isalnum(c) || c == '_') {
1302			c = tok_nextc(tok);
1303		}
1304		tok_backup(tok, c);
1305		*p_start = tok->start;
1306		*p_end = tok->cur;
1307		return NAME;
1308	}
1309
1310	/* Newline */
1311	if (c == '\n') {
1312		tok->atbol = 1;
1313		if (blankline || tok->level > 0)
1314			goto nextline;
1315		*p_start = tok->start;
1316		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1317                tok->cont_line = 0;
1318		return NEWLINE;
1319	}
1320
1321	/* Period or number starting with period? */
1322	if (c == '.') {
1323		c = tok_nextc(tok);
1324		if (isdigit(c)) {
1325			goto fraction;
1326		}
1327		else {
1328			tok_backup(tok, c);
1329			*p_start = tok->start;
1330			*p_end = tok->cur;
1331			return DOT;
1332		}
1333	}
1334
1335	/* Number */
1336	if (isdigit(c)) {
1337		if (c == '0') {
1338			/* Hex, octal or binary -- maybe. */
1339			c = tok_nextc(tok);
1340			if (c == '.')
1341				goto fraction;
1342#ifndef WITHOUT_COMPLEX
1343			if (c == 'j' || c == 'J')
1344				goto imaginary;
1345#endif
1346			if (c == 'x' || c == 'X') {
1347
1348				/* Hex */
1349				c = tok_nextc(tok);
1350				if (!isxdigit(c)) {
1351					tok->done = E_TOKEN;
1352					tok_backup(tok, c);
1353					return ERRORTOKEN;
1354				}
1355				do {
1356					c = tok_nextc(tok);
1357				} while (isxdigit(c));
1358			}
1359                        else if (c == 'o' || c == 'O') {
1360				/* Octal */
1361				c = tok_nextc(tok);
1362				if (c < '0' || c > '8') {
1363					tok->done = E_TOKEN;
1364					tok_backup(tok, c);
1365					return ERRORTOKEN;
1366				}
1367				do {
1368					c = tok_nextc(tok);
1369				} while ('0' <= c && c < '8');
1370			}
1371			else if (c == 'b' || c == 'B') {
1372				/* Binary */
1373				c = tok_nextc(tok);
1374				if (c != '0' && c != '1') {
1375					tok->done = E_TOKEN;
1376					tok_backup(tok, c);
1377					return ERRORTOKEN;
1378				}
1379				do {
1380					c = tok_nextc(tok);
1381				} while (c == '0' || c == '1');
1382			}
1383			else {
1384				int found_decimal = 0;
1385				/* Octal; c is first char of it */
1386				/* There's no 'isoctdigit' macro, sigh */
1387				while ('0' <= c && c < '8') {
1388					c = tok_nextc(tok);
1389				}
1390				if (isdigit(c)) {
1391					found_decimal = 1;
1392					do {
1393						c = tok_nextc(tok);
1394					} while (isdigit(c));
1395				}
1396				if (c == '.')
1397					goto fraction;
1398				else if (c == 'e' || c == 'E')
1399					goto exponent;
1400#ifndef WITHOUT_COMPLEX
1401				else if (c == 'j' || c == 'J')
1402					goto imaginary;
1403#endif
1404				else if (found_decimal) {
1405					tok->done = E_TOKEN;
1406					tok_backup(tok, c);
1407					return ERRORTOKEN;
1408				}
1409			}
1410			if (c == 'l' || c == 'L')
1411				c = tok_nextc(tok);
1412		}
1413		else {
1414			/* Decimal */
1415			do {
1416				c = tok_nextc(tok);
1417			} while (isdigit(c));
1418			if (c == 'l' || c == 'L')
1419				c = tok_nextc(tok);
1420			else {
1421				/* Accept floating point numbers. */
1422				if (c == '.') {
1423		fraction:
1424					/* Fraction */
1425					do {
1426						c = tok_nextc(tok);
1427					} while (isdigit(c));
1428				}
1429				if (c == 'e' || c == 'E') {
1430		exponent:
1431					/* Exponent part */
1432					c = tok_nextc(tok);
1433					if (c == '+' || c == '-')
1434						c = tok_nextc(tok);
1435					if (!isdigit(c)) {
1436						tok->done = E_TOKEN;
1437						tok_backup(tok, c);
1438						return ERRORTOKEN;
1439					}
1440					do {
1441						c = tok_nextc(tok);
1442					} while (isdigit(c));
1443				}
1444#ifndef WITHOUT_COMPLEX
1445				if (c == 'j' || c == 'J')
1446					/* Imaginary part */
1447		imaginary:
1448					c = tok_nextc(tok);
1449#endif
1450			}
1451		}
1452		tok_backup(tok, c);
1453		*p_start = tok->start;
1454		*p_end = tok->cur;
1455		return NUMBER;
1456	}
1457
1458  letter_quote:
1459	/* String */
1460	if (c == '\'' || c == '"') {
1461		Py_ssize_t quote2 = tok->cur - tok->start + 1;
1462		int quote = c;
1463		int triple = 0;
1464		int tripcount = 0;
1465		for (;;) {
1466			c = tok_nextc(tok);
1467			if (c == '\n') {
1468				if (!triple) {
1469					tok->done = E_EOLS;
1470					tok_backup(tok, c);
1471					return ERRORTOKEN;
1472				}
1473				tripcount = 0;
1474                                tok->cont_line = 1; /* multiline string. */
1475			}
1476			else if (c == EOF) {
1477				if (triple)
1478					tok->done = E_EOFS;
1479				else
1480					tok->done = E_EOLS;
1481				tok->cur = tok->inp;
1482				return ERRORTOKEN;
1483			}
1484			else if (c == quote) {
1485				tripcount++;
1486				if (tok->cur - tok->start == quote2) {
1487					c = tok_nextc(tok);
1488					if (c == quote) {
1489						triple = 1;
1490						tripcount = 0;
1491						continue;
1492					}
1493					tok_backup(tok, c);
1494				}
1495				if (!triple || tripcount == 3)
1496					break;
1497			}
1498			else if (c == '\\') {
1499				tripcount = 0;
1500				c = tok_nextc(tok);
1501				if (c == EOF) {
1502					tok->done = E_EOLS;
1503					tok->cur = tok->inp;
1504					return ERRORTOKEN;
1505				}
1506			}
1507			else
1508				tripcount = 0;
1509		}
1510		*p_start = tok->start;
1511		*p_end = tok->cur;
1512		return STRING;
1513	}
1514
1515	/* Line continuation */
1516	if (c == '\\') {
1517		c = tok_nextc(tok);
1518		if (c != '\n') {
1519			tok->done = E_LINECONT;
1520			tok->cur = tok->inp;
1521			return ERRORTOKEN;
1522		}
1523                tok->cont_line = 1;
1524		goto again; /* Read next line */
1525	}
1526
1527	/* Check for two-character token */
1528	{
1529		int c2 = tok_nextc(tok);
1530		int token = PyToken_TwoChars(c, c2);
1531#ifndef PGEN
1532		if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1533			if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1534					       "<> not supported in 3.x",
1535					       tok->filename, tok->lineno,
1536					       NULL, NULL)) {
1537				return ERRORTOKEN;
1538			}
1539		}
1540#endif
1541		if (token != OP) {
1542			int c3 = tok_nextc(tok);
1543			int token3 = PyToken_ThreeChars(c, c2, c3);
1544			if (token3 != OP) {
1545				token = token3;
1546			} else {
1547				tok_backup(tok, c3);
1548			}
1549			*p_start = tok->start;
1550			*p_end = tok->cur;
1551			return token;
1552		}
1553		tok_backup(tok, c2);
1554	}
1555
1556	/* Keep track of parentheses nesting level */
1557	switch (c) {
1558	case '(':
1559	case '[':
1560	case '{':
1561		tok->level++;
1562		break;
1563	case ')':
1564	case ']':
1565	case '}':
1566		tok->level--;
1567		break;
1568	}
1569
1570	/* Punctuation character */
1571	*p_start = tok->start;
1572	*p_end = tok->cur;
1573	return PyToken_OneChar(c);
1574}
1575
1576int
1577PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1578{
1579	int result = tok_get(tok, p_start, p_end);
1580	if (tok->decoding_erred) {
1581		result = ERRORTOKEN;
1582		tok->done = E_DECODE;
1583	}
1584	return result;
1585}
1586
1587/* This function is only called from parsetok. However, it cannot live
1588   there, as it must be empty for PGEN, and we can check for PGEN only
1589   in this file. */
1590
1591#if defined(PGEN) || !defined(Py_USING_UNICODE)
1592char*
1593PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1594{
1595	return NULL;
1596}
1597#else
1598#ifdef Py_USING_UNICODE
1599static PyObject *
1600dec_utf8(const char *enc, const char *text, size_t len) {
1601	PyObject *ret = NULL;
1602	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1603	if (unicode_text) {
1604		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1605		Py_DECREF(unicode_text);
1606	}
1607	if (!ret) {
1608		PyErr_Clear();
1609	}
1610	return ret;
1611}
1612char *
1613PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1614{
1615	char *text = NULL;
1616	if (tok->encoding) {
1617		/* convert source to original encondig */
1618		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1619		if (lineobj != NULL) {
1620			int linelen = PyString_Size(lineobj);
1621			const char *line = PyString_AsString(lineobj);
1622			text = PyObject_MALLOC(linelen + 1);
1623			if (text != NULL && line != NULL) {
1624				if (linelen)
1625					strncpy(text, line, linelen);
1626				text[linelen] = '\0';
1627			}
1628			Py_DECREF(lineobj);
1629
1630			/* adjust error offset */
1631			if (*offset > 1) {
1632				PyObject *offsetobj = dec_utf8(tok->encoding,
1633							       tok->buf, *offset-1);
1634				if (offsetobj) {
1635					*offset = PyString_Size(offsetobj) + 1;
1636					Py_DECREF(offsetobj);
1637				}
1638			}
1639
1640		}
1641	}
1642	return text;
1643
1644}
1645#endif /* defined(Py_USING_UNICODE) */
1646#endif
1647
1648
1649#ifdef Py_DEBUG
1650
1651void
1652tok_dump(int type, char *start, char *end)
1653{
1654	printf("%s", _PyParser_TokenNames[type]);
1655	if (type == NAME || type == NUMBER || type == STRING || type == OP)
1656		printf("(%.*s)", (int)(end - start), start);
1657}
1658
1659#endif
1660