tokenizer.c revision 42d63847c32fda10b61c1f420402a09ddbbe95eb
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#include "pydebug.h"
20#endif /* PGEN */
21
22extern char *PyOS_Readline(FILE *, FILE *, char *);
23/* Return malloc'ed string including trailing \n;
24   empty malloc'ed string for EOF;
25   NULL if interrupted */
26
27/* Don't ever change this -- it would break the portability of Python code */
28#define TABSIZE 8
29
30/* Forward */
31static struct tok_state *tok_new(void);
32static int tok_nextc(struct tok_state *tok);
33static void tok_backup(struct tok_state *tok, int c);
34
35/* Token names */
36
37char *_PyParser_TokenNames[] = {
38	"ENDMARKER",
39	"NAME",
40	"NUMBER",
41	"STRING",
42	"NEWLINE",
43	"INDENT",
44	"DEDENT",
45	"LPAR",
46	"RPAR",
47	"LSQB",
48	"RSQB",
49	"COLON",
50	"COMMA",
51	"SEMI",
52	"PLUS",
53	"MINUS",
54	"STAR",
55	"SLASH",
56	"VBAR",
57	"AMPER",
58	"LESS",
59	"GREATER",
60	"EQUAL",
61	"DOT",
62	"PERCENT",
63	"BACKQUOTE",
64	"LBRACE",
65	"RBRACE",
66	"EQEQUAL",
67	"NOTEQUAL",
68	"LESSEQUAL",
69	"GREATEREQUAL",
70	"TILDE",
71	"CIRCUMFLEX",
72	"LEFTSHIFT",
73	"RIGHTSHIFT",
74	"DOUBLESTAR",
75	"PLUSEQUAL",
76	"MINEQUAL",
77	"STAREQUAL",
78	"SLASHEQUAL",
79	"PERCENTEQUAL",
80	"AMPEREQUAL",
81	"VBAREQUAL",
82	"CIRCUMFLEXEQUAL",
83	"LEFTSHIFTEQUAL",
84	"RIGHTSHIFTEQUAL",
85	"DOUBLESTAREQUAL",
86	"DOUBLESLASH",
87	"DOUBLESLASHEQUAL",
88	"AT",
89	/* This table must match the #defines in token.h! */
90	"OP",
91	"<ERRORTOKEN>",
92	"<N_TOKENS>"
93};
94
95
96/* Create and initialize a new tok_state structure */
97
98static struct tok_state *
99tok_new(void)
100{
101	struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102                                                sizeof(struct tok_state));
103	if (tok == NULL)
104		return NULL;
105	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
106	tok->done = E_OK;
107	tok->fp = NULL;
108	tok->input = NULL;
109	tok->tabsize = TABSIZE;
110	tok->indent = 0;
111	tok->indstack[0] = 0;
112	tok->atbol = 1;
113	tok->pendin = 0;
114	tok->prompt = tok->nextprompt = NULL;
115	tok->lineno = 0;
116	tok->level = 0;
117	tok->filename = NULL;
118	tok->altwarning = 0;
119	tok->alterror = 0;
120	tok->alttabsize = 1;
121	tok->altindstack[0] = 0;
122	tok->decoding_state = 0;
123	tok->decoding_erred = 0;
124	tok->read_coding_spec = 0;
125	tok->encoding = NULL;
126        tok->cont_line = 0;
127#ifndef PGEN
128	tok->decoding_readline = NULL;
129	tok->decoding_buffer = NULL;
130#endif
131	return tok;
132}
133
134static char *
135new_string(const char *s, Py_ssize_t len)
136{
137	char* result = (char *)PyMem_MALLOC(len + 1);
138	if (result != NULL) {
139		memcpy(result, s, len);
140		result[len] = '\0';
141	}
142	return result;
143}
144
145#ifdef PGEN
146
147static char *
148decoding_fgets(char *s, int size, struct tok_state *tok)
149{
150	return fgets(s, size, tok->fp);
151}
152
153static int
154decoding_feof(struct tok_state *tok)
155{
156	return feof(tok->fp);
157}
158
159static char *
160decode_str(const char *str, int exec_input, struct tok_state *tok)
161{
162	return new_string(str, strlen(str));
163}
164
165#else /* PGEN */
166
167static char *
168error_ret(struct tok_state *tok) /* XXX */
169{
170	tok->decoding_erred = 1;
171	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
172		PyMem_FREE(tok->buf);
173	tok->buf = NULL;
174	return NULL;		/* as if it were EOF */
175}
176
177
178static char *
179get_normal_name(char *s)	/* for utf-8 and latin-1 */
180{
181	char buf[13];
182	int i;
183	for (i = 0; i < 12; i++) {
184		int c = s[i];
185		if (c == '\0')
186			break;
187		else if (c == '_')
188			buf[i] = '-';
189		else
190			buf[i] = tolower(c);
191	}
192	buf[i] = '\0';
193	if (strcmp(buf, "utf-8") == 0 ||
194	    strncmp(buf, "utf-8-", 6) == 0)
195		return "utf-8";
196	else if (strcmp(buf, "latin-1") == 0 ||
197		 strcmp(buf, "iso-8859-1") == 0 ||
198		 strcmp(buf, "iso-latin-1") == 0 ||
199		 strncmp(buf, "latin-1-", 8) == 0 ||
200		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201		 strncmp(buf, "iso-latin-1-", 12) == 0)
202		return "iso-8859-1";
203	else
204		return s;
205}
206
207/* Return the coding spec in S, or NULL if none is found.  */
208
209static char *
210get_coding_spec(const char *s, Py_ssize_t size)
211{
212	Py_ssize_t i;
213	/* Coding spec must be in a comment, and that comment must be
214         * the only statement on the source code line. */
215        for (i = 0; i < size - 6; i++) {
216		if (s[i] == '#')
217			break;
218		if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219			return NULL;
220	}
221	for (; i < size - 6; i++) { /* XXX inefficient search */
222		const char* t = s + i;
223		if (strncmp(t, "coding", 6) == 0) {
224			const char* begin = NULL;
225			t += 6;
226			if (t[0] != ':' && t[0] != '=')
227				continue;
228			do {
229				t++;
230			} while (t[0] == '\x20' || t[0] == '\t');
231
232			begin = t;
233			while (isalnum(Py_CHARMASK(t[0])) ||
234			       t[0] == '-' || t[0] == '_' || t[0] == '.')
235				t++;
236
237			if (begin < t) {
238				char* r = new_string(begin, t - begin);
239				char* q = get_normal_name(r);
240				if (r != q) {
241					PyMem_FREE(r);
242					r = new_string(q, strlen(q));
243				}
244				return r;
245			}
246		}
247	}
248	return NULL;
249}
250
251/* Check whether the line contains a coding spec. If it does,
252   invoke the set_readline function for the new encoding.
253   This function receives the tok_state and the new encoding.
254   Return 1 on success, 0 on failure.  */
255
256static int
257check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
258		  int set_readline(struct tok_state *, const char *))
259{
260	char * cs;
261	int r = 1;
262
263        if (tok->cont_line)
264		/* It's a continuation line, so it can't be a coding spec. */
265		return 1;
266	cs = get_coding_spec(line, size);
267	if (cs != NULL) {
268		tok->read_coding_spec = 1;
269		if (tok->encoding == NULL) {
270			assert(tok->decoding_state == 1); /* raw */
271			if (strcmp(cs, "utf-8") == 0 ||
272			    strcmp(cs, "iso-8859-1") == 0) {
273				tok->encoding = cs;
274			} else {
275#ifdef Py_USING_UNICODE
276				r = set_readline(tok, cs);
277				if (r) {
278					tok->encoding = cs;
279					tok->decoding_state = -1;
280				}
281				else
282					PyMem_FREE(cs);
283#else
284                                /* Without Unicode support, we cannot
285                                   process the coding spec. Since there
286                                   won't be any Unicode literals, that
287                                   won't matter. */
288				PyMem_FREE(cs);
289#endif
290			}
291		} else {	/* then, compare cs with BOM */
292			r = (strcmp(tok->encoding, cs) == 0);
293			PyMem_FREE(cs);
294		}
295	}
296	if (!r) {
297		cs = tok->encoding;
298		if (!cs)
299			cs = "with BOM";
300		PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301	}
302	return r;
303}
304
305/* See whether the file starts with a BOM. If it does,
306   invoke the set_readline function with the new encoding.
307   Return 1 on success, 0 on failure.  */
308
309static int
310check_bom(int get_char(struct tok_state *),
311	  void unget_char(int, struct tok_state *),
312	  int set_readline(struct tok_state *, const char *),
313	  struct tok_state *tok)
314{
315	int ch = get_char(tok);
316	tok->decoding_state = 1;
317	if (ch == EOF) {
318		return 1;
319	} else if (ch == 0xEF) {
320		ch = get_char(tok);
321		if (ch != 0xBB)
322			goto NON_BOM;
323		ch = get_char(tok);
324		if (ch != 0xBF)
325			goto NON_BOM;
326#if 0
327	/* Disable support for UTF-16 BOMs until a decision
328	   is made whether this needs to be supported.  */
329	} else if (ch == 0xFE) {
330		ch = get_char(tok);
331		if (ch != 0xFF)
332			goto NON_BOM;
333		if (!set_readline(tok, "utf-16-be"))
334			return 0;
335		tok->decoding_state = -1;
336	} else if (ch == 0xFF) {
337		ch = get_char(tok);
338		if (ch != 0xFE)
339			goto NON_BOM;
340		if (!set_readline(tok, "utf-16-le"))
341			return 0;
342		tok->decoding_state = -1;
343#endif
344	} else {
345		unget_char(ch, tok);
346		return 1;
347	}
348	if (tok->encoding != NULL)
349		PyMem_FREE(tok->encoding);
350	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
351	return 1;
352  NON_BOM:
353	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
354	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
355	return 1;
356}
357
358/* Read a line of text from TOK into S, using the stream in TOK.
359   Return NULL on failure, else S.
360
361   On entry, tok->decoding_buffer will be one of:
362     1) NULL: need to call tok->decoding_readline to get a new line
363     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
364           stored the result in tok->decoding_buffer
365     3) PyStringObject *: previous call to fp_readl did not have enough room
366           (in the s buffer) to copy entire contents of the line read
367           by tok->decoding_readline.  tok->decoding_buffer has the overflow.
368           In this case, fp_readl is called in a loop (with an expanded buffer)
369           until the buffer ends with a '\n' (or until the end of the file is
370           reached): see tok_nextc and its calls to decoding_fgets.
371*/
372
373static char *
374fp_readl(char *s, int size, struct tok_state *tok)
375{
376#ifndef Py_USING_UNICODE
377	/* In a non-Unicode built, this should never be called. */
378	Py_FatalError("fp_readl should not be called in this build.");
379	return NULL; /* Keep compiler happy (not reachable) */
380#else
381	PyObject* utf8 = NULL;
382	PyObject* buf = tok->decoding_buffer;
383	char *str;
384	Py_ssize_t utf8len;
385
386	/* Ask for one less byte so we can terminate it */
387	assert(size > 0);
388	size--;
389
390	if (buf == NULL) {
391		buf = PyObject_CallObject(tok->decoding_readline, NULL);
392		if (buf == NULL)
393			return error_ret(tok);
394	} else {
395		tok->decoding_buffer = NULL;
396		if (PyString_CheckExact(buf))
397			utf8 = buf;
398	}
399	if (utf8 == NULL) {
400		utf8 = PyUnicode_AsUTF8String(buf);
401		Py_DECREF(buf);
402		if (utf8 == NULL)
403			return error_ret(tok);
404	}
405	str = PyString_AsString(utf8);
406	utf8len = PyString_GET_SIZE(utf8);
407	if (utf8len > size) {
408		tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
409		if (tok->decoding_buffer == NULL) {
410			Py_DECREF(utf8);
411			return error_ret(tok);
412		}
413		utf8len = size;
414	}
415	memcpy(s, str, utf8len);
416	s[utf8len] = '\0';
417	Py_DECREF(utf8);
418	if (utf8len == 0)
419		return NULL; /* EOF */
420	return s;
421#endif
422}
423
424/* Set the readline function for TOK to a StreamReader's
425   readline function. The StreamReader is named ENC.
426
427   This function is called from check_bom and check_coding_spec.
428
429   ENC is usually identical to the future value of tok->encoding,
430   except for the (currently unsupported) case of UTF-16.
431
432   Return 1 on success, 0 on failure. */
433
434static int
435fp_setreadl(struct tok_state *tok, const char* enc)
436{
437	PyObject *reader, *stream, *readline;
438
439	/* XXX: constify filename argument. */
440	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
441	if (stream == NULL)
442		return 0;
443
444	reader = PyCodec_StreamReader(enc, stream, NULL);
445	Py_DECREF(stream);
446	if (reader == NULL)
447		return 0;
448
449	readline = PyObject_GetAttrString(reader, "readline");
450	Py_DECREF(reader);
451	if (readline == NULL)
452		return 0;
453
454	tok->decoding_readline = readline;
455	return 1;
456}
457
458/* Fetch the next byte from TOK. */
459
460static int fp_getc(struct tok_state *tok) {
461	return getc(tok->fp);
462}
463
464/* Unfetch the last byte back into TOK.  */
465
466static void fp_ungetc(int c, struct tok_state *tok) {
467	ungetc(c, tok->fp);
468}
469
470/* Read a line of input from TOK. Determine encoding
471   if necessary.  */
472
473static char *
474decoding_fgets(char *s, int size, struct tok_state *tok)
475{
476	char *line = NULL;
477	int badchar = 0;
478	for (;;) {
479		if (tok->decoding_state < 0) {
480			/* We already have a codec associated with
481			   this input. */
482			line = fp_readl(s, size, tok);
483			break;
484		} else if (tok->decoding_state > 0) {
485			/* We want a 'raw' read. */
486			line = Py_UniversalNewlineFgets(s, size,
487							tok->fp, NULL);
488			break;
489		} else {
490			/* We have not yet determined the encoding.
491			   If an encoding is found, use the file-pointer
492			   reader functions from now on. */
493			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
494				return error_ret(tok);
495			assert(tok->decoding_state != 0);
496		}
497	}
498	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
499		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
500			return error_ret(tok);
501		}
502	}
503#ifndef PGEN
504	/* The default encoding is ASCII, so make sure we don't have any
505           non-ASCII bytes in it. */
506	if (line && !tok->encoding) {
507		unsigned char *c;
508		for (c = (unsigned char *)line; *c; c++)
509			if (*c > 127) {
510				badchar = *c;
511				break;
512			}
513	}
514	if (badchar) {
515		char buf[500];
516		/* Need to add 1 to the line number, since this line
517		   has not been counted, yet.  */
518		sprintf(buf,
519			"Non-ASCII character '\\x%.2x' "
520			"in file %.200s on line %i, "
521			"but no encoding declared; "
522			"see http://www.python.org/peps/pep-0263.html for details",
523			badchar, tok->filename, tok->lineno + 1);
524		PyErr_SetString(PyExc_SyntaxError, buf);
525		return error_ret(tok);
526	}
527#endif
528	return line;
529}
530
531static int
532decoding_feof(struct tok_state *tok)
533{
534	if (tok->decoding_state >= 0) {
535		return feof(tok->fp);
536	} else {
537		PyObject* buf = tok->decoding_buffer;
538		if (buf == NULL) {
539			buf = PyObject_CallObject(tok->decoding_readline, NULL);
540			if (buf == NULL) {
541				error_ret(tok);
542				return 1;
543			} else {
544				tok->decoding_buffer = buf;
545			}
546		}
547		return PyObject_Length(buf) == 0;
548	}
549}
550
551/* Fetch a byte from TOK, using the string buffer. */
552
553static int
554buf_getc(struct tok_state *tok) {
555	return Py_CHARMASK(*tok->str++);
556}
557
558/* Unfetch a byte from TOK, using the string buffer. */
559
560static void
561buf_ungetc(int c, struct tok_state *tok) {
562	tok->str--;
563	assert(Py_CHARMASK(*tok->str) == c);	/* tok->cur may point to read-only segment */
564}
565
566/* Set the readline function for TOK to ENC. For the string-based
567   tokenizer, this means to just record the encoding. */
568
569static int
570buf_setreadl(struct tok_state *tok, const char* enc) {
571	tok->enc = enc;
572	return 1;
573}
574
575/* Return a UTF-8 encoding Python string object from the
576   C byte string STR, which is encoded with ENC. */
577
578#ifdef Py_USING_UNICODE
579static PyObject *
580translate_into_utf8(const char* str, const char* enc) {
581	PyObject *utf8;
582	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
583	if (buf == NULL)
584		return NULL;
585	utf8 = PyUnicode_AsUTF8String(buf);
586	Py_DECREF(buf);
587	return utf8;
588}
589#endif
590
591
592static char *
593translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
594	int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
595	char *buf, *current;
596	char c = '\0';
597	buf = PyMem_MALLOC(needed_length);
598	if (buf == NULL) {
599		tok->done = E_NOMEM;
600		return NULL;
601	}
602	for (current = buf; *s; s++, current++) {
603		c = *s;
604		if (skip_next_lf) {
605			skip_next_lf = 0;
606			if (c == '\n') {
607				c = *++s;
608				if (!c)
609					break;
610			}
611		}
612		if (c == '\r') {
613			skip_next_lf = 1;
614			c = '\n';
615		}
616		*current = c;
617	}
618	/* If this is exec input, add a newline to the end of the string if
619	   there isn't one already. */
620	if (exec_input && c != '\n') {
621		*current = '\n';
622		current++;
623	}
624	*current = '\0';
625	final_length = current - buf + 1;
626	if (final_length < needed_length && final_length)
627		/* should never fail */
628		buf = PyMem_REALLOC(buf, final_length);
629	return buf;
630}
631
632/* Decode a byte string STR for use as the buffer of TOK.
633   Look for encoding declarations inside STR, and record them
634   inside TOK.  */
635
636static const char *
637decode_str(const char *input, int single, struct tok_state *tok)
638{
639	PyObject* utf8 = NULL;
640	const char *str;
641	const char *s;
642	const char *newl[2] = {NULL, NULL};
643	int lineno = 0;
644	tok->input = str = translate_newlines(input, single, tok);
645	if (str == NULL)
646		return NULL;
647	tok->enc = NULL;
648	tok->str = str;
649	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
650		return error_ret(tok);
651	str = tok->str;		/* string after BOM if any */
652	assert(str);
653#ifdef Py_USING_UNICODE
654	if (tok->enc != NULL) {
655		utf8 = translate_into_utf8(str, tok->enc);
656		if (utf8 == NULL)
657			return error_ret(tok);
658		str = PyString_AsString(utf8);
659	}
660#endif
661	for (s = str;; s++) {
662		if (*s == '\0') break;
663		else if (*s == '\n') {
664			assert(lineno < 2);
665			newl[lineno] = s;
666			lineno++;
667			if (lineno == 2) break;
668		}
669	}
670	tok->enc = NULL;
671	/* need to check line 1 and 2 separately since check_coding_spec
672	   assumes a single line as input */
673	if (newl[0]) {
674		if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
675			return error_ret(tok);
676		if (tok->enc == NULL && newl[1]) {
677			if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
678					       tok, buf_setreadl))
679				return error_ret(tok);
680		}
681	}
682#ifdef Py_USING_UNICODE
683	if (tok->enc != NULL) {
684		assert(utf8 == NULL);
685		utf8 = translate_into_utf8(str, tok->enc);
686		if (utf8 == NULL)
687			return error_ret(tok);
688		str = PyString_AsString(utf8);
689	}
690#endif
691	assert(tok->decoding_buffer == NULL);
692	tok->decoding_buffer = utf8; /* CAUTION */
693	return str;
694}
695
696#endif /* PGEN */
697
698/* Set up tokenizer for string */
699
700struct tok_state *
701PyTokenizer_FromString(const char *str, int exec_input)
702{
703	struct tok_state *tok = tok_new();
704	if (tok == NULL)
705		return NULL;
706	str = (char *)decode_str(str, exec_input, tok);
707	if (str == NULL) {
708		PyTokenizer_Free(tok);
709		return NULL;
710	}
711
712	/* XXX: constify members. */
713	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
714	return tok;
715}
716
717
718/* Set up tokenizer for file */
719
720struct tok_state *
721PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
722{
723	struct tok_state *tok = tok_new();
724	if (tok == NULL)
725		return NULL;
726	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
727		PyTokenizer_Free(tok);
728		return NULL;
729	}
730	tok->cur = tok->inp = tok->buf;
731	tok->end = tok->buf + BUFSIZ;
732	tok->fp = fp;
733	tok->prompt = ps1;
734	tok->nextprompt = ps2;
735	return tok;
736}
737
738
739/* Free a tok_state structure */
740
741void
742PyTokenizer_Free(struct tok_state *tok)
743{
744	if (tok->encoding != NULL)
745		PyMem_FREE(tok->encoding);
746#ifndef PGEN
747	Py_XDECREF(tok->decoding_readline);
748	Py_XDECREF(tok->decoding_buffer);
749#endif
750	if (tok->fp != NULL && tok->buf != NULL)
751		PyMem_FREE(tok->buf);
752	if (tok->input)
753		PyMem_FREE((char *)tok->input);
754	PyMem_FREE(tok);
755}
756
757#if !defined(PGEN) && defined(Py_USING_UNICODE)
758static int
759tok_stdin_decode(struct tok_state *tok, char **inp)
760{
761	PyObject *enc, *sysstdin, *decoded, *utf8;
762	const char *encoding;
763	char *converted;
764
765	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
766		return 0;
767	sysstdin = PySys_GetObject("stdin");
768	if (sysstdin == NULL || !PyFile_Check(sysstdin))
769		return 0;
770
771	enc = ((PyFileObject *)sysstdin)->f_encoding;
772	if (enc == NULL || !PyString_Check(enc))
773		return 0;
774	Py_INCREF(enc);
775
776	encoding = PyString_AsString(enc);
777	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
778	if (decoded == NULL)
779		goto error_clear;
780
781	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
782	Py_DECREF(decoded);
783	if (utf8 == NULL)
784		goto error_clear;
785
786	assert(PyString_Check(utf8));
787	converted = new_string(PyString_AS_STRING(utf8),
788			       PyString_GET_SIZE(utf8));
789	Py_DECREF(utf8);
790	if (converted == NULL)
791		goto error_nomem;
792
793	PyMem_FREE(*inp);
794	*inp = converted;
795	if (tok->encoding != NULL)
796		PyMem_FREE(tok->encoding);
797	tok->encoding = new_string(encoding, strlen(encoding));
798	if (tok->encoding == NULL)
799		goto error_nomem;
800
801	Py_DECREF(enc);
802	return 0;
803
804error_nomem:
805	Py_DECREF(enc);
806	tok->done = E_NOMEM;
807	return -1;
808
809error_clear:
810	/* Fallback to iso-8859-1: for backward compatibility */
811	Py_DECREF(enc);
812	PyErr_Clear();
813	return 0;
814}
815#endif
816
817/* Get next char, updating state; error code goes into tok->done */
818
819static int
820tok_nextc(register struct tok_state *tok)
821{
822	for (;;) {
823		if (tok->cur != tok->inp) {
824			return Py_CHARMASK(*tok->cur++); /* Fast path */
825		}
826		if (tok->done != E_OK)
827			return EOF;
828		if (tok->fp == NULL) {
829			char *end = strchr(tok->inp, '\n');
830			if (end != NULL)
831				end++;
832			else {
833				end = strchr(tok->inp, '\0');
834				if (end == tok->inp) {
835					tok->done = E_EOF;
836					return EOF;
837				}
838			}
839			if (tok->start == NULL)
840				tok->buf = tok->cur;
841			tok->line_start = tok->cur;
842			tok->lineno++;
843			tok->inp = end;
844			return Py_CHARMASK(*tok->cur++);
845		}
846		if (tok->prompt != NULL) {
847			char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
848			if (tok->nextprompt != NULL)
849				tok->prompt = tok->nextprompt;
850			if (newtok == NULL)
851				tok->done = E_INTR;
852			else if (*newtok == '\0') {
853				PyMem_FREE(newtok);
854				tok->done = E_EOF;
855			}
856#if !defined(PGEN) && defined(Py_USING_UNICODE)
857			else if (tok_stdin_decode(tok, &newtok) != 0)
858				PyMem_FREE(newtok);
859#endif
860			else if (tok->start != NULL) {
861				size_t start = tok->start - tok->buf;
862				size_t oldlen = tok->cur - tok->buf;
863				size_t newlen = oldlen + strlen(newtok);
864				char *buf = tok->buf;
865				buf = (char *)PyMem_REALLOC(buf, newlen+1);
866				tok->lineno++;
867				if (buf == NULL) {
868					PyMem_FREE(tok->buf);
869					tok->buf = NULL;
870					PyMem_FREE(newtok);
871					tok->done = E_NOMEM;
872					return EOF;
873				}
874				tok->buf = buf;
875				tok->cur = tok->buf + oldlen;
876				tok->line_start = tok->cur;
877				strcpy(tok->buf + oldlen, newtok);
878				PyMem_FREE(newtok);
879				tok->inp = tok->buf + newlen;
880				tok->end = tok->inp + 1;
881				tok->start = tok->buf + start;
882			}
883			else {
884				tok->lineno++;
885				if (tok->buf != NULL)
886					PyMem_FREE(tok->buf);
887				tok->buf = newtok;
888				tok->line_start = tok->buf;
889				tok->cur = tok->buf;
890				tok->line_start = tok->buf;
891				tok->inp = strchr(tok->buf, '\0');
892				tok->end = tok->inp + 1;
893			}
894		}
895		else {
896			int done = 0;
897			Py_ssize_t cur = 0;
898			char *pt;
899			if (tok->start == NULL) {
900				if (tok->buf == NULL) {
901					tok->buf = (char *)
902						PyMem_MALLOC(BUFSIZ);
903					if (tok->buf == NULL) {
904						tok->done = E_NOMEM;
905						return EOF;
906					}
907					tok->end = tok->buf + BUFSIZ;
908				}
909				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
910					  tok) == NULL) {
911					tok->done = E_EOF;
912					done = 1;
913				}
914				else {
915					tok->done = E_OK;
916					tok->inp = strchr(tok->buf, '\0');
917					done = tok->inp[-1] == '\n';
918				}
919			}
920			else {
921				cur = tok->cur - tok->buf;
922				if (decoding_feof(tok)) {
923					tok->done = E_EOF;
924					done = 1;
925				}
926				else
927					tok->done = E_OK;
928			}
929			tok->lineno++;
930			/* Read until '\n' or EOF */
931			while (!done) {
932				Py_ssize_t curstart = tok->start == NULL ? -1 :
933					          tok->start - tok->buf;
934				Py_ssize_t curvalid = tok->inp - tok->buf;
935				Py_ssize_t newsize = curvalid + BUFSIZ;
936				char *newbuf = tok->buf;
937				newbuf = (char *)PyMem_REALLOC(newbuf,
938							       newsize);
939				if (newbuf == NULL) {
940					tok->done = E_NOMEM;
941					tok->cur = tok->inp;
942					return EOF;
943				}
944				tok->buf = newbuf;
945				tok->inp = tok->buf + curvalid;
946				tok->end = tok->buf + newsize;
947				tok->start = curstart < 0 ? NULL :
948					     tok->buf + curstart;
949				if (decoding_fgets(tok->inp,
950					       (int)(tok->end - tok->inp),
951					       tok) == NULL) {
952					/* Break out early on decoding
953					   errors, as tok->buf will be NULL
954					 */
955					if (tok->decoding_erred)
956						return EOF;
957					/* Last line does not end in \n,
958					   fake one */
959					strcpy(tok->inp, "\n");
960				}
961				tok->inp = strchr(tok->inp, '\0');
962				done = tok->inp[-1] == '\n';
963			}
964			if (tok->buf != NULL) {
965				tok->cur = tok->buf + cur;
966				tok->line_start = tok->cur;
967				/* replace "\r\n" with "\n" */
968				/* For Mac leave the \r, giving a syntax error */
969				pt = tok->inp - 2;
970				if (pt >= tok->buf && *pt == '\r') {
971					*pt++ = '\n';
972					*pt = '\0';
973					tok->inp = pt;
974				}
975			}
976		}
977		if (tok->done != E_OK) {
978			if (tok->prompt != NULL)
979				PySys_WriteStderr("\n");
980			tok->cur = tok->inp;
981			return EOF;
982		}
983	}
984	/*NOTREACHED*/
985}
986
987
988/* Back-up one character */
989
990static void
991tok_backup(register struct tok_state *tok, register int c)
992{
993	if (c != EOF) {
994		if (--tok->cur < tok->buf)
995			Py_FatalError("tok_backup: beginning of buffer");
996		if (*tok->cur != c)
997			*tok->cur = c;
998	}
999}
1000
1001
1002/* Return the token corresponding to a single character */
1003
1004int
1005PyToken_OneChar(int c)
1006{
1007	switch (c) {
1008	case '(':	return LPAR;
1009	case ')':	return RPAR;
1010	case '[':	return LSQB;
1011	case ']':	return RSQB;
1012	case ':':	return COLON;
1013	case ',':	return COMMA;
1014	case ';':	return SEMI;
1015	case '+':	return PLUS;
1016	case '-':	return MINUS;
1017	case '*':	return STAR;
1018	case '/':	return SLASH;
1019	case '|':	return VBAR;
1020	case '&':	return AMPER;
1021	case '<':	return LESS;
1022	case '>':	return GREATER;
1023	case '=':	return EQUAL;
1024	case '.':	return DOT;
1025	case '%':	return PERCENT;
1026	case '`':	return BACKQUOTE;
1027	case '{':	return LBRACE;
1028	case '}':	return RBRACE;
1029	case '^':	return CIRCUMFLEX;
1030	case '~':	return TILDE;
1031	case '@':       return AT;
1032	default:	return OP;
1033	}
1034}
1035
1036
1037int
1038PyToken_TwoChars(int c1, int c2)
1039{
1040	switch (c1) {
1041	case '=':
1042		switch (c2) {
1043		case '=':	return EQEQUAL;
1044		}
1045		break;
1046	case '!':
1047		switch (c2) {
1048		case '=':	return NOTEQUAL;
1049		}
1050		break;
1051	case '<':
1052		switch (c2) {
1053		case '>':	return NOTEQUAL;
1054		case '=':	return LESSEQUAL;
1055		case '<':	return LEFTSHIFT;
1056		}
1057		break;
1058	case '>':
1059		switch (c2) {
1060		case '=':	return GREATEREQUAL;
1061		case '>':	return RIGHTSHIFT;
1062		}
1063		break;
1064	case '+':
1065		switch (c2) {
1066		case '=':	return PLUSEQUAL;
1067		}
1068		break;
1069	case '-':
1070		switch (c2) {
1071		case '=':	return MINEQUAL;
1072		}
1073		break;
1074	case '*':
1075		switch (c2) {
1076		case '*':	return DOUBLESTAR;
1077		case '=':	return STAREQUAL;
1078		}
1079		break;
1080	case '/':
1081		switch (c2) {
1082		case '/':	return DOUBLESLASH;
1083		case '=':	return SLASHEQUAL;
1084		}
1085		break;
1086	case '|':
1087		switch (c2) {
1088		case '=':	return VBAREQUAL;
1089		}
1090		break;
1091	case '%':
1092		switch (c2) {
1093		case '=':	return PERCENTEQUAL;
1094		}
1095		break;
1096	case '&':
1097		switch (c2) {
1098		case '=':	return AMPEREQUAL;
1099		}
1100		break;
1101	case '^':
1102		switch (c2) {
1103		case '=':	return CIRCUMFLEXEQUAL;
1104		}
1105		break;
1106	}
1107	return OP;
1108}
1109
1110int
1111PyToken_ThreeChars(int c1, int c2, int c3)
1112{
1113	switch (c1) {
1114	case '<':
1115		switch (c2) {
1116		case '<':
1117			switch (c3) {
1118			case '=':
1119				return LEFTSHIFTEQUAL;
1120			}
1121			break;
1122		}
1123		break;
1124	case '>':
1125		switch (c2) {
1126		case '>':
1127			switch (c3) {
1128			case '=':
1129				return RIGHTSHIFTEQUAL;
1130			}
1131			break;
1132		}
1133		break;
1134	case '*':
1135		switch (c2) {
1136		case '*':
1137			switch (c3) {
1138			case '=':
1139				return DOUBLESTAREQUAL;
1140			}
1141			break;
1142		}
1143		break;
1144	case '/':
1145		switch (c2) {
1146		case '/':
1147			switch (c3) {
1148			case '=':
1149				return DOUBLESLASHEQUAL;
1150			}
1151			break;
1152		}
1153		break;
1154	}
1155	return OP;
1156}
1157
1158static int
1159indenterror(struct tok_state *tok)
1160{
1161	if (tok->alterror) {
1162		tok->done = E_TABSPACE;
1163		tok->cur = tok->inp;
1164		return 1;
1165	}
1166	if (tok->altwarning) {
1167		PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1168                                  "in indentation\n", tok->filename);
1169		tok->altwarning = 0;
1170	}
1171	return 0;
1172}
1173
1174
1175/* Get next token, after space stripping etc. */
1176
1177static int
1178tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1179{
1180	register int c;
1181	int blankline;
1182
1183	*p_start = *p_end = NULL;
1184  nextline:
1185	tok->start = NULL;
1186	blankline = 0;
1187
1188	/* Get indentation level */
1189	if (tok->atbol) {
1190		register int col = 0;
1191		register int altcol = 0;
1192		tok->atbol = 0;
1193		for (;;) {
1194			c = tok_nextc(tok);
1195			if (c == ' ')
1196				col++, altcol++;
1197			else if (c == '\t') {
1198				col = (col/tok->tabsize + 1) * tok->tabsize;
1199				altcol = (altcol/tok->alttabsize + 1)
1200					* tok->alttabsize;
1201			}
1202			else if (c == '\014') /* Control-L (formfeed) */
1203				col = altcol = 0; /* For Emacs users */
1204			else
1205				break;
1206		}
1207		tok_backup(tok, c);
1208		if (c == '#' || c == '\n') {
1209			/* Lines with only whitespace and/or comments
1210			   shouldn't affect the indentation and are
1211			   not passed to the parser as NEWLINE tokens,
1212			   except *totally* empty lines in interactive
1213			   mode, which signal the end of a command group. */
1214			if (col == 0 && c == '\n' && tok->prompt != NULL)
1215				blankline = 0; /* Let it through */
1216			else
1217				blankline = 1; /* Ignore completely */
1218			/* We can't jump back right here since we still
1219			   may need to skip to the end of a comment */
1220		}
1221		if (!blankline && tok->level == 0) {
1222			if (col == tok->indstack[tok->indent]) {
1223				/* No change */
1224				if (altcol != tok->altindstack[tok->indent]) {
1225					if (indenterror(tok))
1226						return ERRORTOKEN;
1227				}
1228			}
1229			else if (col > tok->indstack[tok->indent]) {
1230				/* Indent -- always one */
1231				if (tok->indent+1 >= MAXINDENT) {
1232					tok->done = E_TOODEEP;
1233					tok->cur = tok->inp;
1234					return ERRORTOKEN;
1235				}
1236				if (altcol <= tok->altindstack[tok->indent]) {
1237					if (indenterror(tok))
1238						return ERRORTOKEN;
1239				}
1240				tok->pendin++;
1241				tok->indstack[++tok->indent] = col;
1242				tok->altindstack[tok->indent] = altcol;
1243			}
1244			else /* col < tok->indstack[tok->indent] */ {
1245				/* Dedent -- any number, must be consistent */
1246				while (tok->indent > 0 &&
1247					col < tok->indstack[tok->indent]) {
1248					tok->pendin--;
1249					tok->indent--;
1250				}
1251				if (col != tok->indstack[tok->indent]) {
1252					tok->done = E_DEDENT;
1253					tok->cur = tok->inp;
1254					return ERRORTOKEN;
1255				}
1256				if (altcol != tok->altindstack[tok->indent]) {
1257					if (indenterror(tok))
1258						return ERRORTOKEN;
1259				}
1260			}
1261		}
1262	}
1263
1264	tok->start = tok->cur;
1265
1266	/* Return pending indents/dedents */
1267	if (tok->pendin != 0) {
1268		if (tok->pendin < 0) {
1269			tok->pendin++;
1270			return DEDENT;
1271		}
1272		else {
1273			tok->pendin--;
1274			return INDENT;
1275		}
1276	}
1277
1278 again:
1279	tok->start = NULL;
1280	/* Skip spaces */
1281	do {
1282		c = tok_nextc(tok);
1283	} while (c == ' ' || c == '\t' || c == '\014');
1284
1285	/* Set start of current token */
1286	tok->start = tok->cur - 1;
1287
1288	/* Skip comment, while looking for tab-setting magic */
1289	if (c == '#') {
1290		static char *tabforms[] = {
1291			"tab-width:",		/* Emacs */
1292			":tabstop=",		/* vim, full form */
1293			":ts=",			/* vim, abbreviated form */
1294			"set tabsize=",		/* will vi never die? */
1295		/* more templates can be added here to support other editors */
1296		};
1297		char cbuf[80];
1298		char *tp, **cp;
1299		tp = cbuf;
1300		do {
1301			*tp++ = c = tok_nextc(tok);
1302		} while (c != EOF && c != '\n' &&
1303			 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1304		*tp = '\0';
1305		for (cp = tabforms;
1306		     cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1307		     cp++) {
1308			if ((tp = strstr(cbuf, *cp))) {
1309				int newsize = atoi(tp + strlen(*cp));
1310
1311				if (newsize >= 1 && newsize <= 40) {
1312					tok->tabsize = newsize;
1313					if (Py_VerboseFlag)
1314					    PySys_WriteStderr(
1315						"Tab size set to %d\n",
1316						newsize);
1317				}
1318			}
1319		}
1320		while (c != EOF && c != '\n')
1321			c = tok_nextc(tok);
1322	}
1323
1324	/* Check for EOF and errors now */
1325	if (c == EOF) {
1326		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1327	}
1328
1329	/* Identifier (most frequent token!) */
1330	if (isalpha(c) || c == '_') {
1331		/* Process r"", u"" and ur"" */
1332		switch (c) {
1333		case 'b':
1334		case 'B':
1335			c = tok_nextc(tok);
1336			if (c == 'r' || c == 'R')
1337				c = tok_nextc(tok);
1338			if (c == '"' || c == '\'')
1339				goto letter_quote;
1340			break;
1341		case 'r':
1342		case 'R':
1343			c = tok_nextc(tok);
1344			if (c == '"' || c == '\'')
1345				goto letter_quote;
1346			break;
1347		case 'u':
1348		case 'U':
1349			c = tok_nextc(tok);
1350			if (c == 'r' || c == 'R')
1351				c = tok_nextc(tok);
1352			if (c == '"' || c == '\'')
1353				goto letter_quote;
1354			break;
1355		}
1356		while (isalnum(c) || c == '_') {
1357			c = tok_nextc(tok);
1358		}
1359		tok_backup(tok, c);
1360		*p_start = tok->start;
1361		*p_end = tok->cur;
1362		return NAME;
1363	}
1364
1365	/* Newline */
1366	if (c == '\n') {
1367		tok->atbol = 1;
1368		if (blankline || tok->level > 0)
1369			goto nextline;
1370		*p_start = tok->start;
1371		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1372                tok->cont_line = 0;
1373		return NEWLINE;
1374	}
1375
1376	/* Period or number starting with period? */
1377	if (c == '.') {
1378		c = tok_nextc(tok);
1379		if (isdigit(c)) {
1380			goto fraction;
1381		}
1382		else {
1383			tok_backup(tok, c);
1384			*p_start = tok->start;
1385			*p_end = tok->cur;
1386			return DOT;
1387		}
1388	}
1389
1390	/* Number */
1391	if (isdigit(c)) {
1392		if (c == '0') {
1393			/* Hex, octal or binary -- maybe. */
1394			c = tok_nextc(tok);
1395			if (c == '.')
1396				goto fraction;
1397#ifndef WITHOUT_COMPLEX
1398			if (c == 'j' || c == 'J')
1399				goto imaginary;
1400#endif
1401			if (c == 'x' || c == 'X') {
1402
1403				/* Hex */
1404				c = tok_nextc(tok);
1405				if (!isxdigit(c)) {
1406					tok->done = E_TOKEN;
1407					tok_backup(tok, c);
1408					return ERRORTOKEN;
1409				}
1410				do {
1411					c = tok_nextc(tok);
1412				} while (isxdigit(c));
1413			}
1414                        else if (c == 'o' || c == 'O') {
1415				/* Octal */
1416				c = tok_nextc(tok);
1417				if (c < '0' || c >= '8') {
1418					tok->done = E_TOKEN;
1419					tok_backup(tok, c);
1420					return ERRORTOKEN;
1421				}
1422				do {
1423					c = tok_nextc(tok);
1424				} while ('0' <= c && c < '8');
1425			}
1426			else if (c == 'b' || c == 'B') {
1427				/* Binary */
1428				c = tok_nextc(tok);
1429				if (c != '0' && c != '1') {
1430					tok->done = E_TOKEN;
1431					tok_backup(tok, c);
1432					return ERRORTOKEN;
1433				}
1434				do {
1435					c = tok_nextc(tok);
1436				} while (c == '0' || c == '1');
1437			}
1438			else {
1439				int found_decimal = 0;
1440				/* Octal; c is first char of it */
1441				/* There's no 'isoctdigit' macro, sigh */
1442				while ('0' <= c && c < '8') {
1443					c = tok_nextc(tok);
1444				}
1445				if (isdigit(c)) {
1446					found_decimal = 1;
1447					do {
1448						c = tok_nextc(tok);
1449					} while (isdigit(c));
1450				}
1451				if (c == '.')
1452					goto fraction;
1453				else if (c == 'e' || c == 'E')
1454					goto exponent;
1455#ifndef WITHOUT_COMPLEX
1456				else if (c == 'j' || c == 'J')
1457					goto imaginary;
1458#endif
1459				else if (found_decimal) {
1460					tok->done = E_TOKEN;
1461					tok_backup(tok, c);
1462					return ERRORTOKEN;
1463				}
1464			}
1465			if (c == 'l' || c == 'L')
1466				c = tok_nextc(tok);
1467		}
1468		else {
1469			/* Decimal */
1470			do {
1471				c = tok_nextc(tok);
1472			} while (isdigit(c));
1473			if (c == 'l' || c == 'L')
1474				c = tok_nextc(tok);
1475			else {
1476				/* Accept floating point numbers. */
1477				if (c == '.') {
1478		fraction:
1479					/* Fraction */
1480					do {
1481						c = tok_nextc(tok);
1482					} while (isdigit(c));
1483				}
1484				if (c == 'e' || c == 'E') {
1485		exponent:
1486					/* Exponent part */
1487					c = tok_nextc(tok);
1488					if (c == '+' || c == '-')
1489						c = tok_nextc(tok);
1490					if (!isdigit(c)) {
1491						tok->done = E_TOKEN;
1492						tok_backup(tok, c);
1493						return ERRORTOKEN;
1494					}
1495					do {
1496						c = tok_nextc(tok);
1497					} while (isdigit(c));
1498				}
1499#ifndef WITHOUT_COMPLEX
1500				if (c == 'j' || c == 'J')
1501					/* Imaginary part */
1502		imaginary:
1503					c = tok_nextc(tok);
1504#endif
1505			}
1506		}
1507		tok_backup(tok, c);
1508		*p_start = tok->start;
1509		*p_end = tok->cur;
1510		return NUMBER;
1511	}
1512
1513  letter_quote:
1514	/* String */
1515	if (c == '\'' || c == '"') {
1516		Py_ssize_t quote2 = tok->cur - tok->start + 1;
1517		int quote = c;
1518		int triple = 0;
1519		int tripcount = 0;
1520		for (;;) {
1521			c = tok_nextc(tok);
1522			if (c == '\n') {
1523				if (!triple) {
1524					tok->done = E_EOLS;
1525					tok_backup(tok, c);
1526					return ERRORTOKEN;
1527				}
1528				tripcount = 0;
1529                                tok->cont_line = 1; /* multiline string. */
1530			}
1531			else if (c == EOF) {
1532				if (triple)
1533					tok->done = E_EOFS;
1534				else
1535					tok->done = E_EOLS;
1536				tok->cur = tok->inp;
1537				return ERRORTOKEN;
1538			}
1539			else if (c == quote) {
1540				tripcount++;
1541				if (tok->cur - tok->start == quote2) {
1542					c = tok_nextc(tok);
1543					if (c == quote) {
1544						triple = 1;
1545						tripcount = 0;
1546						continue;
1547					}
1548					tok_backup(tok, c);
1549				}
1550				if (!triple || tripcount == 3)
1551					break;
1552			}
1553			else if (c == '\\') {
1554				tripcount = 0;
1555				c = tok_nextc(tok);
1556				if (c == EOF) {
1557					tok->done = E_EOLS;
1558					tok->cur = tok->inp;
1559					return ERRORTOKEN;
1560				}
1561			}
1562			else
1563				tripcount = 0;
1564		}
1565		*p_start = tok->start;
1566		*p_end = tok->cur;
1567		return STRING;
1568	}
1569
1570	/* Line continuation */
1571	if (c == '\\') {
1572		c = tok_nextc(tok);
1573		if (c != '\n') {
1574			tok->done = E_LINECONT;
1575			tok->cur = tok->inp;
1576			return ERRORTOKEN;
1577		}
1578                tok->cont_line = 1;
1579		goto again; /* Read next line */
1580	}
1581
1582	/* Check for two-character token */
1583	{
1584		int c2 = tok_nextc(tok);
1585		int token = PyToken_TwoChars(c, c2);
1586#ifndef PGEN
1587		if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1588			if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1589					       "<> not supported in 3.x; use !=",
1590					       tok->filename, tok->lineno,
1591					       NULL, NULL)) {
1592				return ERRORTOKEN;
1593			}
1594		}
1595#endif
1596		if (token != OP) {
1597			int c3 = tok_nextc(tok);
1598			int token3 = PyToken_ThreeChars(c, c2, c3);
1599			if (token3 != OP) {
1600				token = token3;
1601			} else {
1602				tok_backup(tok, c3);
1603			}
1604			*p_start = tok->start;
1605			*p_end = tok->cur;
1606			return token;
1607		}
1608		tok_backup(tok, c2);
1609	}
1610
1611	/* Keep track of parentheses nesting level */
1612	switch (c) {
1613	case '(':
1614	case '[':
1615	case '{':
1616		tok->level++;
1617		break;
1618	case ')':
1619	case ']':
1620	case '}':
1621		tok->level--;
1622		break;
1623	}
1624
1625	/* Punctuation character */
1626	*p_start = tok->start;
1627	*p_end = tok->cur;
1628	return PyToken_OneChar(c);
1629}
1630
1631int
1632PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1633{
1634	int result = tok_get(tok, p_start, p_end);
1635	if (tok->decoding_erred) {
1636		result = ERRORTOKEN;
1637		tok->done = E_DECODE;
1638	}
1639	return result;
1640}
1641
1642/* This function is only called from parsetok. However, it cannot live
1643   there, as it must be empty for PGEN, and we can check for PGEN only
1644   in this file. */
1645
1646#if defined(PGEN) || !defined(Py_USING_UNICODE)
1647char*
1648PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1649{
1650	return NULL;
1651}
1652#else
1653#ifdef Py_USING_UNICODE
1654static PyObject *
1655dec_utf8(const char *enc, const char *text, size_t len) {
1656	PyObject *ret = NULL;
1657	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1658	if (unicode_text) {
1659		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1660		Py_DECREF(unicode_text);
1661	}
1662	if (!ret) {
1663		PyErr_Clear();
1664	}
1665	return ret;
1666}
1667char *
1668PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1669{
1670	char *text = NULL;
1671	if (tok->encoding) {
1672		/* convert source to original encondig */
1673		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1674		if (lineobj != NULL) {
1675			int linelen = PyString_Size(lineobj);
1676			const char *line = PyString_AsString(lineobj);
1677			text = PyObject_MALLOC(linelen + 1);
1678			if (text != NULL && line != NULL) {
1679				if (linelen)
1680					strncpy(text, line, linelen);
1681				text[linelen] = '\0';
1682			}
1683			Py_DECREF(lineobj);
1684
1685			/* adjust error offset */
1686			if (*offset > 1) {
1687				PyObject *offsetobj = dec_utf8(tok->encoding,
1688							       tok->buf, *offset-1);
1689				if (offsetobj) {
1690					*offset = PyString_Size(offsetobj) + 1;
1691					Py_DECREF(offsetobj);
1692				}
1693			}
1694
1695		}
1696	}
1697	return text;
1698
1699}
1700#endif /* defined(Py_USING_UNICODE) */
1701#endif
1702
1703
1704#ifdef Py_DEBUG
1705
1706void
1707tok_dump(int type, char *start, char *end)
1708{
1709	printf("%s", _PyParser_TokenNames[type]);
1710	if (type == NAME || type == NUMBER || type == STRING || type == OP)
1711		printf("(%.*s)", (int)(end - start), start);
1712}
1713
1714#endif
1715