tokenizer.c revision 49c5da1d88f605248167f4d95b1dfe08c1f703c7
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
21extern char *PyOS_Readline(FILE *, FILE *, char *);
22/* Return malloc'ed string including trailing \n;
23   empty malloc'ed string for EOF;
24   NULL if interrupted */
25
26/* Don't ever change this -- it would break the portability of Python code */
27#define TABSIZE 8
28
29/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c)		(c)
33#else
34#define Py_CHARMASK(c)		((c) & 0xff)
35#endif
36
37/* Forward */
38static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
41
42/* Token names */
43
44char *_PyParser_TokenNames[] = {
45	"ENDMARKER",
46	"NAME",
47	"NUMBER",
48	"STRING",
49	"NEWLINE",
50	"INDENT",
51	"DEDENT",
52	"LPAR",
53	"RPAR",
54	"LSQB",
55	"RSQB",
56	"COLON",
57	"COMMA",
58	"SEMI",
59	"PLUS",
60	"MINUS",
61	"STAR",
62	"SLASH",
63	"VBAR",
64	"AMPER",
65	"LESS",
66	"GREATER",
67	"EQUAL",
68	"DOT",
69	"PERCENT",
70	"BACKQUOTE",
71	"LBRACE",
72	"RBRACE",
73	"EQEQUAL",
74	"NOTEQUAL",
75	"LESSEQUAL",
76	"GREATEREQUAL",
77	"TILDE",
78	"CIRCUMFLEX",
79	"LEFTSHIFT",
80	"RIGHTSHIFT",
81	"DOUBLESTAR",
82	"PLUSEQUAL",
83	"MINEQUAL",
84	"STAREQUAL",
85	"SLASHEQUAL",
86	"PERCENTEQUAL",
87	"AMPEREQUAL",
88	"VBAREQUAL",
89	"CIRCUMFLEXEQUAL",
90	"LEFTSHIFTEQUAL",
91	"RIGHTSHIFTEQUAL",
92	"DOUBLESTAREQUAL",
93	"DOUBLESLASH",
94	"DOUBLESLASHEQUAL",
95	"AT",
96	/* This table must match the #defines in token.h! */
97	"OP",
98	"<ERRORTOKEN>",
99	"<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
106tok_new(void)
107{
108	struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
109	if (tok == NULL)
110		return NULL;
111	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
112	tok->done = E_OK;
113	tok->fp = NULL;
114	tok->tabsize = TABSIZE;
115	tok->indent = 0;
116	tok->indstack[0] = 0;
117	tok->atbol = 1;
118	tok->pendin = 0;
119	tok->prompt = tok->nextprompt = NULL;
120	tok->lineno = 0;
121	tok->level = 0;
122	tok->filename = NULL;
123	tok->altwarning = 0;
124	tok->alterror = 0;
125	tok->alttabsize = 1;
126	tok->altindstack[0] = 0;
127	tok->decoding_state = 0;
128	tok->decoding_erred = 0;
129	tok->read_coding_spec = 0;
130	tok->encoding = NULL;
131        tok->cont_line = 0;
132#ifndef PGEN
133	tok->decoding_readline = NULL;
134	tok->decoding_buffer = NULL;
135#endif
136	return tok;
137}
138
139#ifdef PGEN
140
141static char *
142decoding_fgets(char *s, int size, struct tok_state *tok)
143{
144	return fgets(s, size, tok->fp);
145}
146
147static int
148decoding_feof(struct tok_state *tok)
149{
150	return feof(tok->fp);
151}
152
153static const char *
154decode_str(const char *str, struct tok_state *tok)
155{
156	return str;
157}
158
159#else /* PGEN */
160
161static char *
162error_ret(struct tok_state *tok) /* XXX */
163{
164	tok->decoding_erred = 1;
165	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166		PyMem_DEL(tok->buf);
167	tok->buf = NULL;
168	return NULL;		/* as if it were EOF */
169}
170
171static char *
172new_string(const char *s, Py_ssize_t len)
173{
174	char* result = PyMem_NEW(char, len + 1);
175	if (result != NULL) {
176		memcpy(result, s, len);
177		result[len] = '\0';
178	}
179	return result;
180}
181
182static char *
183get_normal_name(char *s)	/* for utf-8 and latin-1 */
184{
185	char buf[13];
186	int i;
187	for (i = 0; i < 12; i++) {
188		int c = s[i];
189		if (c == '\0') break;
190		else if (c == '_') buf[i] = '-';
191		else buf[i] = tolower(c);
192	}
193	buf[i] = '\0';
194	if (strcmp(buf, "utf-8") == 0 ||
195	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196	else if (strcmp(buf, "latin-1") == 0 ||
197		 strcmp(buf, "iso-8859-1") == 0 ||
198		 strcmp(buf, "iso-latin-1") == 0 ||
199		 strncmp(buf, "latin-1-", 8) == 0 ||
200		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202	else return s;
203}
204
205/* Return the coding spec in S, or NULL if none is found.  */
206
207static char *
208get_coding_spec(const char *s, Py_ssize_t size)
209{
210	Py_ssize_t i;
211	/* Coding spec must be in a comment, and that comment must be
212         * the only statement on the source code line. */
213        for (i = 0; i < size - 6; i++) {
214		if (s[i] == '#')
215			break;
216		if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217			return NULL;
218	}
219	for (; i < size - 6; i++) { /* XXX inefficient search */
220		const char* t = s + i;
221		if (strncmp(t, "coding", 6) == 0) {
222			const char* begin = NULL;
223			t += 6;
224			if (t[0] != ':' && t[0] != '=')
225				continue;
226			do {
227				t++;
228			} while (t[0] == '\x20' || t[0] == '\t');
229
230			begin = t;
231			while (isalnum(Py_CHARMASK(t[0])) ||
232			       t[0] == '-' || t[0] == '_' || t[0] == '.')
233				t++;
234
235			if (begin < t) {
236				char* r = new_string(begin, t - begin);
237				char* q = get_normal_name(r);
238				if (r != q) {
239					PyMem_DEL(r);
240					r = new_string(q, strlen(q));
241				}
242				return r;
243			}
244		}
245	}
246	return NULL;
247}
248
249/* Check whether the line contains a coding spec. If it does,
250   invoke the set_readline function for the new encoding.
251   This function receives the tok_state and the new encoding.
252   Return 1 on success, 0 on failure.  */
253
254static int
255check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
256		  int set_readline(struct tok_state *, const char *))
257{
258	char * cs;
259	int r = 1;
260
261        if (tok->cont_line)
262		/* It's a continuation line, so it can't be a coding spec. */
263		return 1;
264	cs = get_coding_spec(line, size);
265	if (cs != NULL) {
266		tok->read_coding_spec = 1;
267		if (tok->encoding == NULL) {
268			assert(tok->decoding_state == 1); /* raw */
269			if (strcmp(cs, "utf-8") == 0 ||
270			    strcmp(cs, "iso-8859-1") == 0) {
271				tok->encoding = cs;
272			} else {
273#ifdef Py_USING_UNICODE
274				r = set_readline(tok, cs);
275				if (r) {
276					tok->encoding = cs;
277					tok->decoding_state = -1;
278				}
279				else
280					PyMem_DEL(cs);
281#else
282                                /* Without Unicode support, we cannot
283                                   process the coding spec. Since there
284                                   won't be any Unicode literals, that
285                                   won't matter. */
286				PyMem_DEL(cs);
287#endif
288			}
289		} else {	/* then, compare cs with BOM */
290			r = (strcmp(tok->encoding, cs) == 0);
291			PyMem_DEL(cs);
292		}
293	}
294	if (!r) {
295		cs = tok->encoding;
296		if (!cs)
297			cs = "with BOM";
298		PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
299	}
300	return r;
301}
302
303/* See whether the file starts with a BOM. If it does,
304   invoke the set_readline function with the new encoding.
305   Return 1 on success, 0 on failure.  */
306
307static int
308check_bom(int get_char(struct tok_state *),
309	  void unget_char(int, struct tok_state *),
310	  int set_readline(struct tok_state *, const char *),
311	  struct tok_state *tok)
312{
313	int ch = get_char(tok);
314	tok->decoding_state = 1;
315	if (ch == EOF) {
316		return 1;
317	} else if (ch == 0xEF) {
318		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
319		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
320#if 0
321	/* Disable support for UTF-16 BOMs until a decision
322	   is made whether this needs to be supported.  */
323	} else if (ch == 0xFE) {
324		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
325		if (!set_readline(tok, "utf-16-be")) return 0;
326		tok->decoding_state = -1;
327	} else if (ch == 0xFF) {
328		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
329		if (!set_readline(tok, "utf-16-le")) return 0;
330		tok->decoding_state = -1;
331#endif
332	} else {
333		unget_char(ch, tok);
334		return 1;
335	}
336	if (tok->encoding != NULL)
337		PyMem_DEL(tok->encoding);
338	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
339	return 1;
340  NON_BOM:
341	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
342	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
343	return 1;
344}
345
346/* Read a line of text from TOK into S, using the stream in TOK.
347   Return NULL on failure, else S.
348
349   On entry, tok->decoding_buffer will be one of:
350     1) NULL: need to call tok->decoding_readline to get a new line
351     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
352           stored the result in tok->decoding_buffer
353     3) PyStringObject *: previous call to fp_readl did not have enough room
354           (in the s buffer) to copy entire contents of the line read
355           by tok->decoding_readline.  tok->decoding_buffer has the overflow.
356           In this case, fp_readl is called in a loop (with an expanded buffer)
357           until the buffer ends with a '\n' (or until the end of the file is
358           reached): see tok_nextc and its calls to decoding_fgets.
359*/
360
361static char *
362fp_readl(char *s, int size, struct tok_state *tok)
363{
364#ifndef Py_USING_UNICODE
365	/* In a non-Unicode built, this should never be called. */
366	Py_FatalError("fp_readl should not be called in this build.");
367	return NULL; /* Keep compiler happy (not reachable) */
368#else
369	PyObject* utf8 = NULL;
370	PyObject* buf = tok->decoding_buffer;
371	char *str;
372	Py_ssize_t utf8len;
373
374	/* Ask for one less byte so we can terminate it */
375	assert(size > 0);
376	size--;
377
378	if (buf == NULL) {
379		buf = PyObject_CallObject(tok->decoding_readline, NULL);
380		if (buf == NULL)
381			return error_ret(tok);
382	} else {
383		tok->decoding_buffer = NULL;
384		if (PyString_CheckExact(buf))
385			utf8 = buf;
386	}
387	if (utf8 == NULL) {
388		utf8 = PyUnicode_AsUTF8String(buf);
389		Py_DECREF(buf);
390		if (utf8 == NULL)
391			return error_ret(tok);
392	}
393	str = PyString_AsString(utf8);
394	utf8len = PyString_GET_SIZE(utf8);
395	if (utf8len > size) {
396		tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
397		if (tok->decoding_buffer == NULL) {
398			Py_DECREF(utf8);
399			return error_ret(tok);
400		}
401		utf8len = size;
402	}
403	memcpy(s, str, utf8len);
404	s[utf8len] = '\0';
405	Py_DECREF(utf8);
406	if (utf8len == 0) return NULL; /* EOF */
407	return s;
408#endif
409}
410
411/* Set the readline function for TOK to a StreamReader's
412   readline function. The StreamReader is named ENC.
413
414   This function is called from check_bom and check_coding_spec.
415
416   ENC is usually identical to the future value of tok->encoding,
417   except for the (currently unsupported) case of UTF-16.
418
419   Return 1 on success, 0 on failure. */
420
421static int
422fp_setreadl(struct tok_state *tok, const char* enc)
423{
424	PyObject *reader, *stream, *readline;
425
426	/* XXX: constify filename argument. */
427	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
428	if (stream == NULL)
429		return 0;
430
431	reader = PyCodec_StreamReader(enc, stream, NULL);
432	Py_DECREF(stream);
433	if (reader == NULL)
434		return 0;
435
436	readline = PyObject_GetAttrString(reader, "readline");
437	Py_DECREF(reader);
438	if (readline == NULL)
439		return 0;
440
441	tok->decoding_readline = readline;
442	return 1;
443}
444
445/* Fetch the next byte from TOK. */
446
447static int fp_getc(struct tok_state *tok) {
448	return getc(tok->fp);
449}
450
451/* Unfetch the last byte back into TOK.  */
452
453static void fp_ungetc(int c, struct tok_state *tok) {
454	ungetc(c, tok->fp);
455}
456
457/* Read a line of input from TOK. Determine encoding
458   if necessary.  */
459
460static char *
461decoding_fgets(char *s, int size, struct tok_state *tok)
462{
463	char *line = NULL;
464	int badchar = 0;
465	for (;;) {
466		if (tok->decoding_state < 0) {
467			/* We already have a codec associated with
468			   this input. */
469			line = fp_readl(s, size, tok);
470			break;
471		} else if (tok->decoding_state > 0) {
472			/* We want a 'raw' read. */
473			line = Py_UniversalNewlineFgets(s, size,
474							tok->fp, NULL);
475			break;
476		} else {
477			/* We have not yet determined the encoding.
478			   If an encoding is found, use the file-pointer
479			   reader functions from now on. */
480			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
481				return error_ret(tok);
482			assert(tok->decoding_state != 0);
483		}
484	}
485	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
486		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
487			return error_ret(tok);
488		}
489	}
490#ifndef PGEN
491	/* The default encoding is ASCII, so make sure we don't have any
492           non-ASCII bytes in it. */
493	if (line && !tok->encoding) {
494		unsigned char *c;
495		for (c = (unsigned char *)line; *c; c++)
496			if (*c > 127) {
497				badchar = *c;
498				break;
499			}
500	}
501	if (badchar) {
502		char buf[500];
503		/* Need to add 1 to the line number, since this line
504		   has not been counted, yet.  */
505		sprintf(buf,
506			"Non-ASCII character '\\x%.2x' "
507			"in file %.200s on line %i, "
508			"but no encoding declared; "
509			"see http://www.python.org/peps/pep-0263.html for details",
510			badchar, tok->filename, tok->lineno + 1);
511		PyErr_SetString(PyExc_SyntaxError, buf);
512		return error_ret(tok);
513	}
514#endif
515	return line;
516}
517
518static int
519decoding_feof(struct tok_state *tok)
520{
521	if (tok->decoding_state >= 0) {
522		return feof(tok->fp);
523	} else {
524		PyObject* buf = tok->decoding_buffer;
525		if (buf == NULL) {
526			buf = PyObject_CallObject(tok->decoding_readline, NULL);
527			if (buf == NULL) {
528				error_ret(tok);
529				return 1;
530			} else {
531				tok->decoding_buffer = buf;
532			}
533		}
534		return PyObject_Length(buf) == 0;
535	}
536}
537
538/* Fetch a byte from TOK, using the string buffer. */
539
540static int buf_getc(struct tok_state *tok) {
541	return Py_CHARMASK(*tok->str++);
542}
543
544/* Unfetch a byte from TOK, using the string buffer. */
545
546static void buf_ungetc(int c, struct tok_state *tok) {
547	tok->str--;
548	assert(Py_CHARMASK(*tok->str) == c);	/* tok->cur may point to read-only segment */
549}
550
551/* Set the readline function for TOK to ENC. For the string-based
552   tokenizer, this means to just record the encoding. */
553
554static int buf_setreadl(struct tok_state *tok, const char* enc) {
555	tok->enc = enc;
556	return 1;
557}
558
559/* Return a UTF-8 encoding Python string object from the
560   C byte string STR, which is encoded with ENC. */
561
562#ifdef Py_USING_UNICODE
563static PyObject *
564translate_into_utf8(const char* str, const char* enc) {
565	PyObject *utf8;
566	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
567	if (buf == NULL)
568		return NULL;
569	utf8 = PyUnicode_AsUTF8String(buf);
570	Py_DECREF(buf);
571	return utf8;
572}
573#endif
574
575/* Decode a byte string STR for use as the buffer of TOK.
576   Look for encoding declarations inside STR, and record them
577   inside TOK.  */
578
579static const char *
580decode_str(const char *str, struct tok_state *tok)
581{
582	PyObject* utf8 = NULL;
583	const char *s;
584	int lineno = 0;
585	tok->enc = NULL;
586	tok->str = str;
587	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
588		return error_ret(tok);
589	str = tok->str;		/* string after BOM if any */
590	assert(str);
591#ifdef Py_USING_UNICODE
592	if (tok->enc != NULL) {
593		utf8 = translate_into_utf8(str, tok->enc);
594		if (utf8 == NULL)
595			return error_ret(tok);
596		str = PyString_AsString(utf8);
597	}
598#endif
599	for (s = str;; s++) {
600		if (*s == '\0') break;
601		else if (*s == '\n') {
602			lineno++;
603			if (lineno == 2) break;
604		}
605	}
606	tok->enc = NULL;
607	if (!check_coding_spec(str, s - str, tok, buf_setreadl))
608		return error_ret(tok);
609#ifdef Py_USING_UNICODE
610	if (tok->enc != NULL) {
611		assert(utf8 == NULL);
612		utf8 = translate_into_utf8(str, tok->enc);
613		if (utf8 == NULL) {
614			PyErr_Format(PyExc_SyntaxError,
615				"unknown encoding: %s", tok->enc);
616			return error_ret(tok);
617		}
618		str = PyString_AsString(utf8);
619	}
620#endif
621	assert(tok->decoding_buffer == NULL);
622	tok->decoding_buffer = utf8; /* CAUTION */
623	return str;
624}
625
626#endif /* PGEN */
627
628/* Set up tokenizer for string */
629
630struct tok_state *
631PyTokenizer_FromString(const char *str)
632{
633	struct tok_state *tok = tok_new();
634	if (tok == NULL)
635		return NULL;
636	str = (char *)decode_str(str, tok);
637	if (str == NULL) {
638		PyTokenizer_Free(tok);
639		return NULL;
640	}
641
642	/* XXX: constify members. */
643	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
644	return tok;
645}
646
647
648/* Set up tokenizer for file */
649
650struct tok_state *
651PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
652{
653	struct tok_state *tok = tok_new();
654	if (tok == NULL)
655		return NULL;
656	if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
657		PyTokenizer_Free(tok);
658		return NULL;
659	}
660	tok->cur = tok->inp = tok->buf;
661	tok->end = tok->buf + BUFSIZ;
662	tok->fp = fp;
663	tok->prompt = ps1;
664	tok->nextprompt = ps2;
665	return tok;
666}
667
668
669/* Free a tok_state structure */
670
671void
672PyTokenizer_Free(struct tok_state *tok)
673{
674	if (tok->encoding != NULL)
675		PyMem_DEL(tok->encoding);
676#ifndef PGEN
677	Py_XDECREF(tok->decoding_readline);
678	Py_XDECREF(tok->decoding_buffer);
679#endif
680	if (tok->fp != NULL && tok->buf != NULL)
681		PyMem_DEL(tok->buf);
682	PyMem_DEL(tok);
683}
684
685#if !defined(PGEN) && defined(Py_USING_UNICODE)
686static int
687tok_stdin_decode(struct tok_state *tok, char **inp)
688{
689	PyObject *enc, *sysstdin, *decoded, *utf8;
690	const char *encoding;
691	char *converted;
692
693	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
694		return 0;
695	sysstdin = PySys_GetObject("stdin");
696	if (sysstdin == NULL || !PyFile_Check(sysstdin))
697		return 0;
698
699	enc = ((PyFileObject *)sysstdin)->f_encoding;
700	if (enc == NULL || !PyString_Check(enc))
701		return 0;
702	Py_INCREF(enc);
703
704	encoding = PyString_AsString(enc);
705	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
706	if (decoded == NULL)
707		goto error_clear;
708
709	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
710	Py_DECREF(decoded);
711	if (utf8 == NULL)
712		goto error_clear;
713
714	converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
715	Py_DECREF(utf8);
716	if (converted == NULL)
717		goto error_nomem;
718
719	PyMem_FREE(*inp);
720	*inp = converted;
721	if (tok->encoding != NULL)
722		PyMem_DEL(tok->encoding);
723	tok->encoding = new_string(encoding, strlen(encoding));
724	if (tok->encoding == NULL)
725		goto error_nomem;
726
727	Py_DECREF(enc);
728	return 0;
729
730error_nomem:
731	Py_DECREF(enc);
732	tok->done = E_NOMEM;
733	return -1;
734
735error_clear:
736	/* Fallback to iso-8859-1: for backward compatibility */
737	Py_DECREF(enc);
738	PyErr_Clear();
739	return 0;
740}
741#endif
742
743/* Get next char, updating state; error code goes into tok->done */
744
745static int
746tok_nextc(register struct tok_state *tok)
747{
748	for (;;) {
749		if (tok->cur != tok->inp) {
750			return Py_CHARMASK(*tok->cur++); /* Fast path */
751		}
752		if (tok->done != E_OK)
753			return EOF;
754		if (tok->fp == NULL) {
755			char *end = strchr(tok->inp, '\n');
756			if (end != NULL)
757				end++;
758			else {
759				end = strchr(tok->inp, '\0');
760				if (end == tok->inp) {
761					tok->done = E_EOF;
762					return EOF;
763				}
764			}
765			if (tok->start == NULL)
766				tok->buf = tok->cur;
767			tok->line_start = tok->cur;
768			tok->lineno++;
769			tok->inp = end;
770			return Py_CHARMASK(*tok->cur++);
771		}
772		if (tok->prompt != NULL) {
773			char *new = PyOS_Readline(stdin, stdout, tok->prompt);
774			if (tok->nextprompt != NULL)
775				tok->prompt = tok->nextprompt;
776			if (new == NULL)
777				tok->done = E_INTR;
778			else if (*new == '\0') {
779				PyMem_FREE(new);
780				tok->done = E_EOF;
781			}
782#if !defined(PGEN) && defined(Py_USING_UNICODE)
783			else if (tok_stdin_decode(tok, &new) != 0)
784				PyMem_FREE(new);
785#endif
786			else if (tok->start != NULL) {
787				size_t start = tok->start - tok->buf;
788				size_t oldlen = tok->cur - tok->buf;
789				size_t newlen = oldlen + strlen(new);
790				char *buf = tok->buf;
791				PyMem_RESIZE(buf, char, newlen+1);
792				tok->lineno++;
793				if (buf == NULL) {
794					PyMem_DEL(tok->buf);
795					tok->buf = NULL;
796					PyMem_FREE(new);
797					tok->done = E_NOMEM;
798					return EOF;
799				}
800				tok->buf = buf;
801				tok->cur = tok->buf + oldlen;
802				tok->line_start = tok->cur;
803				strcpy(tok->buf + oldlen, new);
804				PyMem_FREE(new);
805				tok->inp = tok->buf + newlen;
806				tok->end = tok->inp + 1;
807				tok->start = tok->buf + start;
808			}
809			else {
810				tok->lineno++;
811				if (tok->buf != NULL)
812					PyMem_DEL(tok->buf);
813				tok->buf = new;
814				tok->line_start = tok->buf;
815				tok->cur = tok->buf;
816				tok->line_start = tok->buf;
817				tok->inp = strchr(tok->buf, '\0');
818				tok->end = tok->inp + 1;
819			}
820		}
821		else {
822			int done = 0;
823			Py_ssize_t cur = 0;
824			char *pt;
825			if (tok->start == NULL) {
826				if (tok->buf == NULL) {
827					tok->buf = PyMem_NEW(char, BUFSIZ);
828					if (tok->buf == NULL) {
829						tok->done = E_NOMEM;
830						return EOF;
831					}
832					tok->end = tok->buf + BUFSIZ;
833				}
834				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
835					  tok) == NULL) {
836					tok->done = E_EOF;
837					done = 1;
838				}
839				else {
840					tok->done = E_OK;
841					tok->inp = strchr(tok->buf, '\0');
842					done = tok->inp[-1] == '\n';
843				}
844			}
845			else {
846				cur = tok->cur - tok->buf;
847				if (decoding_feof(tok)) {
848					tok->done = E_EOF;
849					done = 1;
850				}
851				else
852					tok->done = E_OK;
853			}
854			tok->lineno++;
855			/* Read until '\n' or EOF */
856			while (!done) {
857				Py_ssize_t curstart = tok->start == NULL ? -1 :
858					          tok->start - tok->buf;
859				Py_ssize_t curvalid = tok->inp - tok->buf;
860				Py_ssize_t newsize = curvalid + BUFSIZ;
861				char *newbuf = tok->buf;
862				PyMem_RESIZE(newbuf, char, newsize);
863				if (newbuf == NULL) {
864					tok->done = E_NOMEM;
865					tok->cur = tok->inp;
866					return EOF;
867				}
868				tok->buf = newbuf;
869				tok->inp = tok->buf + curvalid;
870				tok->end = tok->buf + newsize;
871				tok->start = curstart < 0 ? NULL :
872					     tok->buf + curstart;
873				if (decoding_fgets(tok->inp,
874					       (int)(tok->end - tok->inp),
875					       tok) == NULL) {
876					/* Last line does not end in \n,
877					   fake one */
878					strcpy(tok->inp, "\n");
879				}
880				tok->inp = strchr(tok->inp, '\0');
881				done = tok->inp[-1] == '\n';
882			}
883			tok->cur = tok->buf + cur;
884			tok->line_start = tok->cur;
885			/* replace "\r\n" with "\n" */
886			/* For Mac we leave the \r, giving a syntax error */
887			pt = tok->inp - 2;
888			if (pt >= tok->buf && *pt == '\r') {
889				*pt++ = '\n';
890				*pt = '\0';
891				tok->inp = pt;
892			}
893		}
894		if (tok->done != E_OK) {
895			if (tok->prompt != NULL)
896				PySys_WriteStderr("\n");
897			tok->cur = tok->inp;
898			return EOF;
899		}
900	}
901	/*NOTREACHED*/
902}
903
904
905/* Back-up one character */
906
907static void
908tok_backup(register struct tok_state *tok, register int c)
909{
910	if (c != EOF) {
911		if (--tok->cur < tok->buf)
912			Py_FatalError("tok_backup: begin of buffer");
913		if (*tok->cur != c)
914			*tok->cur = c;
915	}
916}
917
918
919/* Return the token corresponding to a single character */
920
921int
922PyToken_OneChar(int c)
923{
924	switch (c) {
925	case '(':	return LPAR;
926	case ')':	return RPAR;
927	case '[':	return LSQB;
928	case ']':	return RSQB;
929	case ':':	return COLON;
930	case ',':	return COMMA;
931	case ';':	return SEMI;
932	case '+':	return PLUS;
933	case '-':	return MINUS;
934	case '*':	return STAR;
935	case '/':	return SLASH;
936	case '|':	return VBAR;
937	case '&':	return AMPER;
938	case '<':	return LESS;
939	case '>':	return GREATER;
940	case '=':	return EQUAL;
941	case '.':	return DOT;
942	case '%':	return PERCENT;
943	case '`':	return BACKQUOTE;
944	case '{':	return LBRACE;
945	case '}':	return RBRACE;
946	case '^':	return CIRCUMFLEX;
947	case '~':	return TILDE;
948	case '@':       return AT;
949	default:	return OP;
950	}
951}
952
953
954int
955PyToken_TwoChars(int c1, int c2)
956{
957	switch (c1) {
958	case '=':
959		switch (c2) {
960		case '=':	return EQEQUAL;
961		}
962		break;
963	case '!':
964		switch (c2) {
965		case '=':	return NOTEQUAL;
966		}
967		break;
968	case '<':
969		switch (c2) {
970		case '>':	return NOTEQUAL;
971		case '=':	return LESSEQUAL;
972		case '<':	return LEFTSHIFT;
973		}
974		break;
975	case '>':
976		switch (c2) {
977		case '=':	return GREATEREQUAL;
978		case '>':	return RIGHTSHIFT;
979		}
980		break;
981	case '+':
982		switch (c2) {
983		case '=':	return PLUSEQUAL;
984		}
985		break;
986	case '-':
987		switch (c2) {
988		case '=':	return MINEQUAL;
989		}
990		break;
991	case '*':
992		switch (c2) {
993		case '*':	return DOUBLESTAR;
994		case '=':	return STAREQUAL;
995		}
996		break;
997	case '/':
998		switch (c2) {
999		case '/':	return DOUBLESLASH;
1000		case '=':	return SLASHEQUAL;
1001		}
1002		break;
1003	case '|':
1004		switch (c2) {
1005		case '=':	return VBAREQUAL;
1006		}
1007		break;
1008	case '%':
1009		switch (c2) {
1010		case '=':	return PERCENTEQUAL;
1011		}
1012		break;
1013	case '&':
1014		switch (c2) {
1015		case '=':	return AMPEREQUAL;
1016		}
1017		break;
1018	case '^':
1019		switch (c2) {
1020		case '=':	return CIRCUMFLEXEQUAL;
1021		}
1022		break;
1023	}
1024	return OP;
1025}
1026
1027int
1028PyToken_ThreeChars(int c1, int c2, int c3)
1029{
1030	switch (c1) {
1031	case '<':
1032		switch (c2) {
1033		case '<':
1034			switch (c3) {
1035			case '=':
1036				return LEFTSHIFTEQUAL;
1037			}
1038			break;
1039		}
1040		break;
1041	case '>':
1042		switch (c2) {
1043		case '>':
1044			switch (c3) {
1045			case '=':
1046				return RIGHTSHIFTEQUAL;
1047			}
1048			break;
1049		}
1050		break;
1051	case '*':
1052		switch (c2) {
1053		case '*':
1054			switch (c3) {
1055			case '=':
1056				return DOUBLESTAREQUAL;
1057			}
1058			break;
1059		}
1060		break;
1061	case '/':
1062		switch (c2) {
1063		case '/':
1064			switch (c3) {
1065			case '=':
1066				return DOUBLESLASHEQUAL;
1067			}
1068			break;
1069		}
1070		break;
1071	}
1072	return OP;
1073}
1074
1075static int
1076indenterror(struct tok_state *tok)
1077{
1078	if (tok->alterror) {
1079		tok->done = E_TABSPACE;
1080		tok->cur = tok->inp;
1081		return 1;
1082	}
1083	if (tok->altwarning) {
1084		PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1085                                  "in indentation\n", tok->filename);
1086		tok->altwarning = 0;
1087	}
1088	return 0;
1089}
1090
1091
1092/* Get next token, after space stripping etc. */
1093
1094static int
1095tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1096{
1097	register int c;
1098	int blankline;
1099
1100	*p_start = *p_end = NULL;
1101  nextline:
1102	tok->start = NULL;
1103	blankline = 0;
1104
1105	/* Get indentation level */
1106	if (tok->atbol) {
1107		register int col = 0;
1108		register int altcol = 0;
1109		tok->atbol = 0;
1110		for (;;) {
1111			c = tok_nextc(tok);
1112			if (c == ' ')
1113				col++, altcol++;
1114			else if (c == '\t') {
1115				col = (col/tok->tabsize + 1) * tok->tabsize;
1116				altcol = (altcol/tok->alttabsize + 1)
1117					* tok->alttabsize;
1118			}
1119			else if (c == '\014') /* Control-L (formfeed) */
1120				col = altcol = 0; /* For Emacs users */
1121			else
1122				break;
1123		}
1124		tok_backup(tok, c);
1125		if (c == '#' || c == '\n') {
1126			/* Lines with only whitespace and/or comments
1127			   shouldn't affect the indentation and are
1128			   not passed to the parser as NEWLINE tokens,
1129			   except *totally* empty lines in interactive
1130			   mode, which signal the end of a command group. */
1131			if (col == 0 && c == '\n' && tok->prompt != NULL)
1132				blankline = 0; /* Let it through */
1133			else
1134				blankline = 1; /* Ignore completely */
1135			/* We can't jump back right here since we still
1136			   may need to skip to the end of a comment */
1137		}
1138		if (!blankline && tok->level == 0) {
1139			if (col == tok->indstack[tok->indent]) {
1140				/* No change */
1141				if (altcol != tok->altindstack[tok->indent]) {
1142					if (indenterror(tok))
1143						return ERRORTOKEN;
1144				}
1145			}
1146			else if (col > tok->indstack[tok->indent]) {
1147				/* Indent -- always one */
1148				if (tok->indent+1 >= MAXINDENT) {
1149					tok->done = E_TOODEEP;
1150					tok->cur = tok->inp;
1151					return ERRORTOKEN;
1152				}
1153				if (altcol <= tok->altindstack[tok->indent]) {
1154					if (indenterror(tok))
1155						return ERRORTOKEN;
1156				}
1157				tok->pendin++;
1158				tok->indstack[++tok->indent] = col;
1159				tok->altindstack[tok->indent] = altcol;
1160			}
1161			else /* col < tok->indstack[tok->indent] */ {
1162				/* Dedent -- any number, must be consistent */
1163				while (tok->indent > 0 &&
1164					col < tok->indstack[tok->indent]) {
1165					tok->pendin--;
1166					tok->indent--;
1167				}
1168				if (col != tok->indstack[tok->indent]) {
1169					tok->done = E_DEDENT;
1170					tok->cur = tok->inp;
1171					return ERRORTOKEN;
1172				}
1173				if (altcol != tok->altindstack[tok->indent]) {
1174					if (indenterror(tok))
1175						return ERRORTOKEN;
1176				}
1177			}
1178		}
1179	}
1180
1181	tok->start = tok->cur;
1182
1183	/* Return pending indents/dedents */
1184	if (tok->pendin != 0) {
1185		if (tok->pendin < 0) {
1186			tok->pendin++;
1187			return DEDENT;
1188		}
1189		else {
1190			tok->pendin--;
1191			return INDENT;
1192		}
1193	}
1194
1195 again:
1196	tok->start = NULL;
1197	/* Skip spaces */
1198	do {
1199		c = tok_nextc(tok);
1200	} while (c == ' ' || c == '\t' || c == '\014');
1201
1202	/* Set start of current token */
1203	tok->start = tok->cur - 1;
1204
1205	/* Skip comment, while looking for tab-setting magic */
1206	if (c == '#') {
1207		static char *tabforms[] = {
1208			"tab-width:",		/* Emacs */
1209			":tabstop=",		/* vim, full form */
1210			":ts=",			/* vim, abbreviated form */
1211			"set tabsize=",		/* will vi never die? */
1212		/* more templates can be added here to support other editors */
1213		};
1214		char cbuf[80];
1215		char *tp, **cp;
1216		tp = cbuf;
1217		do {
1218			*tp++ = c = tok_nextc(tok);
1219		} while (c != EOF && c != '\n' &&
1220			 tp - cbuf + 1 < sizeof(cbuf));
1221		*tp = '\0';
1222		for (cp = tabforms;
1223		     cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1224		     cp++) {
1225			if ((tp = strstr(cbuf, *cp))) {
1226				int newsize = atoi(tp + strlen(*cp));
1227
1228				if (newsize >= 1 && newsize <= 40) {
1229					tok->tabsize = newsize;
1230					if (Py_VerboseFlag)
1231					    PySys_WriteStderr(
1232						"Tab size set to %d\n",
1233						newsize);
1234				}
1235			}
1236		}
1237		while (c != EOF && c != '\n')
1238			c = tok_nextc(tok);
1239	}
1240
1241	/* Check for EOF and errors now */
1242	if (c == EOF) {
1243		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1244	}
1245
1246	/* Identifier (most frequent token!) */
1247	if (isalpha(c) || c == '_') {
1248		/* Process r"", u"" and ur"" */
1249		switch (c) {
1250		case 'r':
1251		case 'R':
1252			c = tok_nextc(tok);
1253			if (c == '"' || c == '\'')
1254				goto letter_quote;
1255			break;
1256		case 'u':
1257		case 'U':
1258			c = tok_nextc(tok);
1259			if (c == 'r' || c == 'R')
1260				c = tok_nextc(tok);
1261			if (c == '"' || c == '\'')
1262				goto letter_quote;
1263			break;
1264		}
1265		while (isalnum(c) || c == '_') {
1266			c = tok_nextc(tok);
1267		}
1268		tok_backup(tok, c);
1269		*p_start = tok->start;
1270		*p_end = tok->cur;
1271		return NAME;
1272	}
1273
1274	/* Newline */
1275	if (c == '\n') {
1276		tok->atbol = 1;
1277		if (blankline || tok->level > 0)
1278			goto nextline;
1279		*p_start = tok->start;
1280		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1281                tok->cont_line = 0;
1282		return NEWLINE;
1283	}
1284
1285	/* Period or number starting with period? */
1286	if (c == '.') {
1287		c = tok_nextc(tok);
1288		if (isdigit(c)) {
1289			goto fraction;
1290		}
1291		else {
1292			tok_backup(tok, c);
1293			*p_start = tok->start;
1294			*p_end = tok->cur;
1295			return DOT;
1296		}
1297	}
1298
1299	/* Number */
1300	if (isdigit(c)) {
1301		if (c == '0') {
1302			/* Hex or octal -- maybe. */
1303			c = tok_nextc(tok);
1304			if (c == '.')
1305				goto fraction;
1306#ifndef WITHOUT_COMPLEX
1307			if (c == 'j' || c == 'J')
1308				goto imaginary;
1309#endif
1310			if (c == 'x' || c == 'X') {
1311				/* Hex */
1312				do {
1313					c = tok_nextc(tok);
1314				} while (isxdigit(c));
1315			}
1316			else {
1317				int found_decimal = 0;
1318				/* Octal; c is first char of it */
1319				/* There's no 'isoctdigit' macro, sigh */
1320				while ('0' <= c && c < '8') {
1321					c = tok_nextc(tok);
1322				}
1323				if (isdigit(c)) {
1324					found_decimal = 1;
1325					do {
1326						c = tok_nextc(tok);
1327					} while (isdigit(c));
1328				}
1329				if (c == '.')
1330					goto fraction;
1331				else if (c == 'e' || c == 'E')
1332					goto exponent;
1333#ifndef WITHOUT_COMPLEX
1334				else if (c == 'j' || c == 'J')
1335					goto imaginary;
1336#endif
1337				else if (found_decimal) {
1338					tok->done = E_TOKEN;
1339					tok_backup(tok, c);
1340					return ERRORTOKEN;
1341				}
1342			}
1343			if (c == 'l' || c == 'L')
1344				c = tok_nextc(tok);
1345		}
1346		else {
1347			/* Decimal */
1348			do {
1349				c = tok_nextc(tok);
1350			} while (isdigit(c));
1351			if (c == 'l' || c == 'L')
1352				c = tok_nextc(tok);
1353			else {
1354				/* Accept floating point numbers. */
1355				if (c == '.') {
1356		fraction:
1357					/* Fraction */
1358					do {
1359						c = tok_nextc(tok);
1360					} while (isdigit(c));
1361				}
1362				if (c == 'e' || c == 'E') {
1363		exponent:
1364					/* Exponent part */
1365					c = tok_nextc(tok);
1366					if (c == '+' || c == '-')
1367						c = tok_nextc(tok);
1368					if (!isdigit(c)) {
1369						tok->done = E_TOKEN;
1370						tok_backup(tok, c);
1371						return ERRORTOKEN;
1372					}
1373					do {
1374						c = tok_nextc(tok);
1375					} while (isdigit(c));
1376				}
1377#ifndef WITHOUT_COMPLEX
1378				if (c == 'j' || c == 'J')
1379					/* Imaginary part */
1380		imaginary:
1381					c = tok_nextc(tok);
1382#endif
1383			}
1384		}
1385		tok_backup(tok, c);
1386		*p_start = tok->start;
1387		*p_end = tok->cur;
1388		return NUMBER;
1389	}
1390
1391  letter_quote:
1392	/* String */
1393	if (c == '\'' || c == '"') {
1394		Py_ssize_t quote2 = tok->cur - tok->start + 1;
1395		int quote = c;
1396		int triple = 0;
1397		int tripcount = 0;
1398		for (;;) {
1399			c = tok_nextc(tok);
1400			if (c == '\n') {
1401				if (!triple) {
1402					tok->done = E_EOLS;
1403					tok_backup(tok, c);
1404					return ERRORTOKEN;
1405				}
1406				tripcount = 0;
1407                                tok->cont_line = 1; /* multiline string. */
1408			}
1409			else if (c == EOF) {
1410				if (triple)
1411					tok->done = E_EOFS;
1412				else
1413					tok->done = E_EOLS;
1414				tok->cur = tok->inp;
1415				return ERRORTOKEN;
1416			}
1417			else if (c == quote) {
1418				tripcount++;
1419				if (tok->cur - tok->start == quote2) {
1420					c = tok_nextc(tok);
1421					if (c == quote) {
1422						triple = 1;
1423						tripcount = 0;
1424						continue;
1425					}
1426					tok_backup(tok, c);
1427				}
1428				if (!triple || tripcount == 3)
1429					break;
1430			}
1431			else if (c == '\\') {
1432				tripcount = 0;
1433				c = tok_nextc(tok);
1434				if (c == EOF) {
1435					tok->done = E_EOLS;
1436					tok->cur = tok->inp;
1437					return ERRORTOKEN;
1438				}
1439			}
1440			else
1441				tripcount = 0;
1442		}
1443		*p_start = tok->start;
1444		*p_end = tok->cur;
1445		return STRING;
1446	}
1447
1448	/* Line continuation */
1449	if (c == '\\') {
1450		c = tok_nextc(tok);
1451		if (c != '\n') {
1452			tok->done = E_LINECONT;
1453			tok->cur = tok->inp;
1454			return ERRORTOKEN;
1455		}
1456                tok->cont_line = 1;
1457		goto again; /* Read next line */
1458	}
1459
1460	/* Check for two-character token */
1461	{
1462		int c2 = tok_nextc(tok);
1463		int token = PyToken_TwoChars(c, c2);
1464		if (token != OP) {
1465			int c3 = tok_nextc(tok);
1466			int token3 = PyToken_ThreeChars(c, c2, c3);
1467			if (token3 != OP) {
1468				token = token3;
1469			} else {
1470				tok_backup(tok, c3);
1471			}
1472			*p_start = tok->start;
1473			*p_end = tok->cur;
1474			return token;
1475		}
1476		tok_backup(tok, c2);
1477	}
1478
1479	/* Keep track of parentheses nesting level */
1480	switch (c) {
1481	case '(':
1482	case '[':
1483	case '{':
1484		tok->level++;
1485		break;
1486	case ')':
1487	case ']':
1488	case '}':
1489		tok->level--;
1490		break;
1491	}
1492
1493	/* Punctuation character */
1494	*p_start = tok->start;
1495	*p_end = tok->cur;
1496	return PyToken_OneChar(c);
1497}
1498
1499int
1500PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1501{
1502	int result = tok_get(tok, p_start, p_end);
1503	if (tok->decoding_erred) {
1504		result = ERRORTOKEN;
1505		tok->done = E_DECODE;
1506	}
1507	return result;
1508}
1509
1510#ifdef Py_DEBUG
1511
1512void
1513tok_dump(int type, char *start, char *end)
1514{
1515	printf("%s", _PyParser_TokenNames[type]);
1516	if (type == NAME || type == NUMBER || type == STRING || type == OP)
1517		printf("(%.*s)", (int)(end - start), start);
1518}
1519
1520#endif
1521