tokenizer.c revision 9fc1b96a19ef821174f5ce37d007b68a55b9ba67
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
21extern char *PyOS_Readline(FILE *, FILE *, char *);
22/* Return malloc'ed string including trailing \n;
23   empty malloc'ed string for EOF;
24   NULL if interrupted */
25
26/* Don't ever change this -- it would break the portability of Python code */
27#define TABSIZE 8
28
29/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c)		(c)
33#else
34#define Py_CHARMASK(c)		((c) & 0xff)
35#endif
36
37/* Forward */
38static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
41
42/* Token names */
43
44char *_PyParser_TokenNames[] = {
45	"ENDMARKER",
46	"NAME",
47	"NUMBER",
48	"STRING",
49	"NEWLINE",
50	"INDENT",
51	"DEDENT",
52	"LPAR",
53	"RPAR",
54	"LSQB",
55	"RSQB",
56	"COLON",
57	"COMMA",
58	"SEMI",
59	"PLUS",
60	"MINUS",
61	"STAR",
62	"SLASH",
63	"VBAR",
64	"AMPER",
65	"LESS",
66	"GREATER",
67	"EQUAL",
68	"DOT",
69	"PERCENT",
70	"BACKQUOTE",
71	"LBRACE",
72	"RBRACE",
73	"EQEQUAL",
74	"NOTEQUAL",
75	"LESSEQUAL",
76	"GREATEREQUAL",
77	"TILDE",
78	"CIRCUMFLEX",
79	"LEFTSHIFT",
80	"RIGHTSHIFT",
81	"DOUBLESTAR",
82	"PLUSEQUAL",
83	"MINEQUAL",
84	"STAREQUAL",
85	"SLASHEQUAL",
86	"PERCENTEQUAL",
87	"AMPEREQUAL",
88	"VBAREQUAL",
89	"CIRCUMFLEXEQUAL",
90	"LEFTSHIFTEQUAL",
91	"RIGHTSHIFTEQUAL",
92	"DOUBLESTAREQUAL",
93	"DOUBLESLASH",
94	"DOUBLESLASHEQUAL",
95	"AT",
96	/* This table must match the #defines in token.h! */
97	"OP",
98	"<ERRORTOKEN>",
99	"<N_TOKENS>"
100};
101
102
103/* Create and initialize a new tok_state structure */
104
105static struct tok_state *
106tok_new(void)
107{
108	struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
109                                                sizeof(struct tok_state));
110	if (tok == NULL)
111		return NULL;
112	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
113	tok->done = E_OK;
114	tok->fp = NULL;
115	tok->tabsize = TABSIZE;
116	tok->indent = 0;
117	tok->indstack[0] = 0;
118	tok->atbol = 1;
119	tok->pendin = 0;
120	tok->prompt = tok->nextprompt = NULL;
121	tok->lineno = 0;
122	tok->level = 0;
123	tok->filename = NULL;
124	tok->altwarning = 0;
125	tok->alterror = 0;
126	tok->alttabsize = 1;
127	tok->altindstack[0] = 0;
128	tok->decoding_state = 0;
129	tok->decoding_erred = 0;
130	tok->read_coding_spec = 0;
131	tok->encoding = NULL;
132        tok->cont_line = 0;
133#ifndef PGEN
134	tok->decoding_readline = NULL;
135	tok->decoding_buffer = NULL;
136#endif
137	return tok;
138}
139
140#ifdef PGEN
141
142static char *
143decoding_fgets(char *s, int size, struct tok_state *tok)
144{
145	return fgets(s, size, tok->fp);
146}
147
148static int
149decoding_feof(struct tok_state *tok)
150{
151	return feof(tok->fp);
152}
153
154static const char *
155decode_str(const char *str, struct tok_state *tok)
156{
157	return str;
158}
159
160#else /* PGEN */
161
162static char *
163error_ret(struct tok_state *tok) /* XXX */
164{
165	tok->decoding_erred = 1;
166	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167		PyMem_FREE(tok->buf);
168	tok->buf = NULL;
169	return NULL;		/* as if it were EOF */
170}
171
172static char *
173new_string(const char *s, Py_ssize_t len)
174{
175	char* result = (char *)PyMem_MALLOC(len + 1);
176	if (result != NULL) {
177		memcpy(result, s, len);
178		result[len] = '\0';
179	}
180	return result;
181}
182
183static char *
184get_normal_name(char *s)	/* for utf-8 and latin-1 */
185{
186	char buf[13];
187	int i;
188	for (i = 0; i < 12; i++) {
189		int c = s[i];
190		if (c == '\0') break;
191		else if (c == '_') buf[i] = '-';
192		else buf[i] = tolower(c);
193	}
194	buf[i] = '\0';
195	if (strcmp(buf, "utf-8") == 0 ||
196	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197	else if (strcmp(buf, "latin-1") == 0 ||
198		 strcmp(buf, "iso-8859-1") == 0 ||
199		 strcmp(buf, "iso-latin-1") == 0 ||
200		 strncmp(buf, "latin-1-", 8) == 0 ||
201		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203	else return s;
204}
205
206/* Return the coding spec in S, or NULL if none is found.  */
207
208static char *
209get_coding_spec(const char *s, Py_ssize_t size)
210{
211	Py_ssize_t i;
212	/* Coding spec must be in a comment, and that comment must be
213         * the only statement on the source code line. */
214        for (i = 0; i < size - 6; i++) {
215		if (s[i] == '#')
216			break;
217		if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218			return NULL;
219	}
220	for (; i < size - 6; i++) { /* XXX inefficient search */
221		const char* t = s + i;
222		if (strncmp(t, "coding", 6) == 0) {
223			const char* begin = NULL;
224			t += 6;
225			if (t[0] != ':' && t[0] != '=')
226				continue;
227			do {
228				t++;
229			} while (t[0] == '\x20' || t[0] == '\t');
230
231			begin = t;
232			while (isalnum(Py_CHARMASK(t[0])) ||
233			       t[0] == '-' || t[0] == '_' || t[0] == '.')
234				t++;
235
236			if (begin < t) {
237				char* r = new_string(begin, t - begin);
238				char* q = get_normal_name(r);
239				if (r != q) {
240					PyMem_FREE(r);
241					r = new_string(q, strlen(q));
242				}
243				return r;
244			}
245		}
246	}
247	return NULL;
248}
249
250/* Check whether the line contains a coding spec. If it does,
251   invoke the set_readline function for the new encoding.
252   This function receives the tok_state and the new encoding.
253   Return 1 on success, 0 on failure.  */
254
255static int
256check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257		  int set_readline(struct tok_state *, const char *))
258{
259	char * cs;
260	int r = 1;
261
262        if (tok->cont_line)
263		/* It's a continuation line, so it can't be a coding spec. */
264		return 1;
265	cs = get_coding_spec(line, size);
266	if (cs != NULL) {
267		tok->read_coding_spec = 1;
268		if (tok->encoding == NULL) {
269			assert(tok->decoding_state == 1); /* raw */
270			if (strcmp(cs, "utf-8") == 0 ||
271			    strcmp(cs, "iso-8859-1") == 0) {
272				tok->encoding = cs;
273			} else {
274#ifdef Py_USING_UNICODE
275				r = set_readline(tok, cs);
276				if (r) {
277					tok->encoding = cs;
278					tok->decoding_state = -1;
279				}
280				else
281					PyMem_FREE(cs);
282#else
283                                /* Without Unicode support, we cannot
284                                   process the coding spec. Since there
285                                   won't be any Unicode literals, that
286                                   won't matter. */
287				PyMem_FREE(cs);
288#endif
289			}
290		} else {	/* then, compare cs with BOM */
291			r = (strcmp(tok->encoding, cs) == 0);
292			PyMem_FREE(cs);
293		}
294	}
295	if (!r) {
296		cs = tok->encoding;
297		if (!cs)
298			cs = "with BOM";
299		PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300	}
301	return r;
302}
303
304/* See whether the file starts with a BOM. If it does,
305   invoke the set_readline function with the new encoding.
306   Return 1 on success, 0 on failure.  */
307
308static int
309check_bom(int get_char(struct tok_state *),
310	  void unget_char(int, struct tok_state *),
311	  int set_readline(struct tok_state *, const char *),
312	  struct tok_state *tok)
313{
314	int ch = get_char(tok);
315	tok->decoding_state = 1;
316	if (ch == EOF) {
317		return 1;
318	} else if (ch == 0xEF) {
319		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
320		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
321#if 0
322	/* Disable support for UTF-16 BOMs until a decision
323	   is made whether this needs to be supported.  */
324	} else if (ch == 0xFE) {
325		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
326		if (!set_readline(tok, "utf-16-be")) return 0;
327		tok->decoding_state = -1;
328	} else if (ch == 0xFF) {
329		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
330		if (!set_readline(tok, "utf-16-le")) return 0;
331		tok->decoding_state = -1;
332#endif
333	} else {
334		unget_char(ch, tok);
335		return 1;
336	}
337	if (tok->encoding != NULL)
338		PyMem_FREE(tok->encoding);
339	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
340	return 1;
341  NON_BOM:
342	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
344	return 1;
345}
346
347/* Read a line of text from TOK into S, using the stream in TOK.
348   Return NULL on failure, else S.
349
350   On entry, tok->decoding_buffer will be one of:
351     1) NULL: need to call tok->decoding_readline to get a new line
352     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353           stored the result in tok->decoding_buffer
354     3) PyStringObject *: previous call to fp_readl did not have enough room
355           (in the s buffer) to copy entire contents of the line read
356           by tok->decoding_readline.  tok->decoding_buffer has the overflow.
357           In this case, fp_readl is called in a loop (with an expanded buffer)
358           until the buffer ends with a '\n' (or until the end of the file is
359           reached): see tok_nextc and its calls to decoding_fgets.
360*/
361
362static char *
363fp_readl(char *s, int size, struct tok_state *tok)
364{
365#ifndef Py_USING_UNICODE
366	/* In a non-Unicode built, this should never be called. */
367	Py_FatalError("fp_readl should not be called in this build.");
368	return NULL; /* Keep compiler happy (not reachable) */
369#else
370	PyObject* utf8 = NULL;
371	PyObject* buf = tok->decoding_buffer;
372	char *str;
373	Py_ssize_t utf8len;
374
375	/* Ask for one less byte so we can terminate it */
376	assert(size > 0);
377	size--;
378
379	if (buf == NULL) {
380		buf = PyObject_CallObject(tok->decoding_readline, NULL);
381		if (buf == NULL)
382			return error_ret(tok);
383	} else {
384		tok->decoding_buffer = NULL;
385		if (PyString_CheckExact(buf))
386			utf8 = buf;
387	}
388	if (utf8 == NULL) {
389		utf8 = PyUnicode_AsUTF8String(buf);
390		Py_DECREF(buf);
391		if (utf8 == NULL)
392			return error_ret(tok);
393	}
394	str = PyString_AsString(utf8);
395	utf8len = PyString_GET_SIZE(utf8);
396	if (utf8len > size) {
397		tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
398		if (tok->decoding_buffer == NULL) {
399			Py_DECREF(utf8);
400			return error_ret(tok);
401		}
402		utf8len = size;
403	}
404	memcpy(s, str, utf8len);
405	s[utf8len] = '\0';
406	Py_DECREF(utf8);
407	if (utf8len == 0) return NULL; /* EOF */
408	return s;
409#endif
410}
411
412/* Set the readline function for TOK to a StreamReader's
413   readline function. The StreamReader is named ENC.
414
415   This function is called from check_bom and check_coding_spec.
416
417   ENC is usually identical to the future value of tok->encoding,
418   except for the (currently unsupported) case of UTF-16.
419
420   Return 1 on success, 0 on failure. */
421
422static int
423fp_setreadl(struct tok_state *tok, const char* enc)
424{
425	PyObject *reader, *stream, *readline;
426
427	/* XXX: constify filename argument. */
428	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
429	if (stream == NULL)
430		return 0;
431
432	reader = PyCodec_StreamReader(enc, stream, NULL);
433	Py_DECREF(stream);
434	if (reader == NULL)
435		return 0;
436
437	readline = PyObject_GetAttrString(reader, "readline");
438	Py_DECREF(reader);
439	if (readline == NULL)
440		return 0;
441
442	tok->decoding_readline = readline;
443	return 1;
444}
445
446/* Fetch the next byte from TOK. */
447
448static int fp_getc(struct tok_state *tok) {
449	return getc(tok->fp);
450}
451
452/* Unfetch the last byte back into TOK.  */
453
454static void fp_ungetc(int c, struct tok_state *tok) {
455	ungetc(c, tok->fp);
456}
457
458/* Read a line of input from TOK. Determine encoding
459   if necessary.  */
460
461static char *
462decoding_fgets(char *s, int size, struct tok_state *tok)
463{
464	char *line = NULL;
465	int badchar = 0;
466	for (;;) {
467		if (tok->decoding_state < 0) {
468			/* We already have a codec associated with
469			   this input. */
470			line = fp_readl(s, size, tok);
471			break;
472		} else if (tok->decoding_state > 0) {
473			/* We want a 'raw' read. */
474			line = Py_UniversalNewlineFgets(s, size,
475							tok->fp, NULL);
476			break;
477		} else {
478			/* We have not yet determined the encoding.
479			   If an encoding is found, use the file-pointer
480			   reader functions from now on. */
481			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
482				return error_ret(tok);
483			assert(tok->decoding_state != 0);
484		}
485	}
486	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
487		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
488			return error_ret(tok);
489		}
490	}
491#ifndef PGEN
492	/* The default encoding is ASCII, so make sure we don't have any
493           non-ASCII bytes in it. */
494	if (line && !tok->encoding) {
495		unsigned char *c;
496		for (c = (unsigned char *)line; *c; c++)
497			if (*c > 127) {
498				badchar = *c;
499				break;
500			}
501	}
502	if (badchar) {
503		char buf[500];
504		/* Need to add 1 to the line number, since this line
505		   has not been counted, yet.  */
506		sprintf(buf,
507			"Non-ASCII character '\\x%.2x' "
508			"in file %.200s on line %i, "
509			"but no encoding declared; "
510			"see http://www.python.org/peps/pep-0263.html for details",
511			badchar, tok->filename, tok->lineno + 1);
512		PyErr_SetString(PyExc_SyntaxError, buf);
513		return error_ret(tok);
514	}
515#endif
516	return line;
517}
518
519static int
520decoding_feof(struct tok_state *tok)
521{
522	if (tok->decoding_state >= 0) {
523		return feof(tok->fp);
524	} else {
525		PyObject* buf = tok->decoding_buffer;
526		if (buf == NULL) {
527			buf = PyObject_CallObject(tok->decoding_readline, NULL);
528			if (buf == NULL) {
529				error_ret(tok);
530				return 1;
531			} else {
532				tok->decoding_buffer = buf;
533			}
534		}
535		return PyObject_Length(buf) == 0;
536	}
537}
538
539/* Fetch a byte from TOK, using the string buffer. */
540
541static int
542buf_getc(struct tok_state *tok) {
543	return Py_CHARMASK(*tok->str++);
544}
545
546/* Unfetch a byte from TOK, using the string buffer. */
547
548static void
549buf_ungetc(int c, struct tok_state *tok) {
550	tok->str--;
551	assert(Py_CHARMASK(*tok->str) == c);	/* tok->cur may point to read-only segment */
552}
553
554/* Set the readline function for TOK to ENC. For the string-based
555   tokenizer, this means to just record the encoding. */
556
557static int
558buf_setreadl(struct tok_state *tok, const char* enc) {
559	tok->enc = enc;
560	return 1;
561}
562
563/* Return a UTF-8 encoding Python string object from the
564   C byte string STR, which is encoded with ENC. */
565
566#ifdef Py_USING_UNICODE
567static PyObject *
568translate_into_utf8(const char* str, const char* enc) {
569	PyObject *utf8;
570	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
571	if (buf == NULL)
572		return NULL;
573	utf8 = PyUnicode_AsUTF8String(buf);
574	Py_DECREF(buf);
575	return utf8;
576}
577#endif
578
579/* Decode a byte string STR for use as the buffer of TOK.
580   Look for encoding declarations inside STR, and record them
581   inside TOK.  */
582
583static const char *
584decode_str(const char *str, struct tok_state *tok)
585{
586	PyObject* utf8 = NULL;
587	const char *s;
588	int lineno = 0;
589	tok->enc = NULL;
590	tok->str = str;
591	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
592		return error_ret(tok);
593	str = tok->str;		/* string after BOM if any */
594	assert(str);
595#ifdef Py_USING_UNICODE
596	if (tok->enc != NULL) {
597		utf8 = translate_into_utf8(str, tok->enc);
598		if (utf8 == NULL)
599			return error_ret(tok);
600		str = PyString_AsString(utf8);
601	}
602#endif
603	for (s = str;; s++) {
604		if (*s == '\0') break;
605		else if (*s == '\n') {
606			lineno++;
607			if (lineno == 2) break;
608		}
609	}
610	tok->enc = NULL;
611	if (!check_coding_spec(str, s - str, tok, buf_setreadl))
612		return error_ret(tok);
613#ifdef Py_USING_UNICODE
614	if (tok->enc != NULL) {
615		assert(utf8 == NULL);
616		utf8 = translate_into_utf8(str, tok->enc);
617		if (utf8 == NULL) {
618			PyErr_Format(PyExc_SyntaxError,
619				"unknown encoding: %s", tok->enc);
620			return error_ret(tok);
621		}
622		str = PyString_AsString(utf8);
623	}
624#endif
625	assert(tok->decoding_buffer == NULL);
626	tok->decoding_buffer = utf8; /* CAUTION */
627	return str;
628}
629
630#endif /* PGEN */
631
632/* Set up tokenizer for string */
633
634struct tok_state *
635PyTokenizer_FromString(const char *str)
636{
637	struct tok_state *tok = tok_new();
638	if (tok == NULL)
639		return NULL;
640	str = (char *)decode_str(str, tok);
641	if (str == NULL) {
642		PyTokenizer_Free(tok);
643		return NULL;
644	}
645
646	/* XXX: constify members. */
647	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
648	return tok;
649}
650
651
652/* Set up tokenizer for file */
653
654struct tok_state *
655PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
656{
657	struct tok_state *tok = tok_new();
658	if (tok == NULL)
659		return NULL;
660	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
661		PyTokenizer_Free(tok);
662		return NULL;
663	}
664	tok->cur = tok->inp = tok->buf;
665	tok->end = tok->buf + BUFSIZ;
666	tok->fp = fp;
667	tok->prompt = ps1;
668	tok->nextprompt = ps2;
669	return tok;
670}
671
672
673/* Free a tok_state structure */
674
675void
676PyTokenizer_Free(struct tok_state *tok)
677{
678	if (tok->encoding != NULL)
679		PyMem_FREE(tok->encoding);
680#ifndef PGEN
681	Py_XDECREF(tok->decoding_readline);
682	Py_XDECREF(tok->decoding_buffer);
683#endif
684	if (tok->fp != NULL && tok->buf != NULL)
685		PyMem_FREE(tok->buf);
686	PyMem_FREE(tok);
687}
688
689#if !defined(PGEN) && defined(Py_USING_UNICODE)
690static int
691tok_stdin_decode(struct tok_state *tok, char **inp)
692{
693	PyObject *enc, *sysstdin, *decoded, *utf8;
694	const char *encoding;
695	char *converted;
696
697	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
698		return 0;
699	sysstdin = PySys_GetObject("stdin");
700	if (sysstdin == NULL || !PyFile_Check(sysstdin))
701		return 0;
702
703	enc = ((PyFileObject *)sysstdin)->f_encoding;
704	if (enc == NULL || !PyString_Check(enc))
705		return 0;
706	Py_INCREF(enc);
707
708	encoding = PyString_AsString(enc);
709	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
710	if (decoded == NULL)
711		goto error_clear;
712
713	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
714	Py_DECREF(decoded);
715	if (utf8 == NULL)
716		goto error_clear;
717
718	assert(PyString_Check(utf8));
719	converted = new_string(PyString_AS_STRING(utf8),
720			       PyString_GET_SIZE(utf8));
721	Py_DECREF(utf8);
722	if (converted == NULL)
723		goto error_nomem;
724
725	PyMem_FREE(*inp);
726	*inp = converted;
727	if (tok->encoding != NULL)
728		PyMem_FREE(tok->encoding);
729	tok->encoding = new_string(encoding, strlen(encoding));
730	if (tok->encoding == NULL)
731		goto error_nomem;
732
733	Py_DECREF(enc);
734	return 0;
735
736error_nomem:
737	Py_DECREF(enc);
738	tok->done = E_NOMEM;
739	return -1;
740
741error_clear:
742	/* Fallback to iso-8859-1: for backward compatibility */
743	Py_DECREF(enc);
744	PyErr_Clear();
745	return 0;
746}
747#endif
748
749/* Get next char, updating state; error code goes into tok->done */
750
751static int
752tok_nextc(register struct tok_state *tok)
753{
754	for (;;) {
755		if (tok->cur != tok->inp) {
756			return Py_CHARMASK(*tok->cur++); /* Fast path */
757		}
758		if (tok->done != E_OK)
759			return EOF;
760		if (tok->fp == NULL) {
761			char *end = strchr(tok->inp, '\n');
762			if (end != NULL)
763				end++;
764			else {
765				end = strchr(tok->inp, '\0');
766				if (end == tok->inp) {
767					tok->done = E_EOF;
768					return EOF;
769				}
770			}
771			if (tok->start == NULL)
772				tok->buf = tok->cur;
773			tok->line_start = tok->cur;
774			tok->lineno++;
775			tok->inp = end;
776			return Py_CHARMASK(*tok->cur++);
777		}
778		if (tok->prompt != NULL) {
779			char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
780			if (tok->nextprompt != NULL)
781				tok->prompt = tok->nextprompt;
782			if (newtok == NULL)
783				tok->done = E_INTR;
784			else if (*newtok == '\0') {
785				PyMem_FREE(newtok);
786				tok->done = E_EOF;
787			}
788#if !defined(PGEN) && defined(Py_USING_UNICODE)
789			else if (tok_stdin_decode(tok, &newtok) != 0)
790				PyMem_FREE(newtok);
791#endif
792			else if (tok->start != NULL) {
793				size_t start = tok->start - tok->buf;
794				size_t oldlen = tok->cur - tok->buf;
795				size_t newlen = oldlen + strlen(newtok);
796				char *buf = tok->buf;
797				buf = (char *)PyMem_REALLOC(buf, newlen+1);
798				tok->lineno++;
799				if (buf == NULL) {
800					PyMem_FREE(tok->buf);
801					tok->buf = NULL;
802					PyMem_FREE(newtok);
803					tok->done = E_NOMEM;
804					return EOF;
805				}
806				tok->buf = buf;
807				tok->cur = tok->buf + oldlen;
808				tok->line_start = tok->cur;
809				strcpy(tok->buf + oldlen, newtok);
810				PyMem_FREE(newtok);
811				tok->inp = tok->buf + newlen;
812				tok->end = tok->inp + 1;
813				tok->start = tok->buf + start;
814			}
815			else {
816				tok->lineno++;
817				if (tok->buf != NULL)
818					PyMem_FREE(tok->buf);
819				tok->buf = newtok;
820				tok->line_start = tok->buf;
821				tok->cur = tok->buf;
822				tok->line_start = tok->buf;
823				tok->inp = strchr(tok->buf, '\0');
824				tok->end = tok->inp + 1;
825			}
826		}
827		else {
828			int done = 0;
829			Py_ssize_t cur = 0;
830			char *pt;
831			if (tok->start == NULL) {
832				if (tok->buf == NULL) {
833					tok->buf = (char *)
834						PyMem_MALLOC(BUFSIZ);
835					if (tok->buf == NULL) {
836						tok->done = E_NOMEM;
837						return EOF;
838					}
839					tok->end = tok->buf + BUFSIZ;
840				}
841				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
842					  tok) == NULL) {
843					tok->done = E_EOF;
844					done = 1;
845				}
846				else {
847					tok->done = E_OK;
848					tok->inp = strchr(tok->buf, '\0');
849					done = tok->inp[-1] == '\n';
850				}
851			}
852			else {
853				cur = tok->cur - tok->buf;
854				if (decoding_feof(tok)) {
855					tok->done = E_EOF;
856					done = 1;
857				}
858				else
859					tok->done = E_OK;
860			}
861			tok->lineno++;
862			/* Read until '\n' or EOF */
863			while (!done) {
864				Py_ssize_t curstart = tok->start == NULL ? -1 :
865					          tok->start - tok->buf;
866				Py_ssize_t curvalid = tok->inp - tok->buf;
867				Py_ssize_t newsize = curvalid + BUFSIZ;
868				char *newbuf = tok->buf;
869				newbuf = (char *)PyMem_REALLOC(newbuf,
870							       newsize);
871				if (newbuf == NULL) {
872					tok->done = E_NOMEM;
873					tok->cur = tok->inp;
874					return EOF;
875				}
876				tok->buf = newbuf;
877				tok->inp = tok->buf + curvalid;
878				tok->end = tok->buf + newsize;
879				tok->start = curstart < 0 ? NULL :
880					     tok->buf + curstart;
881				if (decoding_fgets(tok->inp,
882					       (int)(tok->end - tok->inp),
883					       tok) == NULL) {
884					/* Break out early on decoding
885					   errors, as tok->buf will be NULL
886					 */
887					if (tok->decoding_erred)
888						return EOF;
889					/* Last line does not end in \n,
890					   fake one */
891					strcpy(tok->inp, "\n");
892				}
893				tok->inp = strchr(tok->inp, '\0');
894				done = tok->inp[-1] == '\n';
895			}
896			if (tok->buf != NULL) {
897				tok->cur = tok->buf + cur;
898				tok->line_start = tok->cur;
899				/* replace "\r\n" with "\n" */
900				/* For Mac leave the \r, giving a syntax error */
901				pt = tok->inp - 2;
902				if (pt >= tok->buf && *pt == '\r') {
903					*pt++ = '\n';
904					*pt = '\0';
905					tok->inp = pt;
906				}
907			}
908		}
909		if (tok->done != E_OK) {
910			if (tok->prompt != NULL)
911				PySys_WriteStderr("\n");
912			tok->cur = tok->inp;
913			return EOF;
914		}
915	}
916	/*NOTREACHED*/
917}
918
919
920/* Back-up one character */
921
922static void
923tok_backup(register struct tok_state *tok, register int c)
924{
925	if (c != EOF) {
926		if (--tok->cur < tok->buf)
927			Py_FatalError("tok_backup: begin of buffer");
928		if (*tok->cur != c)
929			*tok->cur = c;
930	}
931}
932
933
934/* Return the token corresponding to a single character */
935
936int
937PyToken_OneChar(int c)
938{
939	switch (c) {
940	case '(':	return LPAR;
941	case ')':	return RPAR;
942	case '[':	return LSQB;
943	case ']':	return RSQB;
944	case ':':	return COLON;
945	case ',':	return COMMA;
946	case ';':	return SEMI;
947	case '+':	return PLUS;
948	case '-':	return MINUS;
949	case '*':	return STAR;
950	case '/':	return SLASH;
951	case '|':	return VBAR;
952	case '&':	return AMPER;
953	case '<':	return LESS;
954	case '>':	return GREATER;
955	case '=':	return EQUAL;
956	case '.':	return DOT;
957	case '%':	return PERCENT;
958	case '`':	return BACKQUOTE;
959	case '{':	return LBRACE;
960	case '}':	return RBRACE;
961	case '^':	return CIRCUMFLEX;
962	case '~':	return TILDE;
963	case '@':       return AT;
964	default:	return OP;
965	}
966}
967
968
969int
970PyToken_TwoChars(int c1, int c2)
971{
972	switch (c1) {
973	case '=':
974		switch (c2) {
975		case '=':	return EQEQUAL;
976		}
977		break;
978	case '!':
979		switch (c2) {
980		case '=':	return NOTEQUAL;
981		}
982		break;
983	case '<':
984		switch (c2) {
985		case '>':	return NOTEQUAL;
986		case '=':	return LESSEQUAL;
987		case '<':	return LEFTSHIFT;
988		}
989		break;
990	case '>':
991		switch (c2) {
992		case '=':	return GREATEREQUAL;
993		case '>':	return RIGHTSHIFT;
994		}
995		break;
996	case '+':
997		switch (c2) {
998		case '=':	return PLUSEQUAL;
999		}
1000		break;
1001	case '-':
1002		switch (c2) {
1003		case '=':	return MINEQUAL;
1004		}
1005		break;
1006	case '*':
1007		switch (c2) {
1008		case '*':	return DOUBLESTAR;
1009		case '=':	return STAREQUAL;
1010		}
1011		break;
1012	case '/':
1013		switch (c2) {
1014		case '/':	return DOUBLESLASH;
1015		case '=':	return SLASHEQUAL;
1016		}
1017		break;
1018	case '|':
1019		switch (c2) {
1020		case '=':	return VBAREQUAL;
1021		}
1022		break;
1023	case '%':
1024		switch (c2) {
1025		case '=':	return PERCENTEQUAL;
1026		}
1027		break;
1028	case '&':
1029		switch (c2) {
1030		case '=':	return AMPEREQUAL;
1031		}
1032		break;
1033	case '^':
1034		switch (c2) {
1035		case '=':	return CIRCUMFLEXEQUAL;
1036		}
1037		break;
1038	}
1039	return OP;
1040}
1041
1042int
1043PyToken_ThreeChars(int c1, int c2, int c3)
1044{
1045	switch (c1) {
1046	case '<':
1047		switch (c2) {
1048		case '<':
1049			switch (c3) {
1050			case '=':
1051				return LEFTSHIFTEQUAL;
1052			}
1053			break;
1054		}
1055		break;
1056	case '>':
1057		switch (c2) {
1058		case '>':
1059			switch (c3) {
1060			case '=':
1061				return RIGHTSHIFTEQUAL;
1062			}
1063			break;
1064		}
1065		break;
1066	case '*':
1067		switch (c2) {
1068		case '*':
1069			switch (c3) {
1070			case '=':
1071				return DOUBLESTAREQUAL;
1072			}
1073			break;
1074		}
1075		break;
1076	case '/':
1077		switch (c2) {
1078		case '/':
1079			switch (c3) {
1080			case '=':
1081				return DOUBLESLASHEQUAL;
1082			}
1083			break;
1084		}
1085		break;
1086	}
1087	return OP;
1088}
1089
1090static int
1091indenterror(struct tok_state *tok)
1092{
1093	if (tok->alterror) {
1094		tok->done = E_TABSPACE;
1095		tok->cur = tok->inp;
1096		return 1;
1097	}
1098	if (tok->altwarning) {
1099		PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100                                  "in indentation\n", tok->filename);
1101		tok->altwarning = 0;
1102	}
1103	return 0;
1104}
1105
1106
1107/* Get next token, after space stripping etc. */
1108
1109static int
1110tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1111{
1112	register int c;
1113	int blankline;
1114
1115	*p_start = *p_end = NULL;
1116  nextline:
1117	tok->start = NULL;
1118	blankline = 0;
1119
1120	/* Get indentation level */
1121	if (tok->atbol) {
1122		register int col = 0;
1123		register int altcol = 0;
1124		tok->atbol = 0;
1125		for (;;) {
1126			c = tok_nextc(tok);
1127			if (c == ' ')
1128				col++, altcol++;
1129			else if (c == '\t') {
1130				col = (col/tok->tabsize + 1) * tok->tabsize;
1131				altcol = (altcol/tok->alttabsize + 1)
1132					* tok->alttabsize;
1133			}
1134			else if (c == '\014') /* Control-L (formfeed) */
1135				col = altcol = 0; /* For Emacs users */
1136			else
1137				break;
1138		}
1139		tok_backup(tok, c);
1140		if (c == '#' || c == '\n') {
1141			/* Lines with only whitespace and/or comments
1142			   shouldn't affect the indentation and are
1143			   not passed to the parser as NEWLINE tokens,
1144			   except *totally* empty lines in interactive
1145			   mode, which signal the end of a command group. */
1146			if (col == 0 && c == '\n' && tok->prompt != NULL)
1147				blankline = 0; /* Let it through */
1148			else
1149				blankline = 1; /* Ignore completely */
1150			/* We can't jump back right here since we still
1151			   may need to skip to the end of a comment */
1152		}
1153		if (!blankline && tok->level == 0) {
1154			if (col == tok->indstack[tok->indent]) {
1155				/* No change */
1156				if (altcol != tok->altindstack[tok->indent]) {
1157					if (indenterror(tok))
1158						return ERRORTOKEN;
1159				}
1160			}
1161			else if (col > tok->indstack[tok->indent]) {
1162				/* Indent -- always one */
1163				if (tok->indent+1 >= MAXINDENT) {
1164					tok->done = E_TOODEEP;
1165					tok->cur = tok->inp;
1166					return ERRORTOKEN;
1167				}
1168				if (altcol <= tok->altindstack[tok->indent]) {
1169					if (indenterror(tok))
1170						return ERRORTOKEN;
1171				}
1172				tok->pendin++;
1173				tok->indstack[++tok->indent] = col;
1174				tok->altindstack[tok->indent] = altcol;
1175			}
1176			else /* col < tok->indstack[tok->indent] */ {
1177				/* Dedent -- any number, must be consistent */
1178				while (tok->indent > 0 &&
1179					col < tok->indstack[tok->indent]) {
1180					tok->pendin--;
1181					tok->indent--;
1182				}
1183				if (col != tok->indstack[tok->indent]) {
1184					tok->done = E_DEDENT;
1185					tok->cur = tok->inp;
1186					return ERRORTOKEN;
1187				}
1188				if (altcol != tok->altindstack[tok->indent]) {
1189					if (indenterror(tok))
1190						return ERRORTOKEN;
1191				}
1192			}
1193		}
1194	}
1195
1196	tok->start = tok->cur;
1197
1198	/* Return pending indents/dedents */
1199	if (tok->pendin != 0) {
1200		if (tok->pendin < 0) {
1201			tok->pendin++;
1202			return DEDENT;
1203		}
1204		else {
1205			tok->pendin--;
1206			return INDENT;
1207		}
1208	}
1209
1210 again:
1211	tok->start = NULL;
1212	/* Skip spaces */
1213	do {
1214		c = tok_nextc(tok);
1215	} while (c == ' ' || c == '\t' || c == '\014');
1216
1217	/* Set start of current token */
1218	tok->start = tok->cur - 1;
1219
1220	/* Skip comment, while looking for tab-setting magic */
1221	if (c == '#') {
1222		static char *tabforms[] = {
1223			"tab-width:",		/* Emacs */
1224			":tabstop=",		/* vim, full form */
1225			":ts=",			/* vim, abbreviated form */
1226			"set tabsize=",		/* will vi never die? */
1227		/* more templates can be added here to support other editors */
1228		};
1229		char cbuf[80];
1230		char *tp, **cp;
1231		tp = cbuf;
1232		do {
1233			*tp++ = c = tok_nextc(tok);
1234		} while (c != EOF && c != '\n' &&
1235			 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1236		*tp = '\0';
1237		for (cp = tabforms;
1238		     cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1239		     cp++) {
1240			if ((tp = strstr(cbuf, *cp))) {
1241				int newsize = atoi(tp + strlen(*cp));
1242
1243				if (newsize >= 1 && newsize <= 40) {
1244					tok->tabsize = newsize;
1245					if (Py_VerboseFlag)
1246					    PySys_WriteStderr(
1247						"Tab size set to %d\n",
1248						newsize);
1249				}
1250			}
1251		}
1252		while (c != EOF && c != '\n')
1253			c = tok_nextc(tok);
1254	}
1255
1256	/* Check for EOF and errors now */
1257	if (c == EOF) {
1258		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1259	}
1260
1261	/* Identifier (most frequent token!) */
1262	if (isalpha(c) || c == '_') {
1263		/* Process r"", u"" and ur"" */
1264		switch (c) {
1265		case 'r':
1266		case 'R':
1267			c = tok_nextc(tok);
1268			if (c == '"' || c == '\'')
1269				goto letter_quote;
1270			break;
1271		case 'u':
1272		case 'U':
1273			c = tok_nextc(tok);
1274			if (c == 'r' || c == 'R')
1275				c = tok_nextc(tok);
1276			if (c == '"' || c == '\'')
1277				goto letter_quote;
1278			break;
1279		}
1280		while (isalnum(c) || c == '_') {
1281			c = tok_nextc(tok);
1282		}
1283		tok_backup(tok, c);
1284		*p_start = tok->start;
1285		*p_end = tok->cur;
1286		return NAME;
1287	}
1288
1289	/* Newline */
1290	if (c == '\n') {
1291		tok->atbol = 1;
1292		if (blankline || tok->level > 0)
1293			goto nextline;
1294		*p_start = tok->start;
1295		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1296                tok->cont_line = 0;
1297		return NEWLINE;
1298	}
1299
1300	/* Period or number starting with period? */
1301	if (c == '.') {
1302		c = tok_nextc(tok);
1303		if (isdigit(c)) {
1304			goto fraction;
1305		}
1306		else {
1307			tok_backup(tok, c);
1308			*p_start = tok->start;
1309			*p_end = tok->cur;
1310			return DOT;
1311		}
1312	}
1313
1314	/* Number */
1315	if (isdigit(c)) {
1316		if (c == '0') {
1317			/* Hex or octal -- maybe. */
1318			c = tok_nextc(tok);
1319			if (c == '.')
1320				goto fraction;
1321#ifndef WITHOUT_COMPLEX
1322			if (c == 'j' || c == 'J')
1323				goto imaginary;
1324#endif
1325			if (c == 'x' || c == 'X') {
1326				/* Hex */
1327				do {
1328					c = tok_nextc(tok);
1329				} while (isxdigit(c));
1330			}
1331			else {
1332				int found_decimal = 0;
1333				/* Octal; c is first char of it */
1334				/* There's no 'isoctdigit' macro, sigh */
1335				while ('0' <= c && c < '8') {
1336					c = tok_nextc(tok);
1337				}
1338				if (isdigit(c)) {
1339					found_decimal = 1;
1340					do {
1341						c = tok_nextc(tok);
1342					} while (isdigit(c));
1343				}
1344				if (c == '.')
1345					goto fraction;
1346				else if (c == 'e' || c == 'E')
1347					goto exponent;
1348#ifndef WITHOUT_COMPLEX
1349				else if (c == 'j' || c == 'J')
1350					goto imaginary;
1351#endif
1352				else if (found_decimal) {
1353					tok->done = E_TOKEN;
1354					tok_backup(tok, c);
1355					return ERRORTOKEN;
1356				}
1357			}
1358			if (c == 'l' || c == 'L')
1359				c = tok_nextc(tok);
1360		}
1361		else {
1362			/* Decimal */
1363			do {
1364				c = tok_nextc(tok);
1365			} while (isdigit(c));
1366			if (c == 'l' || c == 'L')
1367				c = tok_nextc(tok);
1368			else {
1369				/* Accept floating point numbers. */
1370				if (c == '.') {
1371		fraction:
1372					/* Fraction */
1373					do {
1374						c = tok_nextc(tok);
1375					} while (isdigit(c));
1376				}
1377				if (c == 'e' || c == 'E') {
1378		exponent:
1379					/* Exponent part */
1380					c = tok_nextc(tok);
1381					if (c == '+' || c == '-')
1382						c = tok_nextc(tok);
1383					if (!isdigit(c)) {
1384						tok->done = E_TOKEN;
1385						tok_backup(tok, c);
1386						return ERRORTOKEN;
1387					}
1388					do {
1389						c = tok_nextc(tok);
1390					} while (isdigit(c));
1391				}
1392#ifndef WITHOUT_COMPLEX
1393				if (c == 'j' || c == 'J')
1394					/* Imaginary part */
1395		imaginary:
1396					c = tok_nextc(tok);
1397#endif
1398			}
1399		}
1400		tok_backup(tok, c);
1401		*p_start = tok->start;
1402		*p_end = tok->cur;
1403		return NUMBER;
1404	}
1405
1406  letter_quote:
1407	/* String */
1408	if (c == '\'' || c == '"') {
1409		Py_ssize_t quote2 = tok->cur - tok->start + 1;
1410		int quote = c;
1411		int triple = 0;
1412		int tripcount = 0;
1413		for (;;) {
1414			c = tok_nextc(tok);
1415			if (c == '\n') {
1416				if (!triple) {
1417					tok->done = E_EOLS;
1418					tok_backup(tok, c);
1419					return ERRORTOKEN;
1420				}
1421				tripcount = 0;
1422                                tok->cont_line = 1; /* multiline string. */
1423			}
1424			else if (c == EOF) {
1425				if (triple)
1426					tok->done = E_EOFS;
1427				else
1428					tok->done = E_EOLS;
1429				tok->cur = tok->inp;
1430				return ERRORTOKEN;
1431			}
1432			else if (c == quote) {
1433				tripcount++;
1434				if (tok->cur - tok->start == quote2) {
1435					c = tok_nextc(tok);
1436					if (c == quote) {
1437						triple = 1;
1438						tripcount = 0;
1439						continue;
1440					}
1441					tok_backup(tok, c);
1442				}
1443				if (!triple || tripcount == 3)
1444					break;
1445			}
1446			else if (c == '\\') {
1447				tripcount = 0;
1448				c = tok_nextc(tok);
1449				if (c == EOF) {
1450					tok->done = E_EOLS;
1451					tok->cur = tok->inp;
1452					return ERRORTOKEN;
1453				}
1454			}
1455			else
1456				tripcount = 0;
1457		}
1458		*p_start = tok->start;
1459		*p_end = tok->cur;
1460		return STRING;
1461	}
1462
1463	/* Line continuation */
1464	if (c == '\\') {
1465		c = tok_nextc(tok);
1466		if (c != '\n') {
1467			tok->done = E_LINECONT;
1468			tok->cur = tok->inp;
1469			return ERRORTOKEN;
1470		}
1471                tok->cont_line = 1;
1472		goto again; /* Read next line */
1473	}
1474
1475	/* Check for two-character token */
1476	{
1477		int c2 = tok_nextc(tok);
1478		int token = PyToken_TwoChars(c, c2);
1479		if (token != OP) {
1480			int c3 = tok_nextc(tok);
1481			int token3 = PyToken_ThreeChars(c, c2, c3);
1482			if (token3 != OP) {
1483				token = token3;
1484			} else {
1485				tok_backup(tok, c3);
1486			}
1487			*p_start = tok->start;
1488			*p_end = tok->cur;
1489			return token;
1490		}
1491		tok_backup(tok, c2);
1492	}
1493
1494	/* Keep track of parentheses nesting level */
1495	switch (c) {
1496	case '(':
1497	case '[':
1498	case '{':
1499		tok->level++;
1500		break;
1501	case ')':
1502	case ']':
1503	case '}':
1504		tok->level--;
1505		break;
1506	}
1507
1508	/* Punctuation character */
1509	*p_start = tok->start;
1510	*p_end = tok->cur;
1511	return PyToken_OneChar(c);
1512}
1513
1514int
1515PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1516{
1517	int result = tok_get(tok, p_start, p_end);
1518	if (tok->decoding_erred) {
1519		result = ERRORTOKEN;
1520		tok->done = E_DECODE;
1521	}
1522	return result;
1523}
1524
1525/* This function is only called from parsetok. However, it cannot live
1526   there, as it must be empty for PGEN, and we can check for PGEN only
1527   in this file. */
1528
1529#ifdef PGEN
1530char*
1531PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1532{
1533	return NULL;
1534}
1535#else
1536static PyObject *
1537dec_utf8(const char *enc, const char *text, size_t len) {
1538	PyObject *ret = NULL;
1539	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1540	if (unicode_text) {
1541		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1542		Py_DECREF(unicode_text);
1543	}
1544	if (!ret) {
1545		PyErr_Clear();
1546	}
1547	return ret;
1548}
1549
1550char *
1551PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1552{
1553	char *text = NULL;
1554	if (tok->encoding) {
1555		/* convert source to original encondig */
1556		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1557		if (lineobj != NULL) {
1558			int linelen = PyString_Size(lineobj);
1559			const char *line = PyString_AsString(lineobj);
1560			text = PyObject_MALLOC(linelen + 1);
1561			if (text != NULL && line != NULL) {
1562				if (linelen)
1563					strncpy(text, line, linelen);
1564				text[linelen] = '\0';
1565			}
1566			Py_DECREF(lineobj);
1567
1568			/* adjust error offset */
1569			if (*offset > 1) {
1570				PyObject *offsetobj = dec_utf8(tok->encoding,
1571							       tok->buf, *offset-1);
1572				if (offsetobj) {
1573					*offset = PyString_Size(offsetobj) + 1;
1574					Py_DECREF(offsetobj);
1575				}
1576			}
1577
1578		}
1579	}
1580	return text;
1581
1582}
1583#endif
1584
1585
1586
1587#ifdef Py_DEBUG
1588
1589void
1590tok_dump(int type, char *start, char *end)
1591{
1592	printf("%s", _PyParser_TokenNames[type]);
1593	if (type == NAME || type == NUMBER || type == STRING || type == OP)
1594		printf("(%.*s)", (int)(end - start), start);
1595}
1596
1597#endif
1598