tokenizer.c revision 118ec70ea27000db428ba3e3a757f4b423670db6
1
2/* Tokenizer implementation */
3
4#include "Python.h"
5#include "pgenheaders.h"
6
7#include <ctype.h>
8#include <assert.h>
9
10#include "tokenizer.h"
11#include "errcode.h"
12
13#ifndef PGEN
14#include "unicodeobject.h"
15#include "stringobject.h"
16#include "fileobject.h"
17#include "codecs.h"
18#include "abstract.h"
19#endif /* PGEN */
20
21extern char *PyOS_Readline(char *);
22/* Return malloc'ed string including trailing \n;
23   empty malloc'ed string for EOF;
24   NULL if interrupted */
25
26/* Don't ever change this -- it would break the portability of Python code */
27#define TABSIZE 8
28
29/* Convert a possibly signed character to a nonnegative int */
30/* XXX This assumes characters are 8 bits wide */
31#ifdef __CHAR_UNSIGNED__
32#define Py_CHARMASK(c)		(c)
33#else
34#define Py_CHARMASK(c)		((c) & 0xff)
35#endif
36
37/* Forward */
38static struct tok_state *tok_new(void);
39static int tok_nextc(struct tok_state *tok);
40static void tok_backup(struct tok_state *tok, int c);
41
42/* Token names */
43
44char *_PyParser_TokenNames[] = {
45	"ENDMARKER",
46	"NAME",
47	"NUMBER",
48	"STRING",
49	"NEWLINE",
50	"INDENT",
51	"DEDENT",
52	"LPAR",
53	"RPAR",
54	"LSQB",
55	"RSQB",
56	"COLON",
57	"COMMA",
58	"SEMI",
59	"PLUS",
60	"MINUS",
61	"STAR",
62	"SLASH",
63	"VBAR",
64	"AMPER",
65	"LESS",
66	"GREATER",
67	"EQUAL",
68	"DOT",
69	"PERCENT",
70	"BACKQUOTE",
71	"LBRACE",
72	"RBRACE",
73	"EQEQUAL",
74	"NOTEQUAL",
75	"LESSEQUAL",
76	"GREATEREQUAL",
77	"TILDE",
78	"CIRCUMFLEX",
79	"LEFTSHIFT",
80	"RIGHTSHIFT",
81	"DOUBLESTAR",
82	"PLUSEQUAL",
83	"MINEQUAL",
84	"STAREQUAL",
85	"SLASHEQUAL",
86	"PERCENTEQUAL",
87	"AMPEREQUAL",
88	"VBAREQUAL",
89	"CIRCUMFLEXEQUAL",
90	"LEFTSHIFTEQUAL",
91	"RIGHTSHIFTEQUAL",
92	"DOUBLESTAREQUAL",
93	"DOUBLESLASH",
94	"DOUBLESLASHEQUAL",
95	/* This table must match the #defines in token.h! */
96	"OP",
97	"<ERRORTOKEN>",
98	"<N_TOKENS>"
99};
100
101
102/* Create and initialize a new tok_state structure */
103
104static struct tok_state *
105tok_new(void)
106{
107	struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
108	if (tok == NULL)
109		return NULL;
110	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
111	tok->done = E_OK;
112	tok->fp = NULL;
113	tok->tabsize = TABSIZE;
114	tok->indent = 0;
115	tok->indstack[0] = 0;
116	tok->atbol = 1;
117	tok->pendin = 0;
118	tok->prompt = tok->nextprompt = NULL;
119	tok->lineno = 0;
120	tok->level = 0;
121	tok->filename = NULL;
122	tok->altwarning = 0;
123	tok->alterror = 0;
124	tok->alttabsize = 1;
125	tok->altindstack[0] = 0;
126	tok->decoding_state = 0;
127	tok->decoding_erred = 0;
128	tok->read_coding_spec = 0;
129	tok->issued_encoding_warning = 0;
130	tok->encoding = NULL;
131#ifndef PGEN
132	tok->decoding_readline = NULL;
133	tok->decoding_buffer = NULL;
134#endif
135	return tok;
136}
137
138#ifdef PGEN
139
140static char *
141decoding_fgets(char *s, int size, struct tok_state *tok)
142{
143	return fgets(s, size, tok->fp);
144}
145
146static int
147decoding_feof(struct tok_state *tok)
148{
149	return feof(tok->fp);
150}
151
152static const char *
153decode_str(const char *str, struct tok_state *tok)
154{
155	return str;
156}
157
158#else /* PGEN */
159
160static char *
161error_ret(struct tok_state *tok) /* XXX */
162{
163	tok->decoding_erred = 1;
164	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
165		PyMem_DEL(tok->buf);
166	tok->buf = NULL;
167	return NULL;		/* as if it were EOF */
168}
169
170static char *
171new_string(const char *s, int len)
172{
173	char* result = PyMem_NEW(char, len + 1);
174	if (result != NULL) {
175		memcpy(result, s, len);
176		result[len] = '\0';
177	}
178	return result;
179}
180
181static char *
182get_normal_name(char *s)	/* for utf-8 and latin-1 */
183{
184	char buf[13];
185	int i;
186	for (i = 0; i < 12; i++) {
187		int c = s[i];
188		if (c == '\0') break;
189		else if (c == '_') buf[i] = '-';
190		else buf[i] = tolower(c);
191	}
192	buf[i] = '\0';
193	if (strcmp(buf, "utf-8") == 0 ||
194	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
195	else if (strcmp(buf, "latin-1") == 0 ||
196		 strcmp(buf, "iso-8859-1") == 0 ||
197		 strcmp(buf, "iso-latin-1") == 0 ||
198		 strncmp(buf, "latin-1-", 8) == 0 ||
199		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
200		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
201	else return s;
202}
203
204/* Return the coding spec in S, or NULL if none is found.  */
205
206static char *
207get_coding_spec(const char *s, int size)
208{
209	int i;
210	for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
211		const char* t = s + i;
212		if (strncmp(t, "coding", 6) == 0) {
213			const char* begin = NULL;
214			t += 6;
215			if (t[0] != ':' && t[0] != '=')
216				continue;
217			do {
218				t++;
219			} while (t[0] == '\x20' || t[0] == '\t');
220
221			begin = t;
222			while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
223			       t[0] == '.')
224				t++;
225
226			if (begin < t) {
227				char* r = new_string(begin, t - begin);
228				char* q = get_normal_name(r);
229				if (r != q) {
230					PyMem_DEL(r);
231					r = new_string(q, strlen(q));
232				}
233				return r;
234			}
235		}
236	}
237	return NULL;
238}
239
240/* Check whether the line contains a coding spec. If it does,
241   invoke the set_readline function for the new encoding.
242   This function receives the tok_state and the new encoding.
243   Return 1 on success, 0 on failure.  */
244
245static int
246check_coding_spec(const char* line, int size, struct tok_state *tok,
247		  int set_readline(struct tok_state *, const char *))
248{
249	int r = 1;
250	char* cs = get_coding_spec(line, size);
251	if (cs != NULL) {
252		tok->read_coding_spec = 1;
253		if (tok->encoding == NULL) {
254			assert(tok->decoding_state == 1); /* raw */
255			if (strcmp(cs, "utf-8") == 0 ||
256			    strcmp(cs, "iso-8859-1") == 0) {
257				tok->encoding = cs;
258			} else {
259#ifdef Py_USING_UNICODE
260				r = set_readline(tok, cs);
261				if (r) {
262					tok->encoding = cs;
263					tok->decoding_state = -1;
264				}
265#else
266                                /* Without Unicode support, we cannot
267                                   process the coding spec. Since there
268                                   won't be any Unicode literals, that
269                                   won't matter. */
270#endif
271			}
272		} else {	/* then, compare cs with BOM */
273			r = (strcmp(tok->encoding, cs) == 0);
274			PyMem_DEL(cs);
275		}
276	}
277	return r;
278}
279
280/* See whether the file starts with a BOM. If it does,
281   invoke the set_readline function with the new encoding.
282   Return 1 on success, 0 on failure.  */
283
284static int
285check_bom(int get_char(struct tok_state *),
286	  void unget_char(int, struct tok_state *),
287	  int set_readline(struct tok_state *, const char *),
288	  struct tok_state *tok)
289{
290	int ch = get_char(tok);
291	tok->decoding_state = 1;
292	if (ch == EOF) {
293		return 1;
294	} else if (ch == 0xEF) {
295		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
296		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
297#if 0
298	/* Disable support for UTF-16 BOMs until a decision
299	   is made whether this needs to be supported.  */
300	} else if (ch == 0xFE) {
301		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
302		if (!set_readline(tok, "utf-16-be")) return 0;
303		tok->decoding_state = -1;
304	} else if (ch == 0xFF) {
305		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
306		if (!set_readline(tok, "utf-16-le")) return 0;
307		tok->decoding_state = -1;
308#endif
309	} else {
310		unget_char(ch, tok);
311		return 1;
312	}
313	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
314	return 1;
315  NON_BOM:
316	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
317	unget_char(0xFF, tok);	/* XXX this will cause a syntax error */
318	return 1;
319}
320
321/* Read a line of text from TOK into S, using the stream in TOK.
322   Return NULL on failure, else S.  */
323
324static char *
325fp_readl(char *s, int size, struct tok_state *tok)
326{
327#ifndef Py_USING_UNICODE
328	/* In a non-Unicode built, this should never be called. */
329	Py_FatalError("fp_readl should not be called in this build.");
330	return NULL;
331#else
332	PyObject* utf8;
333	PyObject* buf = tok->decoding_buffer;
334	if (buf == NULL) {
335		buf = PyObject_CallObject(tok->decoding_readline, NULL);
336		if (buf == NULL)
337			return error_ret(tok);
338	} else {
339		tok->decoding_buffer = NULL;
340	}
341	utf8 = PyUnicode_AsUTF8String(buf);
342	Py_DECREF(buf);
343	if (utf8 == NULL)
344		return error_ret(tok);
345	else {
346		const char* str = PyString_AsString(utf8);
347		assert(strlen(str) < (size_t)size); /* XXX */
348		strcpy(s, str);
349		Py_DECREF(utf8);
350		if (s[0] == '\0') return NULL; /* EOF */
351		return s;
352	}
353#endif
354}
355
356/* Set the readline function for TOK to a StreamReader's
357   readline function. The StreamReader is named ENC.
358
359   This function is called from check_bom and check_coding_spec.
360
361   ENC is usually identical to the future value of tok->encoding,
362   except for the (currently unsupported) case of UTF-16.
363
364   Return 1 on success, 0 on failure. */
365
366static int
367fp_setreadl(struct tok_state *tok, const char* enc)
368{
369	PyObject *reader, *stream, *readline;
370
371	stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
372	if (stream == NULL)
373		return 0;
374
375	reader = PyCodec_StreamReader(enc, stream, NULL);
376	Py_DECREF(stream);
377	if (reader == NULL)
378		return 0;
379
380	readline = PyObject_GetAttrString(reader, "readline");
381	Py_DECREF(reader);
382	if (readline == NULL)
383		return 0;
384
385	tok->decoding_readline = readline;
386	return 1;
387}
388
389/* Fetch the next byte from TOK. */
390
391static int fp_getc(struct tok_state *tok) {
392	return getc(tok->fp);
393}
394
395/* Unfetch the last byte back into TOK.  */
396
397static void fp_ungetc(int c, struct tok_state *tok) {
398	ungetc(c, tok->fp);
399}
400
401/* Read a line of input from TOK. Determine encoding
402   if necessary.  */
403
404static char *
405decoding_fgets(char *s, int size, struct tok_state *tok)
406{
407	char *line = NULL;
408	int warn = 0, badchar = 0;
409	for (;;) {
410		if (tok->decoding_state < 0) {
411			/* We already have a codec associated with
412			   this input. */
413			line = fp_readl(s, size, tok);
414			break;
415		} else if (tok->decoding_state > 0) {
416			/* We want a 'raw' read. */
417			line = Py_UniversalNewlineFgets(s, size,
418							tok->fp, NULL);
419			warn = 1;
420			break;
421		} else {
422			/* We have not yet determined the encoding.
423			   If an encoding is found, use the file-pointer
424			   reader functions from now on. */
425			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
426				return error_ret(tok);
427			assert(tok->decoding_state != 0);
428		}
429	}
430	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
431		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
432			return error_ret(tok);
433		}
434	}
435#ifndef PGEN
436	if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
437		unsigned char *c;
438		for (c = (unsigned char *)line; *c; c++)
439			if (*c > 127) {
440				badchar = *c;
441				break;
442			}
443	}
444	if (badchar) {
445		char buf[200];
446		sprintf(buf, "Non-ASCII character '\\x%.2x', "
447			"but no declared encoding", badchar);
448		/* Need to add 1 to the line number, since this line
449		   has not been counted, yet.  */
450		PyErr_WarnExplicit(PyExc_DeprecationWarning,
451				   buf, tok->filename, tok->lineno + 1,
452				   NULL, NULL);
453		tok->issued_encoding_warning = 1;
454	}
455#endif
456	return line;
457}
458
459static int
460decoding_feof(struct tok_state *tok)
461{
462	if (tok->decoding_state >= 0) {
463		return feof(tok->fp);
464	} else {
465		PyObject* buf = tok->decoding_buffer;
466		if (buf == NULL) {
467			buf = PyObject_CallObject(tok->decoding_readline, NULL);
468			if (buf == NULL) {
469				error_ret(tok);
470				return 1;
471			} else {
472				tok->decoding_buffer = buf;
473			}
474		}
475		return PyObject_Length(buf) == 0;
476	}
477}
478
479/* Fetch a byte from TOK, using the string buffer. */
480
481static int buf_getc(struct tok_state *tok) {
482	return *tok->str++;
483}
484
485/* Unfetch a byte from TOK, using the string buffer. */
486
487static void buf_ungetc(int c, struct tok_state *tok) {
488	tok->str--;
489	assert(*tok->str == c);	/* tok->cur may point to read-only segment */
490}
491
492/* Set the readline function for TOK to ENC. For the string-based
493   tokenizer, this means to just record the encoding. */
494
495static int buf_setreadl(struct tok_state *tok, const char* enc) {
496	tok->enc = enc;
497	return 1;
498}
499
500/* Return a UTF-8 encoding Python string object from the
501   C byte string STR, which is encoded with ENC. */
502
503#ifdef Py_USING_UNICODE
504static PyObject *
505translate_into_utf8(const char* str, const char* enc) {
506	PyObject *utf8;
507	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
508	if (buf == NULL)
509		return NULL;
510	utf8 = PyUnicode_AsUTF8String(buf);
511	Py_DECREF(buf);
512	return utf8;
513}
514#endif
515
516/* Decode a byte string STR for use as the buffer of TOK.
517   Look for encoding declarations inside STR, and record them
518   inside TOK.  */
519
520static const char *
521decode_str(const char *str, struct tok_state *tok)
522{
523	PyObject* utf8 = NULL;
524	const char *s;
525	int lineno = 0;
526	tok->enc = NULL;
527	tok->str = str;
528	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
529		return NULL;
530	str = tok->str;		/* string after BOM if any */
531	assert(str);
532#ifdef Py_USING_UNICODE
533	if (tok->enc != NULL) {
534		utf8 = translate_into_utf8(str, tok->enc);
535		if (utf8 == NULL)
536			return NULL;
537		str = PyString_AsString(utf8);
538	}
539#endif
540	for (s = str;; s++) {
541		if (*s == '\0') break;
542		else if (*s == '\n') {
543			lineno++;
544			if (lineno == 2) break;
545		}
546	}
547	tok->enc = NULL;
548	if (!check_coding_spec(str, s - str, tok, buf_setreadl))
549		return NULL;
550#ifdef Py_USING_UNICODE
551	if (tok->enc != NULL) {
552		assert(utf8 == NULL);
553		utf8 = translate_into_utf8(str, tok->enc);
554		if (utf8 == NULL)
555			return NULL;
556		str = PyString_AsString(utf8);
557	}
558#endif
559	assert(tok->decoding_buffer == NULL);
560	tok->decoding_buffer = utf8; /* CAUTION */
561	return str;
562}
563
564#endif /* PGEN */
565
566/* Set up tokenizer for string */
567
568struct tok_state *
569PyTokenizer_FromString(char *str)
570{
571	struct tok_state *tok = tok_new();
572	if (tok == NULL)
573		return NULL;
574	str = (char *)decode_str(str, tok);
575	if (str == NULL)
576		return NULL;
577	tok->buf = tok->cur = tok->end = tok->inp = str;
578	return tok;
579}
580
581
582/* Set up tokenizer for file */
583
584struct tok_state *
585PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
586{
587	struct tok_state *tok = tok_new();
588	if (tok == NULL)
589		return NULL;
590	if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
591		PyMem_DEL(tok);
592		return NULL;
593	}
594	tok->cur = tok->inp = tok->buf;
595	tok->end = tok->buf + BUFSIZ;
596	tok->fp = fp;
597	tok->prompt = ps1;
598	tok->nextprompt = ps2;
599	return tok;
600}
601
602
603/* Free a tok_state structure */
604
605void
606PyTokenizer_Free(struct tok_state *tok)
607{
608	if (tok->encoding != NULL)
609		PyMem_DEL(tok->encoding);
610#ifndef PGEN
611	Py_XDECREF(tok->decoding_readline);
612	Py_XDECREF(tok->decoding_buffer);
613#endif
614	if (tok->fp != NULL && tok->buf != NULL)
615		PyMem_DEL(tok->buf);
616	PyMem_DEL(tok);
617}
618
619
620/* Get next char, updating state; error code goes into tok->done */
621
622static int
623tok_nextc(register struct tok_state *tok)
624{
625	for (;;) {
626		if (tok->cur != tok->inp) {
627			return Py_CHARMASK(*tok->cur++); /* Fast path */
628		}
629		if (tok->done != E_OK)
630			return EOF;
631		if (tok->fp == NULL) {
632			char *end = strchr(tok->inp, '\n');
633			if (end != NULL)
634				end++;
635			else {
636				end = strchr(tok->inp, '\0');
637				if (end == tok->inp) {
638					tok->done = E_EOF;
639					return EOF;
640				}
641			}
642			if (tok->start == NULL)
643				tok->buf = tok->cur;
644			tok->lineno++;
645			tok->inp = end;
646			return Py_CHARMASK(*tok->cur++);
647		}
648		if (tok->prompt != NULL) {
649			char *new = PyOS_Readline(tok->prompt);
650			if (tok->nextprompt != NULL)
651				tok->prompt = tok->nextprompt;
652			if (new == NULL)
653				tok->done = E_INTR;
654			else if (*new == '\0') {
655				PyMem_FREE(new);
656				tok->done = E_EOF;
657			}
658			else if (tok->start != NULL) {
659				size_t start = tok->start - tok->buf;
660				size_t oldlen = tok->cur - tok->buf;
661				size_t newlen = oldlen + strlen(new);
662				char *buf = tok->buf;
663				PyMem_RESIZE(buf, char, newlen+1);
664				tok->lineno++;
665				if (buf == NULL) {
666					PyMem_DEL(tok->buf);
667					tok->buf = NULL;
668					PyMem_FREE(new);
669					tok->done = E_NOMEM;
670					return EOF;
671				}
672				tok->buf = buf;
673				tok->cur = tok->buf + oldlen;
674				strcpy(tok->buf + oldlen, new);
675				PyMem_FREE(new);
676				tok->inp = tok->buf + newlen;
677				tok->end = tok->inp + 1;
678				tok->start = tok->buf + start;
679			}
680			else {
681				tok->lineno++;
682				if (tok->buf != NULL)
683					PyMem_DEL(tok->buf);
684				tok->buf = new;
685				tok->cur = tok->buf;
686				tok->inp = strchr(tok->buf, '\0');
687				tok->end = tok->inp + 1;
688			}
689		}
690		else {
691			int done = 0;
692			int cur = 0;
693			char *pt;
694			if (tok->start == NULL) {
695				if (tok->buf == NULL) {
696					tok->buf = PyMem_NEW(char, BUFSIZ);
697					if (tok->buf == NULL) {
698						tok->done = E_NOMEM;
699						return EOF;
700					}
701					tok->end = tok->buf + BUFSIZ;
702				}
703				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
704					  tok) == NULL) {
705					tok->done = E_EOF;
706					done = 1;
707				}
708				else {
709					tok->done = E_OK;
710					tok->inp = strchr(tok->buf, '\0');
711					done = tok->inp[-1] == '\n';
712				}
713			}
714			else {
715				cur = tok->cur - tok->buf;
716				if (decoding_feof(tok)) {
717					tok->done = E_EOF;
718					done = 1;
719				}
720				else
721					tok->done = E_OK;
722			}
723			tok->lineno++;
724			/* Read until '\n' or EOF */
725			while (!done) {
726				int curstart = tok->start == NULL ? -1 :
727					       tok->start - tok->buf;
728				int curvalid = tok->inp - tok->buf;
729				int newsize = curvalid + BUFSIZ;
730				char *newbuf = tok->buf;
731				PyMem_RESIZE(newbuf, char, newsize);
732				if (newbuf == NULL) {
733					tok->done = E_NOMEM;
734					tok->cur = tok->inp;
735					return EOF;
736				}
737				tok->buf = newbuf;
738				tok->inp = tok->buf + curvalid;
739				tok->end = tok->buf + newsize;
740				tok->start = curstart < 0 ? NULL :
741					     tok->buf + curstart;
742				if (decoding_fgets(tok->inp,
743					       (int)(tok->end - tok->inp),
744					       tok) == NULL) {
745					/* Last line does not end in \n,
746					   fake one */
747					strcpy(tok->inp, "\n");
748				}
749				tok->inp = strchr(tok->inp, '\0');
750				done = tok->inp[-1] == '\n';
751			}
752			tok->cur = tok->buf + cur;
753#ifndef macintosh
754			/* replace "\r\n" with "\n" */
755			/* For Mac we leave the \r, giving a syntax error */
756			pt = tok->inp - 2;
757			if (pt >= tok->buf && *pt == '\r') {
758				*pt++ = '\n';
759				*pt = '\0';
760				tok->inp = pt;
761			}
762#endif
763		}
764		if (tok->done != E_OK) {
765			if (tok->prompt != NULL)
766				PySys_WriteStderr("\n");
767			tok->cur = tok->inp;
768			return EOF;
769		}
770	}
771	/*NOTREACHED*/
772}
773
774
775/* Back-up one character */
776
777static void
778tok_backup(register struct tok_state *tok, register int c)
779{
780	if (c != EOF) {
781		if (--tok->cur < tok->buf)
782			Py_FatalError("tok_backup: begin of buffer");
783		if (*tok->cur != c)
784			*tok->cur = c;
785	}
786}
787
788
789/* Return the token corresponding to a single character */
790
791int
792PyToken_OneChar(int c)
793{
794	switch (c) {
795	case '(':	return LPAR;
796	case ')':	return RPAR;
797	case '[':	return LSQB;
798	case ']':	return RSQB;
799	case ':':	return COLON;
800	case ',':	return COMMA;
801	case ';':	return SEMI;
802	case '+':	return PLUS;
803	case '-':	return MINUS;
804	case '*':	return STAR;
805	case '/':	return SLASH;
806	case '|':	return VBAR;
807	case '&':	return AMPER;
808	case '<':	return LESS;
809	case '>':	return GREATER;
810	case '=':	return EQUAL;
811	case '.':	return DOT;
812	case '%':	return PERCENT;
813	case '`':	return BACKQUOTE;
814	case '{':	return LBRACE;
815	case '}':	return RBRACE;
816	case '^':	return CIRCUMFLEX;
817	case '~':	return TILDE;
818	default:	return OP;
819	}
820}
821
822
823int
824PyToken_TwoChars(int c1, int c2)
825{
826	switch (c1) {
827	case '=':
828		switch (c2) {
829		case '=':	return EQEQUAL;
830		}
831		break;
832	case '!':
833		switch (c2) {
834		case '=':	return NOTEQUAL;
835		}
836		break;
837	case '<':
838		switch (c2) {
839		case '>':	return NOTEQUAL;
840		case '=':	return LESSEQUAL;
841		case '<':	return LEFTSHIFT;
842		}
843		break;
844	case '>':
845		switch (c2) {
846		case '=':	return GREATEREQUAL;
847		case '>':	return RIGHTSHIFT;
848		}
849		break;
850	case '+':
851		switch (c2) {
852		case '=':	return PLUSEQUAL;
853		}
854		break;
855	case '-':
856		switch (c2) {
857		case '=':	return MINEQUAL;
858		}
859		break;
860	case '*':
861		switch (c2) {
862		case '*':	return DOUBLESTAR;
863		case '=':	return STAREQUAL;
864		}
865		break;
866	case '/':
867		switch (c2) {
868		case '/':	return DOUBLESLASH;
869		case '=':	return SLASHEQUAL;
870		}
871		break;
872	case '|':
873		switch (c2) {
874		case '=':	return VBAREQUAL;
875		}
876		break;
877	case '%':
878		switch (c2) {
879		case '=':	return PERCENTEQUAL;
880		}
881		break;
882	case '&':
883		switch (c2) {
884		case '=':	return AMPEREQUAL;
885		}
886		break;
887	case '^':
888		switch (c2) {
889		case '=':	return CIRCUMFLEXEQUAL;
890		}
891		break;
892	}
893	return OP;
894}
895
896int
897PyToken_ThreeChars(int c1, int c2, int c3)
898{
899	switch (c1) {
900	case '<':
901		switch (c2) {
902		case '<':
903			switch (c3) {
904			case '=':
905				return LEFTSHIFTEQUAL;
906			}
907			break;
908		}
909		break;
910	case '>':
911		switch (c2) {
912		case '>':
913			switch (c3) {
914			case '=':
915				return RIGHTSHIFTEQUAL;
916			}
917			break;
918		}
919		break;
920	case '*':
921		switch (c2) {
922		case '*':
923			switch (c3) {
924			case '=':
925				return DOUBLESTAREQUAL;
926			}
927			break;
928		}
929		break;
930	case '/':
931		switch (c2) {
932		case '/':
933			switch (c3) {
934			case '=':
935				return DOUBLESLASHEQUAL;
936			}
937			break;
938		}
939		break;
940	}
941	return OP;
942}
943
944static int
945indenterror(struct tok_state *tok)
946{
947	if (tok->alterror) {
948		tok->done = E_TABSPACE;
949		tok->cur = tok->inp;
950		return 1;
951	}
952	if (tok->altwarning) {
953		PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
954                                  "in indentation\n", tok->filename);
955		tok->altwarning = 0;
956	}
957	return 0;
958}
959
960
961/* Get next token, after space stripping etc. */
962
963static int
964tok_get(register struct tok_state *tok, char **p_start, char **p_end)
965{
966	register int c;
967	int blankline;
968
969	*p_start = *p_end = NULL;
970  nextline:
971	tok->start = NULL;
972	blankline = 0;
973
974	/* Get indentation level */
975	if (tok->atbol) {
976		register int col = 0;
977		register int altcol = 0;
978		tok->atbol = 0;
979		for (;;) {
980			c = tok_nextc(tok);
981			if (c == ' ')
982				col++, altcol++;
983			else if (c == '\t') {
984				col = (col/tok->tabsize + 1) * tok->tabsize;
985				altcol = (altcol/tok->alttabsize + 1)
986					* tok->alttabsize;
987			}
988			else if (c == '\014') /* Control-L (formfeed) */
989				col = altcol = 0; /* For Emacs users */
990			else
991				break;
992		}
993		tok_backup(tok, c);
994		if (c == '#' || c == '\n') {
995			/* Lines with only whitespace and/or comments
996			   shouldn't affect the indentation and are
997			   not passed to the parser as NEWLINE tokens,
998			   except *totally* empty lines in interactive
999			   mode, which signal the end of a command group. */
1000			if (col == 0 && c == '\n' && tok->prompt != NULL)
1001				blankline = 0; /* Let it through */
1002			else
1003				blankline = 1; /* Ignore completely */
1004			/* We can't jump back right here since we still
1005			   may need to skip to the end of a comment */
1006		}
1007		if (!blankline && tok->level == 0) {
1008			if (col == tok->indstack[tok->indent]) {
1009				/* No change */
1010				if (altcol != tok->altindstack[tok->indent]) {
1011					if (indenterror(tok))
1012						return ERRORTOKEN;
1013				}
1014			}
1015			else if (col > tok->indstack[tok->indent]) {
1016				/* Indent -- always one */
1017				if (tok->indent+1 >= MAXINDENT) {
1018					tok->done = E_TOODEEP;
1019					tok->cur = tok->inp;
1020					return ERRORTOKEN;
1021				}
1022				if (altcol <= tok->altindstack[tok->indent]) {
1023					if (indenterror(tok))
1024						return ERRORTOKEN;
1025				}
1026				tok->pendin++;
1027				tok->indstack[++tok->indent] = col;
1028				tok->altindstack[tok->indent] = altcol;
1029			}
1030			else /* col < tok->indstack[tok->indent] */ {
1031				/* Dedent -- any number, must be consistent */
1032				while (tok->indent > 0 &&
1033					col < tok->indstack[tok->indent]) {
1034					tok->pendin--;
1035					tok->indent--;
1036				}
1037				if (col != tok->indstack[tok->indent]) {
1038					tok->done = E_DEDENT;
1039					tok->cur = tok->inp;
1040					return ERRORTOKEN;
1041				}
1042				if (altcol != tok->altindstack[tok->indent]) {
1043					if (indenterror(tok))
1044						return ERRORTOKEN;
1045				}
1046			}
1047		}
1048	}
1049
1050	tok->start = tok->cur;
1051
1052	/* Return pending indents/dedents */
1053	if (tok->pendin != 0) {
1054		if (tok->pendin < 0) {
1055			tok->pendin++;
1056			return DEDENT;
1057		}
1058		else {
1059			tok->pendin--;
1060			return INDENT;
1061		}
1062	}
1063
1064 again:
1065	tok->start = NULL;
1066	/* Skip spaces */
1067	do {
1068		c = tok_nextc(tok);
1069	} while (c == ' ' || c == '\t' || c == '\014');
1070
1071	/* Set start of current token */
1072	tok->start = tok->cur - 1;
1073
1074	/* Skip comment, while looking for tab-setting magic */
1075	if (c == '#') {
1076		static char *tabforms[] = {
1077			"tab-width:",		/* Emacs */
1078			":tabstop=",		/* vim, full form */
1079			":ts=",			/* vim, abbreviated form */
1080			"set tabsize=",		/* will vi never die? */
1081		/* more templates can be added here to support other editors */
1082		};
1083		char cbuf[80];
1084		char *tp, **cp;
1085		tp = cbuf;
1086		do {
1087			*tp++ = c = tok_nextc(tok);
1088		} while (c != EOF && c != '\n' &&
1089			 tp - cbuf + 1 < sizeof(cbuf));
1090		*tp = '\0';
1091		for (cp = tabforms;
1092		     cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1093		     cp++) {
1094			if ((tp = strstr(cbuf, *cp))) {
1095				int newsize = atoi(tp + strlen(*cp));
1096
1097				if (newsize >= 1 && newsize <= 40) {
1098					tok->tabsize = newsize;
1099					if (Py_VerboseFlag)
1100					    PySys_WriteStderr(
1101						"Tab size set to %d\n",
1102						newsize);
1103				}
1104			}
1105		}
1106		while (c != EOF && c != '\n')
1107			c = tok_nextc(tok);
1108	}
1109
1110	/* Check for EOF and errors now */
1111	if (c == EOF) {
1112		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1113	}
1114
1115	/* Identifier (most frequent token!) */
1116	if (isalpha(c) || c == '_') {
1117		/* Process r"", u"" and ur"" */
1118		switch (c) {
1119		case 'r':
1120		case 'R':
1121			c = tok_nextc(tok);
1122			if (c == '"' || c == '\'')
1123				goto letter_quote;
1124			break;
1125		case 'u':
1126		case 'U':
1127			c = tok_nextc(tok);
1128			if (c == 'r' || c == 'R')
1129				c = tok_nextc(tok);
1130			if (c == '"' || c == '\'')
1131				goto letter_quote;
1132			break;
1133		}
1134		while (isalnum(c) || c == '_') {
1135			c = tok_nextc(tok);
1136		}
1137		tok_backup(tok, c);
1138		*p_start = tok->start;
1139		*p_end = tok->cur;
1140		return NAME;
1141	}
1142
1143	/* Newline */
1144	if (c == '\n') {
1145		tok->atbol = 1;
1146		if (blankline || tok->level > 0)
1147			goto nextline;
1148		*p_start = tok->start;
1149		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
1150		return NEWLINE;
1151	}
1152
1153#ifdef macintosh
1154	if (c == '\r') {
1155		PySys_WriteStderr(
1156		  "File contains \\r characters (incorrect line endings?)\n");
1157		tok->done = E_TOKEN;
1158		tok->cur = tok->inp;
1159		return ERRORTOKEN;
1160	}
1161#endif
1162	/* Period or number starting with period? */
1163	if (c == '.') {
1164		c = tok_nextc(tok);
1165		if (isdigit(c)) {
1166			goto fraction;
1167		}
1168		else {
1169			tok_backup(tok, c);
1170			*p_start = tok->start;
1171			*p_end = tok->cur;
1172			return DOT;
1173		}
1174	}
1175
1176	/* Number */
1177	if (isdigit(c)) {
1178		if (c == '0') {
1179			/* Hex or octal -- maybe. */
1180			c = tok_nextc(tok);
1181			if (c == '.')
1182				goto fraction;
1183#ifndef WITHOUT_COMPLEX
1184			if (c == 'j' || c == 'J')
1185				goto imaginary;
1186#endif
1187			if (c == 'x' || c == 'X') {
1188				/* Hex */
1189				do {
1190					c = tok_nextc(tok);
1191				} while (isxdigit(c));
1192			}
1193			else {
1194				int found_decimal = 0;
1195				/* Octal; c is first char of it */
1196				/* There's no 'isoctdigit' macro, sigh */
1197				while ('0' <= c && c < '8') {
1198					c = tok_nextc(tok);
1199				}
1200				if (isdigit(c)) {
1201					found_decimal = 1;
1202					do {
1203						c = tok_nextc(tok);
1204					} while (isdigit(c));
1205				}
1206				if (c == '.')
1207					goto fraction;
1208				else if (c == 'e' || c == 'E')
1209					goto exponent;
1210#ifndef WITHOUT_COMPLEX
1211				else if (c == 'j' || c == 'J')
1212					goto imaginary;
1213#endif
1214				else if (found_decimal) {
1215					tok->done = E_TOKEN;
1216					tok_backup(tok, c);
1217					return ERRORTOKEN;
1218				}
1219			}
1220			if (c == 'l' || c == 'L')
1221				c = tok_nextc(tok);
1222		}
1223		else {
1224			/* Decimal */
1225			do {
1226				c = tok_nextc(tok);
1227			} while (isdigit(c));
1228			if (c == 'l' || c == 'L')
1229				c = tok_nextc(tok);
1230			else {
1231				/* Accept floating point numbers. */
1232				if (c == '.') {
1233		fraction:
1234					/* Fraction */
1235					do {
1236						c = tok_nextc(tok);
1237					} while (isdigit(c));
1238				}
1239				if (c == 'e' || c == 'E') {
1240		exponent:
1241					/* Exponent part */
1242					c = tok_nextc(tok);
1243					if (c == '+' || c == '-')
1244						c = tok_nextc(tok);
1245					if (!isdigit(c)) {
1246						tok->done = E_TOKEN;
1247						tok_backup(tok, c);
1248						return ERRORTOKEN;
1249					}
1250					do {
1251						c = tok_nextc(tok);
1252					} while (isdigit(c));
1253				}
1254#ifndef WITHOUT_COMPLEX
1255				if (c == 'j' || c == 'J')
1256					/* Imaginary part */
1257		imaginary:
1258					c = tok_nextc(tok);
1259#endif
1260			}
1261		}
1262		tok_backup(tok, c);
1263		*p_start = tok->start;
1264		*p_end = tok->cur;
1265		return NUMBER;
1266	}
1267
1268  letter_quote:
1269	/* String */
1270	if (c == '\'' || c == '"') {
1271		int quote2 = tok->cur - tok->start + 1;
1272		int quote = c;
1273		int triple = 0;
1274		int tripcount = 0;
1275		for (;;) {
1276			c = tok_nextc(tok);
1277			if (c == '\n') {
1278				if (!triple) {
1279					tok->done = E_EOLS;
1280					tok_backup(tok, c);
1281					return ERRORTOKEN;
1282				}
1283				tripcount = 0;
1284			}
1285			else if (c == EOF) {
1286				if (triple)
1287					tok->done = E_EOFS;
1288				else
1289					tok->done = E_EOLS;
1290				tok->cur = tok->inp;
1291				return ERRORTOKEN;
1292			}
1293			else if (c == quote) {
1294				tripcount++;
1295				if (tok->cur - tok->start == quote2) {
1296					c = tok_nextc(tok);
1297					if (c == quote) {
1298						triple = 1;
1299						tripcount = 0;
1300						continue;
1301					}
1302					tok_backup(tok, c);
1303				}
1304				if (!triple || tripcount == 3)
1305					break;
1306			}
1307			else if (c == '\\') {
1308				tripcount = 0;
1309				c = tok_nextc(tok);
1310				if (c == EOF) {
1311					tok->done = E_EOLS;
1312					tok->cur = tok->inp;
1313					return ERRORTOKEN;
1314				}
1315			}
1316			else
1317				tripcount = 0;
1318		}
1319		*p_start = tok->start;
1320		*p_end = tok->cur;
1321		return STRING;
1322	}
1323
1324	/* Line continuation */
1325	if (c == '\\') {
1326		c = tok_nextc(tok);
1327		if (c != '\n') {
1328			tok->done = E_TOKEN;
1329			tok->cur = tok->inp;
1330			return ERRORTOKEN;
1331		}
1332		goto again; /* Read next line */
1333	}
1334
1335	/* Check for two-character token */
1336	{
1337		int c2 = tok_nextc(tok);
1338		int token = PyToken_TwoChars(c, c2);
1339		if (token != OP) {
1340			int c3 = tok_nextc(tok);
1341			int token3 = PyToken_ThreeChars(c, c2, c3);
1342			if (token3 != OP) {
1343				token = token3;
1344			} else {
1345				tok_backup(tok, c3);
1346			}
1347			*p_start = tok->start;
1348			*p_end = tok->cur;
1349			return token;
1350		}
1351		tok_backup(tok, c2);
1352	}
1353
1354	/* Keep track of parentheses nesting level */
1355	switch (c) {
1356	case '(':
1357	case '[':
1358	case '{':
1359		tok->level++;
1360		break;
1361	case ')':
1362	case ']':
1363	case '}':
1364		tok->level--;
1365		break;
1366	}
1367
1368	/* Punctuation character */
1369	*p_start = tok->start;
1370	*p_end = tok->cur;
1371	return PyToken_OneChar(c);
1372}
1373
1374int
1375PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1376{
1377	int result = tok_get(tok, p_start, p_end);
1378	if (tok->decoding_erred) {
1379		result = ERRORTOKEN;
1380		tok->done = E_DECODE;
1381	}
1382	return result;
1383}
1384
1385#ifdef Py_DEBUG
1386
1387void
1388tok_dump(int type, char *start, char *end)
1389{
1390	printf("%s", _PyParser_TokenNames[type]);
1391	if (type == NAME || type == NUMBER || type == STRING || type == OP)
1392		printf("(%.*s)", (int)(end - start), start);
1393}
1394
1395#endif
1396