tokenizer.c revision 926f13a0819eb3d40a0d0fd38ff25ef0c7d489b3
1/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* Tokenizer implementation */
33
34#include "pgenheaders.h"
35
36#include <ctype.h>
37
38#include "tokenizer.h"
39#include "errcode.h"
40
41extern char *PyOS_Readline Py_PROTO((char *));
42/* Return malloc'ed string including trailing \n;
43   empty malloc'ed string for EOF;
44   NULL if interrupted */
45
46/* Don't ever change this -- it would break the portability of Python code */
47#define TABSIZE 8
48
49/* Convert a possibly signed character to a nonnegative int */
50/* XXX This assumes characters are 8 bits wide */
51#ifdef __CHAR_UNSIGNED__
52#define Py_CHARMASK(c)		(c)
53#else
54#define Py_CHARMASK(c)		((c) & 0xff)
55#endif
56
57/* Forward */
58static struct tok_state *tok_new Py_PROTO((void));
59static int tok_nextc Py_PROTO((struct tok_state *tok));
60static void tok_backup Py_PROTO((struct tok_state *tok, int c));
61
62/* Token names */
63
64char *_PyParser_TokenNames[] = {
65	"ENDMARKER",
66	"NAME",
67	"NUMBER",
68	"STRING",
69	"NEWLINE",
70	"INDENT",
71	"DEDENT",
72	"LPAR",
73	"RPAR",
74	"LSQB",
75	"RSQB",
76	"COLON",
77	"COMMA",
78	"SEMI",
79	"PLUS",
80	"MINUS",
81	"STAR",
82	"SLASH",
83	"VBAR",
84	"AMPER",
85	"LESS",
86	"GREATER",
87	"EQUAL",
88	"DOT",
89	"PERCENT",
90	"BACKQUOTE",
91	"LBRACE",
92	"RBRACE",
93	"EQEQUAL",
94	"NOTEQUAL",
95	"LESSEQUAL",
96	"GREATEREQUAL",
97	"TILDE",
98	"CIRCUMFLEX",
99	"LEFTSHIFT",
100	"RIGHTSHIFT",
101	"DOUBLESTAR",
102	/* This table must match the #defines in token.h! */
103	"OP",
104	"<ERRORTOKEN>",
105	"<N_TOKENS>"
106};
107
108
109/* Create and initialize a new tok_state structure */
110
111static struct tok_state *
112tok_new()
113{
114	struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
115	if (tok == NULL)
116		return NULL;
117	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
118	tok->done = E_OK;
119	tok->fp = NULL;
120	tok->tabsize = TABSIZE;
121	tok->indent = 0;
122	tok->indstack[0] = 0;
123	tok->atbol = 1;
124	tok->pendin = 0;
125	tok->prompt = tok->nextprompt = NULL;
126	tok->lineno = 0;
127	tok->level = 0;
128	tok->filename = NULL;
129	tok->altwarning = 0;
130	tok->alterror = 0;
131	tok->alttabsize = 1;
132	tok->altindstack[0] = 0;
133	return tok;
134}
135
136
137/* Set up tokenizer for string */
138
139struct tok_state *
140PyTokenizer_FromString(str)
141	char *str;
142{
143	struct tok_state *tok = tok_new();
144	if (tok == NULL)
145		return NULL;
146	tok->buf = tok->cur = tok->end = tok->inp = str;
147	return tok;
148}
149
150
151/* Set up tokenizer for file */
152
153struct tok_state *
154PyTokenizer_FromFile(fp, ps1, ps2)
155	FILE *fp;
156	char *ps1, *ps2;
157{
158	struct tok_state *tok = tok_new();
159	if (tok == NULL)
160		return NULL;
161	if ((tok->buf = PyMem_NEW(char, BUFSIZ)) == NULL) {
162		PyMem_DEL(tok);
163		return NULL;
164	}
165	tok->cur = tok->inp = tok->buf;
166	tok->end = tok->buf + BUFSIZ;
167	tok->fp = fp;
168	tok->prompt = ps1;
169	tok->nextprompt = ps2;
170	return tok;
171}
172
173
174/* Free a tok_state structure */
175
176void
177PyTokenizer_Free(tok)
178	struct tok_state *tok;
179{
180	if (tok->fp != NULL && tok->buf != NULL)
181		PyMem_DEL(tok->buf);
182	PyMem_DEL(tok);
183}
184
185
186/* Get next char, updating state; error code goes into tok->done */
187
188static int
189tok_nextc(tok)
190	register struct tok_state *tok;
191{
192	for (;;) {
193		if (tok->cur != tok->inp) {
194			return Py_CHARMASK(*tok->cur++); /* Fast path */
195		}
196		if (tok->done != E_OK)
197			return EOF;
198		if (tok->fp == NULL) {
199			char *end = strchr(tok->inp, '\n');
200			if (end != NULL)
201				end++;
202			else {
203				end = strchr(tok->inp, '\0');
204				if (end == tok->inp) {
205					tok->done = E_EOF;
206					return EOF;
207				}
208			}
209			if (tok->start == NULL)
210				tok->buf = tok->cur;
211			tok->lineno++;
212			tok->inp = end;
213			return Py_CHARMASK(*tok->cur++);
214		}
215		if (tok->prompt != NULL) {
216			char *new = PyOS_Readline(tok->prompt);
217			if (tok->nextprompt != NULL)
218				tok->prompt = tok->nextprompt;
219			if (new == NULL)
220				tok->done = E_INTR;
221			else if (*new == '\0') {
222				free(new);
223				tok->done = E_EOF;
224			}
225			else if (tok->start != NULL) {
226				int start = tok->start - tok->buf;
227				int oldlen = tok->cur - tok->buf;
228				int newlen = oldlen + strlen(new);
229				char *buf = realloc(tok->buf, newlen+1);
230				tok->lineno++;
231				if (buf == NULL) {
232					free(tok->buf);
233					tok->buf = NULL;
234					free(new);
235					tok->done = E_NOMEM;
236					return EOF;
237				}
238				tok->buf = buf;
239				tok->cur = tok->buf + oldlen;
240				strcpy(tok->buf + oldlen, new);
241				free(new);
242				tok->inp = tok->buf + newlen;
243				tok->end = tok->inp + 1;
244				tok->start = tok->buf + start;
245			}
246			else {
247				tok->lineno++;
248				if (tok->buf != NULL)
249					free(tok->buf);
250				tok->buf = new;
251				tok->cur = tok->buf;
252				tok->inp = strchr(tok->buf, '\0');
253				tok->end = tok->inp + 1;
254			}
255		}
256		else {
257			int done = 0;
258			int cur = 0;
259			char *pt;
260			if (tok->start == NULL) {
261				if (tok->buf == NULL) {
262					tok->buf = PyMem_NEW(char, BUFSIZ);
263					if (tok->buf == NULL) {
264						tok->done = E_NOMEM;
265						return EOF;
266					}
267					tok->end = tok->buf + BUFSIZ;
268				}
269				if (fgets(tok->buf, (int)(tok->end - tok->buf),
270					  tok->fp) == NULL) {
271					tok->done = E_EOF;
272					done = 1;
273				}
274				else {
275					tok->done = E_OK;
276					tok->inp = strchr(tok->buf, '\0');
277					done = tok->inp[-1] == '\n';
278				}
279			}
280			else {
281				cur = tok->cur - tok->buf;
282				if (feof(tok->fp)) {
283					tok->done = E_EOF;
284					done = 1;
285				}
286				else
287					tok->done = E_OK;
288			}
289			tok->lineno++;
290			/* Read until '\n' or EOF */
291			while (!done) {
292				int curstart = tok->start == NULL ? -1 :
293					       tok->start - tok->buf;
294				int curvalid = tok->inp - tok->buf;
295				int newsize = curvalid + BUFSIZ;
296				char *newbuf = tok->buf;
297				PyMem_RESIZE(newbuf, char, newsize);
298				if (newbuf == NULL) {
299					tok->done = E_NOMEM;
300					tok->cur = tok->inp;
301					return EOF;
302				}
303				tok->buf = newbuf;
304				tok->inp = tok->buf + curvalid;
305				tok->end = tok->buf + newsize;
306				tok->start = curstart < 0 ? NULL :
307					     tok->buf + curstart;
308				if (fgets(tok->inp,
309					       (int)(tok->end - tok->inp),
310					       tok->fp) == NULL) {
311					/* Last line does not end in \n,
312					   fake one */
313					strcpy(tok->inp, "\n");
314				}
315				tok->inp = strchr(tok->inp, '\0');
316				done = tok->inp[-1] == '\n';
317			}
318			tok->cur = tok->buf + cur;
319#ifndef macintosh
320			/* replace "\r\n" with "\n" */
321			/* For Mac we leave the \r, giving a syntax error */
322			pt = tok->inp - 2;
323			if (pt >= tok->buf && *pt == '\r') {
324				*pt++ = '\n';
325				*pt = '\0';
326				tok->inp = pt;
327			}
328#endif
329		}
330		if (tok->done != E_OK) {
331			if (tok->prompt != NULL)
332				fprintf(stderr, "\n");
333			tok->cur = tok->inp;
334			return EOF;
335		}
336	}
337	/*NOTREACHED*/
338}
339
340
341/* Back-up one character */
342
343static void
344tok_backup(tok, c)
345	register struct tok_state *tok;
346	register int c;
347{
348	if (c != EOF) {
349		if (--tok->cur < tok->buf)
350			Py_FatalError("tok_backup: begin of buffer");
351		if (*tok->cur != c)
352			*tok->cur = c;
353	}
354}
355
356
357/* Return the token corresponding to a single character */
358
359int
360PyToken_OneChar(c)
361	int c;
362{
363	switch (c) {
364	case '(':	return LPAR;
365	case ')':	return RPAR;
366	case '[':	return LSQB;
367	case ']':	return RSQB;
368	case ':':	return COLON;
369	case ',':	return COMMA;
370	case ';':	return SEMI;
371	case '+':	return PLUS;
372	case '-':	return MINUS;
373	case '*':	return STAR;
374	case '/':	return SLASH;
375	case '|':	return VBAR;
376	case '&':	return AMPER;
377	case '<':	return LESS;
378	case '>':	return GREATER;
379	case '=':	return EQUAL;
380	case '.':	return DOT;
381	case '%':	return PERCENT;
382	case '`':	return BACKQUOTE;
383	case '{':	return LBRACE;
384	case '}':	return RBRACE;
385	case '^':	return CIRCUMFLEX;
386	case '~':	return TILDE;
387	default:	return OP;
388	}
389}
390
391
392int
393PyToken_TwoChars(c1, c2)
394	int c1, c2;
395{
396	switch (c1) {
397	case '=':
398		switch (c2) {
399		case '=':	return EQEQUAL;
400		}
401		break;
402	case '!':
403		switch (c2) {
404		case '=':	return NOTEQUAL;
405		}
406		break;
407	case '<':
408		switch (c2) {
409		case '>':	return NOTEQUAL;
410		case '=':	return LESSEQUAL;
411		case '<':	return LEFTSHIFT;
412		}
413		break;
414	case '>':
415		switch (c2) {
416		case '=':	return GREATEREQUAL;
417		case '>':	return RIGHTSHIFT;
418		}
419		break;
420	case '*':
421		switch (c2) {
422		case '*':	return DOUBLESTAR;
423		}
424		break;
425	}
426	return OP;
427}
428
429
430static int
431indenterror(tok)
432	struct tok_state *tok;
433{
434	if (tok->alterror) {
435		tok->done = E_INDENT;
436		tok->cur = tok->inp;
437		return 1;
438	}
439	if (tok->altwarning) {
440		fprintf(stderr, "%s: inconsistent tab/space usage\n",
441			tok->filename);
442		tok->altwarning = 0;
443	}
444	return 0;
445}
446
447
448/* Get next token, after space stripping etc. */
449
450int
451PyTokenizer_Get(tok, p_start, p_end)
452	register struct tok_state *tok; /* In/out: tokenizer state */
453	char **p_start, **p_end; /* Out: point to start/end of token */
454{
455	register int c;
456	int blankline;
457
458	*p_start = *p_end = NULL;
459  nextline:
460	tok->start = NULL;
461	blankline = 0;
462
463	/* Get indentation level */
464	if (tok->atbol) {
465		register int col = 0;
466		register int altcol = 0;
467		tok->atbol = 0;
468		for (;;) {
469			c = tok_nextc(tok);
470			if (c == ' ')
471				col++, altcol++;
472			else if (c == '\t') {
473				col = (col/tok->tabsize + 1) * tok->tabsize;
474				altcol = (altcol/tok->alttabsize + 1)
475					* tok->alttabsize;
476			}
477			else if (c == '\014') /* Control-L (formfeed) */
478				col = altcol = 0; /* For Emacs users */
479			else
480				break;
481		}
482		tok_backup(tok, c);
483		if (c == '#' || c == '\n') {
484			/* Lines with only whitespace and/or comments
485			   shouldn't affect the indentation and are
486			   not passed to the parser as NEWLINE tokens,
487			   except *totally* empty lines in interactive
488			   mode, which signal the end of a command group. */
489			if (col == 0 && c == '\n' && tok->prompt != NULL)
490				blankline = 0; /* Let it through */
491			else
492				blankline = 1; /* Ignore completely */
493			/* We can't jump back right here since we still
494			   may need to skip to the end of a comment */
495		}
496		if (!blankline && tok->level == 0) {
497			if (col == tok->indstack[tok->indent]) {
498				/* No change */
499				if (altcol != tok->altindstack[tok->indent]) {
500					if (indenterror(tok))
501						return ERRORTOKEN;
502				}
503			}
504			else if (col > tok->indstack[tok->indent]) {
505				/* Indent -- always one */
506				if (tok->indent+1 >= MAXINDENT) {
507					fprintf(stderr, "excessive indent\n");
508					tok->done = E_TOKEN;
509					tok->cur = tok->inp;
510					return ERRORTOKEN;
511				}
512				if (altcol <= tok->altindstack[tok->indent]) {
513					if (indenterror(tok))
514						return ERRORTOKEN;
515				}
516				tok->pendin++;
517				tok->indstack[++tok->indent] = col;
518				tok->altindstack[tok->indent] = altcol;
519			}
520			else /* col < tok->indstack[tok->indent] */ {
521				/* Dedent -- any number, must be consistent */
522				while (tok->indent > 0 &&
523					col < tok->indstack[tok->indent]) {
524					tok->pendin--;
525					tok->indent--;
526				}
527				if (col != tok->indstack[tok->indent]) {
528					fprintf(stderr,
529						"inconsistent dedent\n");
530					tok->done = E_TOKEN;
531					tok->cur = tok->inp;
532					return ERRORTOKEN;
533				}
534				if (altcol != tok->altindstack[tok->indent]) {
535					if (indenterror(tok))
536						return ERRORTOKEN;
537				}
538			}
539		}
540	}
541
542	tok->start = tok->cur;
543
544	/* Return pending indents/dedents */
545	if (tok->pendin != 0) {
546		if (tok->pendin < 0) {
547			tok->pendin++;
548			return DEDENT;
549		}
550		else {
551			tok->pendin--;
552			return INDENT;
553		}
554	}
555
556 again:
557	tok->start = NULL;
558	/* Skip spaces */
559	do {
560		c = tok_nextc(tok);
561	} while (c == ' ' || c == '\t' || c == '\014');
562
563	/* Set start of current token */
564	tok->start = tok->cur - 1;
565
566	/* Skip comment */
567	if (c == '#') {
568		/* Hack to allow overriding the tabsize in the file.
569		   This is also recognized by vi, when it occurs near the
570		   beginning or end of the file.  (Will vi never die...?)
571		   For Python it must be at the beginning of the file! */
572		/* XXX The real vi syntax is actually different :-( */
573		/* XXX Should recognize Emacs syntax, too */
574		int x;
575		if (sscanf(tok->cur,
576				" vi:set tabsize=%d:", &x) == 1 &&
577						x >= 1 && x <= 40) {
578			/* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
579			tok->tabsize = x;
580		}
581		do {
582			c = tok_nextc(tok);
583		} while (c != EOF && c != '\n');
584	}
585
586	/* Check for EOF and errors now */
587	if (c == EOF) {
588		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
589	}
590
591	/* Identifier (most frequent token!) */
592	if (isalpha(c) || c == '_') {
593		switch (c) {
594		case 'r':
595		case 'R':
596			c = tok_nextc(tok);
597			if (c == '"' || c == '\'')
598				goto letter_quote;
599		}
600		while (isalnum(c) || c == '_') {
601			c = tok_nextc(tok);
602		}
603		tok_backup(tok, c);
604		*p_start = tok->start;
605		*p_end = tok->cur;
606		return NAME;
607	}
608
609	/* Newline */
610	if (c == '\n') {
611		tok->atbol = 1;
612		if (blankline || tok->level > 0)
613			goto nextline;
614		*p_start = tok->start;
615		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
616		return NEWLINE;
617	}
618
619#ifdef macintosh
620	if (c == '\r') {
621		fprintf(stderr,
622		  "File contains \\r characters (incorrect line endings?)\n");
623		tok->done = E_TOKEN;
624		tok->cur = tok->inp;
625		return ERRORTOKEN;
626	}
627#endif
628	/* Period or number starting with period? */
629	if (c == '.') {
630		c = tok_nextc(tok);
631		if (isdigit(c)) {
632			goto fraction;
633		}
634		else {
635			tok_backup(tok, c);
636			*p_start = tok->start;
637			*p_end = tok->cur;
638			return DOT;
639		}
640	}
641
642	/* Number */
643	if (isdigit(c)) {
644		if (c == '0') {
645			/* Hex or octal */
646			c = tok_nextc(tok);
647			if (c == '.')
648				goto fraction;
649#ifndef WITHOUT_COMPLEX
650			if (c == 'j' || c == 'J')
651				goto imaginary;
652#endif
653			if (c == 'x' || c == 'X') {
654				/* Hex */
655				do {
656					c = tok_nextc(tok);
657				} while (isxdigit(c));
658			}
659			else {
660				/* XXX This is broken!  E.g.,
661				   09.9 should be accepted as float! */
662				/* Octal; c is first char of it */
663				/* There's no 'isoctdigit' macro, sigh */
664				while ('0' <= c && c < '8') {
665					c = tok_nextc(tok);
666				}
667			}
668			if (c == 'l' || c == 'L')
669				c = tok_nextc(tok);
670		}
671		else {
672			/* Decimal */
673			do {
674				c = tok_nextc(tok);
675			} while (isdigit(c));
676			if (c == 'l' || c == 'L')
677				c = tok_nextc(tok);
678			else {
679				/* Accept floating point numbers.
680				   XXX This accepts incomplete things like
681				   XXX 12e or 1e+; worry run-time */
682				if (c == '.') {
683		fraction:
684					/* Fraction */
685					do {
686						c = tok_nextc(tok);
687					} while (isdigit(c));
688				}
689				if (c == 'e' || c == 'E') {
690					/* Exponent part */
691					c = tok_nextc(tok);
692					if (c == '+' || c == '-')
693						c = tok_nextc(tok);
694					while (isdigit(c)) {
695						c = tok_nextc(tok);
696					}
697				}
698#ifndef WITHOUT_COMPLEX
699				if (c == 'j' || c == 'J')
700					/* Imaginary part */
701		imaginary:
702					c = tok_nextc(tok);
703#endif
704			}
705		}
706		tok_backup(tok, c);
707		*p_start = tok->start;
708		*p_end = tok->cur;
709		return NUMBER;
710	}
711
712  letter_quote:
713	/* String */
714	if (c == '\'' || c == '"') {
715		int quote2 = tok->cur - tok->start + 1;
716		int quote = c;
717		int triple = 0;
718		int tripcount = 0;
719		for (;;) {
720			c = tok_nextc(tok);
721			if (c == '\n') {
722				if (!triple) {
723					tok->done = E_TOKEN;
724					tok_backup(tok, c);
725					return ERRORTOKEN;
726				}
727				tripcount = 0;
728			}
729			else if (c == EOF) {
730				tok->done = E_TOKEN;
731				tok->cur = tok->inp;
732				return ERRORTOKEN;
733			}
734			else if (c == quote) {
735				tripcount++;
736				if (tok->cur - tok->start == quote2) {
737					c = tok_nextc(tok);
738					if (c == quote) {
739						triple = 1;
740						tripcount = 0;
741						continue;
742					}
743					tok_backup(tok, c);
744				}
745				if (!triple || tripcount == 3)
746					break;
747			}
748			else if (c == '\\') {
749				tripcount = 0;
750				c = tok_nextc(tok);
751				if (c == EOF) {
752					tok->done = E_TOKEN;
753					tok->cur = tok->inp;
754					return ERRORTOKEN;
755				}
756			}
757			else
758				tripcount = 0;
759		}
760		*p_start = tok->start;
761		*p_end = tok->cur;
762		return STRING;
763	}
764
765	/* Line continuation */
766	if (c == '\\') {
767		c = tok_nextc(tok);
768		if (c != '\n') {
769			tok->done = E_TOKEN;
770			tok->cur = tok->inp;
771			return ERRORTOKEN;
772		}
773		goto again; /* Read next line */
774	}
775
776	/* Check for two-character token */
777	{
778		int c2 = tok_nextc(tok);
779		int token = PyToken_TwoChars(c, c2);
780		if (token != OP) {
781			*p_start = tok->start;
782			*p_end = tok->cur;
783			return token;
784		}
785		tok_backup(tok, c2);
786	}
787
788	/* Keep track of parentheses nesting level */
789	switch (c) {
790	case '(':
791	case '[':
792	case '{':
793		tok->level++;
794		break;
795	case ')':
796	case ']':
797	case '}':
798		tok->level--;
799		break;
800	}
801
802	/* Punctuation character */
803	*p_start = tok->start;
804	*p_end = tok->cur;
805	return PyToken_OneChar(c);
806}
807
808
809#ifdef Py_DEBUG
810
811void
812tok_dump(type, start, end)
813	int type;
814	char *start, *end;
815{
816	printf("%s", _PyParser_TokenNames[type]);
817	if (type == NAME || type == NUMBER || type == STRING || type == OP)
818		printf("(%.*s)", (int)(end - start), start);
819}
820
821#endif
822