tokenizer.c revision fd8a393086fbf43597965d5e55bec158a094a466
1/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* Tokenizer implementation */
33
34#include "pgenheaders.h"
35
36#include <ctype.h>
37
38#include "tokenizer.h"
39#include "errcode.h"
40
41extern char *my_readline PROTO((char *));
42/* Return malloc'ed string including trailing \n;
43   empty malloc'ed string for EOF;
44   NULL if interrupted */
45
46/* Don't ever change this -- it would break the portability of Python code */
47#define TABSIZE 8
48
49/* Forward */
50static struct tok_state *tok_new PROTO((void));
51static int tok_nextc PROTO((struct tok_state *tok));
52static void tok_backup PROTO((struct tok_state *tok, int c));
53
54/* Token names */
55
56char *tok_name[] = {
57	"ENDMARKER",
58	"NAME",
59	"NUMBER",
60	"STRING",
61	"NEWLINE",
62	"INDENT",
63	"DEDENT",
64	"LPAR",
65	"RPAR",
66	"LSQB",
67	"RSQB",
68	"COLON",
69	"COMMA",
70	"SEMI",
71	"PLUS",
72	"MINUS",
73	"STAR",
74	"SLASH",
75	"VBAR",
76	"AMPER",
77	"LESS",
78	"GREATER",
79	"EQUAL",
80	"DOT",
81	"PERCENT",
82	"BACKQUOTE",
83	"LBRACE",
84	"RBRACE",
85	"EQEQUAL",
86	"NOTEQUAL",
87	"LESSEQUAL",
88	"GREATEREQUAL",
89	"TILDE",
90	"CIRCUMFLEX",
91	"LEFTSHIFT",
92	"RIGHTSHIFT",
93	"DOUBLESTAR",
94	/* This table must match the #defines in token.h! */
95	"OP",
96	"<ERRORTOKEN>",
97	"<N_TOKENS>"
98};
99
100
101/* Create and initialize a new tok_state structure */
102
103static struct tok_state *
104tok_new()
105{
106	struct tok_state *tok = NEW(struct tok_state, 1);
107	if (tok == NULL)
108		return NULL;
109	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
110	tok->done = E_OK;
111	tok->fp = NULL;
112	tok->tabsize = TABSIZE;
113	tok->indent = 0;
114	tok->indstack[0] = 0;
115	tok->atbol = 1;
116	tok->pendin = 0;
117	tok->prompt = tok->nextprompt = NULL;
118	tok->lineno = 0;
119	tok->level = 0;
120	return tok;
121}
122
123
124/* Set up tokenizer for string */
125
126struct tok_state *
127tok_setups(str)
128	char *str;
129{
130	struct tok_state *tok = tok_new();
131	if (tok == NULL)
132		return NULL;
133	tok->buf = tok->cur = tok->end = tok->inp = str;
134	return tok;
135}
136
137
138/* Set up tokenizer for file */
139
140struct tok_state *
141tok_setupf(fp, ps1, ps2)
142	FILE *fp;
143	char *ps1, *ps2;
144{
145	struct tok_state *tok = tok_new();
146	if (tok == NULL)
147		return NULL;
148	if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
149		DEL(tok);
150		return NULL;
151	}
152	tok->cur = tok->inp = tok->buf;
153	tok->end = tok->buf + BUFSIZ;
154	tok->fp = fp;
155	tok->prompt = ps1;
156	tok->nextprompt = ps2;
157	return tok;
158}
159
160
161/* Free a tok_state structure */
162
163void
164tok_free(tok)
165	struct tok_state *tok;
166{
167	if (tok->fp != NULL && tok->buf != NULL)
168		DEL(tok->buf);
169	DEL(tok);
170}
171
172
173/* Get next char, updating state; error code goes into tok->done */
174
175static int
176tok_nextc(tok)
177	register struct tok_state *tok;
178{
179	for (;;) {
180		if (tok->cur != tok->inp) {
181			return *tok->cur++; /* Fast path */
182		}
183		if (tok->done != E_OK)
184			return EOF;
185		if (tok->fp == NULL) {
186			char *end = strchr(tok->inp, '\n');
187			if (end != NULL)
188				end++;
189			else {
190				end = strchr(tok->inp, '\0');
191				if (end == tok->inp) {
192					tok->done = E_EOF;
193					return EOF;
194				}
195			}
196			if (tok->start == NULL)
197				tok->buf = tok->cur;
198			tok->lineno++;
199			tok->inp = end;
200			return *tok->cur++;
201		}
202		if (tok->prompt != NULL) {
203			char *new = my_readline(tok->prompt);
204			if (tok->nextprompt != NULL)
205				tok->prompt = tok->nextprompt;
206			if (new == NULL)
207				tok->done = E_INTR;
208			else if (*new == '\0') {
209				free(new);
210				tok->done = E_EOF;
211			}
212			else if (tok->start != NULL) {
213				int start = tok->start - tok->buf;
214				int oldlen = tok->cur - tok->buf;
215				int newlen = oldlen + strlen(new);
216				char *buf = realloc(tok->buf, newlen+1);
217				tok->lineno++;
218				if (buf == NULL) {
219					free(tok->buf);
220					tok->buf = NULL;
221					free(new);
222					tok->done = E_NOMEM;
223					return EOF;
224				}
225				tok->buf = buf;
226				tok->cur = tok->buf + oldlen;
227				strcpy(tok->buf + oldlen, new);
228				free(new);
229				tok->inp = tok->buf + newlen;
230				tok->end = tok->inp + 1;
231				tok->start = tok->buf + start;
232			}
233			else {
234				tok->lineno++;
235				if (tok->buf != NULL)
236					free(tok->buf);
237				tok->buf = new;
238				tok->cur = tok->buf;
239				tok->inp = strchr(tok->buf, '\0');
240				tok->end = tok->inp + 1;
241			}
242		}
243		else {
244			int done = 0;
245			int cur = 0;
246			char *pt;
247			if (tok->start == NULL) {
248				if (tok->buf == NULL) {
249					tok->buf = NEW(char, BUFSIZ);
250					if (tok->buf == NULL) {
251						tok->done = E_NOMEM;
252						return EOF;
253					}
254					tok->end = tok->buf + BUFSIZ;
255				}
256				if (fgets(tok->buf, (int)(tok->end - tok->buf),
257					  tok->fp) == NULL) {
258					tok->done = E_EOF;
259					done = 1;
260				}
261				else {
262					tok->done = E_OK;
263					tok->inp = strchr(tok->buf, '\0');
264					done = tok->inp[-1] == '\n';
265				}
266			}
267			else {
268				cur = tok->cur - tok->buf;
269				if (feof(tok->fp)) {
270					tok->done = E_EOF;
271					done = 1;
272				}
273				else
274					tok->done = E_OK;
275			}
276			tok->lineno++;
277			/* Read until '\n' or EOF */
278			while (!done) {
279				int curstart = tok->start == NULL ? -1 :
280					       tok->start - tok->buf;
281				int curvalid = tok->inp - tok->buf;
282				int newsize = curvalid + BUFSIZ;
283				char *newbuf = tok->buf;
284				RESIZE(newbuf, char, newsize);
285				if (newbuf == NULL) {
286					tok->done = E_NOMEM;
287					tok->cur = tok->inp;
288					return EOF;
289				}
290				tok->buf = newbuf;
291				tok->inp = tok->buf + curvalid;
292				tok->end = tok->buf + newsize;
293				tok->start = curstart < 0 ? NULL :
294					     tok->buf + curstart;
295				if (fgets(tok->inp,
296					       (int)(tok->end - tok->inp),
297					       tok->fp) == NULL) {
298					/* Last line does not end in \n,
299					   fake one */
300					strcpy(tok->inp, "\n");
301				}
302				tok->inp = strchr(tok->inp, '\0');
303				done = tok->inp[-1] == '\n';
304			}
305			tok->cur = tok->buf + cur;
306			/* replace "\r\n" with "\n" */
307			pt = tok->inp - 2;
308			if (pt >= tok->buf && *pt == '\r') {
309				*pt++ = '\n';
310				*pt = '\0';
311				tok->inp = pt;
312			}
313		}
314		if (tok->done != E_OK) {
315			if (tok->prompt != NULL)
316				fprintf(stderr, "\n");
317			tok->cur = tok->inp;
318			return EOF;
319		}
320	}
321	/*NOTREACHED*/
322}
323
324
325/* Back-up one character */
326
327static void
328tok_backup(tok, c)
329	register struct tok_state *tok;
330	register int c;
331{
332	if (c != EOF) {
333		if (--tok->cur < tok->buf)
334			fatal("tok_backup: begin of buffer");
335		if (*tok->cur != c)
336			*tok->cur = c;
337	}
338}
339
340
341/* Return the token corresponding to a single character */
342
343int
344tok_1char(c)
345	int c;
346{
347	switch (c) {
348	case '(':	return LPAR;
349	case ')':	return RPAR;
350	case '[':	return LSQB;
351	case ']':	return RSQB;
352	case ':':	return COLON;
353	case ',':	return COMMA;
354	case ';':	return SEMI;
355	case '+':	return PLUS;
356	case '-':	return MINUS;
357	case '*':	return STAR;
358	case '/':	return SLASH;
359	case '|':	return VBAR;
360	case '&':	return AMPER;
361	case '<':	return LESS;
362	case '>':	return GREATER;
363	case '=':	return EQUAL;
364	case '.':	return DOT;
365	case '%':	return PERCENT;
366	case '`':	return BACKQUOTE;
367	case '{':	return LBRACE;
368	case '}':	return RBRACE;
369	case '^':	return CIRCUMFLEX;
370	case '~':	return TILDE;
371	default:	return OP;
372	}
373}
374
375
376int
377tok_2char(c1, c2)
378	int c1, c2;
379{
380	switch (c1) {
381	case '=':
382		switch (c2) {
383		case '=':	return EQEQUAL;
384		}
385		break;
386	case '!':
387		switch (c2) {
388		case '=':	return NOTEQUAL;
389		}
390		break;
391	case '<':
392		switch (c2) {
393		case '>':	return NOTEQUAL;
394		case '=':	return LESSEQUAL;
395		case '<':	return LEFTSHIFT;
396		}
397		break;
398	case '>':
399		switch (c2) {
400		case '=':	return GREATEREQUAL;
401		case '>':	return RIGHTSHIFT;
402		}
403		break;
404	case '*':
405		switch (c2) {
406		case '*':	return DOUBLESTAR;
407		}
408		break;
409	}
410	return OP;
411}
412
413
414/* Get next token, after space stripping etc. */
415
416int
417tok_get(tok, p_start, p_end)
418	register struct tok_state *tok; /* In/out: tokenizer state */
419	char **p_start, **p_end; /* Out: point to start/end of token */
420{
421	register int c;
422	int blankline;
423
424	*p_start = *p_end = NULL;
425  nextline:
426	tok->start = NULL;
427	blankline = 0;
428
429	/* Get indentation level */
430	if (tok->atbol) {
431		register int col = 0;
432		tok->atbol = 0;
433		for (;;) {
434			c = tok_nextc(tok);
435			if (c == ' ')
436				col++;
437			else if (c == '\t')
438				col = (col/tok->tabsize + 1) * tok->tabsize;
439			else if (c == '\014') /* Control-L (formfeed) */
440				col = 0; /* For Emacs users */
441			else
442				break;
443		}
444		tok_backup(tok, c);
445		if (c == '#' || c == '\n') {
446			/* Lines with only whitespace and/or comments
447			   shouldn't affect the indentation and are
448			   not passed to the parser as NEWLINE tokens,
449			   except *totally* empty lines in interactive
450			   mode, which signal the end of a command group. */
451			if (col == 0 && c == '\n' && tok->prompt != NULL)
452				blankline = 0; /* Let it through */
453			else
454				blankline = 1; /* Ignore completely */
455			/* We can't jump back right here since we still
456			   may need to skip to the end of a comment */
457		}
458		if (!blankline && tok->level == 0) {
459			if (col == tok->indstack[tok->indent]) {
460				/* No change */
461			}
462			else if (col > tok->indstack[tok->indent]) {
463				/* Indent -- always one */
464				if (tok->indent+1 >= MAXINDENT) {
465					fprintf(stderr, "excessive indent\n");
466					tok->done = E_TOKEN;
467					tok->cur = tok->inp;
468					return ERRORTOKEN;
469				}
470				tok->pendin++;
471				tok->indstack[++tok->indent] = col;
472			}
473			else /* col < tok->indstack[tok->indent] */ {
474				/* Dedent -- any number, must be consistent */
475				while (tok->indent > 0 &&
476					col < tok->indstack[tok->indent]) {
477					tok->indent--;
478					tok->pendin--;
479				}
480				if (col != tok->indstack[tok->indent]) {
481					fprintf(stderr, "inconsistent dedent\n");
482					tok->done = E_TOKEN;
483					tok->cur = tok->inp;
484					return ERRORTOKEN;
485				}
486			}
487		}
488	}
489
490	tok->start = tok->cur;
491
492	/* Return pending indents/dedents */
493	if (tok->pendin != 0) {
494		if (tok->pendin < 0) {
495			tok->pendin++;
496			return DEDENT;
497		}
498		else {
499			tok->pendin--;
500			return INDENT;
501		}
502	}
503
504 again:
505	tok->start = NULL;
506	/* Skip spaces */
507	do {
508		c = tok_nextc(tok);
509	} while (c == ' ' || c == '\t' || c == '\014');
510
511	/* Set start of current token */
512	tok->start = tok->cur - 1;
513
514	/* Skip comment */
515	if (c == '#') {
516		/* Hack to allow overriding the tabsize in the file.
517		   This is also recognized by vi, when it occurs near the
518		   beginning or end of the file.  (Will vi never die...?)
519		   For Python it must be at the beginning of the file! */
520		/* XXX The real vi syntax is actually different :-( */
521		/* XXX Should recognize Emacs syntax, too */
522		int x;
523		if (sscanf(tok->cur,
524				" vi:set tabsize=%d:", &x) == 1 &&
525						x >= 1 && x <= 40) {
526			/* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
527			tok->tabsize = x;
528		}
529		do {
530			c = tok_nextc(tok);
531		} while (c != EOF && c != '\n');
532	}
533
534	/* Check for EOF and errors now */
535	if (c == EOF) {
536		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
537	}
538
539	/* Identifier (most frequent token!) */
540	if (isalpha(c) || c == '_') {
541		do {
542			c = tok_nextc(tok);
543		} while (isalnum(c) || c == '_');
544		tok_backup(tok, c);
545		*p_start = tok->start;
546		*p_end = tok->cur;
547		return NAME;
548	}
549
550	/* Newline */
551	if (c == '\n') {
552		tok->atbol = 1;
553		if (blankline || tok->level > 0)
554			goto nextline;
555		*p_start = tok->start;
556		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
557		return NEWLINE;
558	}
559
560	/* Period or number starting with period? */
561	if (c == '.') {
562		c = tok_nextc(tok);
563		if (isdigit(c)) {
564			goto fraction;
565		}
566		else {
567			tok_backup(tok, c);
568			*p_start = tok->start;
569			*p_end = tok->cur;
570			return DOT;
571		}
572	}
573
574	/* Number */
575	if (isdigit(c)) {
576		if (c == '0') {
577			/* Hex or octal */
578			c = tok_nextc(tok);
579			if (c == '.')
580				goto fraction;
581#ifndef WITHOUT_COMPLEX
582			if (c == 'j' || c == 'J')
583				goto imaginary;
584#endif
585			if (c == 'x' || c == 'X') {
586				/* Hex */
587				do {
588					c = tok_nextc(tok);
589				} while (isxdigit(c));
590			}
591			else {
592				/* XXX This is broken!  E.g.,
593				   09.9 should be accepted as float! */
594				/* Octal; c is first char of it */
595				/* There's no 'isoctdigit' macro, sigh */
596				while ('0' <= c && c < '8') {
597					c = tok_nextc(tok);
598				}
599			}
600			if (c == 'l' || c == 'L')
601				c = tok_nextc(tok);
602		}
603		else {
604			/* Decimal */
605			do {
606				c = tok_nextc(tok);
607			} while (isdigit(c));
608			if (c == 'l' || c == 'L')
609				c = tok_nextc(tok);
610			else {
611				/* Accept floating point numbers.
612				   XXX This accepts incomplete things like
613				   XXX 12e or 1e+; worry run-time */
614				if (c == '.') {
615		fraction:
616					/* Fraction */
617					do {
618						c = tok_nextc(tok);
619					} while (isdigit(c));
620				}
621				if (c == 'e' || c == 'E') {
622					/* Exponent part */
623					c = tok_nextc(tok);
624					if (c == '+' || c == '-')
625						c = tok_nextc(tok);
626					while (isdigit(c)) {
627						c = tok_nextc(tok);
628					}
629				}
630#ifndef WITHOUT_COMPLEX
631				if (c == 'j' || c == 'J')
632					/* Imaginary part */
633		imaginary:
634					c = tok_nextc(tok);
635#endif
636			}
637		}
638		tok_backup(tok, c);
639		*p_start = tok->start;
640		*p_end = tok->cur;
641		return NUMBER;
642	}
643
644	/* String */
645	if (c == '\'' || c == '"') {
646		int quote = c;
647		int triple = 0;
648		int tripcount = 0;
649		for (;;) {
650			c = tok_nextc(tok);
651			if (c == '\n') {
652				if (!triple) {
653					tok->done = E_TOKEN;
654					tok_backup(tok, c);
655					return ERRORTOKEN;
656				}
657				tripcount = 0;
658			}
659			else if (c == EOF) {
660				tok->done = E_TOKEN;
661				tok->cur = tok->inp;
662				return ERRORTOKEN;
663			}
664			else if (c == quote) {
665				tripcount++;
666				if (tok->cur == tok->start+2) {
667					c = tok_nextc(tok);
668					if (c == quote) {
669						triple = 1;
670						tripcount = 0;
671						continue;
672					}
673					tok_backup(tok, c);
674				}
675				if (!triple || tripcount == 3)
676					break;
677			}
678			else if (c == '\\') {
679				tripcount = 0;
680				c = tok_nextc(tok);
681				if (c == EOF) {
682					tok->done = E_TOKEN;
683					tok->cur = tok->inp;
684					return ERRORTOKEN;
685				}
686			}
687			else
688				tripcount = 0;
689		}
690		*p_start = tok->start;
691		*p_end = tok->cur;
692		return STRING;
693	}
694
695	/* Line continuation */
696	if (c == '\\') {
697		c = tok_nextc(tok);
698		if (c != '\n') {
699			tok->done = E_TOKEN;
700			tok->cur = tok->inp;
701			return ERRORTOKEN;
702		}
703		goto again; /* Read next line */
704	}
705
706	/* Check for two-character token */
707	{
708		int c2 = tok_nextc(tok);
709		int token = tok_2char(c, c2);
710		if (token != OP) {
711			*p_start = tok->start;
712			*p_end = tok->cur;
713			return token;
714		}
715		tok_backup(tok, c2);
716	}
717
718	/* Keep track of parentheses nesting level */
719	switch (c) {
720	case '(':
721	case '[':
722	case '{':
723		tok->level++;
724		break;
725	case ')':
726	case ']':
727	case '}':
728		tok->level--;
729		break;
730	}
731
732	/* Punctuation character */
733	*p_start = tok->start;
734	*p_end = tok->cur;
735	return tok_1char(c);
736}
737
738
739#ifdef DEBUG
740
741void
742tok_dump(type, start, end)
743	int type;
744	char *start, *end;
745{
746	printf("%s", tok_name[type]);
747	if (type == NAME || type == NUMBER || type == STRING || type == OP)
748		printf("(%.*s)", (int)(end - start), start);
749}
750
751#endif
752