tokenizer.c revision 78c0535a224697e1c7a1a4e68462d3d204e38942
1/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
25/* Tokenizer implementation */
26
27#include "pgenheaders.h"
28
29#include <ctype.h>
30
31#include "tokenizer.h"
32#include "errcode.h"
33
34extern char *my_readline PROTO((char *));
35/* Return malloc'ed string including trailing \n;
36   empty malloc'ed string for EOF;
37   NULL if interrupted */
38
39/* Don't ever change this -- it would break the portability of Python code */
40#define TABSIZE 8
41
42/* Forward */
43static struct tok_state *tok_new PROTO((void));
44static int tok_nextc PROTO((struct tok_state *tok));
45static void tok_backup PROTO((struct tok_state *tok, int c));
46
47/* Token names */
48
49char *tok_name[] = {
50	"ENDMARKER",
51	"NAME",
52	"NUMBER",
53	"STRING",
54	"NEWLINE",
55	"INDENT",
56	"DEDENT",
57	"LPAR",
58	"RPAR",
59	"LSQB",
60	"RSQB",
61	"COLON",
62	"COMMA",
63	"SEMI",
64	"PLUS",
65	"MINUS",
66	"STAR",
67	"SLASH",
68	"VBAR",
69	"AMPER",
70	"LESS",
71	"GREATER",
72	"EQUAL",
73	"DOT",
74	"PERCENT",
75	"BACKQUOTE",
76	"LBRACE",
77	"RBRACE",
78	"EQEQUAL",
79	"NOTEQUAL",
80	"LESSEQUAL",
81	"GREATEREQUAL",
82	"TILDE",
83	"CIRCUMFLEX",
84	"LEFTSHIFT",
85	"RIGHTSHIFT",
86	/* This table must match the #defines in token.h! */
87	"OP",
88	"<ERRORTOKEN>",
89	"<N_TOKENS>"
90};
91
92
93/* Create and initialize a new tok_state structure */
94
95static struct tok_state *
96tok_new()
97{
98	struct tok_state *tok = NEW(struct tok_state, 1);
99	if (tok == NULL)
100		return NULL;
101	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
102	tok->done = E_OK;
103	tok->fp = NULL;
104	tok->tabsize = TABSIZE;
105	tok->indent = 0;
106	tok->indstack[0] = 0;
107	tok->atbol = 1;
108	tok->pendin = 0;
109	tok->prompt = tok->nextprompt = NULL;
110	tok->lineno = 0;
111	tok->level = 0;
112	return tok;
113}
114
115
116/* Set up tokenizer for string */
117
118struct tok_state *
119tok_setups(str)
120	char *str;
121{
122	struct tok_state *tok = tok_new();
123	if (tok == NULL)
124		return NULL;
125	tok->buf = tok->cur = tok->end = tok->inp = str;
126	return tok;
127}
128
129
130/* Set up tokenizer for file */
131
132struct tok_state *
133tok_setupf(fp, ps1, ps2)
134	FILE *fp;
135	char *ps1, *ps2;
136{
137	struct tok_state *tok = tok_new();
138	if (tok == NULL)
139		return NULL;
140	if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
141		DEL(tok);
142		return NULL;
143	}
144	tok->cur = tok->inp = tok->buf;
145	tok->end = tok->buf + BUFSIZ;
146	tok->fp = fp;
147	tok->prompt = ps1;
148	tok->nextprompt = ps2;
149	return tok;
150}
151
152
153/* Free a tok_state structure */
154
155void
156tok_free(tok)
157	struct tok_state *tok;
158{
159	if (tok->fp != NULL && tok->buf != NULL)
160		DEL(tok->buf);
161	DEL(tok);
162}
163
164
165/* Get next char, updating state; error code goes into tok->done */
166
167static int
168tok_nextc(tok)
169	register struct tok_state *tok;
170{
171	for (;;) {
172		if (tok->cur != tok->inp) {
173			return *tok->cur++; /* Fast path */
174		}
175		if (tok->done != E_OK)
176			return EOF;
177		if (tok->fp == NULL) {
178			char *end = strchr(tok->inp, '\n');
179			if (end != NULL)
180				end++;
181			else {
182				end = strchr(tok->inp, '\0');
183				if (end == tok->inp) {
184					tok->done = E_EOF;
185					return EOF;
186				}
187			}
188			if (tok->start == NULL)
189				tok->buf = tok->cur;
190			tok->lineno++;
191			tok->inp = end;
192			return *tok->cur++;
193		}
194		if (tok->prompt != NULL) {
195			char *new = my_readline(tok->prompt);
196			if (tok->nextprompt != NULL)
197				tok->prompt = tok->nextprompt;
198			if (new == NULL)
199				tok->done = E_INTR;
200			else if (*new == '\0') {
201				free(new);
202				tok->done = E_EOF;
203			}
204			else if (tok->start != NULL) {
205				int start = tok->start - tok->buf;
206				int oldlen = tok->cur - tok->buf;
207				int newlen = oldlen + strlen(new);
208				char *buf = realloc(tok->buf, newlen+1);
209				tok->lineno++;
210				if (buf == NULL) {
211					free(tok->buf);
212					tok->buf = NULL;
213					free(new);
214					tok->done = E_NOMEM;
215					return EOF;
216				}
217				tok->buf = buf;
218				tok->cur = tok->buf + oldlen;
219				strcpy(tok->buf + oldlen, new);
220				free(new);
221				tok->inp = tok->buf + newlen;
222				tok->end = tok->inp + 1;
223				tok->start = tok->buf + start;
224			}
225			else {
226				tok->lineno++;
227				if (tok->buf != NULL)
228					free(tok->buf);
229				tok->buf = new;
230				tok->cur = tok->buf;
231				tok->inp = strchr(tok->buf, '\0');
232				tok->end = tok->inp + 1;
233			}
234		}
235		else {
236			int done = 0;
237			int cur = 0;
238			if (tok->start == NULL) {
239				if (tok->buf == NULL) {
240					tok->buf = NEW(char, BUFSIZ);
241					if (tok->buf == NULL) {
242						tok->done = E_NOMEM;
243						return EOF;
244					}
245					tok->end = tok->buf + BUFSIZ;
246				}
247				if (fgets(tok->buf, (int)(tok->end - tok->buf),
248					  tok->fp) == NULL) {
249					tok->done = E_EOF;
250					done = 1;
251				}
252				else {
253					tok->done = E_OK;
254					tok->inp = strchr(tok->buf, '\0');
255					done = tok->inp[-1] == '\n';
256				}
257			}
258			else {
259				cur = tok->cur - tok->buf;
260				if (feof(tok->fp)) {
261					tok->done = E_EOF;
262					done = 1;
263				}
264				else
265					tok->done = E_OK;
266			}
267			tok->lineno++;
268			/* Read until '\n' or EOF */
269			while (!done) {
270				int curstart = tok->start == NULL ? -1 :
271					       tok->start - tok->buf;
272				int curvalid = tok->inp - tok->buf;
273				int cursize = tok->end - tok->buf;
274				int newsize = cursize + BUFSIZ;
275				char *newbuf = tok->buf;
276				RESIZE(newbuf, char, newsize);
277				if (newbuf == NULL) {
278					tok->done = E_NOMEM;
279					tok->cur = tok->inp;
280					return EOF;
281				}
282				tok->buf = newbuf;
283				tok->inp = tok->buf + curvalid;
284				tok->end = tok->buf + newsize;
285				tok->start = curstart < 0 ? NULL :
286					     tok->buf + curstart;
287				if (fgets(tok->inp,
288					       (int)(tok->end - tok->inp),
289					       tok->fp) == NULL) {
290					/* Last line does not end in \n,
291					   fake one */
292					strcpy(tok->inp, "\n");
293				}
294				tok->inp = strchr(tok->inp, '\0');
295				done = tok->inp[-1] == '\n';
296			}
297			tok->cur = tok->buf + cur;
298		}
299		if (tok->done != E_OK) {
300			if (tok->prompt != NULL)
301				fprintf(stderr, "\n");
302			tok->cur = tok->inp;
303			return EOF;
304		}
305	}
306	/*NOTREACHED*/
307}
308
309
310/* Back-up one character */
311
312static void
313tok_backup(tok, c)
314	register struct tok_state *tok;
315	register int c;
316{
317	if (c != EOF) {
318		if (--tok->cur < tok->buf)
319			fatal("tok_backup: begin of buffer");
320		if (*tok->cur != c)
321			*tok->cur = c;
322	}
323}
324
325
326/* Return the token corresponding to a single character */
327
328int
329tok_1char(c)
330	int c;
331{
332	switch (c) {
333	case '(':	return LPAR;
334	case ')':	return RPAR;
335	case '[':	return LSQB;
336	case ']':	return RSQB;
337	case ':':	return COLON;
338	case ',':	return COMMA;
339	case ';':	return SEMI;
340	case '+':	return PLUS;
341	case '-':	return MINUS;
342	case '*':	return STAR;
343	case '/':	return SLASH;
344	case '|':	return VBAR;
345	case '&':	return AMPER;
346	case '<':	return LESS;
347	case '>':	return GREATER;
348	case '=':	return EQUAL;
349	case '.':	return DOT;
350	case '%':	return PERCENT;
351	case '`':	return BACKQUOTE;
352	case '{':	return LBRACE;
353	case '}':	return RBRACE;
354	case '^':	return CIRCUMFLEX;
355	case '~':	return TILDE;
356	default:	return OP;
357	}
358}
359
360
361int
362tok_2char(c1, c2)
363	int c1, c2;
364{
365	switch (c1) {
366	case '=':
367		switch (c2) {
368		case '=':	return EQEQUAL;
369		}
370		break;
371	case '!':
372		switch (c2) {
373		case '=':	return NOTEQUAL;
374		}
375		break;
376	case '<':
377		switch (c2) {
378		case '>':	return NOTEQUAL;
379		case '=':	return LESSEQUAL;
380		case '<':	return LEFTSHIFT;
381		}
382		break;
383	case '>':
384		switch (c2) {
385		case '=':	return GREATEREQUAL;
386		case '>':	return RIGHTSHIFT;
387		}
388		break;
389	}
390	return OP;
391}
392
393
394/* Get next token, after space stripping etc. */
395
396int
397tok_get(tok, p_start, p_end)
398	register struct tok_state *tok; /* In/out: tokenizer state */
399	char **p_start, **p_end; /* Out: point to start/end of token */
400{
401	register int c;
402	int blankline;
403
404	*p_start = *p_end = NULL;
405  nextline:
406	tok->start = NULL;
407	blankline = 0;
408
409	/* Get indentation level */
410	if (tok->atbol) {
411		register int col = 0;
412		tok->atbol = 0;
413		for (;;) {
414			c = tok_nextc(tok);
415			if (c == ' ')
416				col++;
417			else if (c == '\t')
418				col = (col/tok->tabsize + 1) * tok->tabsize;
419			else
420				break;
421		}
422		tok_backup(tok, c);
423		if (c == '#' || c == '\n') {
424			/* Lines with only whitespace and/or comments
425			   shouldn't affect the indentation and are
426			   not passed to the parser as NEWLINE tokens,
427			   except *totally* empty lines in interactive
428			   mode, which signal the end of a command group. */
429			if (col == 0 && c == '\n' && tok->prompt != NULL)
430				blankline = 0; /* Let it through */
431			else
432				blankline = 1; /* Ignore completely */
433			/* We can't jump back right here since we still
434			   may need to skip to the end of a comment */
435		}
436		if (!blankline && tok->level == 0) {
437			if (col == tok->indstack[tok->indent]) {
438				/* No change */
439			}
440			else if (col > tok->indstack[tok->indent]) {
441				/* Indent -- always one */
442				if (tok->indent+1 >= MAXINDENT) {
443					fprintf(stderr, "excessive indent\n");
444					tok->done = E_TOKEN;
445					tok->cur = tok->inp;
446					return ERRORTOKEN;
447				}
448				tok->pendin++;
449				tok->indstack[++tok->indent] = col;
450			}
451			else /* col < tok->indstack[tok->indent] */ {
452				/* Dedent -- any number, must be consistent */
453				while (tok->indent > 0 &&
454					col < tok->indstack[tok->indent]) {
455					tok->indent--;
456					tok->pendin--;
457				}
458				if (col != tok->indstack[tok->indent]) {
459					fprintf(stderr, "inconsistent dedent\n");
460					tok->done = E_TOKEN;
461					tok->cur = tok->inp;
462					return ERRORTOKEN;
463				}
464			}
465		}
466	}
467
468	tok->start = tok->cur;
469
470	/* Return pending indents/dedents */
471	if (tok->pendin != 0) {
472		if (tok->pendin < 0) {
473			tok->pendin++;
474			return DEDENT;
475		}
476		else {
477			tok->pendin--;
478			return INDENT;
479		}
480	}
481
482 again:
483	tok->start = NULL;
484	/* Skip spaces */
485	do {
486		c = tok_nextc(tok);
487	} while (c == ' ' || c == '\t');
488
489	/* Set start of current token */
490	tok->start = tok->cur - 1;
491
492	/* Skip comment */
493	if (c == '#') {
494		/* Hack to allow overriding the tabsize in the file.
495		   This is also recognized by vi, when it occurs near the
496		   beginning or end of the file.  (Will vi never die...?)
497		   For Python it must be at the beginning of the file! */
498		/* XXX The real vi syntax is actually different :-( */
499		/* XXX Should recognize Emacs syntax, too */
500		int x;
501		if (sscanf(tok->cur,
502				" vi:set tabsize=%d:", &x) == 1 &&
503						x >= 1 && x <= 40) {
504			/* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
505			tok->tabsize = x;
506		}
507		do {
508			c = tok_nextc(tok);
509		} while (c != EOF && c != '\n');
510	}
511
512	/* Check for EOF and errors now */
513	if (c == EOF) {
514		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
515	}
516
517	/* Identifier (most frequent token!) */
518	if (isalpha(c) || c == '_') {
519		do {
520			c = tok_nextc(tok);
521		} while (isalnum(c) || c == '_');
522		tok_backup(tok, c);
523		*p_start = tok->start;
524		*p_end = tok->cur;
525		return NAME;
526	}
527
528	/* Newline */
529	if (c == '\n') {
530		tok->atbol = 1;
531		if (blankline || tok->level > 0)
532			goto nextline;
533		*p_start = tok->start;
534		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
535		return NEWLINE;
536	}
537
538	/* Period or number starting with period? */
539	if (c == '.') {
540		c = tok_nextc(tok);
541		if (isdigit(c)) {
542			goto fraction;
543		}
544		else {
545			tok_backup(tok, c);
546			*p_start = tok->start;
547			*p_end = tok->cur;
548			return DOT;
549		}
550	}
551
552	/* Number */
553	if (isdigit(c)) {
554		if (c == '0') {
555			/* Hex or octal */
556			c = tok_nextc(tok);
557			if (c == '.')
558				goto fraction;
559			if (c == 'x' || c == 'X') {
560				/* Hex */
561				do {
562					c = tok_nextc(tok);
563				} while (isxdigit(c));
564			}
565			else {
566				/* XXX This is broken!  E.g.,
567				   09.9 should be accepted as float! */
568				/* Octal; c is first char of it */
569				/* There's no 'isoctdigit' macro, sigh */
570				while ('0' <= c && c < '8') {
571					c = tok_nextc(tok);
572				}
573			}
574			if (c == 'l' || c == 'L')
575				c = tok_nextc(tok);
576		}
577		else {
578			/* Decimal */
579			do {
580				c = tok_nextc(tok);
581			} while (isdigit(c));
582			if (c == 'l' || c == 'L')
583				c = tok_nextc(tok);
584			else {
585				/* Accept floating point numbers.
586				   XXX This accepts incomplete things like
587				   XXX 12e or 1e+; worry run-time */
588				if (c == '.') {
589		fraction:
590					/* Fraction */
591					do {
592						c = tok_nextc(tok);
593					} while (isdigit(c));
594				}
595				if (c == 'e' || c == 'E') {
596					/* Exponent part */
597					c = tok_nextc(tok);
598					if (c == '+' || c == '-')
599						c = tok_nextc(tok);
600					while (isdigit(c)) {
601						c = tok_nextc(tok);
602					}
603				}
604			}
605		}
606		tok_backup(tok, c);
607		*p_start = tok->start;
608		*p_end = tok->cur;
609		return NUMBER;
610	}
611
612	/* String */
613	if (c == '\'' || c == '"') {
614		int quote = c;
615		int triple = 0;
616		int tripcount = 0;
617		for (;;) {
618			c = tok_nextc(tok);
619			if (c == '\n') {
620				if (!triple) {
621					tok->done = E_TOKEN;
622					tok_backup(tok, c);
623					return ERRORTOKEN;
624				}
625				tripcount = 0;
626			}
627			else if (c == EOF) {
628				tok->done = E_TOKEN;
629				tok->cur = tok->inp;
630				return ERRORTOKEN;
631			}
632			else if (c == quote) {
633				tripcount++;
634				if (tok->cur == tok->start+2) {
635					c = tok_nextc(tok);
636					if (c == quote) {
637						triple = 1;
638						tripcount = 0;
639						continue;
640					}
641					tok_backup(tok, c);
642				}
643				if (!triple || tripcount == 3)
644					break;
645			}
646			else if (c == '\\') {
647				tripcount = 0;
648				c = tok_nextc(tok);
649				if (c == EOF) {
650					tok->done = E_TOKEN;
651					tok->cur = tok->inp;
652					return ERRORTOKEN;
653				}
654			}
655			else
656				tripcount = 0;
657		}
658		*p_start = tok->start;
659		*p_end = tok->cur;
660		return STRING;
661	}
662
663	/* Line continuation */
664	if (c == '\\') {
665		c = tok_nextc(tok);
666		if (c != '\n') {
667			tok->done = E_TOKEN;
668			tok->cur = tok->inp;
669			return ERRORTOKEN;
670		}
671		goto again; /* Read next line */
672	}
673
674	/* Check for two-character token */
675	{
676		int c2 = tok_nextc(tok);
677		int token = tok_2char(c, c2);
678		if (token != OP) {
679			*p_start = tok->start;
680			*p_end = tok->cur;
681			return token;
682		}
683		tok_backup(tok, c2);
684	}
685
686	/* Keep track of parentheses nesting level */
687	switch (c) {
688	case '(':
689	case '[':
690	case '{':
691		tok->level++;
692		break;
693	case ')':
694	case ']':
695	case '}':
696		tok->level--;
697		break;
698	}
699
700	/* Punctuation character */
701	*p_start = tok->start;
702	*p_end = tok->cur;
703	return tok_1char(c);
704}
705
706
707#ifdef DEBUG
708
709void
710tok_dump(type, start, end)
711	int type;
712	char *start, *end;
713{
714	printf("%s", tok_name[type]);
715	if (type == NAME || type == NUMBER || type == STRING || type == OP)
716		printf("(%.*s)", (int)(end - start), start);
717}
718
719#endif
720