tokenizer.c revision fbab905ae1fe6320268301193d953d25d2acb5c1
1/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
25/* Tokenizer implementation */
26
27/* XXX This is rather old, should be restructured perhaps */
28/* XXX Need a better interface to report errors than writing to stderr */
29/* XXX Should use editor resource to fetch true tab size on Macintosh */
30
31#include "pgenheaders.h"
32
33#include <ctype.h>
34#include "string.h"
35
36#include "fgetsintr.h"
37#include "tokenizer.h"
38#include "errcode.h"
39
40#ifdef macintosh
41#define TABSIZE 4
42#endif
43
44#ifndef TABSIZE
45#define TABSIZE 8
46#endif
47
48/* Forward */
49static struct tok_state *tok_new PROTO((void));
50static int tok_nextc PROTO((struct tok_state *tok));
51static void tok_backup PROTO((struct tok_state *tok, int c));
52
53/* Token names */
54
55char *tok_name[] = {
56	"ENDMARKER",
57	"NAME",
58	"NUMBER",
59	"STRING",
60	"NEWLINE",
61	"INDENT",
62	"DEDENT",
63	"LPAR",
64	"RPAR",
65	"LSQB",
66	"RSQB",
67	"COLON",
68	"COMMA",
69	"SEMI",
70	"PLUS",
71	"MINUS",
72	"STAR",
73	"SLASH",
74	"VBAR",
75	"AMPER",
76	"LESS",
77	"GREATER",
78	"EQUAL",
79	"DOT",
80	"PERCENT",
81	"BACKQUOTE",
82	"LBRACE",
83	"RBRACE",
84	"EQEQUAL",
85	"NOTEQUAL",
86	"LESSEQUAL",
87	"GREATEREQUAL",
88	/* This table must match the #defines in token.h! */
89	"OP",
90	"<ERRORTOKEN>",
91	"<N_TOKENS>"
92};
93
94
95/* Create and initialize a new tok_state structure */
96
97static struct tok_state *
98tok_new()
99{
100	struct tok_state *tok = NEW(struct tok_state, 1);
101	if (tok == NULL)
102		return NULL;
103	tok->buf = tok->cur = tok->end = tok->inp = NULL;
104	tok->done = E_OK;
105	tok->fp = NULL;
106	tok->tabsize = TABSIZE;
107	tok->indent = 0;
108	tok->indstack[0] = 0;
109	tok->atbol = 1;
110	tok->pendin = 0;
111	tok->prompt = tok->nextprompt = NULL;
112	tok->lineno = 0;
113	return tok;
114}
115
116
117/* Set up tokenizer for string */
118
119struct tok_state *
120tok_setups(str)
121	char *str;
122{
123	struct tok_state *tok = tok_new();
124	if (tok == NULL)
125		return NULL;
126	tok->buf = tok->cur = str;
127	tok->end = tok->inp = strchr(str, '\0');
128	return tok;
129}
130
131
132/* Set up tokenizer for file */
133
134struct tok_state *
135tok_setupf(fp, ps1, ps2)
136	FILE *fp;
137	char *ps1, *ps2;
138{
139	struct tok_state *tok = tok_new();
140	if (tok == NULL)
141		return NULL;
142	if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
143		DEL(tok);
144		return NULL;
145	}
146	tok->cur = tok->inp = tok->buf;
147	tok->end = tok->buf + BUFSIZ;
148	tok->fp = fp;
149	tok->prompt = ps1;
150	tok->nextprompt = ps2;
151	return tok;
152}
153
154
155/* Free a tok_state structure */
156
157void
158tok_free(tok)
159	struct tok_state *tok;
160{
161	/* XXX really need a separate flag to say 'my buffer' */
162	if (tok->fp != NULL && tok->buf != NULL)
163		DEL(tok->buf);
164	DEL(tok);
165}
166
167
168/* Get next char, updating state; error code goes into tok->done */
169
170static int
171tok_nextc(tok)
172	register struct tok_state *tok;
173{
174	if (tok->done != E_OK)
175		return EOF;
176
177	for (;;) {
178		if (tok->cur < tok->inp)
179			return *tok->cur++;
180		if (tok->fp == NULL) {
181			tok->done = E_EOF;
182			return EOF;
183		}
184		if (tok->inp > tok->buf && tok->inp[-1] == '\n')
185			tok->inp = tok->buf;
186		if (tok->inp == tok->end) {
187			int n = tok->end - tok->buf;
188			char *new = tok->buf;
189			RESIZE(new, char, n+n);
190			if (new == NULL) {
191				fprintf(stderr, "tokenizer out of mem\n");
192				tok->done = E_NOMEM;
193				return EOF;
194			}
195			tok->buf = new;
196			tok->inp = tok->buf + n;
197			tok->end = tok->inp + n;
198		}
199#ifdef USE_READLINE
200		if (tok->prompt != NULL) {
201			extern char *readline PROTO((char *prompt));
202			static int been_here;
203			if (!been_here) {
204				/* Force rebind of TAB to insert-tab */
205				extern int rl_insert();
206				rl_bind_key('\t', rl_insert);
207				been_here++;
208			}
209			if (tok->buf != NULL)
210				free(tok->buf);
211			tok->buf = readline(tok->prompt);
212			(void) intrcheck(); /* Clear pending interrupt */
213			if (tok->nextprompt != NULL)
214				tok->prompt = tok->nextprompt;
215				/* XXX different semantics w/o readline()! */
216			if (tok->buf == NULL) {
217				tok->done = E_EOF;
218			}
219			else {
220				unsigned int n = strlen(tok->buf);
221				if (n > 0)
222					add_history(tok->buf);
223				/* Append the '\n' that readline()
224				   doesn't give us, for the tokenizer... */
225				tok->buf = realloc(tok->buf, n+2);
226				if (tok->buf == NULL)
227					tok->done = E_NOMEM;
228				else {
229					tok->end = tok->buf + n;
230					*tok->end++ = '\n';
231					*tok->end = '\0';
232					tok->inp = tok->end;
233					tok->cur = tok->buf;
234				}
235			}
236		}
237		else
238#endif
239		{
240			tok->cur = tok->inp;
241			if (tok->prompt != NULL && tok->inp == tok->buf) {
242				fprintf(stderr, "%s", tok->prompt);
243				tok->prompt = tok->nextprompt;
244			}
245			tok->done = fgets_intr(tok->inp,
246				(int)(tok->end - tok->inp), tok->fp);
247		}
248		if (tok->done != E_OK) {
249			if (tok->prompt != NULL)
250				fprintf(stderr, "\n");
251			return EOF;
252		}
253		tok->inp = strchr(tok->inp, '\0');
254	}
255}
256
257
258/* Back-up one character */
259
260static void
261tok_backup(tok, c)
262	register struct tok_state *tok;
263	register int c;
264{
265	if (c != EOF) {
266		if (--tok->cur < tok->buf) {
267			fprintf(stderr, "tok_backup: begin of buffer\n");
268			abort();
269		}
270		if (*tok->cur != c)
271			*tok->cur = c;
272	}
273}
274
275
276/* Return the token corresponding to a single character */
277
278int
279tok_1char(c)
280	int c;
281{
282	switch (c) {
283	case '(':	return LPAR;
284	case ')':	return RPAR;
285	case '[':	return LSQB;
286	case ']':	return RSQB;
287	case ':':	return COLON;
288	case ',':	return COMMA;
289	case ';':	return SEMI;
290	case '+':	return PLUS;
291	case '-':	return MINUS;
292	case '*':	return STAR;
293	case '/':	return SLASH;
294	case '|':	return VBAR;
295	case '&':	return AMPER;
296	case '<':	return LESS;
297	case '>':	return GREATER;
298	case '=':	return EQUAL;
299	case '.':	return DOT;
300	case '%':	return PERCENT;
301	case '`':	return BACKQUOTE;
302	case '{':	return LBRACE;
303	case '}':	return RBRACE;
304	default:	return OP;
305	}
306}
307
308
309int
310tok_2char(c1, c2)
311	int c1, c2;
312{
313	switch (c1) {
314	case '=':
315		switch (c2) {
316		case '=':	return EQEQUAL;
317		}
318		break;
319	case '!':
320		switch (c2) {
321		case '=':	return NOTEQUAL;
322		}
323		break;
324	case '<':
325		switch (c2) {
326		case '>':	return NOTEQUAL;
327		case '=':	return LESSEQUAL;
328		}
329		break;
330	case '>':
331		switch (c2) {
332		case '=':	return GREATEREQUAL;
333		}
334		break;
335	}
336	return OP;
337}
338
339
340/* Get next token, after space stripping etc. */
341
342int
343tok_get(tok, p_start, p_end)
344	register struct tok_state *tok; /* In/out: tokenizer state */
345	char **p_start, **p_end; /* Out: point to start/end of token */
346{
347	register int c;
348	int blankline;
349
350  nextline:
351	blankline = 0;
352
353	/* Get indentation level */
354	if (tok->atbol) {
355		register int col = 0;
356		tok->atbol = 0;
357		tok->lineno++;
358		for (;;) {
359			c = tok_nextc(tok);
360			if (c == ' ')
361				col++;
362			else if (c == '\t')
363				col = (col/tok->tabsize + 1) * tok->tabsize;
364			else
365				break;
366		}
367		tok_backup(tok, c);
368		if (c == '#' || c == '\n') {
369			/* Lines with only whitespace and/or comments
370			   shouldn't affect the indentation and are
371			   not passed to the parser as NEWLINE tokens,
372			   except *totally* empty lines in interactive
373			   mode, which signal the end of a command group. */
374			if (col == 0 && c == '\n' && tok->prompt != NULL)
375				blankline = 0; /* Let it through */
376			else
377				blankline = 1; /* Ignore completely */
378			/* We can't jump back right here since we still
379			   may need to skip to the end of a comment */
380		}
381		if (!blankline) {
382			if (col == tok->indstack[tok->indent]) {
383				/* No change */
384			}
385			else if (col > tok->indstack[tok->indent]) {
386				/* Indent -- always one */
387				if (tok->indent+1 >= MAXINDENT) {
388					fprintf(stderr, "excessive indent\n");
389					tok->done = E_TOKEN;
390					return ERRORTOKEN;
391				}
392				tok->pendin++;
393				tok->indstack[++tok->indent] = col;
394			}
395			else /* col < tok->indstack[tok->indent] */ {
396				/* Dedent -- any number, must be consistent */
397				while (tok->indent > 0 &&
398					col < tok->indstack[tok->indent]) {
399					tok->indent--;
400					tok->pendin--;
401				}
402				if (col != tok->indstack[tok->indent]) {
403					fprintf(stderr, "inconsistent dedent\n");
404					tok->done = E_TOKEN;
405					return ERRORTOKEN;
406				}
407			}
408		}
409	}
410
411	*p_start = *p_end = tok->cur;
412
413	/* Return pending indents/dedents */
414	if (tok->pendin != 0) {
415		if (tok->pendin < 0) {
416			tok->pendin++;
417			return DEDENT;
418		}
419		else {
420			tok->pendin--;
421			return INDENT;
422		}
423	}
424
425 again:
426	/* Skip spaces */
427	do {
428		c = tok_nextc(tok);
429	} while (c == ' ' || c == '\t');
430
431	/* Set start of current token */
432	*p_start = tok->cur - 1;
433
434	/* Skip comment */
435	if (c == '#') {
436		/* Hack to allow overriding the tabsize in the file.
437		   This is also recognized by vi, when it occurs near the
438		   beginning or end of the file.  (Will vi never die...?)
439		   For Python it must be at the beginning of the file! */
440		int x;
441		/* XXX The case to (unsigned char *) is needed by THINK C 3.0 */
442		if (sscanf(/*(unsigned char *)*/tok->cur,
443				" vi:set tabsize=%d:", &x) == 1 &&
444						x >= 1 && x <= 40) {
445			/* fprintf(stderr, "# vi:set tabsize=%d:\n", x); */
446			tok->tabsize = x;
447		}
448		do {
449			c = tok_nextc(tok);
450		} while (c != EOF && c != '\n');
451	}
452
453	/* Check for EOF and errors now */
454	if (c == EOF)
455		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
456
457	/* Identifier (most frequent token!) */
458	if (isalpha(c) || c == '_') {
459		do {
460			c = tok_nextc(tok);
461		} while (isalnum(c) || c == '_');
462		tok_backup(tok, c);
463		*p_end = tok->cur;
464		return NAME;
465	}
466
467	/* Newline */
468	if (c == '\n') {
469		tok->atbol = 1;
470		if (blankline)
471			goto nextline;
472		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
473		return NEWLINE;
474	}
475
476	/* Number */
477	if (isdigit(c)) {
478		if (c == '0') {
479			/* Hex or octal */
480			c = tok_nextc(tok);
481			if (c == '.')
482				goto fraction;
483			if (c == 'x' || c == 'X') {
484				/* Hex */
485				do {
486					c = tok_nextc(tok);
487				} while (isxdigit(c));
488			}
489			else {
490				/* Octal; c is first char of it */
491				/* There's no 'isoctdigit' macro, sigh */
492				while ('0' <= c && c < '8') {
493					c = tok_nextc(tok);
494				}
495			}
496			if (c == 'l' || c == 'L')
497				c = tok_nextc(tok);
498		}
499		else {
500			/* Decimal */
501			do {
502				c = tok_nextc(tok);
503			} while (isdigit(c));
504			if (c == 'l' || c == 'L')
505				c = tok_nextc(tok);
506			else {
507				/* Accept floating point numbers.
508				   XXX This accepts incomplete things like
509				   XXX 12e or 1e+; worry run-time.
510				   XXX Doesn't accept numbers
511				   XXX starting with a dot */
512				if (c == '.') {
513		fraction:
514					/* Fraction */
515					do {
516						c = tok_nextc(tok);
517					} while (isdigit(c));
518				}
519				if (c == 'e' || c == 'E') {
520					/* Exponent part */
521					c = tok_nextc(tok);
522					if (c == '+' || c == '-')
523						c = tok_nextc(tok);
524					while (isdigit(c)) {
525						c = tok_nextc(tok);
526					}
527				}
528			}
529		}
530		tok_backup(tok, c);
531		*p_end = tok->cur;
532		return NUMBER;
533	}
534
535	/* String */
536	if (c == '\'') {
537		for (;;) {
538			c = tok_nextc(tok);
539			if (c == '\n' || c == EOF) {
540				tok->done = E_TOKEN;
541				return ERRORTOKEN;
542			}
543			if (c == '\\') {
544				c = tok_nextc(tok);
545				*p_end = tok->cur;
546				if (c == '\n' || c == EOF) {
547					tok->done = E_TOKEN;
548					return ERRORTOKEN;
549				}
550				continue;
551			}
552			if (c == '\'')
553				break;
554		}
555		*p_end = tok->cur;
556		return STRING;
557	}
558
559	/* Line continuation */
560	if (c == '\\') {
561		c = tok_nextc(tok);
562		if (c != '\n') {
563			tok->done = E_TOKEN;
564			return ERRORTOKEN;
565		}
566		tok->lineno++;
567		goto again; /* Read next line */
568	}
569
570	/* Check for two-character token */
571	{
572		int c2 = tok_nextc(tok);
573		int token = tok_2char(c, c2);
574		if (token != OP) {
575			*p_end = tok->cur;
576			return token;
577		}
578		tok_backup(tok, c2);
579	}
580
581	/* Punctuation character */
582	*p_end = tok->cur;
583	return tok_1char(c);
584}
585
586
587#ifdef DEBUG
588
589void
590tok_dump(type, start, end)
591	int type;
592	char *start, *end;
593{
594	printf("%s", tok_name[type]);
595	if (type == NAME || type == NUMBER || type == STRING || type == OP)
596		printf("(%.*s)", (int)(end - start), start);
597}
598
599#endif
600