tokenizer.c revision d6a15ada727e586dc7b2cce8115e65d0abb0d1aa
1/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
25/* Tokenizer implementation */
26
27/* XXX This is rather old, should be restructured perhaps */
28/* XXX Need a better interface to report errors than writing to stderr */
29/* XXX Should use editor resource to fetch true tab size on Macintosh */
30
31#include "pgenheaders.h"
32
33#include <ctype.h>
34#include "string.h"
35
36#include "fgetsintr.h"
37#include "tokenizer.h"
38#include "errcode.h"
39
40#ifdef macintosh
41#define TABSIZE 4
42#endif
43
44#ifndef TABSIZE
45#define TABSIZE 8
46#endif
47
48/* Forward */
49static struct tok_state *tok_new PROTO((void));
50static int tok_nextc PROTO((struct tok_state *tok));
51static void tok_backup PROTO((struct tok_state *tok, int c));
52
53/* Token names */
54
55char *tok_name[] = {
56	"ENDMARKER",
57	"NAME",
58	"NUMBER",
59	"STRING",
60	"NEWLINE",
61	"INDENT",
62	"DEDENT",
63	"LPAR",
64	"RPAR",
65	"LSQB",
66	"RSQB",
67	"COLON",
68	"COMMA",
69	"SEMI",
70	"PLUS",
71	"MINUS",
72	"STAR",
73	"SLASH",
74	"VBAR",
75	"AMPER",
76	"LESS",
77	"GREATER",
78	"EQUAL",
79	"DOT",
80	"PERCENT",
81	"BACKQUOTE",
82	"LBRACE",
83	"RBRACE",
84	"OP",
85	"<ERRORTOKEN>",
86	"<N_TOKENS>"
87};
88
89
90/* Create and initialize a new tok_state structure */
91
92static struct tok_state *
93tok_new()
94{
95	struct tok_state *tok = NEW(struct tok_state, 1);
96	if (tok == NULL)
97		return NULL;
98	tok->buf = tok->cur = tok->end = tok->inp = NULL;
99	tok->done = E_OK;
100	tok->fp = NULL;
101	tok->tabsize = TABSIZE;
102	tok->indent = 0;
103	tok->indstack[0] = 0;
104	tok->atbol = 1;
105	tok->pendin = 0;
106	tok->prompt = tok->nextprompt = NULL;
107	tok->lineno = 0;
108	return tok;
109}
110
111
112/* Set up tokenizer for string */
113
114struct tok_state *
115tok_setups(str)
116	char *str;
117{
118	struct tok_state *tok = tok_new();
119	if (tok == NULL)
120		return NULL;
121	tok->buf = tok->cur = str;
122	tok->end = tok->inp = strchr(str, '\0');
123	return tok;
124}
125
126
127/* Set up tokenizer for string */
128
129struct tok_state *
130tok_setupf(fp, ps1, ps2)
131	FILE *fp;
132	char *ps1, *ps2;
133{
134	struct tok_state *tok = tok_new();
135	if (tok == NULL)
136		return NULL;
137	if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
138		DEL(tok);
139		return NULL;
140	}
141	tok->cur = tok->inp = tok->buf;
142	tok->end = tok->buf + BUFSIZ;
143	tok->fp = fp;
144	tok->prompt = ps1;
145	tok->nextprompt = ps2;
146	return tok;
147}
148
149
150/* Free a tok_state structure */
151
152void
153tok_free(tok)
154	struct tok_state *tok;
155{
156	/* XXX really need a separate flag to say 'my buffer' */
157	if (tok->fp != NULL && tok->buf != NULL)
158		DEL(tok->buf);
159	DEL(tok);
160}
161
162
163/* Get next char, updating state; error code goes into tok->done */
164
165static int
166tok_nextc(tok)
167	register struct tok_state *tok;
168{
169	if (tok->done != E_OK)
170		return EOF;
171
172	for (;;) {
173		if (tok->cur < tok->inp)
174			return *tok->cur++;
175		if (tok->fp == NULL) {
176			tok->done = E_EOF;
177			return EOF;
178		}
179		if (tok->inp > tok->buf && tok->inp[-1] == '\n')
180			tok->inp = tok->buf;
181		if (tok->inp == tok->end) {
182			int n = tok->end - tok->buf;
183			char *new = tok->buf;
184			RESIZE(new, char, n+n);
185			if (new == NULL) {
186				fprintf(stderr, "tokenizer out of mem\n");
187				tok->done = E_NOMEM;
188				return EOF;
189			}
190			tok->buf = new;
191			tok->inp = tok->buf + n;
192			tok->end = tok->inp + n;
193		}
194#ifdef USE_READLINE
195		if (tok->prompt != NULL) {
196			extern char *readline PROTO((char *prompt));
197			static int been_here;
198			if (!been_here) {
199				/* Force rebind of TAB to insert-tab */
200				extern int rl_insert();
201				rl_bind_key('\t', rl_insert);
202				been_here++;
203			}
204			if (tok->buf != NULL)
205				free(tok->buf);
206			tok->buf = readline(tok->prompt);
207			(void) intrcheck(); /* Clear pending interrupt */
208			if (tok->nextprompt != NULL)
209				tok->prompt = tok->nextprompt;
210				/* XXX different semantics w/o readline()! */
211			if (tok->buf == NULL) {
212				tok->done = E_EOF;
213			}
214			else {
215				unsigned int n = strlen(tok->buf);
216				if (n > 0)
217					add_history(tok->buf);
218				/* Append the '\n' that readline()
219				   doesn't give us, for the tokenizer... */
220				tok->buf = realloc(tok->buf, n+2);
221				if (tok->buf == NULL)
222					tok->done = E_NOMEM;
223				else {
224					tok->end = tok->buf + n;
225					*tok->end++ = '\n';
226					*tok->end = '\0';
227					tok->inp = tok->end;
228					tok->cur = tok->buf;
229				}
230			}
231		}
232		else
233#endif
234		{
235			tok->cur = tok->inp;
236			if (tok->prompt != NULL && tok->inp == tok->buf) {
237				fprintf(stderr, "%s", tok->prompt);
238				tok->prompt = tok->nextprompt;
239			}
240			tok->done = fgets_intr(tok->inp,
241				(int)(tok->end - tok->inp), tok->fp);
242		}
243		if (tok->done != E_OK) {
244			if (tok->prompt != NULL)
245				fprintf(stderr, "\n");
246			return EOF;
247		}
248		tok->inp = strchr(tok->inp, '\0');
249	}
250}
251
252
253/* Back-up one character */
254
255static void
256tok_backup(tok, c)
257	register struct tok_state *tok;
258	register int c;
259{
260	if (c != EOF) {
261		if (--tok->cur < tok->buf) {
262			fprintf(stderr, "tok_backup: begin of buffer\n");
263			abort();
264		}
265		if (*tok->cur != c)
266			*tok->cur = c;
267	}
268}
269
270
271/* Return the token corresponding to a single character */
272
273int
274tok_1char(c)
275	int c;
276{
277	switch (c) {
278	case '(':	return LPAR;
279	case ')':	return RPAR;
280	case '[':	return LSQB;
281	case ']':	return RSQB;
282	case ':':	return COLON;
283	case ',':	return COMMA;
284	case ';':	return SEMI;
285	case '+':	return PLUS;
286	case '-':	return MINUS;
287	case '*':	return STAR;
288	case '/':	return SLASH;
289	case '|':	return VBAR;
290	case '&':	return AMPER;
291	case '<':	return LESS;
292	case '>':	return GREATER;
293	case '=':	return EQUAL;
294	case '.':	return DOT;
295	case '%':	return PERCENT;
296	case '`':	return BACKQUOTE;
297	case '{':	return LBRACE;
298	case '}':	return RBRACE;
299	default:	return OP;
300	}
301}
302
303
304/* Get next token, after space stripping etc. */
305
306int
307tok_get(tok, p_start, p_end)
308	register struct tok_state *tok; /* In/out: tokenizer state */
309	char **p_start, **p_end; /* Out: point to start/end of token */
310{
311	register int c;
312
313	/* Get indentation level */
314	if (tok->atbol) {
315		register int col = 0;
316		tok->atbol = 0;
317		tok->lineno++;
318		for (;;) {
319			c = tok_nextc(tok);
320			if (c == ' ')
321				col++;
322			else if (c == '\t')
323				col = (col/tok->tabsize + 1) * tok->tabsize;
324			else
325				break;
326		}
327		tok_backup(tok, c);
328		if (col == tok->indstack[tok->indent]) {
329			/* No change */
330		}
331		else if (col > tok->indstack[tok->indent]) {
332			/* Indent -- always one */
333			if (tok->indent+1 >= MAXINDENT) {
334				fprintf(stderr, "excessive indent\n");
335				tok->done = E_TOKEN;
336				return ERRORTOKEN;
337			}
338			tok->pendin++;
339			tok->indstack[++tok->indent] = col;
340		}
341		else /* col < tok->indstack[tok->indent] */ {
342			/* Dedent -- any number, must be consistent */
343			while (tok->indent > 0 &&
344				col < tok->indstack[tok->indent]) {
345				tok->indent--;
346				tok->pendin--;
347			}
348			if (col != tok->indstack[tok->indent]) {
349				fprintf(stderr, "inconsistent dedent\n");
350				tok->done = E_TOKEN;
351				return ERRORTOKEN;
352			}
353		}
354	}
355
356	*p_start = *p_end = tok->cur;
357
358	/* Return pending indents/dedents */
359	if (tok->pendin != 0) {
360		if (tok->pendin < 0) {
361			tok->pendin++;
362			return DEDENT;
363		}
364		else {
365			tok->pendin--;
366			return INDENT;
367		}
368	}
369
370 again:
371	/* Skip spaces */
372	do {
373		c = tok_nextc(tok);
374	} while (c == ' ' || c == '\t');
375
376	/* Set start of current token */
377	*p_start = tok->cur - 1;
378
379	/* Skip comment */
380	if (c == '#') {
381		/* Hack to allow overriding the tabsize in the file.
382		   This is also recognized by vi, when it occurs near the
383		   beginning or end of the file.  (Will vi never die...?) */
384		int x;
385		/* XXX The case to (unsigned char *) is needed by THINK C 3.0 */
386		if (sscanf(/*(unsigned char *)*/tok->cur,
387				" vi:set tabsize=%d:", &x) == 1 &&
388						x >= 1 && x <= 40) {
389			fprintf(stderr, "# vi:set tabsize=%d:\n", x);
390			tok->tabsize = x;
391		}
392		do {
393			c = tok_nextc(tok);
394		} while (c != EOF && c != '\n');
395	}
396
397	/* Check for EOF and errors now */
398	if (c == EOF)
399		return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
400
401	/* Identifier (most frequent token!) */
402	if (isalpha(c) || c == '_') {
403		do {
404			c = tok_nextc(tok);
405		} while (isalnum(c) || c == '_');
406		tok_backup(tok, c);
407		*p_end = tok->cur;
408		return NAME;
409	}
410
411	/* Newline */
412	if (c == '\n') {
413		tok->atbol = 1;
414		*p_end = tok->cur - 1; /* Leave '\n' out of the string */
415		return NEWLINE;
416	}
417
418	/* Number */
419	if (isdigit(c)) {
420		if (c == '0') {
421			/* Hex or octal */
422			c = tok_nextc(tok);
423			if (c == '.')
424				goto fraction;
425			if (c == 'x' || c == 'X') {
426				/* Hex */
427				do {
428					c = tok_nextc(tok);
429				} while (isxdigit(c));
430			}
431			else {
432				/* Octal; c is first char of it */
433				/* There's no 'isoctdigit' macro, sigh */
434				while ('0' <= c && c < '8') {
435					c = tok_nextc(tok);
436				}
437			}
438			if (c == 'l' || c == 'L')
439				c = tok_nextc(tok);
440		}
441		else {
442			/* Decimal */
443			do {
444				c = tok_nextc(tok);
445			} while (isdigit(c));
446			if (c == 'l' || c == 'L')
447				c = tok_nextc(tok);
448			else {
449				/* Accept floating point numbers.
450				   XXX This accepts incomplete things like
451				   XXX 12e or 1e+; worry run-time.
452				   XXX Doesn't accept numbers
453				   XXX starting with a dot */
454				if (c == '.') {
455		fraction:
456					/* Fraction */
457					do {
458						c = tok_nextc(tok);
459					} while (isdigit(c));
460				}
461				if (c == 'e' || c == 'E') {
462					/* Exponent part */
463					c = tok_nextc(tok);
464					if (c == '+' || c == '-')
465						c = tok_nextc(tok);
466					while (isdigit(c)) {
467						c = tok_nextc(tok);
468					}
469				}
470			}
471		}
472		tok_backup(tok, c);
473		*p_end = tok->cur;
474		return NUMBER;
475	}
476
477	/* String */
478	if (c == '\'') {
479		for (;;) {
480			c = tok_nextc(tok);
481			if (c == '\n' || c == EOF) {
482				tok->done = E_TOKEN;
483				return ERRORTOKEN;
484			}
485			if (c == '\\') {
486				c = tok_nextc(tok);
487				*p_end = tok->cur;
488				if (c == '\n' || c == EOF) {
489					tok->done = E_TOKEN;
490					return ERRORTOKEN;
491				}
492				continue;
493			}
494			if (c == '\'')
495				break;
496		}
497		*p_end = tok->cur;
498		return STRING;
499	}
500
501	/* Line continuation */
502	if (c == '\\') {
503		c = tok_nextc(tok);
504		if (c != '\n') {
505			tok->done = E_TOKEN;
506			return ERRORTOKEN;
507		}
508		tok->lineno++;
509		goto again; /* Read next line */
510	}
511
512	/* Punctuation character */
513	*p_end = tok->cur;
514	return tok_1char(c);
515}
516
517
518#ifdef DEBUG
519
520void
521tok_dump(type, start, end)
522	int type;
523	char *start, *end;
524{
525	printf("%s", tok_name[type]);
526	if (type == NAME || type == NUMBER || type == STRING || type == OP)
527		printf("(%.*s)", (int)(end - start), start);
528}
529
530#endif
531