pgen.c revision 85a5fbbdfea617f6cc8fae82c9e8c2b5c424436d
1/* Parser generator */
2
3/* For a description, see the comments at end of this file */
4
5#include <stdio.h>
6#include "assert.h"
7
8#include "PROTO.h"
9#include "malloc.h"
10#include "token.h"
11#include "node.h"
12#include "grammar.h"
13#include "metagrammar.h"
14#include "pgen.h"
15
16extern int debugging;
17
18
19/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
20
21typedef struct _nfaarc {
22	int	ar_label;
23	int	ar_arrow;
24} nfaarc;
25
26typedef struct _nfastate {
27	int	st_narcs;
28	nfaarc	*st_arc;
29} nfastate;
30
31typedef struct _nfa {
32	int		nf_type;
33	char		*nf_name;
34	int		nf_nstates;
35	nfastate	*nf_state;
36	int		nf_start, nf_finish;
37} nfa;
38
39static int
40addnfastate(nf)
41	nfa *nf;
42{
43	nfastate *st;
44
45	RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
46	if (nf->nf_state == NULL)
47		fatal("out of mem");
48	st = &nf->nf_state[nf->nf_nstates++];
49	st->st_narcs = 0;
50	st->st_arc = NULL;
51	return st - nf->nf_state;
52}
53
54static void
55addnfaarc(nf, from, to, lbl)
56	nfa *nf;
57	int from, to, lbl;
58{
59	nfastate *st;
60	nfaarc *ar;
61
62	st = &nf->nf_state[from];
63	RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
64	if (st->st_arc == NULL)
65		fatal("out of mem");
66	ar = &st->st_arc[st->st_narcs++];
67	ar->ar_label = lbl;
68	ar->ar_arrow = to;
69}
70
71static nfa *
72newnfa(name)
73	char *name;
74{
75	nfa *nf;
76	static type = NT_OFFSET; /* All types will be disjunct */
77
78	nf = NEW(nfa, 1);
79	if (nf == NULL)
80		fatal("no mem for new nfa");
81	nf->nf_type = type++;
82	nf->nf_name = name; /* XXX strdup(name) ??? */
83	nf->nf_nstates = 0;
84	nf->nf_state = NULL;
85	nf->nf_start = nf->nf_finish = -1;
86	return nf;
87}
88
89typedef struct _nfagrammar {
90	int		gr_nnfas;
91	nfa		**gr_nfa;
92	labellist	gr_ll;
93} nfagrammar;
94
95static nfagrammar *
96newnfagrammar()
97{
98	nfagrammar *gr;
99
100	gr = NEW(nfagrammar, 1);
101	if (gr == NULL)
102		fatal("no mem for new nfa grammar");
103	gr->gr_nnfas = 0;
104	gr->gr_nfa = NULL;
105	gr->gr_ll.ll_nlabels = 0;
106	gr->gr_ll.ll_label = NULL;
107	addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
108	return gr;
109}
110
111static nfa *
112addnfa(gr, name)
113	nfagrammar *gr;
114	char *name;
115{
116	nfa *nf;
117
118	nf = newnfa(name);
119	RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
120	if (gr->gr_nfa == NULL)
121		fatal("out of mem");
122	gr->gr_nfa[gr->gr_nnfas++] = nf;
123	addlabel(&gr->gr_ll, NAME, nf->nf_name);
124	return nf;
125}
126
127#ifdef DEBUG
128
129static char REQNFMT[] = "metacompile: less than %d children\n";
130
131#define REQN(i, count) \
132 	if (i < count) { \
133		fprintf(stderr, REQNFMT, count); \
134		abort(); \
135	} else
136
137#else
138#define REQN(i, count)	/* empty */
139#endif
140
141static nfagrammar *
142metacompile(n)
143	node *n;
144{
145	nfagrammar *gr;
146	int i;
147
148	printf("Compiling (meta-) parse tree into NFA grammar\n");
149	gr = newnfagrammar();
150	REQ(n, MSTART);
151	i = n->n_nchildren - 1; /* Last child is ENDMARKER */
152	n = n->n_child;
153	for (; --i >= 0; n++) {
154		if (n->n_type != NEWLINE)
155			compile_rule(gr, n);
156	}
157	return gr;
158}
159
160static
161compile_rule(gr, n)
162	nfagrammar *gr;
163	node *n;
164{
165	nfa *nf;
166
167	REQ(n, RULE);
168	REQN(n->n_nchildren, 4);
169	n = n->n_child;
170	REQ(n, NAME);
171	nf = addnfa(gr, n->n_str);
172	n++;
173	REQ(n, COLON);
174	n++;
175	REQ(n, RHS);
176	compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
177	n++;
178	REQ(n, NEWLINE);
179}
180
181static
182compile_rhs(ll, nf, n, pa, pb)
183	labellist *ll;
184	nfa *nf;
185	node *n;
186	int *pa, *pb;
187{
188	int i;
189	int a, b;
190
191	REQ(n, RHS);
192	i = n->n_nchildren;
193	REQN(i, 1);
194	n = n->n_child;
195	REQ(n, ALT);
196	compile_alt(ll, nf, n, pa, pb);
197	if (--i <= 0)
198		return;
199	n++;
200	a = *pa;
201	b = *pb;
202	*pa = addnfastate(nf);
203	*pb = addnfastate(nf);
204	addnfaarc(nf, *pa, a, EMPTY);
205	addnfaarc(nf, b, *pb, EMPTY);
206	for (; --i >= 0; n++) {
207		REQ(n, VBAR);
208		REQN(i, 1);
209		--i;
210		n++;
211		REQ(n, ALT);
212		compile_alt(ll, nf, n, &a, &b);
213		addnfaarc(nf, *pa, a, EMPTY);
214		addnfaarc(nf, b, *pb, EMPTY);
215	}
216}
217
218static
219compile_alt(ll, nf, n, pa, pb)
220	labellist *ll;
221	nfa *nf;
222	node *n;
223	int *pa, *pb;
224{
225	int i;
226	int a, b;
227
228	REQ(n, ALT);
229	i = n->n_nchildren;
230	REQN(i, 1);
231	n = n->n_child;
232	REQ(n, ITEM);
233	compile_item(ll, nf, n, pa, pb);
234	--i;
235	n++;
236	for (; --i >= 0; n++) {
237		if (n->n_type == COMMA) { /* XXX Temporary */
238			REQN(i, 1);
239			--i;
240			n++;
241		}
242		REQ(n, ITEM);
243		compile_item(ll, nf, n, &a, &b);
244		addnfaarc(nf, *pb, a, EMPTY);
245		*pb = b;
246	}
247}
248
249static
250compile_item(ll, nf, n, pa, pb)
251	labellist *ll;
252	nfa *nf;
253	node *n;
254	int *pa, *pb;
255{
256	int i;
257	int a, b;
258
259	REQ(n, ITEM);
260	i = n->n_nchildren;
261	REQN(i, 1);
262	n = n->n_child;
263	if (n->n_type == LSQB) {
264		REQN(i, 3);
265		n++;
266		REQ(n, RHS);
267		*pa = addnfastate(nf);
268		*pb = addnfastate(nf);
269		addnfaarc(nf, *pa, *pb, EMPTY);
270		compile_rhs(ll, nf, n, &a, &b);
271		addnfaarc(nf, *pa, a, EMPTY);
272		addnfaarc(nf, b, *pb, EMPTY);
273		REQN(i, 1);
274		n++;
275		REQ(n, RSQB);
276	}
277	else {
278		compile_atom(ll, nf, n, pa, pb);
279		if (--i <= 0)
280			return;
281		n++;
282		addnfaarc(nf, *pb, *pa, EMPTY);
283		if (n->n_type == STAR)
284			*pb = *pa;
285		else
286			REQ(n, PLUS);
287	}
288}
289
290static
291compile_atom(ll, nf, n, pa, pb)
292	labellist *ll;
293	nfa *nf;
294	node *n;
295	int *pa, *pb;
296{
297	int i;
298
299	REQ(n, ATOM);
300	i = n->n_nchildren;
301	REQN(i, 1);
302	n = n->n_child;
303	if (n->n_type == LPAR) {
304		REQN(i, 3);
305		n++;
306		REQ(n, RHS);
307		compile_rhs(ll, nf, n, pa, pb);
308		n++;
309		REQ(n, RPAR);
310	}
311	else if (n->n_type == NAME || n->n_type == STRING) {
312		*pa = addnfastate(nf);
313		*pb = addnfastate(nf);
314		addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
315	}
316	else
317		REQ(n, NAME);
318}
319
320static void
321dumpstate(ll, nf, istate)
322	labellist *ll;
323	nfa *nf;
324	int istate;
325{
326	nfastate *st;
327	int i;
328	nfaarc *ar;
329
330	printf("%c%2d%c",
331		istate == nf->nf_start ? '*' : ' ',
332		istate,
333		istate == nf->nf_finish ? '.' : ' ');
334	st = &nf->nf_state[istate];
335	ar = st->st_arc;
336	for (i = 0; i < st->st_narcs; i++) {
337		if (i > 0)
338			printf("\n    ");
339		printf("-> %2d  %s", ar->ar_arrow,
340			labelrepr(&ll->ll_label[ar->ar_label]));
341		ar++;
342	}
343	printf("\n");
344}
345
346static void
347dumpnfa(ll, nf)
348	labellist *ll;
349	nfa *nf;
350{
351	int i;
352
353	printf("NFA '%s' has %d states; start %d, finish %d\n",
354		nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
355	for (i = 0; i < nf->nf_nstates; i++)
356		dumpstate(ll, nf, i);
357}
358
359
360/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
361
362static int
363addclosure(ss, nf, istate)
364	bitset ss;
365	nfa *nf;
366	int istate;
367{
368	if (addbit(ss, istate)) {
369		nfastate *st = &nf->nf_state[istate];
370		nfaarc *ar = st->st_arc;
371		int i;
372
373		for (i = st->st_narcs; --i >= 0; ) {
374			if (ar->ar_label == EMPTY)
375				addclosure(ss, nf, ar->ar_arrow);
376			ar++;
377		}
378	}
379}
380
381typedef struct _ss_arc {
382	bitset	sa_bitset;
383	int	sa_arrow;
384	int	sa_label;
385} ss_arc;
386
387typedef struct _ss_state {
388	bitset	ss_ss;
389	int	ss_narcs;
390	ss_arc	*ss_arc;
391	int	ss_deleted;
392	int	ss_finish;
393	int	ss_rename;
394} ss_state;
395
396typedef struct _ss_dfa {
397	int	sd_nstates;
398	ss_state *sd_state;
399} ss_dfa;
400
401static
402makedfa(gr, nf, d)
403	nfagrammar *gr;
404	nfa *nf;
405	dfa *d;
406{
407	int nbits = nf->nf_nstates;
408	bitset ss;
409	int xx_nstates;
410	ss_state *xx_state, *yy;
411	ss_arc *zz;
412	int istate, jstate, iarc, jarc, ibit;
413	nfastate *st;
414	nfaarc *ar;
415
416	ss = newbitset(nbits);
417	addclosure(ss, nf, nf->nf_start);
418	xx_state = NEW(ss_state, 1);
419	if (xx_state == NULL)
420		fatal("no mem for xx_state in makedfa");
421	xx_nstates = 1;
422	yy = &xx_state[0];
423	yy->ss_ss = ss;
424	yy->ss_narcs = 0;
425	yy->ss_arc = NULL;
426	yy->ss_deleted = 0;
427	yy->ss_finish = testbit(ss, nf->nf_finish);
428	if (yy->ss_finish)
429		printf("Error: nonterminal '%s' may produce empty.\n",
430			nf->nf_name);
431
432	/* This algorithm is from a book written before
433	   the invention of structured programming... */
434
435	/* For each unmarked state... */
436	for (istate = 0; istate < xx_nstates; ++istate) {
437		yy = &xx_state[istate];
438		ss = yy->ss_ss;
439		/* For all its states... */
440		for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
441			if (!testbit(ss, ibit))
442				continue;
443			st = &nf->nf_state[ibit];
444			/* For all non-empty arcs from this state... */
445			for (iarc = 0; iarc < st->st_narcs; iarc++) {
446				ar = &st->st_arc[iarc];
447				if (ar->ar_label == EMPTY)
448					continue;
449				/* Look up in list of arcs from this state */
450				for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
451					zz = &yy->ss_arc[jarc];
452					if (ar->ar_label == zz->sa_label)
453						goto found;
454				}
455				/* Add new arc for this state */
456				RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
457				if (yy->ss_arc == NULL)
458					fatal("out of mem");
459				zz = &yy->ss_arc[yy->ss_narcs++];
460				zz->sa_label = ar->ar_label;
461				zz->sa_bitset = newbitset(nbits);
462				zz->sa_arrow = -1;
463			 found:	;
464				/* Add destination */
465				addclosure(zz->sa_bitset, nf, ar->ar_arrow);
466			}
467		}
468		/* Now look up all the arrow states */
469		for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
470			zz = &xx_state[istate].ss_arc[jarc];
471			for (jstate = 0; jstate < xx_nstates; jstate++) {
472				if (samebitset(zz->sa_bitset,
473					xx_state[jstate].ss_ss, nbits)) {
474					zz->sa_arrow = jstate;
475					goto done;
476				}
477			}
478			RESIZE(xx_state, ss_state, xx_nstates + 1);
479			if (xx_state == NULL)
480				fatal("out of mem");
481			zz->sa_arrow = xx_nstates;
482			yy = &xx_state[xx_nstates++];
483			yy->ss_ss = zz->sa_bitset;
484			yy->ss_narcs = 0;
485			yy->ss_arc = NULL;
486			yy->ss_deleted = 0;
487			yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
488		 done:	;
489		}
490	}
491
492	if (debugging)
493		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
494						"before minimizing");
495
496	simplify(xx_nstates, xx_state);
497
498	if (debugging)
499		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
500						"after minimizing");
501
502	convert(d, xx_nstates, xx_state);
503
504	/* XXX cleanup */
505}
506
507static
508printssdfa(xx_nstates, xx_state, nbits, ll, msg)
509	int xx_nstates;
510	ss_state *xx_state;
511	int nbits;
512	labellist *ll;
513	char *msg;
514{
515	int i, ibit, iarc;
516	ss_state *yy;
517	ss_arc *zz;
518
519	printf("Subset DFA %s\n", msg);
520	for (i = 0; i < xx_nstates; i++) {
521		yy = &xx_state[i];
522		if (yy->ss_deleted)
523			continue;
524		printf(" Subset %d", i);
525		if (yy->ss_finish)
526			printf(" (finish)");
527		printf(" { ");
528		for (ibit = 0; ibit < nbits; ibit++) {
529			if (testbit(yy->ss_ss, ibit))
530				printf("%d ", ibit);
531		}
532		printf("}\n");
533		for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
534			zz = &yy->ss_arc[iarc];
535			printf("  Arc to state %d, label %s\n",
536				zz->sa_arrow,
537				labelrepr(&ll->ll_label[zz->sa_label]));
538		}
539	}
540}
541
542
543/* PART THREE -- SIMPLIFY DFA */
544
545/* Simplify the DFA by repeatedly eliminating states that are
546   equivalent to another oner.  This is NOT Algorithm 3.3 from
547   [Aho&Ullman 77].  It does not always finds the minimal DFA,
548   but it does usually make a much smaller one...  (For an example
549   of sub-optimal behaviour, try S: x a b+ | y a b+.)
550*/
551
552static int
553samestate(s1, s2)
554	ss_state *s1, *s2;
555{
556	int i;
557
558	if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
559		return 0;
560	for (i = 0; i < s1->ss_narcs; i++) {
561		if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
562			s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
563			return 0;
564	}
565	return 1;
566}
567
568static void
569renamestates(xx_nstates, xx_state, from, to)
570	int xx_nstates;
571	ss_state *xx_state;
572	int from, to;
573{
574	int i, j;
575
576	if (debugging)
577		printf("Rename state %d to %d.\n", from, to);
578	for (i = 0; i < xx_nstates; i++) {
579		if (xx_state[i].ss_deleted)
580			continue;
581		for (j = 0; j < xx_state[i].ss_narcs; j++) {
582			if (xx_state[i].ss_arc[j].sa_arrow == from)
583				xx_state[i].ss_arc[j].sa_arrow = to;
584		}
585	}
586}
587
588static
589simplify(xx_nstates, xx_state)
590	int xx_nstates;
591	ss_state *xx_state;
592{
593	int changes;
594	int i, j, k;
595
596	do {
597		changes = 0;
598		for (i = 1; i < xx_nstates; i++) {
599			if (xx_state[i].ss_deleted)
600				continue;
601			for (j = 0; j < i; j++) {
602				if (xx_state[j].ss_deleted)
603					continue;
604				if (samestate(&xx_state[i], &xx_state[j])) {
605					xx_state[i].ss_deleted++;
606					renamestates(xx_nstates, xx_state, i, j);
607					changes++;
608					break;
609				}
610			}
611		}
612	} while (changes);
613}
614
615
616/* PART FOUR -- GENERATE PARSING TABLES */
617
618/* Convert the DFA into a grammar that can be used by our parser */
619
620static
621convert(d, xx_nstates, xx_state)
622	dfa *d;
623	int xx_nstates;
624	ss_state *xx_state;
625{
626	int i, j;
627	ss_state *yy;
628	ss_arc *zz;
629
630	for (i = 0; i < xx_nstates; i++) {
631		yy = &xx_state[i];
632		if (yy->ss_deleted)
633			continue;
634		yy->ss_rename = addstate(d);
635	}
636
637	for (i = 0; i < xx_nstates; i++) {
638		yy = &xx_state[i];
639		if (yy->ss_deleted)
640			continue;
641		for (j = 0; j < yy->ss_narcs; j++) {
642			zz = &yy->ss_arc[j];
643			addarc(d, yy->ss_rename,
644				xx_state[zz->sa_arrow].ss_rename,
645				zz->sa_label);
646		}
647		if (yy->ss_finish)
648			addarc(d, yy->ss_rename, yy->ss_rename, 0);
649	}
650
651	d->d_initial = 0;
652}
653
654
655/* PART FIVE -- GLUE IT ALL TOGETHER */
656
657static grammar *
658maketables(gr)
659	nfagrammar *gr;
660{
661	int i;
662	nfa *nf;
663	dfa *d;
664	grammar *g;
665
666	if (gr->gr_nnfas == 0)
667		return NULL;
668	g = newgrammar(gr->gr_nfa[0]->nf_type);
669			/* XXX first rule must be start rule */
670	g->g_ll = gr->gr_ll;
671
672	for (i = 0; i < gr->gr_nnfas; i++) {
673		nf = gr->gr_nfa[i];
674		if (debugging) {
675			printf("Dump of NFA for '%s' ...\n", nf->nf_name);
676			dumpnfa(&gr->gr_ll, nf);
677		}
678		printf("Making DFA for '%s' ...\n", nf->nf_name);
679		d = adddfa(g, nf->nf_type, nf->nf_name);
680		makedfa(gr, gr->gr_nfa[i], d);
681	}
682
683	return g;
684}
685
686grammar *
687pgen(n)
688	node *n;
689{
690	nfagrammar *gr;
691	grammar *g;
692
693	gr = metacompile(n);
694	g = maketables(gr);
695	translatelabels(g);
696	addfirstsets(g);
697	return g;
698}
699
700
701/*
702
703Description
704-----------
705
706Input is a grammar in extended BNF (using * for repetition, + for
707at-least-once repetition, [] for optional parts, | for alternatives and
708() for grouping).  This has already been parsed and turned into a parse
709tree.
710
711Each rule is considered as a regular expression in its own right.
712It is turned into a Non-deterministic Finite Automaton (NFA), which
713is then turned into a Deterministic Finite Automaton (DFA), which is then
714optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
715or similar compiler books (this technique is more often used for lexical
716analyzers).
717
718The DFA's are used by the parser as parsing tables in a special way
719that's probably unique.  Before they are usable, the FIRST sets of all
720non-terminals are computed.
721
722Reference
723---------
724
725[Aho&Ullman 77]
726	Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
727	(first edition)
728
729*/
730