pgen.c revision 9bfef44d97d1ae24e03717e3d59024b44626a9de
1/***********************************************************
2Copyright 1991, 1992, 1993 by Stichting Mathematisch Centrum,
3Amsterdam, The Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
25/* Parser generator */
26/* XXX This file is not yet fully PROTOized */
27
28/* For a description, see the comments at end of this file */
29
30#include "pgenheaders.h"
31#include "assert.h"
32#include "token.h"
33#include "node.h"
34#include "grammar.h"
35#include "metagrammar.h"
36#include "pgen.h"
37
38extern int debugging;
39
40
41/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
42
43typedef struct _nfaarc {
44	int	ar_label;
45	int	ar_arrow;
46} nfaarc;
47
48typedef struct _nfastate {
49	int	st_narcs;
50	nfaarc	*st_arc;
51} nfastate;
52
53typedef struct _nfa {
54	int		nf_type;
55	char		*nf_name;
56	int		nf_nstates;
57	nfastate	*nf_state;
58	int		nf_start, nf_finish;
59} nfa;
60
61/* Forward */
62static compile_rhs PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
63static compile_alt PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
64static compile_item PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
65static compile_atom PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb));
66
67static int
68addnfastate(nf)
69	nfa *nf;
70{
71	nfastate *st;
72
73	RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
74	if (nf->nf_state == NULL)
75		fatal("out of mem");
76	st = &nf->nf_state[nf->nf_nstates++];
77	st->st_narcs = 0;
78	st->st_arc = NULL;
79	return st - nf->nf_state;
80}
81
82static void
83addnfaarc(nf, from, to, lbl)
84	nfa *nf;
85	int from, to, lbl;
86{
87	nfastate *st;
88	nfaarc *ar;
89
90	st = &nf->nf_state[from];
91	RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
92	if (st->st_arc == NULL)
93		fatal("out of mem");
94	ar = &st->st_arc[st->st_narcs++];
95	ar->ar_label = lbl;
96	ar->ar_arrow = to;
97}
98
99static nfa *
100newnfa(name)
101	char *name;
102{
103	nfa *nf;
104	static type = NT_OFFSET; /* All types will be disjunct */
105
106	nf = NEW(nfa, 1);
107	if (nf == NULL)
108		fatal("no mem for new nfa");
109	nf->nf_type = type++;
110	nf->nf_name = name; /* XXX strdup(name) ??? */
111	nf->nf_nstates = 0;
112	nf->nf_state = NULL;
113	nf->nf_start = nf->nf_finish = -1;
114	return nf;
115}
116
117typedef struct _nfagrammar {
118	int		gr_nnfas;
119	nfa		**gr_nfa;
120	labellist	gr_ll;
121} nfagrammar;
122
123/* Forward */
124static compile_rule PROTO((nfagrammar *gr, node *n));
125
126static nfagrammar *
127newnfagrammar()
128{
129	nfagrammar *gr;
130
131	gr = NEW(nfagrammar, 1);
132	if (gr == NULL)
133		fatal("no mem for new nfa grammar");
134	gr->gr_nnfas = 0;
135	gr->gr_nfa = NULL;
136	gr->gr_ll.ll_nlabels = 0;
137	gr->gr_ll.ll_label = NULL;
138	addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
139	return gr;
140}
141
142static nfa *
143addnfa(gr, name)
144	nfagrammar *gr;
145	char *name;
146{
147	nfa *nf;
148
149	nf = newnfa(name);
150	RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
151	if (gr->gr_nfa == NULL)
152		fatal("out of mem");
153	gr->gr_nfa[gr->gr_nnfas++] = nf;
154	addlabel(&gr->gr_ll, NAME, nf->nf_name);
155	return nf;
156}
157
158#ifdef DEBUG
159
160static char REQNFMT[] = "metacompile: less than %d children\n";
161
162#define REQN(i, count) \
163 	if (i < count) { \
164		fprintf(stderr, REQNFMT, count); \
165		abort(); \
166	} else
167
168#else
169#define REQN(i, count)	/* empty */
170#endif
171
172static nfagrammar *
173metacompile(n)
174	node *n;
175{
176	nfagrammar *gr;
177	int i;
178
179	printf("Compiling (meta-) parse tree into NFA grammar\n");
180	gr = newnfagrammar();
181	REQ(n, MSTART);
182	i = n->n_nchildren - 1; /* Last child is ENDMARKER */
183	n = n->n_child;
184	for (; --i >= 0; n++) {
185		if (n->n_type != NEWLINE)
186			compile_rule(gr, n);
187	}
188	return gr;
189}
190
191static
192compile_rule(gr, n)
193	nfagrammar *gr;
194	node *n;
195{
196	nfa *nf;
197
198	REQ(n, RULE);
199	REQN(n->n_nchildren, 4);
200	n = n->n_child;
201	REQ(n, NAME);
202	nf = addnfa(gr, n->n_str);
203	n++;
204	REQ(n, COLON);
205	n++;
206	REQ(n, RHS);
207	compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
208	n++;
209	REQ(n, NEWLINE);
210}
211
212static
213compile_rhs(ll, nf, n, pa, pb)
214	labellist *ll;
215	nfa *nf;
216	node *n;
217	int *pa, *pb;
218{
219	int i;
220	int a, b;
221
222	REQ(n, RHS);
223	i = n->n_nchildren;
224	REQN(i, 1);
225	n = n->n_child;
226	REQ(n, ALT);
227	compile_alt(ll, nf, n, pa, pb);
228	if (--i <= 0)
229		return;
230	n++;
231	a = *pa;
232	b = *pb;
233	*pa = addnfastate(nf);
234	*pb = addnfastate(nf);
235	addnfaarc(nf, *pa, a, EMPTY);
236	addnfaarc(nf, b, *pb, EMPTY);
237	for (; --i >= 0; n++) {
238		REQ(n, VBAR);
239		REQN(i, 1);
240		--i;
241		n++;
242		REQ(n, ALT);
243		compile_alt(ll, nf, n, &a, &b);
244		addnfaarc(nf, *pa, a, EMPTY);
245		addnfaarc(nf, b, *pb, EMPTY);
246	}
247}
248
249static
250compile_alt(ll, nf, n, pa, pb)
251	labellist *ll;
252	nfa *nf;
253	node *n;
254	int *pa, *pb;
255{
256	int i;
257	int a, b;
258
259	REQ(n, ALT);
260	i = n->n_nchildren;
261	REQN(i, 1);
262	n = n->n_child;
263	REQ(n, ITEM);
264	compile_item(ll, nf, n, pa, pb);
265	--i;
266	n++;
267	for (; --i >= 0; n++) {
268		if (n->n_type == COMMA) { /* XXX Temporary */
269			REQN(i, 1);
270			--i;
271			n++;
272		}
273		REQ(n, ITEM);
274		compile_item(ll, nf, n, &a, &b);
275		addnfaarc(nf, *pb, a, EMPTY);
276		*pb = b;
277	}
278}
279
280static
281compile_item(ll, nf, n, pa, pb)
282	labellist *ll;
283	nfa *nf;
284	node *n;
285	int *pa, *pb;
286{
287	int i;
288	int a, b;
289
290	REQ(n, ITEM);
291	i = n->n_nchildren;
292	REQN(i, 1);
293	n = n->n_child;
294	if (n->n_type == LSQB) {
295		REQN(i, 3);
296		n++;
297		REQ(n, RHS);
298		*pa = addnfastate(nf);
299		*pb = addnfastate(nf);
300		addnfaarc(nf, *pa, *pb, EMPTY);
301		compile_rhs(ll, nf, n, &a, &b);
302		addnfaarc(nf, *pa, a, EMPTY);
303		addnfaarc(nf, b, *pb, EMPTY);
304		REQN(i, 1);
305		n++;
306		REQ(n, RSQB);
307	}
308	else {
309		compile_atom(ll, nf, n, pa, pb);
310		if (--i <= 0)
311			return;
312		n++;
313		addnfaarc(nf, *pb, *pa, EMPTY);
314		if (n->n_type == STAR)
315			*pb = *pa;
316		else
317			REQ(n, PLUS);
318	}
319}
320
321static
322compile_atom(ll, nf, n, pa, pb)
323	labellist *ll;
324	nfa *nf;
325	node *n;
326	int *pa, *pb;
327{
328	int i;
329
330	REQ(n, ATOM);
331	i = n->n_nchildren;
332	REQN(i, 1);
333	n = n->n_child;
334	if (n->n_type == LPAR) {
335		REQN(i, 3);
336		n++;
337		REQ(n, RHS);
338		compile_rhs(ll, nf, n, pa, pb);
339		n++;
340		REQ(n, RPAR);
341	}
342	else if (n->n_type == NAME || n->n_type == STRING) {
343		*pa = addnfastate(nf);
344		*pb = addnfastate(nf);
345		addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
346	}
347	else
348		REQ(n, NAME);
349}
350
351static void
352dumpstate(ll, nf, istate)
353	labellist *ll;
354	nfa *nf;
355	int istate;
356{
357	nfastate *st;
358	int i;
359	nfaarc *ar;
360
361	printf("%c%2d%c",
362		istate == nf->nf_start ? '*' : ' ',
363		istate,
364		istate == nf->nf_finish ? '.' : ' ');
365	st = &nf->nf_state[istate];
366	ar = st->st_arc;
367	for (i = 0; i < st->st_narcs; i++) {
368		if (i > 0)
369			printf("\n    ");
370		printf("-> %2d  %s", ar->ar_arrow,
371			labelrepr(&ll->ll_label[ar->ar_label]));
372		ar++;
373	}
374	printf("\n");
375}
376
377static void
378dumpnfa(ll, nf)
379	labellist *ll;
380	nfa *nf;
381{
382	int i;
383
384	printf("NFA '%s' has %d states; start %d, finish %d\n",
385		nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
386	for (i = 0; i < nf->nf_nstates; i++)
387		dumpstate(ll, nf, i);
388}
389
390
391/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
392
393static int
394addclosure(ss, nf, istate)
395	bitset ss;
396	nfa *nf;
397	int istate;
398{
399	if (addbit(ss, istate)) {
400		nfastate *st = &nf->nf_state[istate];
401		nfaarc *ar = st->st_arc;
402		int i;
403
404		for (i = st->st_narcs; --i >= 0; ) {
405			if (ar->ar_label == EMPTY)
406				addclosure(ss, nf, ar->ar_arrow);
407			ar++;
408		}
409	}
410}
411
412typedef struct _ss_arc {
413	bitset	sa_bitset;
414	int	sa_arrow;
415	int	sa_label;
416} ss_arc;
417
418typedef struct _ss_state {
419	bitset	ss_ss;
420	int	ss_narcs;
421	ss_arc	*ss_arc;
422	int	ss_deleted;
423	int	ss_finish;
424	int	ss_rename;
425} ss_state;
426
427typedef struct _ss_dfa {
428	int	sd_nstates;
429	ss_state *sd_state;
430} ss_dfa;
431
432/* Forward */
433static printssdfa PROTO((int xx_nstates, ss_state *xx_state, int nbits,
434			 labellist *ll, char *msg));
435static simplify PROTO((int xx_nstates, ss_state *xx_state));
436static convert PROTO((dfa *d, int xx_nstates, ss_state *xx_state));
437
438static
439makedfa(gr, nf, d)
440	nfagrammar *gr;
441	nfa *nf;
442	dfa *d;
443{
444	int nbits = nf->nf_nstates;
445	bitset ss;
446	int xx_nstates;
447	ss_state *xx_state, *yy;
448	ss_arc *zz;
449	int istate, jstate, iarc, jarc, ibit;
450	nfastate *st;
451	nfaarc *ar;
452
453	ss = newbitset(nbits);
454	addclosure(ss, nf, nf->nf_start);
455	xx_state = NEW(ss_state, 1);
456	if (xx_state == NULL)
457		fatal("no mem for xx_state in makedfa");
458	xx_nstates = 1;
459	yy = &xx_state[0];
460	yy->ss_ss = ss;
461	yy->ss_narcs = 0;
462	yy->ss_arc = NULL;
463	yy->ss_deleted = 0;
464	yy->ss_finish = testbit(ss, nf->nf_finish);
465	if (yy->ss_finish)
466		printf("Error: nonterminal '%s' may produce empty.\n",
467			nf->nf_name);
468
469	/* This algorithm is from a book written before
470	   the invention of structured programming... */
471
472	/* For each unmarked state... */
473	for (istate = 0; istate < xx_nstates; ++istate) {
474		yy = &xx_state[istate];
475		ss = yy->ss_ss;
476		/* For all its states... */
477		for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
478			if (!testbit(ss, ibit))
479				continue;
480			st = &nf->nf_state[ibit];
481			/* For all non-empty arcs from this state... */
482			for (iarc = 0; iarc < st->st_narcs; iarc++) {
483				ar = &st->st_arc[iarc];
484				if (ar->ar_label == EMPTY)
485					continue;
486				/* Look up in list of arcs from this state */
487				for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
488					zz = &yy->ss_arc[jarc];
489					if (ar->ar_label == zz->sa_label)
490						goto found;
491				}
492				/* Add new arc for this state */
493				RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
494				if (yy->ss_arc == NULL)
495					fatal("out of mem");
496				zz = &yy->ss_arc[yy->ss_narcs++];
497				zz->sa_label = ar->ar_label;
498				zz->sa_bitset = newbitset(nbits);
499				zz->sa_arrow = -1;
500			 found:	;
501				/* Add destination */
502				addclosure(zz->sa_bitset, nf, ar->ar_arrow);
503			}
504		}
505		/* Now look up all the arrow states */
506		for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
507			zz = &xx_state[istate].ss_arc[jarc];
508			for (jstate = 0; jstate < xx_nstates; jstate++) {
509				if (samebitset(zz->sa_bitset,
510					xx_state[jstate].ss_ss, nbits)) {
511					zz->sa_arrow = jstate;
512					goto done;
513				}
514			}
515			RESIZE(xx_state, ss_state, xx_nstates + 1);
516			if (xx_state == NULL)
517				fatal("out of mem");
518			zz->sa_arrow = xx_nstates;
519			yy = &xx_state[xx_nstates++];
520			yy->ss_ss = zz->sa_bitset;
521			yy->ss_narcs = 0;
522			yy->ss_arc = NULL;
523			yy->ss_deleted = 0;
524			yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
525		 done:	;
526		}
527	}
528
529	if (debugging)
530		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
531						"before minimizing");
532
533	simplify(xx_nstates, xx_state);
534
535	if (debugging)
536		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
537						"after minimizing");
538
539	convert(d, xx_nstates, xx_state);
540
541	/* XXX cleanup */
542}
543
544static
545printssdfa(xx_nstates, xx_state, nbits, ll, msg)
546	int xx_nstates;
547	ss_state *xx_state;
548	int nbits;
549	labellist *ll;
550	char *msg;
551{
552	int i, ibit, iarc;
553	ss_state *yy;
554	ss_arc *zz;
555
556	printf("Subset DFA %s\n", msg);
557	for (i = 0; i < xx_nstates; i++) {
558		yy = &xx_state[i];
559		if (yy->ss_deleted)
560			continue;
561		printf(" Subset %d", i);
562		if (yy->ss_finish)
563			printf(" (finish)");
564		printf(" { ");
565		for (ibit = 0; ibit < nbits; ibit++) {
566			if (testbit(yy->ss_ss, ibit))
567				printf("%d ", ibit);
568		}
569		printf("}\n");
570		for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
571			zz = &yy->ss_arc[iarc];
572			printf("  Arc to state %d, label %s\n",
573				zz->sa_arrow,
574				labelrepr(&ll->ll_label[zz->sa_label]));
575		}
576	}
577}
578
579
580/* PART THREE -- SIMPLIFY DFA */
581
582/* Simplify the DFA by repeatedly eliminating states that are
583   equivalent to another oner.  This is NOT Algorithm 3.3 from
584   [Aho&Ullman 77].  It does not always finds the minimal DFA,
585   but it does usually make a much smaller one...  (For an example
586   of sub-optimal behaviour, try S: x a b+ | y a b+.)
587*/
588
589static int
590samestate(s1, s2)
591	ss_state *s1, *s2;
592{
593	int i;
594
595	if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
596		return 0;
597	for (i = 0; i < s1->ss_narcs; i++) {
598		if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
599			s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
600			return 0;
601	}
602	return 1;
603}
604
605static void
606renamestates(xx_nstates, xx_state, from, to)
607	int xx_nstates;
608	ss_state *xx_state;
609	int from, to;
610{
611	int i, j;
612
613	if (debugging)
614		printf("Rename state %d to %d.\n", from, to);
615	for (i = 0; i < xx_nstates; i++) {
616		if (xx_state[i].ss_deleted)
617			continue;
618		for (j = 0; j < xx_state[i].ss_narcs; j++) {
619			if (xx_state[i].ss_arc[j].sa_arrow == from)
620				xx_state[i].ss_arc[j].sa_arrow = to;
621		}
622	}
623}
624
625static
626simplify(xx_nstates, xx_state)
627	int xx_nstates;
628	ss_state *xx_state;
629{
630	int changes;
631	int i, j, k;
632
633	do {
634		changes = 0;
635		for (i = 1; i < xx_nstates; i++) {
636			if (xx_state[i].ss_deleted)
637				continue;
638			for (j = 0; j < i; j++) {
639				if (xx_state[j].ss_deleted)
640					continue;
641				if (samestate(&xx_state[i], &xx_state[j])) {
642					xx_state[i].ss_deleted++;
643					renamestates(xx_nstates, xx_state, i, j);
644					changes++;
645					break;
646				}
647			}
648		}
649	} while (changes);
650}
651
652
653/* PART FOUR -- GENERATE PARSING TABLES */
654
655/* Convert the DFA into a grammar that can be used by our parser */
656
657static
658convert(d, xx_nstates, xx_state)
659	dfa *d;
660	int xx_nstates;
661	ss_state *xx_state;
662{
663	int i, j;
664	ss_state *yy;
665	ss_arc *zz;
666
667	for (i = 0; i < xx_nstates; i++) {
668		yy = &xx_state[i];
669		if (yy->ss_deleted)
670			continue;
671		yy->ss_rename = addstate(d);
672	}
673
674	for (i = 0; i < xx_nstates; i++) {
675		yy = &xx_state[i];
676		if (yy->ss_deleted)
677			continue;
678		for (j = 0; j < yy->ss_narcs; j++) {
679			zz = &yy->ss_arc[j];
680			addarc(d, yy->ss_rename,
681				xx_state[zz->sa_arrow].ss_rename,
682				zz->sa_label);
683		}
684		if (yy->ss_finish)
685			addarc(d, yy->ss_rename, yy->ss_rename, 0);
686	}
687
688	d->d_initial = 0;
689}
690
691
692/* PART FIVE -- GLUE IT ALL TOGETHER */
693
694static grammar *
695maketables(gr)
696	nfagrammar *gr;
697{
698	int i;
699	nfa *nf;
700	dfa *d;
701	grammar *g;
702
703	if (gr->gr_nnfas == 0)
704		return NULL;
705	g = newgrammar(gr->gr_nfa[0]->nf_type);
706			/* XXX first rule must be start rule */
707	g->g_ll = gr->gr_ll;
708
709	for (i = 0; i < gr->gr_nnfas; i++) {
710		nf = gr->gr_nfa[i];
711		if (debugging) {
712			printf("Dump of NFA for '%s' ...\n", nf->nf_name);
713			dumpnfa(&gr->gr_ll, nf);
714		}
715		printf("Making DFA for '%s' ...\n", nf->nf_name);
716		d = adddfa(g, nf->nf_type, nf->nf_name);
717		makedfa(gr, gr->gr_nfa[i], d);
718	}
719
720	return g;
721}
722
723grammar *
724pgen(n)
725	node *n;
726{
727	nfagrammar *gr;
728	grammar *g;
729
730	gr = metacompile(n);
731	g = maketables(gr);
732	translatelabels(g);
733	addfirstsets(g);
734	return g;
735}
736
737
738/*
739
740Description
741-----------
742
743Input is a grammar in extended BNF (using * for repetition, + for
744at-least-once repetition, [] for optional parts, | for alternatives and
745() for grouping).  This has already been parsed and turned into a parse
746tree.
747
748Each rule is considered as a regular expression in its own right.
749It is turned into a Non-deterministic Finite Automaton (NFA), which
750is then turned into a Deterministic Finite Automaton (DFA), which is then
751optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
752or similar compiler books (this technique is more often used for lexical
753analyzers).
754
755The DFA's are used by the parser as parsing tables in a special way
756that's probably unique.  Before they are usable, the FIRST sets of all
757non-terminals are computed.
758
759Reference
760---------
761
762[Aho&Ullman 77]
763	Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
764	(first edition)
765
766*/
767