pgen.c revision f70e43a073b36c6f6e9894c01025243a77a452d4
1/***********************************************************
2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The
3Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI not be used in advertising or publicity pertaining to
13distribution of the software without specific, written prior permission.
14
15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23******************************************************************/
24
25/* Parser generator */
26/* XXX This file is not yet fully PROTOized */
27
28/* For a description, see the comments at end of this file */
29
30#include "pgenheaders.h"
31#include "assert.h"
32#include "token.h"
33#include "node.h"
34#include "grammar.h"
35#include "metagrammar.h"
36#include "pgen.h"
37
38extern int debugging;
39
40
41/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
42
43typedef struct _nfaarc {
44	int	ar_label;
45	int	ar_arrow;
46} nfaarc;
47
48typedef struct _nfastate {
49	int	st_narcs;
50	nfaarc	*st_arc;
51} nfastate;
52
53typedef struct _nfa {
54	int		nf_type;
55	char		*nf_name;
56	int		nf_nstates;
57	nfastate	*nf_state;
58	int		nf_start, nf_finish;
59} nfa;
60
61static int
62addnfastate(nf)
63	nfa *nf;
64{
65	nfastate *st;
66
67	RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
68	if (nf->nf_state == NULL)
69		fatal("out of mem");
70	st = &nf->nf_state[nf->nf_nstates++];
71	st->st_narcs = 0;
72	st->st_arc = NULL;
73	return st - nf->nf_state;
74}
75
76static void
77addnfaarc(nf, from, to, lbl)
78	nfa *nf;
79	int from, to, lbl;
80{
81	nfastate *st;
82	nfaarc *ar;
83
84	st = &nf->nf_state[from];
85	RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
86	if (st->st_arc == NULL)
87		fatal("out of mem");
88	ar = &st->st_arc[st->st_narcs++];
89	ar->ar_label = lbl;
90	ar->ar_arrow = to;
91}
92
93static nfa *
94newnfa(name)
95	char *name;
96{
97	nfa *nf;
98	static type = NT_OFFSET; /* All types will be disjunct */
99
100	nf = NEW(nfa, 1);
101	if (nf == NULL)
102		fatal("no mem for new nfa");
103	nf->nf_type = type++;
104	nf->nf_name = name; /* XXX strdup(name) ??? */
105	nf->nf_nstates = 0;
106	nf->nf_state = NULL;
107	nf->nf_start = nf->nf_finish = -1;
108	return nf;
109}
110
111typedef struct _nfagrammar {
112	int		gr_nnfas;
113	nfa		**gr_nfa;
114	labellist	gr_ll;
115} nfagrammar;
116
117static nfagrammar *
118newnfagrammar()
119{
120	nfagrammar *gr;
121
122	gr = NEW(nfagrammar, 1);
123	if (gr == NULL)
124		fatal("no mem for new nfa grammar");
125	gr->gr_nnfas = 0;
126	gr->gr_nfa = NULL;
127	gr->gr_ll.ll_nlabels = 0;
128	gr->gr_ll.ll_label = NULL;
129	addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
130	return gr;
131}
132
133static nfa *
134addnfa(gr, name)
135	nfagrammar *gr;
136	char *name;
137{
138	nfa *nf;
139
140	nf = newnfa(name);
141	RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
142	if (gr->gr_nfa == NULL)
143		fatal("out of mem");
144	gr->gr_nfa[gr->gr_nnfas++] = nf;
145	addlabel(&gr->gr_ll, NAME, nf->nf_name);
146	return nf;
147}
148
149#ifdef DEBUG
150
151static char REQNFMT[] = "metacompile: less than %d children\n";
152
153#define REQN(i, count) \
154 	if (i < count) { \
155		fprintf(stderr, REQNFMT, count); \
156		abort(); \
157	} else
158
159#else
160#define REQN(i, count)	/* empty */
161#endif
162
163static nfagrammar *
164metacompile(n)
165	node *n;
166{
167	nfagrammar *gr;
168	int i;
169
170	printf("Compiling (meta-) parse tree into NFA grammar\n");
171	gr = newnfagrammar();
172	REQ(n, MSTART);
173	i = n->n_nchildren - 1; /* Last child is ENDMARKER */
174	n = n->n_child;
175	for (; --i >= 0; n++) {
176		if (n->n_type != NEWLINE)
177			compile_rule(gr, n);
178	}
179	return gr;
180}
181
182static
183compile_rule(gr, n)
184	nfagrammar *gr;
185	node *n;
186{
187	nfa *nf;
188
189	REQ(n, RULE);
190	REQN(n->n_nchildren, 4);
191	n = n->n_child;
192	REQ(n, NAME);
193	nf = addnfa(gr, n->n_str);
194	n++;
195	REQ(n, COLON);
196	n++;
197	REQ(n, RHS);
198	compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
199	n++;
200	REQ(n, NEWLINE);
201}
202
203static
204compile_rhs(ll, nf, n, pa, pb)
205	labellist *ll;
206	nfa *nf;
207	node *n;
208	int *pa, *pb;
209{
210	int i;
211	int a, b;
212
213	REQ(n, RHS);
214	i = n->n_nchildren;
215	REQN(i, 1);
216	n = n->n_child;
217	REQ(n, ALT);
218	compile_alt(ll, nf, n, pa, pb);
219	if (--i <= 0)
220		return;
221	n++;
222	a = *pa;
223	b = *pb;
224	*pa = addnfastate(nf);
225	*pb = addnfastate(nf);
226	addnfaarc(nf, *pa, a, EMPTY);
227	addnfaarc(nf, b, *pb, EMPTY);
228	for (; --i >= 0; n++) {
229		REQ(n, VBAR);
230		REQN(i, 1);
231		--i;
232		n++;
233		REQ(n, ALT);
234		compile_alt(ll, nf, n, &a, &b);
235		addnfaarc(nf, *pa, a, EMPTY);
236		addnfaarc(nf, b, *pb, EMPTY);
237	}
238}
239
240static
241compile_alt(ll, nf, n, pa, pb)
242	labellist *ll;
243	nfa *nf;
244	node *n;
245	int *pa, *pb;
246{
247	int i;
248	int a, b;
249
250	REQ(n, ALT);
251	i = n->n_nchildren;
252	REQN(i, 1);
253	n = n->n_child;
254	REQ(n, ITEM);
255	compile_item(ll, nf, n, pa, pb);
256	--i;
257	n++;
258	for (; --i >= 0; n++) {
259		if (n->n_type == COMMA) { /* XXX Temporary */
260			REQN(i, 1);
261			--i;
262			n++;
263		}
264		REQ(n, ITEM);
265		compile_item(ll, nf, n, &a, &b);
266		addnfaarc(nf, *pb, a, EMPTY);
267		*pb = b;
268	}
269}
270
271static
272compile_item(ll, nf, n, pa, pb)
273	labellist *ll;
274	nfa *nf;
275	node *n;
276	int *pa, *pb;
277{
278	int i;
279	int a, b;
280
281	REQ(n, ITEM);
282	i = n->n_nchildren;
283	REQN(i, 1);
284	n = n->n_child;
285	if (n->n_type == LSQB) {
286		REQN(i, 3);
287		n++;
288		REQ(n, RHS);
289		*pa = addnfastate(nf);
290		*pb = addnfastate(nf);
291		addnfaarc(nf, *pa, *pb, EMPTY);
292		compile_rhs(ll, nf, n, &a, &b);
293		addnfaarc(nf, *pa, a, EMPTY);
294		addnfaarc(nf, b, *pb, EMPTY);
295		REQN(i, 1);
296		n++;
297		REQ(n, RSQB);
298	}
299	else {
300		compile_atom(ll, nf, n, pa, pb);
301		if (--i <= 0)
302			return;
303		n++;
304		addnfaarc(nf, *pb, *pa, EMPTY);
305		if (n->n_type == STAR)
306			*pb = *pa;
307		else
308			REQ(n, PLUS);
309	}
310}
311
312static
313compile_atom(ll, nf, n, pa, pb)
314	labellist *ll;
315	nfa *nf;
316	node *n;
317	int *pa, *pb;
318{
319	int i;
320
321	REQ(n, ATOM);
322	i = n->n_nchildren;
323	REQN(i, 1);
324	n = n->n_child;
325	if (n->n_type == LPAR) {
326		REQN(i, 3);
327		n++;
328		REQ(n, RHS);
329		compile_rhs(ll, nf, n, pa, pb);
330		n++;
331		REQ(n, RPAR);
332	}
333	else if (n->n_type == NAME || n->n_type == STRING) {
334		*pa = addnfastate(nf);
335		*pb = addnfastate(nf);
336		addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
337	}
338	else
339		REQ(n, NAME);
340}
341
342static void
343dumpstate(ll, nf, istate)
344	labellist *ll;
345	nfa *nf;
346	int istate;
347{
348	nfastate *st;
349	int i;
350	nfaarc *ar;
351
352	printf("%c%2d%c",
353		istate == nf->nf_start ? '*' : ' ',
354		istate,
355		istate == nf->nf_finish ? '.' : ' ');
356	st = &nf->nf_state[istate];
357	ar = st->st_arc;
358	for (i = 0; i < st->st_narcs; i++) {
359		if (i > 0)
360			printf("\n    ");
361		printf("-> %2d  %s", ar->ar_arrow,
362			labelrepr(&ll->ll_label[ar->ar_label]));
363		ar++;
364	}
365	printf("\n");
366}
367
368static void
369dumpnfa(ll, nf)
370	labellist *ll;
371	nfa *nf;
372{
373	int i;
374
375	printf("NFA '%s' has %d states; start %d, finish %d\n",
376		nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
377	for (i = 0; i < nf->nf_nstates; i++)
378		dumpstate(ll, nf, i);
379}
380
381
382/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
383
384static int
385addclosure(ss, nf, istate)
386	bitset ss;
387	nfa *nf;
388	int istate;
389{
390	if (addbit(ss, istate)) {
391		nfastate *st = &nf->nf_state[istate];
392		nfaarc *ar = st->st_arc;
393		int i;
394
395		for (i = st->st_narcs; --i >= 0; ) {
396			if (ar->ar_label == EMPTY)
397				addclosure(ss, nf, ar->ar_arrow);
398			ar++;
399		}
400	}
401}
402
403typedef struct _ss_arc {
404	bitset	sa_bitset;
405	int	sa_arrow;
406	int	sa_label;
407} ss_arc;
408
409typedef struct _ss_state {
410	bitset	ss_ss;
411	int	ss_narcs;
412	ss_arc	*ss_arc;
413	int	ss_deleted;
414	int	ss_finish;
415	int	ss_rename;
416} ss_state;
417
418typedef struct _ss_dfa {
419	int	sd_nstates;
420	ss_state *sd_state;
421} ss_dfa;
422
423static
424makedfa(gr, nf, d)
425	nfagrammar *gr;
426	nfa *nf;
427	dfa *d;
428{
429	int nbits = nf->nf_nstates;
430	bitset ss;
431	int xx_nstates;
432	ss_state *xx_state, *yy;
433	ss_arc *zz;
434	int istate, jstate, iarc, jarc, ibit;
435	nfastate *st;
436	nfaarc *ar;
437
438	ss = newbitset(nbits);
439	addclosure(ss, nf, nf->nf_start);
440	xx_state = NEW(ss_state, 1);
441	if (xx_state == NULL)
442		fatal("no mem for xx_state in makedfa");
443	xx_nstates = 1;
444	yy = &xx_state[0];
445	yy->ss_ss = ss;
446	yy->ss_narcs = 0;
447	yy->ss_arc = NULL;
448	yy->ss_deleted = 0;
449	yy->ss_finish = testbit(ss, nf->nf_finish);
450	if (yy->ss_finish)
451		printf("Error: nonterminal '%s' may produce empty.\n",
452			nf->nf_name);
453
454	/* This algorithm is from a book written before
455	   the invention of structured programming... */
456
457	/* For each unmarked state... */
458	for (istate = 0; istate < xx_nstates; ++istate) {
459		yy = &xx_state[istate];
460		ss = yy->ss_ss;
461		/* For all its states... */
462		for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
463			if (!testbit(ss, ibit))
464				continue;
465			st = &nf->nf_state[ibit];
466			/* For all non-empty arcs from this state... */
467			for (iarc = 0; iarc < st->st_narcs; iarc++) {
468				ar = &st->st_arc[iarc];
469				if (ar->ar_label == EMPTY)
470					continue;
471				/* Look up in list of arcs from this state */
472				for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
473					zz = &yy->ss_arc[jarc];
474					if (ar->ar_label == zz->sa_label)
475						goto found;
476				}
477				/* Add new arc for this state */
478				RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
479				if (yy->ss_arc == NULL)
480					fatal("out of mem");
481				zz = &yy->ss_arc[yy->ss_narcs++];
482				zz->sa_label = ar->ar_label;
483				zz->sa_bitset = newbitset(nbits);
484				zz->sa_arrow = -1;
485			 found:	;
486				/* Add destination */
487				addclosure(zz->sa_bitset, nf, ar->ar_arrow);
488			}
489		}
490		/* Now look up all the arrow states */
491		for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
492			zz = &xx_state[istate].ss_arc[jarc];
493			for (jstate = 0; jstate < xx_nstates; jstate++) {
494				if (samebitset(zz->sa_bitset,
495					xx_state[jstate].ss_ss, nbits)) {
496					zz->sa_arrow = jstate;
497					goto done;
498				}
499			}
500			RESIZE(xx_state, ss_state, xx_nstates + 1);
501			if (xx_state == NULL)
502				fatal("out of mem");
503			zz->sa_arrow = xx_nstates;
504			yy = &xx_state[xx_nstates++];
505			yy->ss_ss = zz->sa_bitset;
506			yy->ss_narcs = 0;
507			yy->ss_arc = NULL;
508			yy->ss_deleted = 0;
509			yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
510		 done:	;
511		}
512	}
513
514	if (debugging)
515		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
516						"before minimizing");
517
518	simplify(xx_nstates, xx_state);
519
520	if (debugging)
521		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
522						"after minimizing");
523
524	convert(d, xx_nstates, xx_state);
525
526	/* XXX cleanup */
527}
528
529static
530printssdfa(xx_nstates, xx_state, nbits, ll, msg)
531	int xx_nstates;
532	ss_state *xx_state;
533	int nbits;
534	labellist *ll;
535	char *msg;
536{
537	int i, ibit, iarc;
538	ss_state *yy;
539	ss_arc *zz;
540
541	printf("Subset DFA %s\n", msg);
542	for (i = 0; i < xx_nstates; i++) {
543		yy = &xx_state[i];
544		if (yy->ss_deleted)
545			continue;
546		printf(" Subset %d", i);
547		if (yy->ss_finish)
548			printf(" (finish)");
549		printf(" { ");
550		for (ibit = 0; ibit < nbits; ibit++) {
551			if (testbit(yy->ss_ss, ibit))
552				printf("%d ", ibit);
553		}
554		printf("}\n");
555		for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
556			zz = &yy->ss_arc[iarc];
557			printf("  Arc to state %d, label %s\n",
558				zz->sa_arrow,
559				labelrepr(&ll->ll_label[zz->sa_label]));
560		}
561	}
562}
563
564
565/* PART THREE -- SIMPLIFY DFA */
566
567/* Simplify the DFA by repeatedly eliminating states that are
568   equivalent to another oner.  This is NOT Algorithm 3.3 from
569   [Aho&Ullman 77].  It does not always finds the minimal DFA,
570   but it does usually make a much smaller one...  (For an example
571   of sub-optimal behaviour, try S: x a b+ | y a b+.)
572*/
573
574static int
575samestate(s1, s2)
576	ss_state *s1, *s2;
577{
578	int i;
579
580	if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
581		return 0;
582	for (i = 0; i < s1->ss_narcs; i++) {
583		if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
584			s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
585			return 0;
586	}
587	return 1;
588}
589
590static void
591renamestates(xx_nstates, xx_state, from, to)
592	int xx_nstates;
593	ss_state *xx_state;
594	int from, to;
595{
596	int i, j;
597
598	if (debugging)
599		printf("Rename state %d to %d.\n", from, to);
600	for (i = 0; i < xx_nstates; i++) {
601		if (xx_state[i].ss_deleted)
602			continue;
603		for (j = 0; j < xx_state[i].ss_narcs; j++) {
604			if (xx_state[i].ss_arc[j].sa_arrow == from)
605				xx_state[i].ss_arc[j].sa_arrow = to;
606		}
607	}
608}
609
610static
611simplify(xx_nstates, xx_state)
612	int xx_nstates;
613	ss_state *xx_state;
614{
615	int changes;
616	int i, j, k;
617
618	do {
619		changes = 0;
620		for (i = 1; i < xx_nstates; i++) {
621			if (xx_state[i].ss_deleted)
622				continue;
623			for (j = 0; j < i; j++) {
624				if (xx_state[j].ss_deleted)
625					continue;
626				if (samestate(&xx_state[i], &xx_state[j])) {
627					xx_state[i].ss_deleted++;
628					renamestates(xx_nstates, xx_state, i, j);
629					changes++;
630					break;
631				}
632			}
633		}
634	} while (changes);
635}
636
637
638/* PART FOUR -- GENERATE PARSING TABLES */
639
640/* Convert the DFA into a grammar that can be used by our parser */
641
642static
643convert(d, xx_nstates, xx_state)
644	dfa *d;
645	int xx_nstates;
646	ss_state *xx_state;
647{
648	int i, j;
649	ss_state *yy;
650	ss_arc *zz;
651
652	for (i = 0; i < xx_nstates; i++) {
653		yy = &xx_state[i];
654		if (yy->ss_deleted)
655			continue;
656		yy->ss_rename = addstate(d);
657	}
658
659	for (i = 0; i < xx_nstates; i++) {
660		yy = &xx_state[i];
661		if (yy->ss_deleted)
662			continue;
663		for (j = 0; j < yy->ss_narcs; j++) {
664			zz = &yy->ss_arc[j];
665			addarc(d, yy->ss_rename,
666				xx_state[zz->sa_arrow].ss_rename,
667				zz->sa_label);
668		}
669		if (yy->ss_finish)
670			addarc(d, yy->ss_rename, yy->ss_rename, 0);
671	}
672
673	d->d_initial = 0;
674}
675
676
677/* PART FIVE -- GLUE IT ALL TOGETHER */
678
679static grammar *
680maketables(gr)
681	nfagrammar *gr;
682{
683	int i;
684	nfa *nf;
685	dfa *d;
686	grammar *g;
687
688	if (gr->gr_nnfas == 0)
689		return NULL;
690	g = newgrammar(gr->gr_nfa[0]->nf_type);
691			/* XXX first rule must be start rule */
692	g->g_ll = gr->gr_ll;
693
694	for (i = 0; i < gr->gr_nnfas; i++) {
695		nf = gr->gr_nfa[i];
696		if (debugging) {
697			printf("Dump of NFA for '%s' ...\n", nf->nf_name);
698			dumpnfa(&gr->gr_ll, nf);
699		}
700		printf("Making DFA for '%s' ...\n", nf->nf_name);
701		d = adddfa(g, nf->nf_type, nf->nf_name);
702		makedfa(gr, gr->gr_nfa[i], d);
703	}
704
705	return g;
706}
707
708grammar *
709pgen(n)
710	node *n;
711{
712	nfagrammar *gr;
713	grammar *g;
714
715	gr = metacompile(n);
716	g = maketables(gr);
717	translatelabels(g);
718	addfirstsets(g);
719	return g;
720}
721
722
723/*
724
725Description
726-----------
727
728Input is a grammar in extended BNF (using * for repetition, + for
729at-least-once repetition, [] for optional parts, | for alternatives and
730() for grouping).  This has already been parsed and turned into a parse
731tree.
732
733Each rule is considered as a regular expression in its own right.
734It is turned into a Non-deterministic Finite Automaton (NFA), which
735is then turned into a Deterministic Finite Automaton (DFA), which is then
736optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
737or similar compiler books (this technique is more often used for lexical
738analyzers).
739
740The DFA's are used by the parser as parsing tables in a special way
741that's probably unique.  Before they are usable, the FIRST sets of all
742non-terminals are computed.
743
744Reference
745---------
746
747[Aho&Ullman 77]
748	Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
749	(first edition)
750
751*/
752