pgen.c revision 408027ea4635de5c892b28c4fdf41840eb9089a4
1/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* Parser generator */
33/* XXX This file is not yet fully PROTOized */
34
35/* For a description, see the comments at end of this file */
36
37#include "pgenheaders.h"
38#include "assert.h"
39#include "token.h"
40#include "node.h"
41#include "grammar.h"
42#include "metagrammar.h"
43#include "pgen.h"
44
45extern int debugging;
46
47
48/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
49
50typedef struct _nfaarc {
51	int	ar_label;
52	int	ar_arrow;
53} nfaarc;
54
55typedef struct _nfastate {
56	int	st_narcs;
57	nfaarc	*st_arc;
58} nfastate;
59
60typedef struct _nfa {
61	int		nf_type;
62	char		*nf_name;
63	int		nf_nstates;
64	nfastate	*nf_state;
65	int		nf_start, nf_finish;
66} nfa;
67
68/* Forward */
69static void compile_rhs PROTO((labellist *ll,
70			       nfa *nf, node *n, int *pa, int *pb));
71static void compile_alt PROTO((labellist *ll,
72			       nfa *nf, node *n, int *pa, int *pb));
73static void compile_item PROTO((labellist *ll,
74				nfa *nf, node *n, int *pa, int *pb));
75static void compile_atom PROTO((labellist *ll,
76				nfa *nf, node *n, int *pa, int *pb));
77
78static int
79addnfastate(nf)
80	nfa *nf;
81{
82	nfastate *st;
83
84	RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
85	if (nf->nf_state == NULL)
86		fatal("out of mem");
87	st = &nf->nf_state[nf->nf_nstates++];
88	st->st_narcs = 0;
89	st->st_arc = NULL;
90	return st - nf->nf_state;
91}
92
93static void
94addnfaarc(nf, from, to, lbl)
95	nfa *nf;
96	int from, to, lbl;
97{
98	nfastate *st;
99	nfaarc *ar;
100
101	st = &nf->nf_state[from];
102	RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
103	if (st->st_arc == NULL)
104		fatal("out of mem");
105	ar = &st->st_arc[st->st_narcs++];
106	ar->ar_label = lbl;
107	ar->ar_arrow = to;
108}
109
110static nfa *
111newnfa(name)
112	char *name;
113{
114	nfa *nf;
115	static type = NT_OFFSET; /* All types will be disjunct */
116
117	nf = NEW(nfa, 1);
118	if (nf == NULL)
119		fatal("no mem for new nfa");
120	nf->nf_type = type++;
121	nf->nf_name = name; /* XXX strdup(name) ??? */
122	nf->nf_nstates = 0;
123	nf->nf_state = NULL;
124	nf->nf_start = nf->nf_finish = -1;
125	return nf;
126}
127
128typedef struct _nfagrammar {
129	int		gr_nnfas;
130	nfa		**gr_nfa;
131	labellist	gr_ll;
132} nfagrammar;
133
134/* Forward */
135static void compile_rule PROTO((nfagrammar *gr, node *n));
136
137static nfagrammar *
138newnfagrammar()
139{
140	nfagrammar *gr;
141
142	gr = NEW(nfagrammar, 1);
143	if (gr == NULL)
144		fatal("no mem for new nfa grammar");
145	gr->gr_nnfas = 0;
146	gr->gr_nfa = NULL;
147	gr->gr_ll.ll_nlabels = 0;
148	gr->gr_ll.ll_label = NULL;
149	addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
150	return gr;
151}
152
153static nfa *
154addnfa(gr, name)
155	nfagrammar *gr;
156	char *name;
157{
158	nfa *nf;
159
160	nf = newnfa(name);
161	RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
162	if (gr->gr_nfa == NULL)
163		fatal("out of mem");
164	gr->gr_nfa[gr->gr_nnfas++] = nf;
165	addlabel(&gr->gr_ll, NAME, nf->nf_name);
166	return nf;
167}
168
169#ifdef Py_DEBUG
170
171static char REQNFMT[] = "metacompile: less than %d children\n";
172
173#define REQN(i, count) \
174 	if (i < count) { \
175		fprintf(stderr, REQNFMT, count); \
176		fatal("REQN"); \
177	} else
178
179#else
180#define REQN(i, count)	/* empty */
181#endif
182
183static nfagrammar *
184metacompile(n)
185	node *n;
186{
187	nfagrammar *gr;
188	int i;
189
190	printf("Compiling (meta-) parse tree into NFA grammar\n");
191	gr = newnfagrammar();
192	REQ(n, MSTART);
193	i = n->n_nchildren - 1; /* Last child is ENDMARKER */
194	n = n->n_child;
195	for (; --i >= 0; n++) {
196		if (n->n_type != NEWLINE)
197			compile_rule(gr, n);
198	}
199	return gr;
200}
201
202static void
203compile_rule(gr, n)
204	nfagrammar *gr;
205	node *n;
206{
207	nfa *nf;
208
209	REQ(n, RULE);
210	REQN(n->n_nchildren, 4);
211	n = n->n_child;
212	REQ(n, NAME);
213	nf = addnfa(gr, n->n_str);
214	n++;
215	REQ(n, COLON);
216	n++;
217	REQ(n, RHS);
218	compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
219	n++;
220	REQ(n, NEWLINE);
221}
222
223static void
224compile_rhs(ll, nf, n, pa, pb)
225	labellist *ll;
226	nfa *nf;
227	node *n;
228	int *pa, *pb;
229{
230	int i;
231	int a, b;
232
233	REQ(n, RHS);
234	i = n->n_nchildren;
235	REQN(i, 1);
236	n = n->n_child;
237	REQ(n, ALT);
238	compile_alt(ll, nf, n, pa, pb);
239	if (--i <= 0)
240		return;
241	n++;
242	a = *pa;
243	b = *pb;
244	*pa = addnfastate(nf);
245	*pb = addnfastate(nf);
246	addnfaarc(nf, *pa, a, EMPTY);
247	addnfaarc(nf, b, *pb, EMPTY);
248	for (; --i >= 0; n++) {
249		REQ(n, VBAR);
250		REQN(i, 1);
251		--i;
252		n++;
253		REQ(n, ALT);
254		compile_alt(ll, nf, n, &a, &b);
255		addnfaarc(nf, *pa, a, EMPTY);
256		addnfaarc(nf, b, *pb, EMPTY);
257	}
258}
259
260static void
261compile_alt(ll, nf, n, pa, pb)
262	labellist *ll;
263	nfa *nf;
264	node *n;
265	int *pa, *pb;
266{
267	int i;
268	int a, b;
269
270	REQ(n, ALT);
271	i = n->n_nchildren;
272	REQN(i, 1);
273	n = n->n_child;
274	REQ(n, ITEM);
275	compile_item(ll, nf, n, pa, pb);
276	--i;
277	n++;
278	for (; --i >= 0; n++) {
279		if (n->n_type == COMMA) { /* XXX Temporary */
280			REQN(i, 1);
281			--i;
282			n++;
283		}
284		REQ(n, ITEM);
285		compile_item(ll, nf, n, &a, &b);
286		addnfaarc(nf, *pb, a, EMPTY);
287		*pb = b;
288	}
289}
290
291static void
292compile_item(ll, nf, n, pa, pb)
293	labellist *ll;
294	nfa *nf;
295	node *n;
296	int *pa, *pb;
297{
298	int i;
299	int a, b;
300
301	REQ(n, ITEM);
302	i = n->n_nchildren;
303	REQN(i, 1);
304	n = n->n_child;
305	if (n->n_type == LSQB) {
306		REQN(i, 3);
307		n++;
308		REQ(n, RHS);
309		*pa = addnfastate(nf);
310		*pb = addnfastate(nf);
311		addnfaarc(nf, *pa, *pb, EMPTY);
312		compile_rhs(ll, nf, n, &a, &b);
313		addnfaarc(nf, *pa, a, EMPTY);
314		addnfaarc(nf, b, *pb, EMPTY);
315		REQN(i, 1);
316		n++;
317		REQ(n, RSQB);
318	}
319	else {
320		compile_atom(ll, nf, n, pa, pb);
321		if (--i <= 0)
322			return;
323		n++;
324		addnfaarc(nf, *pb, *pa, EMPTY);
325		if (n->n_type == STAR)
326			*pb = *pa;
327		else
328			REQ(n, PLUS);
329	}
330}
331
332static void
333compile_atom(ll, nf, n, pa, pb)
334	labellist *ll;
335	nfa *nf;
336	node *n;
337	int *pa, *pb;
338{
339	int i;
340
341	REQ(n, ATOM);
342	i = n->n_nchildren;
343	REQN(i, 1);
344	n = n->n_child;
345	if (n->n_type == LPAR) {
346		REQN(i, 3);
347		n++;
348		REQ(n, RHS);
349		compile_rhs(ll, nf, n, pa, pb);
350		n++;
351		REQ(n, RPAR);
352	}
353	else if (n->n_type == NAME || n->n_type == STRING) {
354		*pa = addnfastate(nf);
355		*pb = addnfastate(nf);
356		addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
357	}
358	else
359		REQ(n, NAME);
360}
361
362static void
363dumpstate(ll, nf, istate)
364	labellist *ll;
365	nfa *nf;
366	int istate;
367{
368	nfastate *st;
369	int i;
370	nfaarc *ar;
371
372	printf("%c%2d%c",
373		istate == nf->nf_start ? '*' : ' ',
374		istate,
375		istate == nf->nf_finish ? '.' : ' ');
376	st = &nf->nf_state[istate];
377	ar = st->st_arc;
378	for (i = 0; i < st->st_narcs; i++) {
379		if (i > 0)
380			printf("\n    ");
381		printf("-> %2d  %s", ar->ar_arrow,
382			labelrepr(&ll->ll_label[ar->ar_label]));
383		ar++;
384	}
385	printf("\n");
386}
387
388static void
389dumpnfa(ll, nf)
390	labellist *ll;
391	nfa *nf;
392{
393	int i;
394
395	printf("NFA '%s' has %d states; start %d, finish %d\n",
396		nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
397	for (i = 0; i < nf->nf_nstates; i++)
398		dumpstate(ll, nf, i);
399}
400
401
402/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
403
404static void
405addclosure(ss, nf, istate)
406	bitset ss;
407	nfa *nf;
408	int istate;
409{
410	if (addbit(ss, istate)) {
411		nfastate *st = &nf->nf_state[istate];
412		nfaarc *ar = st->st_arc;
413		int i;
414
415		for (i = st->st_narcs; --i >= 0; ) {
416			if (ar->ar_label == EMPTY)
417				addclosure(ss, nf, ar->ar_arrow);
418			ar++;
419		}
420	}
421}
422
423typedef struct _ss_arc {
424	bitset	sa_bitset;
425	int	sa_arrow;
426	int	sa_label;
427} ss_arc;
428
429typedef struct _ss_state {
430	bitset	ss_ss;
431	int	ss_narcs;
432	ss_arc	*ss_arc;
433	int	ss_deleted;
434	int	ss_finish;
435	int	ss_rename;
436} ss_state;
437
438typedef struct _ss_dfa {
439	int	sd_nstates;
440	ss_state *sd_state;
441} ss_dfa;
442
443/* Forward */
444static void printssdfa PROTO((int xx_nstates, ss_state *xx_state, int nbits,
445			      labellist *ll, char *msg));
446static void simplify PROTO((int xx_nstates, ss_state *xx_state));
447static void convert PROTO((dfa *d, int xx_nstates, ss_state *xx_state));
448
449static void
450makedfa(gr, nf, d)
451	nfagrammar *gr;
452	nfa *nf;
453	dfa *d;
454{
455	int nbits = nf->nf_nstates;
456	bitset ss;
457	int xx_nstates;
458	ss_state *xx_state, *yy;
459	ss_arc *zz;
460	int istate, jstate, iarc, jarc, ibit;
461	nfastate *st;
462	nfaarc *ar;
463
464	ss = newbitset(nbits);
465	addclosure(ss, nf, nf->nf_start);
466	xx_state = NEW(ss_state, 1);
467	if (xx_state == NULL)
468		fatal("no mem for xx_state in makedfa");
469	xx_nstates = 1;
470	yy = &xx_state[0];
471	yy->ss_ss = ss;
472	yy->ss_narcs = 0;
473	yy->ss_arc = NULL;
474	yy->ss_deleted = 0;
475	yy->ss_finish = testbit(ss, nf->nf_finish);
476	if (yy->ss_finish)
477		printf("Error: nonterminal '%s' may produce empty.\n",
478			nf->nf_name);
479
480	/* This algorithm is from a book written before
481	   the invention of structured programming... */
482
483	/* For each unmarked state... */
484	for (istate = 0; istate < xx_nstates; ++istate) {
485		yy = &xx_state[istate];
486		ss = yy->ss_ss;
487		/* For all its states... */
488		for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
489			if (!testbit(ss, ibit))
490				continue;
491			st = &nf->nf_state[ibit];
492			/* For all non-empty arcs from this state... */
493			for (iarc = 0; iarc < st->st_narcs; iarc++) {
494				ar = &st->st_arc[iarc];
495				if (ar->ar_label == EMPTY)
496					continue;
497				/* Look up in list of arcs from this state */
498				for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
499					zz = &yy->ss_arc[jarc];
500					if (ar->ar_label == zz->sa_label)
501						goto found;
502				}
503				/* Add new arc for this state */
504				RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
505				if (yy->ss_arc == NULL)
506					fatal("out of mem");
507				zz = &yy->ss_arc[yy->ss_narcs++];
508				zz->sa_label = ar->ar_label;
509				zz->sa_bitset = newbitset(nbits);
510				zz->sa_arrow = -1;
511			 found:	;
512				/* Add destination */
513				addclosure(zz->sa_bitset, nf, ar->ar_arrow);
514			}
515		}
516		/* Now look up all the arrow states */
517		for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
518			zz = &xx_state[istate].ss_arc[jarc];
519			for (jstate = 0; jstate < xx_nstates; jstate++) {
520				if (samebitset(zz->sa_bitset,
521					xx_state[jstate].ss_ss, nbits)) {
522					zz->sa_arrow = jstate;
523					goto done;
524				}
525			}
526			RESIZE(xx_state, ss_state, xx_nstates + 1);
527			if (xx_state == NULL)
528				fatal("out of mem");
529			zz->sa_arrow = xx_nstates;
530			yy = &xx_state[xx_nstates++];
531			yy->ss_ss = zz->sa_bitset;
532			yy->ss_narcs = 0;
533			yy->ss_arc = NULL;
534			yy->ss_deleted = 0;
535			yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
536		 done:	;
537		}
538	}
539
540	if (debugging)
541		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
542						"before minimizing");
543
544	simplify(xx_nstates, xx_state);
545
546	if (debugging)
547		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
548						"after minimizing");
549
550	convert(d, xx_nstates, xx_state);
551
552	/* XXX cleanup */
553}
554
555static void
556printssdfa(xx_nstates, xx_state, nbits, ll, msg)
557	int xx_nstates;
558	ss_state *xx_state;
559	int nbits;
560	labellist *ll;
561	char *msg;
562{
563	int i, ibit, iarc;
564	ss_state *yy;
565	ss_arc *zz;
566
567	printf("Subset DFA %s\n", msg);
568	for (i = 0; i < xx_nstates; i++) {
569		yy = &xx_state[i];
570		if (yy->ss_deleted)
571			continue;
572		printf(" Subset %d", i);
573		if (yy->ss_finish)
574			printf(" (finish)");
575		printf(" { ");
576		for (ibit = 0; ibit < nbits; ibit++) {
577			if (testbit(yy->ss_ss, ibit))
578				printf("%d ", ibit);
579		}
580		printf("}\n");
581		for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
582			zz = &yy->ss_arc[iarc];
583			printf("  Arc to state %d, label %s\n",
584				zz->sa_arrow,
585				labelrepr(&ll->ll_label[zz->sa_label]));
586		}
587	}
588}
589
590
591/* PART THREE -- SIMPLIFY DFA */
592
593/* Simplify the DFA by repeatedly eliminating states that are
594   equivalent to another oner.  This is NOT Algorithm 3.3 from
595   [Aho&Ullman 77].  It does not always finds the minimal DFA,
596   but it does usually make a much smaller one...  (For an example
597   of sub-optimal behaviour, try S: x a b+ | y a b+.)
598*/
599
600static int
601samestate(s1, s2)
602	ss_state *s1, *s2;
603{
604	int i;
605
606	if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
607		return 0;
608	for (i = 0; i < s1->ss_narcs; i++) {
609		if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
610			s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
611			return 0;
612	}
613	return 1;
614}
615
616static void
617renamestates(xx_nstates, xx_state, from, to)
618	int xx_nstates;
619	ss_state *xx_state;
620	int from, to;
621{
622	int i, j;
623
624	if (debugging)
625		printf("Rename state %d to %d.\n", from, to);
626	for (i = 0; i < xx_nstates; i++) {
627		if (xx_state[i].ss_deleted)
628			continue;
629		for (j = 0; j < xx_state[i].ss_narcs; j++) {
630			if (xx_state[i].ss_arc[j].sa_arrow == from)
631				xx_state[i].ss_arc[j].sa_arrow = to;
632		}
633	}
634}
635
636static void
637simplify(xx_nstates, xx_state)
638	int xx_nstates;
639	ss_state *xx_state;
640{
641	int changes;
642	int i, j;
643
644	do {
645		changes = 0;
646		for (i = 1; i < xx_nstates; i++) {
647			if (xx_state[i].ss_deleted)
648				continue;
649			for (j = 0; j < i; j++) {
650				if (xx_state[j].ss_deleted)
651					continue;
652				if (samestate(&xx_state[i], &xx_state[j])) {
653					xx_state[i].ss_deleted++;
654					renamestates(xx_nstates, xx_state, i, j);
655					changes++;
656					break;
657				}
658			}
659		}
660	} while (changes);
661}
662
663
664/* PART FOUR -- GENERATE PARSING TABLES */
665
666/* Convert the DFA into a grammar that can be used by our parser */
667
668static void
669convert(d, xx_nstates, xx_state)
670	dfa *d;
671	int xx_nstates;
672	ss_state *xx_state;
673{
674	int i, j;
675	ss_state *yy;
676	ss_arc *zz;
677
678	for (i = 0; i < xx_nstates; i++) {
679		yy = &xx_state[i];
680		if (yy->ss_deleted)
681			continue;
682		yy->ss_rename = addstate(d);
683	}
684
685	for (i = 0; i < xx_nstates; i++) {
686		yy = &xx_state[i];
687		if (yy->ss_deleted)
688			continue;
689		for (j = 0; j < yy->ss_narcs; j++) {
690			zz = &yy->ss_arc[j];
691			addarc(d, yy->ss_rename,
692				xx_state[zz->sa_arrow].ss_rename,
693				zz->sa_label);
694		}
695		if (yy->ss_finish)
696			addarc(d, yy->ss_rename, yy->ss_rename, 0);
697	}
698
699	d->d_initial = 0;
700}
701
702
703/* PART FIVE -- GLUE IT ALL TOGETHER */
704
705static grammar *
706maketables(gr)
707	nfagrammar *gr;
708{
709	int i;
710	nfa *nf;
711	dfa *d;
712	grammar *g;
713
714	if (gr->gr_nnfas == 0)
715		return NULL;
716	g = newgrammar(gr->gr_nfa[0]->nf_type);
717			/* XXX first rule must be start rule */
718	g->g_ll = gr->gr_ll;
719
720	for (i = 0; i < gr->gr_nnfas; i++) {
721		nf = gr->gr_nfa[i];
722		if (debugging) {
723			printf("Dump of NFA for '%s' ...\n", nf->nf_name);
724			dumpnfa(&gr->gr_ll, nf);
725		}
726		printf("Making DFA for '%s' ...\n", nf->nf_name);
727		d = adddfa(g, nf->nf_type, nf->nf_name);
728		makedfa(gr, gr->gr_nfa[i], d);
729	}
730
731	return g;
732}
733
734grammar *
735pgen(n)
736	node *n;
737{
738	nfagrammar *gr;
739	grammar *g;
740
741	gr = metacompile(n);
742	g = maketables(gr);
743	translatelabels(g);
744	addfirstsets(g);
745	return g;
746}
747
748
749/*
750
751Description
752-----------
753
754Input is a grammar in extended BNF (using * for repetition, + for
755at-least-once repetition, [] for optional parts, | for alternatives and
756() for grouping).  This has already been parsed and turned into a parse
757tree.
758
759Each rule is considered as a regular expression in its own right.
760It is turned into a Non-deterministic Finite Automaton (NFA), which
761is then turned into a Deterministic Finite Automaton (DFA), which is then
762optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
763or similar compiler books (this technique is more often used for lexical
764analyzers).
765
766The DFA's are used by the parser as parsing tables in a special way
767that's probably unique.  Before they are usable, the FIRST sets of all
768non-terminals are computed.
769
770Reference
771---------
772
773[Aho&Ullman 77]
774	Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
775	(first edition)
776
777*/
778