pgen.c revision 86bea46b3d16c4ed0453e17f241ddbdfade76c98
1/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5                        All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* Parser generator */
33/* XXX This file is not yet fully PROTOized */
34
35/* For a description, see the comments at end of this file */
36
37#include "pgenheaders.h"
38#include "assert.h"
39#include "token.h"
40#include "node.h"
41#include "grammar.h"
42#include "metagrammar.h"
43#include "pgen.h"
44
45extern int Py_DebugFlag;
46
47
48/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
49
50typedef struct _nfaarc {
51	int	ar_label;
52	int	ar_arrow;
53} nfaarc;
54
55typedef struct _nfastate {
56	int	st_narcs;
57	nfaarc	*st_arc;
58} nfastate;
59
60typedef struct _nfa {
61	int		nf_type;
62	char		*nf_name;
63	int		nf_nstates;
64	nfastate	*nf_state;
65	int		nf_start, nf_finish;
66} nfa;
67
68/* Forward */
69static void compile_rhs Py_PROTO((labellist *ll,
70			       nfa *nf, node *n, int *pa, int *pb));
71static void compile_alt Py_PROTO((labellist *ll,
72			       nfa *nf, node *n, int *pa, int *pb));
73static void compile_item Py_PROTO((labellist *ll,
74				nfa *nf, node *n, int *pa, int *pb));
75static void compile_atom Py_PROTO((labellist *ll,
76				nfa *nf, node *n, int *pa, int *pb));
77
78static int
79addnfastate(nf)
80	nfa *nf;
81{
82	nfastate *st;
83
84	PyMem_RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
85	if (nf->nf_state == NULL)
86		Py_FatalError("out of mem");
87	st = &nf->nf_state[nf->nf_nstates++];
88	st->st_narcs = 0;
89	st->st_arc = NULL;
90	return st - nf->nf_state;
91}
92
93static void
94addnfaarc(nf, from, to, lbl)
95	nfa *nf;
96	int from, to, lbl;
97{
98	nfastate *st;
99	nfaarc *ar;
100
101	st = &nf->nf_state[from];
102	PyMem_RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
103	if (st->st_arc == NULL)
104		Py_FatalError("out of mem");
105	ar = &st->st_arc[st->st_narcs++];
106	ar->ar_label = lbl;
107	ar->ar_arrow = to;
108}
109
110static nfa *
111newnfa(name)
112	char *name;
113{
114	nfa *nf;
115	static type = NT_OFFSET; /* All types will be disjunct */
116
117	nf = PyMem_NEW(nfa, 1);
118	if (nf == NULL)
119		Py_FatalError("no mem for new nfa");
120	nf->nf_type = type++;
121	nf->nf_name = name; /* XXX strdup(name) ??? */
122	nf->nf_nstates = 0;
123	nf->nf_state = NULL;
124	nf->nf_start = nf->nf_finish = -1;
125	return nf;
126}
127
128typedef struct _nfagrammar {
129	int		gr_nnfas;
130	nfa		**gr_nfa;
131	labellist	gr_ll;
132} nfagrammar;
133
134/* Forward */
135static void compile_rule Py_PROTO((nfagrammar *gr, node *n));
136
137static nfagrammar *
138newnfagrammar()
139{
140	nfagrammar *gr;
141
142	gr = PyMem_NEW(nfagrammar, 1);
143	if (gr == NULL)
144		Py_FatalError("no mem for new nfa grammar");
145	gr->gr_nnfas = 0;
146	gr->gr_nfa = NULL;
147	gr->gr_ll.ll_nlabels = 0;
148	gr->gr_ll.ll_label = NULL;
149	addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
150	return gr;
151}
152
153static nfa *
154addnfa(gr, name)
155	nfagrammar *gr;
156	char *name;
157{
158	nfa *nf;
159
160	nf = newnfa(name);
161	PyMem_RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
162	if (gr->gr_nfa == NULL)
163		Py_FatalError("out of mem");
164	gr->gr_nfa[gr->gr_nnfas++] = nf;
165	addlabel(&gr->gr_ll, NAME, nf->nf_name);
166	return nf;
167}
168
169#ifdef Py_DEBUG
170
171static char REQNFMT[] = "metacompile: less than %d children\n";
172
173#define REQN(i, count) \
174 	if (i < count) { \
175		fprintf(stderr, REQNFMT, count); \
176		Py_FatalError("REQN"); \
177	} else
178
179#else
180#define REQN(i, count)	/* empty */
181#endif
182
183static nfagrammar *
184metacompile(n)
185	node *n;
186{
187	nfagrammar *gr;
188	int i;
189
190	printf("Compiling (meta-) parse tree into NFA grammar\n");
191	gr = newnfagrammar();
192	REQ(n, MSTART);
193	i = n->n_nchildren - 1; /* Last child is ENDMARKER */
194	n = n->n_child;
195	for (; --i >= 0; n++) {
196		if (n->n_type != NEWLINE)
197			compile_rule(gr, n);
198	}
199	return gr;
200}
201
202static void
203compile_rule(gr, n)
204	nfagrammar *gr;
205	node *n;
206{
207	nfa *nf;
208
209	REQ(n, RULE);
210	REQN(n->n_nchildren, 4);
211	n = n->n_child;
212	REQ(n, NAME);
213	nf = addnfa(gr, n->n_str);
214	n++;
215	REQ(n, COLON);
216	n++;
217	REQ(n, RHS);
218	compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
219	n++;
220	REQ(n, NEWLINE);
221}
222
223static void
224compile_rhs(ll, nf, n, pa, pb)
225	labellist *ll;
226	nfa *nf;
227	node *n;
228	int *pa, *pb;
229{
230	int i;
231	int a, b;
232
233	REQ(n, RHS);
234	i = n->n_nchildren;
235	REQN(i, 1);
236	n = n->n_child;
237	REQ(n, ALT);
238	compile_alt(ll, nf, n, pa, pb);
239	if (--i <= 0)
240		return;
241	n++;
242	a = *pa;
243	b = *pb;
244	*pa = addnfastate(nf);
245	*pb = addnfastate(nf);
246	addnfaarc(nf, *pa, a, EMPTY);
247	addnfaarc(nf, b, *pb, EMPTY);
248	for (; --i >= 0; n++) {
249		REQ(n, VBAR);
250		REQN(i, 1);
251		--i;
252		n++;
253		REQ(n, ALT);
254		compile_alt(ll, nf, n, &a, &b);
255		addnfaarc(nf, *pa, a, EMPTY);
256		addnfaarc(nf, b, *pb, EMPTY);
257	}
258}
259
260static void
261compile_alt(ll, nf, n, pa, pb)
262	labellist *ll;
263	nfa *nf;
264	node *n;
265	int *pa, *pb;
266{
267	int i;
268	int a, b;
269
270	REQ(n, ALT);
271	i = n->n_nchildren;
272	REQN(i, 1);
273	n = n->n_child;
274	REQ(n, ITEM);
275	compile_item(ll, nf, n, pa, pb);
276	--i;
277	n++;
278	for (; --i >= 0; n++) {
279		if (n->n_type == COMMA) { /* XXX Temporary */
280			REQN(i, 1);
281			--i;
282			n++;
283		}
284		REQ(n, ITEM);
285		compile_item(ll, nf, n, &a, &b);
286		addnfaarc(nf, *pb, a, EMPTY);
287		*pb = b;
288	}
289}
290
291static void
292compile_item(ll, nf, n, pa, pb)
293	labellist *ll;
294	nfa *nf;
295	node *n;
296	int *pa, *pb;
297{
298	int i;
299	int a, b;
300
301	REQ(n, ITEM);
302	i = n->n_nchildren;
303	REQN(i, 1);
304	n = n->n_child;
305	if (n->n_type == LSQB) {
306		REQN(i, 3);
307		n++;
308		REQ(n, RHS);
309		*pa = addnfastate(nf);
310		*pb = addnfastate(nf);
311		addnfaarc(nf, *pa, *pb, EMPTY);
312		compile_rhs(ll, nf, n, &a, &b);
313		addnfaarc(nf, *pa, a, EMPTY);
314		addnfaarc(nf, b, *pb, EMPTY);
315		REQN(i, 1);
316		n++;
317		REQ(n, RSQB);
318	}
319	else {
320		compile_atom(ll, nf, n, pa, pb);
321		if (--i <= 0)
322			return;
323		n++;
324		addnfaarc(nf, *pb, *pa, EMPTY);
325		if (n->n_type == STAR)
326			*pb = *pa;
327		else
328			REQ(n, PLUS);
329	}
330}
331
332static void
333compile_atom(ll, nf, n, pa, pb)
334	labellist *ll;
335	nfa *nf;
336	node *n;
337	int *pa, *pb;
338{
339	int i;
340
341	REQ(n, ATOM);
342	i = n->n_nchildren;
343	REQN(i, 1);
344	n = n->n_child;
345	if (n->n_type == LPAR) {
346		REQN(i, 3);
347		n++;
348		REQ(n, RHS);
349		compile_rhs(ll, nf, n, pa, pb);
350		n++;
351		REQ(n, RPAR);
352	}
353	else if (n->n_type == NAME || n->n_type == STRING) {
354		*pa = addnfastate(nf);
355		*pb = addnfastate(nf);
356		addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
357	}
358	else
359		REQ(n, NAME);
360}
361
362static void
363dumpstate(ll, nf, istate)
364	labellist *ll;
365	nfa *nf;
366	int istate;
367{
368	nfastate *st;
369	int i;
370	nfaarc *ar;
371
372	printf("%c%2d%c",
373		istate == nf->nf_start ? '*' : ' ',
374		istate,
375		istate == nf->nf_finish ? '.' : ' ');
376	st = &nf->nf_state[istate];
377	ar = st->st_arc;
378	for (i = 0; i < st->st_narcs; i++) {
379		if (i > 0)
380			printf("\n    ");
381		printf("-> %2d  %s", ar->ar_arrow,
382			PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
383		ar++;
384	}
385	printf("\n");
386}
387
388static void
389dumpnfa(ll, nf)
390	labellist *ll;
391	nfa *nf;
392{
393	int i;
394
395	printf("NFA '%s' has %d states; start %d, finish %d\n",
396		nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
397	for (i = 0; i < nf->nf_nstates; i++)
398		dumpstate(ll, nf, i);
399}
400
401
402/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
403
404static void
405addclosure(ss, nf, istate)
406	bitset ss;
407	nfa *nf;
408	int istate;
409{
410	if (addbit(ss, istate)) {
411		nfastate *st = &nf->nf_state[istate];
412		nfaarc *ar = st->st_arc;
413		int i;
414
415		for (i = st->st_narcs; --i >= 0; ) {
416			if (ar->ar_label == EMPTY)
417				addclosure(ss, nf, ar->ar_arrow);
418			ar++;
419		}
420	}
421}
422
423typedef struct _ss_arc {
424	bitset	sa_bitset;
425	int	sa_arrow;
426	int	sa_label;
427} ss_arc;
428
429typedef struct _ss_state {
430	bitset	ss_ss;
431	int	ss_narcs;
432	ss_arc	*ss_arc;
433	int	ss_deleted;
434	int	ss_finish;
435	int	ss_rename;
436} ss_state;
437
438typedef struct _ss_dfa {
439	int	sd_nstates;
440	ss_state *sd_state;
441} ss_dfa;
442
443/* Forward */
444static void printssdfa Py_PROTO((int xx_nstates, ss_state *xx_state, int nbits,
445			      labellist *ll, char *msg));
446static void simplify Py_PROTO((int xx_nstates, ss_state *xx_state));
447static void convert Py_PROTO((dfa *d, int xx_nstates, ss_state *xx_state));
448
449static void
450makedfa(gr, nf, d)
451	nfagrammar *gr;
452	nfa *nf;
453	dfa *d;
454{
455	int nbits = nf->nf_nstates;
456	bitset ss;
457	int xx_nstates;
458	ss_state *xx_state, *yy;
459	ss_arc *zz;
460	int istate, jstate, iarc, jarc, ibit;
461	nfastate *st;
462	nfaarc *ar;
463
464	ss = newbitset(nbits);
465	addclosure(ss, nf, nf->nf_start);
466	xx_state = PyMem_NEW(ss_state, 1);
467	if (xx_state == NULL)
468		Py_FatalError("no mem for xx_state in makedfa");
469	xx_nstates = 1;
470	yy = &xx_state[0];
471	yy->ss_ss = ss;
472	yy->ss_narcs = 0;
473	yy->ss_arc = NULL;
474	yy->ss_deleted = 0;
475	yy->ss_finish = testbit(ss, nf->nf_finish);
476	if (yy->ss_finish)
477		printf("Error: nonterminal '%s' may produce empty.\n",
478			nf->nf_name);
479
480	/* This algorithm is from a book written before
481	   the invention of structured programming... */
482
483	/* For each unmarked state... */
484	for (istate = 0; istate < xx_nstates; ++istate) {
485		yy = &xx_state[istate];
486		ss = yy->ss_ss;
487		/* For all its states... */
488		for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
489			if (!testbit(ss, ibit))
490				continue;
491			st = &nf->nf_state[ibit];
492			/* For all non-empty arcs from this state... */
493			for (iarc = 0; iarc < st->st_narcs; iarc++) {
494				ar = &st->st_arc[iarc];
495				if (ar->ar_label == EMPTY)
496					continue;
497				/* Look up in list of arcs from this state */
498				for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
499					zz = &yy->ss_arc[jarc];
500					if (ar->ar_label == zz->sa_label)
501						goto found;
502				}
503				/* Add new arc for this state */
504				PyMem_RESIZE(yy->ss_arc, ss_arc,
505					     yy->ss_narcs + 1);
506				if (yy->ss_arc == NULL)
507					Py_FatalError("out of mem");
508				zz = &yy->ss_arc[yy->ss_narcs++];
509				zz->sa_label = ar->ar_label;
510				zz->sa_bitset = newbitset(nbits);
511				zz->sa_arrow = -1;
512			 found:	;
513				/* Add destination */
514				addclosure(zz->sa_bitset, nf, ar->ar_arrow);
515			}
516		}
517		/* Now look up all the arrow states */
518		for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
519			zz = &xx_state[istate].ss_arc[jarc];
520			for (jstate = 0; jstate < xx_nstates; jstate++) {
521				if (samebitset(zz->sa_bitset,
522					xx_state[jstate].ss_ss, nbits)) {
523					zz->sa_arrow = jstate;
524					goto done;
525				}
526			}
527			PyMem_RESIZE(xx_state, ss_state, xx_nstates + 1);
528			if (xx_state == NULL)
529				Py_FatalError("out of mem");
530			zz->sa_arrow = xx_nstates;
531			yy = &xx_state[xx_nstates++];
532			yy->ss_ss = zz->sa_bitset;
533			yy->ss_narcs = 0;
534			yy->ss_arc = NULL;
535			yy->ss_deleted = 0;
536			yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
537		 done:	;
538		}
539	}
540
541	if (Py_DebugFlag)
542		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
543						"before minimizing");
544
545	simplify(xx_nstates, xx_state);
546
547	if (Py_DebugFlag)
548		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
549						"after minimizing");
550
551	convert(d, xx_nstates, xx_state);
552
553	/* XXX cleanup */
554}
555
556static void
557printssdfa(xx_nstates, xx_state, nbits, ll, msg)
558	int xx_nstates;
559	ss_state *xx_state;
560	int nbits;
561	labellist *ll;
562	char *msg;
563{
564	int i, ibit, iarc;
565	ss_state *yy;
566	ss_arc *zz;
567
568	printf("Subset DFA %s\n", msg);
569	for (i = 0; i < xx_nstates; i++) {
570		yy = &xx_state[i];
571		if (yy->ss_deleted)
572			continue;
573		printf(" Subset %d", i);
574		if (yy->ss_finish)
575			printf(" (finish)");
576		printf(" { ");
577		for (ibit = 0; ibit < nbits; ibit++) {
578			if (testbit(yy->ss_ss, ibit))
579				printf("%d ", ibit);
580		}
581		printf("}\n");
582		for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
583			zz = &yy->ss_arc[iarc];
584			printf("  Arc to state %d, label %s\n",
585				zz->sa_arrow,
586				PyGrammar_LabelRepr(
587					&ll->ll_label[zz->sa_label]));
588		}
589	}
590}
591
592
593/* PART THREE -- SIMPLIFY DFA */
594
595/* Simplify the DFA by repeatedly eliminating states that are
596   equivalent to another oner.  This is NOT Algorithm 3.3 from
597   [Aho&Ullman 77].  It does not always finds the minimal DFA,
598   but it does usually make a much smaller one...  (For an example
599   of sub-optimal behaviour, try S: x a b+ | y a b+.)
600*/
601
602static int
603samestate(s1, s2)
604	ss_state *s1, *s2;
605{
606	int i;
607
608	if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
609		return 0;
610	for (i = 0; i < s1->ss_narcs; i++) {
611		if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
612			s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
613			return 0;
614	}
615	return 1;
616}
617
618static void
619renamestates(xx_nstates, xx_state, from, to)
620	int xx_nstates;
621	ss_state *xx_state;
622	int from, to;
623{
624	int i, j;
625
626	if (Py_DebugFlag)
627		printf("Rename state %d to %d.\n", from, to);
628	for (i = 0; i < xx_nstates; i++) {
629		if (xx_state[i].ss_deleted)
630			continue;
631		for (j = 0; j < xx_state[i].ss_narcs; j++) {
632			if (xx_state[i].ss_arc[j].sa_arrow == from)
633				xx_state[i].ss_arc[j].sa_arrow = to;
634		}
635	}
636}
637
638static void
639simplify(xx_nstates, xx_state)
640	int xx_nstates;
641	ss_state *xx_state;
642{
643	int changes;
644	int i, j;
645
646	do {
647		changes = 0;
648		for (i = 1; i < xx_nstates; i++) {
649			if (xx_state[i].ss_deleted)
650				continue;
651			for (j = 0; j < i; j++) {
652				if (xx_state[j].ss_deleted)
653					continue;
654				if (samestate(&xx_state[i], &xx_state[j])) {
655					xx_state[i].ss_deleted++;
656					renamestates(xx_nstates, xx_state,
657						     i, j);
658					changes++;
659					break;
660				}
661			}
662		}
663	} while (changes);
664}
665
666
667/* PART FOUR -- GENERATE PARSING TABLES */
668
669/* Convert the DFA into a grammar that can be used by our parser */
670
671static void
672convert(d, xx_nstates, xx_state)
673	dfa *d;
674	int xx_nstates;
675	ss_state *xx_state;
676{
677	int i, j;
678	ss_state *yy;
679	ss_arc *zz;
680
681	for (i = 0; i < xx_nstates; i++) {
682		yy = &xx_state[i];
683		if (yy->ss_deleted)
684			continue;
685		yy->ss_rename = addstate(d);
686	}
687
688	for (i = 0; i < xx_nstates; i++) {
689		yy = &xx_state[i];
690		if (yy->ss_deleted)
691			continue;
692		for (j = 0; j < yy->ss_narcs; j++) {
693			zz = &yy->ss_arc[j];
694			addarc(d, yy->ss_rename,
695				xx_state[zz->sa_arrow].ss_rename,
696				zz->sa_label);
697		}
698		if (yy->ss_finish)
699			addarc(d, yy->ss_rename, yy->ss_rename, 0);
700	}
701
702	d->d_initial = 0;
703}
704
705
706/* PART FIVE -- GLUE IT ALL TOGETHER */
707
708static grammar *
709maketables(gr)
710	nfagrammar *gr;
711{
712	int i;
713	nfa *nf;
714	dfa *d;
715	grammar *g;
716
717	if (gr->gr_nnfas == 0)
718		return NULL;
719	g = newgrammar(gr->gr_nfa[0]->nf_type);
720			/* XXX first rule must be start rule */
721	g->g_ll = gr->gr_ll;
722
723	for (i = 0; i < gr->gr_nnfas; i++) {
724		nf = gr->gr_nfa[i];
725		if (Py_DebugFlag) {
726			printf("Dump of NFA for '%s' ...\n", nf->nf_name);
727			dumpnfa(&gr->gr_ll, nf);
728		}
729		printf("Making DFA for '%s' ...\n", nf->nf_name);
730		d = adddfa(g, nf->nf_type, nf->nf_name);
731		makedfa(gr, gr->gr_nfa[i], d);
732	}
733
734	return g;
735}
736
737grammar *
738pgen(n)
739	node *n;
740{
741	nfagrammar *gr;
742	grammar *g;
743
744	gr = metacompile(n);
745	g = maketables(gr);
746	translatelabels(g);
747	addfirstsets(g);
748	return g;
749}
750
751
752/*
753
754Description
755-----------
756
757Input is a grammar in extended BNF (using * for repetition, + for
758at-least-once repetition, [] for optional parts, | for alternatives and
759() for grouping).  This has already been parsed and turned into a parse
760tree.
761
762Each rule is considered as a regular expression in its own right.
763It is turned into a Non-deterministic Finite Automaton (NFA), which
764is then turned into a Deterministic Finite Automaton (DFA), which is then
765optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
766or similar compiler books (this technique is more often used for lexical
767analyzers).
768
769The DFA's are used by the parser as parsing tables in a special way
770that's probably unique.  Before they are usable, the FIRST sets of all
771non-terminals are computed.
772
773Reference
774---------
775
776[Aho&Ullman 77]
777	Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
778	(first edition)
779
780*/
781