pgen.c revision fd71b9e9d496caa510dec56a9b69966558d6ba5d
1/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5                        All Rights Reserved
6
7Copyright (c) 2000, BeOpen.com.
8Copyright (c) 1995-2000, Corporation for National Research Initiatives.
9Copyright (c) 1990-1995, Stichting Mathematisch Centrum.
10All rights reserved.
11
12See the file "Misc/COPYRIGHT" for information on usage and
13redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.
14
15******************************************************************/
16
17/* Parser generator */
18/* XXX This file is not yet fully PROTOized */
19
20/* For a description, see the comments at end of this file */
21
22#include "pgenheaders.h"
23#include "assert.h"
24#include "token.h"
25#include "node.h"
26#include "grammar.h"
27#include "metagrammar.h"
28#include "pgen.h"
29
30extern int Py_DebugFlag;
31
32
33/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
34
35typedef struct _nfaarc {
36	int	ar_label;
37	int	ar_arrow;
38} nfaarc;
39
40typedef struct _nfastate {
41	int	st_narcs;
42	nfaarc	*st_arc;
43} nfastate;
44
45typedef struct _nfa {
46	int		nf_type;
47	char		*nf_name;
48	int		nf_nstates;
49	nfastate	*nf_state;
50	int		nf_start, nf_finish;
51} nfa;
52
53/* Forward */
54static void compile_rhs Py_PROTO((labellist *ll,
55			       nfa *nf, node *n, int *pa, int *pb));
56static void compile_alt Py_PROTO((labellist *ll,
57			       nfa *nf, node *n, int *pa, int *pb));
58static void compile_item Py_PROTO((labellist *ll,
59				nfa *nf, node *n, int *pa, int *pb));
60static void compile_atom Py_PROTO((labellist *ll,
61				nfa *nf, node *n, int *pa, int *pb));
62
63static int
64addnfastate(nf)
65	nfa *nf;
66{
67	nfastate *st;
68
69	PyMem_RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
70	if (nf->nf_state == NULL)
71		Py_FatalError("out of mem");
72	st = &nf->nf_state[nf->nf_nstates++];
73	st->st_narcs = 0;
74	st->st_arc = NULL;
75	return st - nf->nf_state;
76}
77
78static void
79addnfaarc(nf, from, to, lbl)
80	nfa *nf;
81	int from, to, lbl;
82{
83	nfastate *st;
84	nfaarc *ar;
85
86	st = &nf->nf_state[from];
87	PyMem_RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
88	if (st->st_arc == NULL)
89		Py_FatalError("out of mem");
90	ar = &st->st_arc[st->st_narcs++];
91	ar->ar_label = lbl;
92	ar->ar_arrow = to;
93}
94
95static nfa *
96newnfa(name)
97	char *name;
98{
99	nfa *nf;
100	static int type = NT_OFFSET; /* All types will be disjunct */
101
102	nf = PyMem_NEW(nfa, 1);
103	if (nf == NULL)
104		Py_FatalError("no mem for new nfa");
105	nf->nf_type = type++;
106	nf->nf_name = name; /* XXX strdup(name) ??? */
107	nf->nf_nstates = 0;
108	nf->nf_state = NULL;
109	nf->nf_start = nf->nf_finish = -1;
110	return nf;
111}
112
113typedef struct _nfagrammar {
114	int		gr_nnfas;
115	nfa		**gr_nfa;
116	labellist	gr_ll;
117} nfagrammar;
118
119/* Forward */
120static void compile_rule Py_PROTO((nfagrammar *gr, node *n));
121
122static nfagrammar *
123newnfagrammar()
124{
125	nfagrammar *gr;
126
127	gr = PyMem_NEW(nfagrammar, 1);
128	if (gr == NULL)
129		Py_FatalError("no mem for new nfa grammar");
130	gr->gr_nnfas = 0;
131	gr->gr_nfa = NULL;
132	gr->gr_ll.ll_nlabels = 0;
133	gr->gr_ll.ll_label = NULL;
134	addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
135	return gr;
136}
137
138static nfa *
139addnfa(gr, name)
140	nfagrammar *gr;
141	char *name;
142{
143	nfa *nf;
144
145	nf = newnfa(name);
146	PyMem_RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
147	if (gr->gr_nfa == NULL)
148		Py_FatalError("out of mem");
149	gr->gr_nfa[gr->gr_nnfas++] = nf;
150	addlabel(&gr->gr_ll, NAME, nf->nf_name);
151	return nf;
152}
153
154#ifdef Py_DEBUG
155
156static char REQNFMT[] = "metacompile: less than %d children\n";
157
158#define REQN(i, count) \
159 	if (i < count) { \
160		fprintf(stderr, REQNFMT, count); \
161		Py_FatalError("REQN"); \
162	} else
163
164#else
165#define REQN(i, count)	/* empty */
166#endif
167
168static nfagrammar *
169metacompile(n)
170	node *n;
171{
172	nfagrammar *gr;
173	int i;
174
175	printf("Compiling (meta-) parse tree into NFA grammar\n");
176	gr = newnfagrammar();
177	REQ(n, MSTART);
178	i = n->n_nchildren - 1; /* Last child is ENDMARKER */
179	n = n->n_child;
180	for (; --i >= 0; n++) {
181		if (n->n_type != NEWLINE)
182			compile_rule(gr, n);
183	}
184	return gr;
185}
186
187static void
188compile_rule(gr, n)
189	nfagrammar *gr;
190	node *n;
191{
192	nfa *nf;
193
194	REQ(n, RULE);
195	REQN(n->n_nchildren, 4);
196	n = n->n_child;
197	REQ(n, NAME);
198	nf = addnfa(gr, n->n_str);
199	n++;
200	REQ(n, COLON);
201	n++;
202	REQ(n, RHS);
203	compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
204	n++;
205	REQ(n, NEWLINE);
206}
207
208static void
209compile_rhs(ll, nf, n, pa, pb)
210	labellist *ll;
211	nfa *nf;
212	node *n;
213	int *pa, *pb;
214{
215	int i;
216	int a, b;
217
218	REQ(n, RHS);
219	i = n->n_nchildren;
220	REQN(i, 1);
221	n = n->n_child;
222	REQ(n, ALT);
223	compile_alt(ll, nf, n, pa, pb);
224	if (--i <= 0)
225		return;
226	n++;
227	a = *pa;
228	b = *pb;
229	*pa = addnfastate(nf);
230	*pb = addnfastate(nf);
231	addnfaarc(nf, *pa, a, EMPTY);
232	addnfaarc(nf, b, *pb, EMPTY);
233	for (; --i >= 0; n++) {
234		REQ(n, VBAR);
235		REQN(i, 1);
236		--i;
237		n++;
238		REQ(n, ALT);
239		compile_alt(ll, nf, n, &a, &b);
240		addnfaarc(nf, *pa, a, EMPTY);
241		addnfaarc(nf, b, *pb, EMPTY);
242	}
243}
244
245static void
246compile_alt(ll, nf, n, pa, pb)
247	labellist *ll;
248	nfa *nf;
249	node *n;
250	int *pa, *pb;
251{
252	int i;
253	int a, b;
254
255	REQ(n, ALT);
256	i = n->n_nchildren;
257	REQN(i, 1);
258	n = n->n_child;
259	REQ(n, ITEM);
260	compile_item(ll, nf, n, pa, pb);
261	--i;
262	n++;
263	for (; --i >= 0; n++) {
264		if (n->n_type == COMMA) { /* XXX Temporary */
265			REQN(i, 1);
266			--i;
267			n++;
268		}
269		REQ(n, ITEM);
270		compile_item(ll, nf, n, &a, &b);
271		addnfaarc(nf, *pb, a, EMPTY);
272		*pb = b;
273	}
274}
275
276static void
277compile_item(ll, nf, n, pa, pb)
278	labellist *ll;
279	nfa *nf;
280	node *n;
281	int *pa, *pb;
282{
283	int i;
284	int a, b;
285
286	REQ(n, ITEM);
287	i = n->n_nchildren;
288	REQN(i, 1);
289	n = n->n_child;
290	if (n->n_type == LSQB) {
291		REQN(i, 3);
292		n++;
293		REQ(n, RHS);
294		*pa = addnfastate(nf);
295		*pb = addnfastate(nf);
296		addnfaarc(nf, *pa, *pb, EMPTY);
297		compile_rhs(ll, nf, n, &a, &b);
298		addnfaarc(nf, *pa, a, EMPTY);
299		addnfaarc(nf, b, *pb, EMPTY);
300		REQN(i, 1);
301		n++;
302		REQ(n, RSQB);
303	}
304	else {
305		compile_atom(ll, nf, n, pa, pb);
306		if (--i <= 0)
307			return;
308		n++;
309		addnfaarc(nf, *pb, *pa, EMPTY);
310		if (n->n_type == STAR)
311			*pb = *pa;
312		else
313			REQ(n, PLUS);
314	}
315}
316
317static void
318compile_atom(ll, nf, n, pa, pb)
319	labellist *ll;
320	nfa *nf;
321	node *n;
322	int *pa, *pb;
323{
324	int i;
325
326	REQ(n, ATOM);
327	i = n->n_nchildren;
328	REQN(i, 1);
329	n = n->n_child;
330	if (n->n_type == LPAR) {
331		REQN(i, 3);
332		n++;
333		REQ(n, RHS);
334		compile_rhs(ll, nf, n, pa, pb);
335		n++;
336		REQ(n, RPAR);
337	}
338	else if (n->n_type == NAME || n->n_type == STRING) {
339		*pa = addnfastate(nf);
340		*pb = addnfastate(nf);
341		addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
342	}
343	else
344		REQ(n, NAME);
345}
346
347static void
348dumpstate(ll, nf, istate)
349	labellist *ll;
350	nfa *nf;
351	int istate;
352{
353	nfastate *st;
354	int i;
355	nfaarc *ar;
356
357	printf("%c%2d%c",
358		istate == nf->nf_start ? '*' : ' ',
359		istate,
360		istate == nf->nf_finish ? '.' : ' ');
361	st = &nf->nf_state[istate];
362	ar = st->st_arc;
363	for (i = 0; i < st->st_narcs; i++) {
364		if (i > 0)
365			printf("\n    ");
366		printf("-> %2d  %s", ar->ar_arrow,
367			PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
368		ar++;
369	}
370	printf("\n");
371}
372
373static void
374dumpnfa(ll, nf)
375	labellist *ll;
376	nfa *nf;
377{
378	int i;
379
380	printf("NFA '%s' has %d states; start %d, finish %d\n",
381		nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
382	for (i = 0; i < nf->nf_nstates; i++)
383		dumpstate(ll, nf, i);
384}
385
386
387/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
388
389static void
390addclosure(ss, nf, istate)
391	bitset ss;
392	nfa *nf;
393	int istate;
394{
395	if (addbit(ss, istate)) {
396		nfastate *st = &nf->nf_state[istate];
397		nfaarc *ar = st->st_arc;
398		int i;
399
400		for (i = st->st_narcs; --i >= 0; ) {
401			if (ar->ar_label == EMPTY)
402				addclosure(ss, nf, ar->ar_arrow);
403			ar++;
404		}
405	}
406}
407
408typedef struct _ss_arc {
409	bitset	sa_bitset;
410	int	sa_arrow;
411	int	sa_label;
412} ss_arc;
413
414typedef struct _ss_state {
415	bitset	ss_ss;
416	int	ss_narcs;
417	ss_arc	*ss_arc;
418	int	ss_deleted;
419	int	ss_finish;
420	int	ss_rename;
421} ss_state;
422
423typedef struct _ss_dfa {
424	int	sd_nstates;
425	ss_state *sd_state;
426} ss_dfa;
427
428/* Forward */
429static void printssdfa Py_PROTO((int xx_nstates, ss_state *xx_state, int nbits,
430			      labellist *ll, char *msg));
431static void simplify Py_PROTO((int xx_nstates, ss_state *xx_state));
432static void convert Py_PROTO((dfa *d, int xx_nstates, ss_state *xx_state));
433
434static void
435makedfa(gr, nf, d)
436	nfagrammar *gr;
437	nfa *nf;
438	dfa *d;
439{
440	int nbits = nf->nf_nstates;
441	bitset ss;
442	int xx_nstates;
443	ss_state *xx_state, *yy;
444	ss_arc *zz;
445	int istate, jstate, iarc, jarc, ibit;
446	nfastate *st;
447	nfaarc *ar;
448
449	ss = newbitset(nbits);
450	addclosure(ss, nf, nf->nf_start);
451	xx_state = PyMem_NEW(ss_state, 1);
452	if (xx_state == NULL)
453		Py_FatalError("no mem for xx_state in makedfa");
454	xx_nstates = 1;
455	yy = &xx_state[0];
456	yy->ss_ss = ss;
457	yy->ss_narcs = 0;
458	yy->ss_arc = NULL;
459	yy->ss_deleted = 0;
460	yy->ss_finish = testbit(ss, nf->nf_finish);
461	if (yy->ss_finish)
462		printf("Error: nonterminal '%s' may produce empty.\n",
463			nf->nf_name);
464
465	/* This algorithm is from a book written before
466	   the invention of structured programming... */
467
468	/* For each unmarked state... */
469	for (istate = 0; istate < xx_nstates; ++istate) {
470		yy = &xx_state[istate];
471		ss = yy->ss_ss;
472		/* For all its states... */
473		for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
474			if (!testbit(ss, ibit))
475				continue;
476			st = &nf->nf_state[ibit];
477			/* For all non-empty arcs from this state... */
478			for (iarc = 0; iarc < st->st_narcs; iarc++) {
479				ar = &st->st_arc[iarc];
480				if (ar->ar_label == EMPTY)
481					continue;
482				/* Look up in list of arcs from this state */
483				for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
484					zz = &yy->ss_arc[jarc];
485					if (ar->ar_label == zz->sa_label)
486						goto found;
487				}
488				/* Add new arc for this state */
489				PyMem_RESIZE(yy->ss_arc, ss_arc,
490					     yy->ss_narcs + 1);
491				if (yy->ss_arc == NULL)
492					Py_FatalError("out of mem");
493				zz = &yy->ss_arc[yy->ss_narcs++];
494				zz->sa_label = ar->ar_label;
495				zz->sa_bitset = newbitset(nbits);
496				zz->sa_arrow = -1;
497			 found:	;
498				/* Add destination */
499				addclosure(zz->sa_bitset, nf, ar->ar_arrow);
500			}
501		}
502		/* Now look up all the arrow states */
503		for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
504			zz = &xx_state[istate].ss_arc[jarc];
505			for (jstate = 0; jstate < xx_nstates; jstate++) {
506				if (samebitset(zz->sa_bitset,
507					xx_state[jstate].ss_ss, nbits)) {
508					zz->sa_arrow = jstate;
509					goto done;
510				}
511			}
512			PyMem_RESIZE(xx_state, ss_state, xx_nstates + 1);
513			if (xx_state == NULL)
514				Py_FatalError("out of mem");
515			zz->sa_arrow = xx_nstates;
516			yy = &xx_state[xx_nstates++];
517			yy->ss_ss = zz->sa_bitset;
518			yy->ss_narcs = 0;
519			yy->ss_arc = NULL;
520			yy->ss_deleted = 0;
521			yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
522		 done:	;
523		}
524	}
525
526	if (Py_DebugFlag)
527		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
528						"before minimizing");
529
530	simplify(xx_nstates, xx_state);
531
532	if (Py_DebugFlag)
533		printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
534						"after minimizing");
535
536	convert(d, xx_nstates, xx_state);
537
538	/* XXX cleanup */
539}
540
541static void
542printssdfa(xx_nstates, xx_state, nbits, ll, msg)
543	int xx_nstates;
544	ss_state *xx_state;
545	int nbits;
546	labellist *ll;
547	char *msg;
548{
549	int i, ibit, iarc;
550	ss_state *yy;
551	ss_arc *zz;
552
553	printf("Subset DFA %s\n", msg);
554	for (i = 0; i < xx_nstates; i++) {
555		yy = &xx_state[i];
556		if (yy->ss_deleted)
557			continue;
558		printf(" Subset %d", i);
559		if (yy->ss_finish)
560			printf(" (finish)");
561		printf(" { ");
562		for (ibit = 0; ibit < nbits; ibit++) {
563			if (testbit(yy->ss_ss, ibit))
564				printf("%d ", ibit);
565		}
566		printf("}\n");
567		for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
568			zz = &yy->ss_arc[iarc];
569			printf("  Arc to state %d, label %s\n",
570				zz->sa_arrow,
571				PyGrammar_LabelRepr(
572					&ll->ll_label[zz->sa_label]));
573		}
574	}
575}
576
577
578/* PART THREE -- SIMPLIFY DFA */
579
580/* Simplify the DFA by repeatedly eliminating states that are
581   equivalent to another oner.  This is NOT Algorithm 3.3 from
582   [Aho&Ullman 77].  It does not always finds the minimal DFA,
583   but it does usually make a much smaller one...  (For an example
584   of sub-optimal behaviour, try S: x a b+ | y a b+.)
585*/
586
587static int
588samestate(s1, s2)
589	ss_state *s1, *s2;
590{
591	int i;
592
593	if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
594		return 0;
595	for (i = 0; i < s1->ss_narcs; i++) {
596		if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
597			s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
598			return 0;
599	}
600	return 1;
601}
602
603static void
604renamestates(xx_nstates, xx_state, from, to)
605	int xx_nstates;
606	ss_state *xx_state;
607	int from, to;
608{
609	int i, j;
610
611	if (Py_DebugFlag)
612		printf("Rename state %d to %d.\n", from, to);
613	for (i = 0; i < xx_nstates; i++) {
614		if (xx_state[i].ss_deleted)
615			continue;
616		for (j = 0; j < xx_state[i].ss_narcs; j++) {
617			if (xx_state[i].ss_arc[j].sa_arrow == from)
618				xx_state[i].ss_arc[j].sa_arrow = to;
619		}
620	}
621}
622
623static void
624simplify(xx_nstates, xx_state)
625	int xx_nstates;
626	ss_state *xx_state;
627{
628	int changes;
629	int i, j;
630
631	do {
632		changes = 0;
633		for (i = 1; i < xx_nstates; i++) {
634			if (xx_state[i].ss_deleted)
635				continue;
636			for (j = 0; j < i; j++) {
637				if (xx_state[j].ss_deleted)
638					continue;
639				if (samestate(&xx_state[i], &xx_state[j])) {
640					xx_state[i].ss_deleted++;
641					renamestates(xx_nstates, xx_state,
642						     i, j);
643					changes++;
644					break;
645				}
646			}
647		}
648	} while (changes);
649}
650
651
652/* PART FOUR -- GENERATE PARSING TABLES */
653
654/* Convert the DFA into a grammar that can be used by our parser */
655
656static void
657convert(d, xx_nstates, xx_state)
658	dfa *d;
659	int xx_nstates;
660	ss_state *xx_state;
661{
662	int i, j;
663	ss_state *yy;
664	ss_arc *zz;
665
666	for (i = 0; i < xx_nstates; i++) {
667		yy = &xx_state[i];
668		if (yy->ss_deleted)
669			continue;
670		yy->ss_rename = addstate(d);
671	}
672
673	for (i = 0; i < xx_nstates; i++) {
674		yy = &xx_state[i];
675		if (yy->ss_deleted)
676			continue;
677		for (j = 0; j < yy->ss_narcs; j++) {
678			zz = &yy->ss_arc[j];
679			addarc(d, yy->ss_rename,
680				xx_state[zz->sa_arrow].ss_rename,
681				zz->sa_label);
682		}
683		if (yy->ss_finish)
684			addarc(d, yy->ss_rename, yy->ss_rename, 0);
685	}
686
687	d->d_initial = 0;
688}
689
690
691/* PART FIVE -- GLUE IT ALL TOGETHER */
692
693static grammar *
694maketables(gr)
695	nfagrammar *gr;
696{
697	int i;
698	nfa *nf;
699	dfa *d;
700	grammar *g;
701
702	if (gr->gr_nnfas == 0)
703		return NULL;
704	g = newgrammar(gr->gr_nfa[0]->nf_type);
705			/* XXX first rule must be start rule */
706	g->g_ll = gr->gr_ll;
707
708	for (i = 0; i < gr->gr_nnfas; i++) {
709		nf = gr->gr_nfa[i];
710		if (Py_DebugFlag) {
711			printf("Dump of NFA for '%s' ...\n", nf->nf_name);
712			dumpnfa(&gr->gr_ll, nf);
713		}
714		printf("Making DFA for '%s' ...\n", nf->nf_name);
715		d = adddfa(g, nf->nf_type, nf->nf_name);
716		makedfa(gr, gr->gr_nfa[i], d);
717	}
718
719	return g;
720}
721
722grammar *
723pgen(n)
724	node *n;
725{
726	nfagrammar *gr;
727	grammar *g;
728
729	gr = metacompile(n);
730	g = maketables(gr);
731	translatelabels(g);
732	addfirstsets(g);
733	return g;
734}
735
736
737/*
738
739Description
740-----------
741
742Input is a grammar in extended BNF (using * for repetition, + for
743at-least-once repetition, [] for optional parts, | for alternatives and
744() for grouping).  This has already been parsed and turned into a parse
745tree.
746
747Each rule is considered as a regular expression in its own right.
748It is turned into a Non-deterministic Finite Automaton (NFA), which
749is then turned into a Deterministic Finite Automaton (DFA), which is then
750optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
751or similar compiler books (this technique is more often used for lexical
752analyzers).
753
754The DFA's are used by the parser as parsing tables in a special way
755that's probably unique.  Before they are usable, the FIRST sets of all
756non-terminals are computed.
757
758Reference
759---------
760
761[Aho&Ullman 77]
762	Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
763	(first edition)
764
765*/
766