1/* Parser generator */
2
3/* For a description, see the comments at end of this file */
4
5#include "Python.h"
6#include "pgenheaders.h"
7#include "token.h"
8#include "node.h"
9#include "grammar.h"
10#include "metagrammar.h"
11#include "pgen.h"
12
13extern int Py_DebugFlag;
14extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */
15
16
17/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
18
19typedef struct _nfaarc {
20    int         ar_label;
21    int         ar_arrow;
22} nfaarc;
23
24typedef struct _nfastate {
25    int         st_narcs;
26    nfaarc      *st_arc;
27} nfastate;
28
29typedef struct _nfa {
30    int                 nf_type;
31    char                *nf_name;
32    int                 nf_nstates;
33    nfastate            *nf_state;
34    int                 nf_start, nf_finish;
35} nfa;
36
37/* Forward */
38static void compile_rhs(labellist *ll,
39                        nfa *nf, node *n, int *pa, int *pb);
40static void compile_alt(labellist *ll,
41                        nfa *nf, node *n, int *pa, int *pb);
42static void compile_item(labellist *ll,
43                         nfa *nf, node *n, int *pa, int *pb);
44static void compile_atom(labellist *ll,
45                         nfa *nf, node *n, int *pa, int *pb);
46
47static int
48addnfastate(nfa *nf)
49{
50    nfastate *st;
51
52    nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state,
53                                sizeof(nfastate) * (nf->nf_nstates + 1));
54    if (nf->nf_state == NULL)
55        Py_FatalError("out of mem");
56    st = &nf->nf_state[nf->nf_nstates++];
57    st->st_narcs = 0;
58    st->st_arc = NULL;
59    return st - nf->nf_state;
60}
61
62static void
63addnfaarc(nfa *nf, int from, int to, int lbl)
64{
65    nfastate *st;
66    nfaarc *ar;
67
68    st = &nf->nf_state[from];
69    st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc,
70                                  sizeof(nfaarc) * (st->st_narcs + 1));
71    if (st->st_arc == NULL)
72        Py_FatalError("out of mem");
73    ar = &st->st_arc[st->st_narcs++];
74    ar->ar_label = lbl;
75    ar->ar_arrow = to;
76}
77
78static nfa *
79newnfa(char *name)
80{
81    nfa *nf;
82    static int type = NT_OFFSET; /* All types will be disjunct */
83
84    nf = (nfa *)PyObject_MALLOC(sizeof(nfa));
85    if (nf == NULL)
86        Py_FatalError("no mem for new nfa");
87    nf->nf_type = type++;
88    nf->nf_name = name; /* XXX strdup(name) ??? */
89    nf->nf_nstates = 0;
90    nf->nf_state = NULL;
91    nf->nf_start = nf->nf_finish = -1;
92    return nf;
93}
94
95typedef struct _nfagrammar {
96    int                 gr_nnfas;
97    nfa                 **gr_nfa;
98    labellist           gr_ll;
99} nfagrammar;
100
101/* Forward */
102static void compile_rule(nfagrammar *gr, node *n);
103
104static nfagrammar *
105newnfagrammar(void)
106{
107    nfagrammar *gr;
108
109    gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar));
110    if (gr == NULL)
111        Py_FatalError("no mem for new nfa grammar");
112    gr->gr_nnfas = 0;
113    gr->gr_nfa = NULL;
114    gr->gr_ll.ll_nlabels = 0;
115    gr->gr_ll.ll_label = NULL;
116    addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
117    return gr;
118}
119
120static nfa *
121addnfa(nfagrammar *gr, char *name)
122{
123    nfa *nf;
124
125    nf = newnfa(name);
126    gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa,
127                                  sizeof(nfa*) * (gr->gr_nnfas + 1));
128    if (gr->gr_nfa == NULL)
129        Py_FatalError("out of mem");
130    gr->gr_nfa[gr->gr_nnfas++] = nf;
131    addlabel(&gr->gr_ll, NAME, nf->nf_name);
132    return nf;
133}
134
135#ifdef Py_DEBUG
136
137static char REQNFMT[] = "metacompile: less than %d children\n";
138
139#define REQN(i, count) \
140    if (i < count) { \
141        fprintf(stderr, REQNFMT, count); \
142        Py_FatalError("REQN"); \
143    } else
144
145#else
146#define REQN(i, count)  /* empty */
147#endif
148
149static nfagrammar *
150metacompile(node *n)
151{
152    nfagrammar *gr;
153    int i;
154
155    if (Py_DebugFlag)
156        printf("Compiling (meta-) parse tree into NFA grammar\n");
157    gr = newnfagrammar();
158    REQ(n, MSTART);
159    i = n->n_nchildren - 1; /* Last child is ENDMARKER */
160    n = n->n_child;
161    for (; --i >= 0; n++) {
162        if (n->n_type != NEWLINE)
163            compile_rule(gr, n);
164    }
165    return gr;
166}
167
168static void
169compile_rule(nfagrammar *gr, node *n)
170{
171    nfa *nf;
172
173    REQ(n, RULE);
174    REQN(n->n_nchildren, 4);
175    n = n->n_child;
176    REQ(n, NAME);
177    nf = addnfa(gr, n->n_str);
178    n++;
179    REQ(n, COLON);
180    n++;
181    REQ(n, RHS);
182    compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
183    n++;
184    REQ(n, NEWLINE);
185}
186
187static void
188compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
189{
190    int i;
191    int a, b;
192
193    REQ(n, RHS);
194    i = n->n_nchildren;
195    REQN(i, 1);
196    n = n->n_child;
197    REQ(n, ALT);
198    compile_alt(ll, nf, n, pa, pb);
199    if (--i <= 0)
200        return;
201    n++;
202    a = *pa;
203    b = *pb;
204    *pa = addnfastate(nf);
205    *pb = addnfastate(nf);
206    addnfaarc(nf, *pa, a, EMPTY);
207    addnfaarc(nf, b, *pb, EMPTY);
208    for (; --i >= 0; n++) {
209        REQ(n, VBAR);
210        REQN(i, 1);
211        --i;
212        n++;
213        REQ(n, ALT);
214        compile_alt(ll, nf, n, &a, &b);
215        addnfaarc(nf, *pa, a, EMPTY);
216        addnfaarc(nf, b, *pb, EMPTY);
217    }
218}
219
220static void
221compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
222{
223    int i;
224    int a, b;
225
226    REQ(n, ALT);
227    i = n->n_nchildren;
228    REQN(i, 1);
229    n = n->n_child;
230    REQ(n, ITEM);
231    compile_item(ll, nf, n, pa, pb);
232    --i;
233    n++;
234    for (; --i >= 0; n++) {
235        REQ(n, ITEM);
236        compile_item(ll, nf, n, &a, &b);
237        addnfaarc(nf, *pb, a, EMPTY);
238        *pb = b;
239    }
240}
241
242static void
243compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
244{
245    int i;
246    int a, b;
247
248    REQ(n, ITEM);
249    i = n->n_nchildren;
250    REQN(i, 1);
251    n = n->n_child;
252    if (n->n_type == LSQB) {
253        REQN(i, 3);
254        n++;
255        REQ(n, RHS);
256        *pa = addnfastate(nf);
257        *pb = addnfastate(nf);
258        addnfaarc(nf, *pa, *pb, EMPTY);
259        compile_rhs(ll, nf, n, &a, &b);
260        addnfaarc(nf, *pa, a, EMPTY);
261        addnfaarc(nf, b, *pb, EMPTY);
262        REQN(i, 1);
263        n++;
264        REQ(n, RSQB);
265    }
266    else {
267        compile_atom(ll, nf, n, pa, pb);
268        if (--i <= 0)
269            return;
270        n++;
271        addnfaarc(nf, *pb, *pa, EMPTY);
272        if (n->n_type == STAR)
273            *pb = *pa;
274        else
275            REQ(n, PLUS);
276    }
277}
278
279static void
280compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
281{
282    int i;
283
284    REQ(n, ATOM);
285    i = n->n_nchildren;
286    REQN(i, 1);
287    n = n->n_child;
288    if (n->n_type == LPAR) {
289        REQN(i, 3);
290        n++;
291        REQ(n, RHS);
292        compile_rhs(ll, nf, n, pa, pb);
293        n++;
294        REQ(n, RPAR);
295    }
296    else if (n->n_type == NAME || n->n_type == STRING) {
297        *pa = addnfastate(nf);
298        *pb = addnfastate(nf);
299        addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
300    }
301    else
302        REQ(n, NAME);
303}
304
305static void
306dumpstate(labellist *ll, nfa *nf, int istate)
307{
308    nfastate *st;
309    int i;
310    nfaarc *ar;
311
312    printf("%c%2d%c",
313        istate == nf->nf_start ? '*' : ' ',
314        istate,
315        istate == nf->nf_finish ? '.' : ' ');
316    st = &nf->nf_state[istate];
317    ar = st->st_arc;
318    for (i = 0; i < st->st_narcs; i++) {
319        if (i > 0)
320            printf("\n    ");
321        printf("-> %2d  %s", ar->ar_arrow,
322            PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
323        ar++;
324    }
325    printf("\n");
326}
327
328static void
329dumpnfa(labellist *ll, nfa *nf)
330{
331    int i;
332
333    printf("NFA '%s' has %d states; start %d, finish %d\n",
334        nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
335    for (i = 0; i < nf->nf_nstates; i++)
336        dumpstate(ll, nf, i);
337}
338
339
340/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
341
342static void
343addclosure(bitset ss, nfa *nf, int istate)
344{
345    if (addbit(ss, istate)) {
346        nfastate *st = &nf->nf_state[istate];
347        nfaarc *ar = st->st_arc;
348        int i;
349
350        for (i = st->st_narcs; --i >= 0; ) {
351            if (ar->ar_label == EMPTY)
352                addclosure(ss, nf, ar->ar_arrow);
353            ar++;
354        }
355    }
356}
357
358typedef struct _ss_arc {
359    bitset      sa_bitset;
360    int         sa_arrow;
361    int         sa_label;
362} ss_arc;
363
364typedef struct _ss_state {
365    bitset      ss_ss;
366    int         ss_narcs;
367    struct _ss_arc      *ss_arc;
368    int         ss_deleted;
369    int         ss_finish;
370    int         ss_rename;
371} ss_state;
372
373typedef struct _ss_dfa {
374    int         sd_nstates;
375    ss_state *sd_state;
376} ss_dfa;
377
378/* Forward */
379static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
380                       labellist *ll, char *msg);
381static void simplify(int xx_nstates, ss_state *xx_state);
382static void convert(dfa *d, int xx_nstates, ss_state *xx_state);
383
384static void
385makedfa(nfagrammar *gr, nfa *nf, dfa *d)
386{
387    int nbits = nf->nf_nstates;
388    bitset ss;
389    int xx_nstates;
390    ss_state *xx_state, *yy;
391    ss_arc *zz;
392    int istate, jstate, iarc, jarc, ibit;
393    nfastate *st;
394    nfaarc *ar;
395
396    ss = newbitset(nbits);
397    addclosure(ss, nf, nf->nf_start);
398    xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state));
399    if (xx_state == NULL)
400        Py_FatalError("no mem for xx_state in makedfa");
401    xx_nstates = 1;
402    yy = &xx_state[0];
403    yy->ss_ss = ss;
404    yy->ss_narcs = 0;
405    yy->ss_arc = NULL;
406    yy->ss_deleted = 0;
407    yy->ss_finish = testbit(ss, nf->nf_finish);
408    if (yy->ss_finish)
409        printf("Error: nonterminal '%s' may produce empty.\n",
410            nf->nf_name);
411
412    /* This algorithm is from a book written before
413       the invention of structured programming... */
414
415    /* For each unmarked state... */
416    for (istate = 0; istate < xx_nstates; ++istate) {
417        size_t size;
418        yy = &xx_state[istate];
419        ss = yy->ss_ss;
420        /* For all its states... */
421        for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
422            if (!testbit(ss, ibit))
423                continue;
424            st = &nf->nf_state[ibit];
425            /* For all non-empty arcs from this state... */
426            for (iarc = 0; iarc < st->st_narcs; iarc++) {
427                ar = &st->st_arc[iarc];
428                if (ar->ar_label == EMPTY)
429                    continue;
430                /* Look up in list of arcs from this state */
431                for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
432                    zz = &yy->ss_arc[jarc];
433                    if (ar->ar_label == zz->sa_label)
434                        goto found;
435                }
436                /* Add new arc for this state */
437                size = sizeof(ss_arc) * (yy->ss_narcs + 1);
438                yy->ss_arc = (ss_arc *)PyObject_REALLOC(
439                                            yy->ss_arc, size);
440                if (yy->ss_arc == NULL)
441                    Py_FatalError("out of mem");
442                zz = &yy->ss_arc[yy->ss_narcs++];
443                zz->sa_label = ar->ar_label;
444                zz->sa_bitset = newbitset(nbits);
445                zz->sa_arrow = -1;
446             found:             ;
447                /* Add destination */
448                addclosure(zz->sa_bitset, nf, ar->ar_arrow);
449            }
450        }
451        /* Now look up all the arrow states */
452        for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
453            zz = &xx_state[istate].ss_arc[jarc];
454            for (jstate = 0; jstate < xx_nstates; jstate++) {
455                if (samebitset(zz->sa_bitset,
456                    xx_state[jstate].ss_ss, nbits)) {
457                    zz->sa_arrow = jstate;
458                    goto done;
459                }
460            }
461            size = sizeof(ss_state) * (xx_nstates + 1);
462            xx_state = (ss_state *)PyObject_REALLOC(xx_state,
463                                                        size);
464            if (xx_state == NULL)
465                Py_FatalError("out of mem");
466            zz->sa_arrow = xx_nstates;
467            yy = &xx_state[xx_nstates++];
468            yy->ss_ss = zz->sa_bitset;
469            yy->ss_narcs = 0;
470            yy->ss_arc = NULL;
471            yy->ss_deleted = 0;
472            yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
473         done:          ;
474        }
475    }
476
477    if (Py_DebugFlag)
478        printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
479                                        "before minimizing");
480
481    simplify(xx_nstates, xx_state);
482
483    if (Py_DebugFlag)
484        printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
485                                        "after minimizing");
486
487    convert(d, xx_nstates, xx_state);
488
489    /* XXX cleanup */
490    PyObject_FREE(xx_state);
491}
492
493static void
494printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
495           labellist *ll, char *msg)
496{
497    int i, ibit, iarc;
498    ss_state *yy;
499    ss_arc *zz;
500
501    printf("Subset DFA %s\n", msg);
502    for (i = 0; i < xx_nstates; i++) {
503        yy = &xx_state[i];
504        if (yy->ss_deleted)
505            continue;
506        printf(" Subset %d", i);
507        if (yy->ss_finish)
508            printf(" (finish)");
509        printf(" { ");
510        for (ibit = 0; ibit < nbits; ibit++) {
511            if (testbit(yy->ss_ss, ibit))
512                printf("%d ", ibit);
513        }
514        printf("}\n");
515        for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
516            zz = &yy->ss_arc[iarc];
517            printf("  Arc to state %d, label %s\n",
518                zz->sa_arrow,
519                PyGrammar_LabelRepr(
520                    &ll->ll_label[zz->sa_label]));
521        }
522    }
523}
524
525
526/* PART THREE -- SIMPLIFY DFA */
527
528/* Simplify the DFA by repeatedly eliminating states that are
529   equivalent to another oner.  This is NOT Algorithm 3.3 from
530   [Aho&Ullman 77].  It does not always finds the minimal DFA,
531   but it does usually make a much smaller one...  (For an example
532   of sub-optimal behavior, try S: x a b+ | y a b+.)
533*/
534
535static int
536samestate(ss_state *s1, ss_state *s2)
537{
538    int i;
539
540    if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
541        return 0;
542    for (i = 0; i < s1->ss_narcs; i++) {
543        if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
544            s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
545            return 0;
546    }
547    return 1;
548}
549
550static void
551renamestates(int xx_nstates, ss_state *xx_state, int from, int to)
552{
553    int i, j;
554
555    if (Py_DebugFlag)
556        printf("Rename state %d to %d.\n", from, to);
557    for (i = 0; i < xx_nstates; i++) {
558        if (xx_state[i].ss_deleted)
559            continue;
560        for (j = 0; j < xx_state[i].ss_narcs; j++) {
561            if (xx_state[i].ss_arc[j].sa_arrow == from)
562                xx_state[i].ss_arc[j].sa_arrow = to;
563        }
564    }
565}
566
567static void
568simplify(int xx_nstates, ss_state *xx_state)
569{
570    int changes;
571    int i, j;
572
573    do {
574        changes = 0;
575        for (i = 1; i < xx_nstates; i++) {
576            if (xx_state[i].ss_deleted)
577                continue;
578            for (j = 0; j < i; j++) {
579                if (xx_state[j].ss_deleted)
580                    continue;
581                if (samestate(&xx_state[i], &xx_state[j])) {
582                    xx_state[i].ss_deleted++;
583                    renamestates(xx_nstates, xx_state,
584                                 i, j);
585                    changes++;
586                    break;
587                }
588            }
589        }
590    } while (changes);
591}
592
593
594/* PART FOUR -- GENERATE PARSING TABLES */
595
596/* Convert the DFA into a grammar that can be used by our parser */
597
598static void
599convert(dfa *d, int xx_nstates, ss_state *xx_state)
600{
601    int i, j;
602    ss_state *yy;
603    ss_arc *zz;
604
605    for (i = 0; i < xx_nstates; i++) {
606        yy = &xx_state[i];
607        if (yy->ss_deleted)
608            continue;
609        yy->ss_rename = addstate(d);
610    }
611
612    for (i = 0; i < xx_nstates; i++) {
613        yy = &xx_state[i];
614        if (yy->ss_deleted)
615            continue;
616        for (j = 0; j < yy->ss_narcs; j++) {
617            zz = &yy->ss_arc[j];
618            addarc(d, yy->ss_rename,
619                xx_state[zz->sa_arrow].ss_rename,
620                zz->sa_label);
621        }
622        if (yy->ss_finish)
623            addarc(d, yy->ss_rename, yy->ss_rename, 0);
624    }
625
626    d->d_initial = 0;
627}
628
629
630/* PART FIVE -- GLUE IT ALL TOGETHER */
631
632static grammar *
633maketables(nfagrammar *gr)
634{
635    int i;
636    nfa *nf;
637    dfa *d;
638    grammar *g;
639
640    if (gr->gr_nnfas == 0)
641        return NULL;
642    g = newgrammar(gr->gr_nfa[0]->nf_type);
643                    /* XXX first rule must be start rule */
644    g->g_ll = gr->gr_ll;
645
646    for (i = 0; i < gr->gr_nnfas; i++) {
647        nf = gr->gr_nfa[i];
648        if (Py_DebugFlag) {
649            printf("Dump of NFA for '%s' ...\n", nf->nf_name);
650            dumpnfa(&gr->gr_ll, nf);
651            printf("Making DFA for '%s' ...\n", nf->nf_name);
652        }
653        d = adddfa(g, nf->nf_type, nf->nf_name);
654        makedfa(gr, gr->gr_nfa[i], d);
655    }
656
657    return g;
658}
659
660grammar *
661pgen(node *n)
662{
663    nfagrammar *gr;
664    grammar *g;
665
666    gr = metacompile(n);
667    g = maketables(gr);
668    translatelabels(g);
669    addfirstsets(g);
670    PyObject_FREE(gr);
671    return g;
672}
673
674grammar *
675Py_pgen(node *n)
676{
677  return pgen(n);
678}
679
680/*
681
682Description
683-----------
684
685Input is a grammar in extended BNF (using * for repetition, + for
686at-least-once repetition, [] for optional parts, | for alternatives and
687() for grouping).  This has already been parsed and turned into a parse
688tree.
689
690Each rule is considered as a regular expression in its own right.
691It is turned into a Non-deterministic Finite Automaton (NFA), which
692is then turned into a Deterministic Finite Automaton (DFA), which is then
693optimized to reduce the number of states.  See [Aho&Ullman 77] chapter 3,
694or similar compiler books (this technique is more often used for lexical
695analyzers).
696
697The DFA's are used by the parser as parsing tables in a special way
698that's probably unique.  Before they are usable, the FIRST sets of all
699non-terminals are computed.
700
701Reference
702---------
703
704[Aho&Ullman 77]
705    Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
706    (first edition)
707
708*/
709