pgen.c revision fd71b9e9d496caa510dec56a9b69966558d6ba5d
1/*********************************************************** 2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, 3The Netherlands. 4 5 All Rights Reserved 6 7Copyright (c) 2000, BeOpen.com. 8Copyright (c) 1995-2000, Corporation for National Research Initiatives. 9Copyright (c) 1990-1995, Stichting Mathematisch Centrum. 10All rights reserved. 11 12See the file "Misc/COPYRIGHT" for information on usage and 13redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES. 14 15******************************************************************/ 16 17/* Parser generator */ 18/* XXX This file is not yet fully PROTOized */ 19 20/* For a description, see the comments at end of this file */ 21 22#include "pgenheaders.h" 23#include "assert.h" 24#include "token.h" 25#include "node.h" 26#include "grammar.h" 27#include "metagrammar.h" 28#include "pgen.h" 29 30extern int Py_DebugFlag; 31 32 33/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ 34 35typedef struct _nfaarc { 36 int ar_label; 37 int ar_arrow; 38} nfaarc; 39 40typedef struct _nfastate { 41 int st_narcs; 42 nfaarc *st_arc; 43} nfastate; 44 45typedef struct _nfa { 46 int nf_type; 47 char *nf_name; 48 int nf_nstates; 49 nfastate *nf_state; 50 int nf_start, nf_finish; 51} nfa; 52 53/* Forward */ 54static void compile_rhs Py_PROTO((labellist *ll, 55 nfa *nf, node *n, int *pa, int *pb)); 56static void compile_alt Py_PROTO((labellist *ll, 57 nfa *nf, node *n, int *pa, int *pb)); 58static void compile_item Py_PROTO((labellist *ll, 59 nfa *nf, node *n, int *pa, int *pb)); 60static void compile_atom Py_PROTO((labellist *ll, 61 nfa *nf, node *n, int *pa, int *pb)); 62 63static int 64addnfastate(nf) 65 nfa *nf; 66{ 67 nfastate *st; 68 69 PyMem_RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1); 70 if (nf->nf_state == NULL) 71 Py_FatalError("out of mem"); 72 st = &nf->nf_state[nf->nf_nstates++]; 73 st->st_narcs = 0; 74 st->st_arc = NULL; 75 return st - nf->nf_state; 76} 77 78static void 79addnfaarc(nf, from, to, lbl) 80 nfa *nf; 81 int from, to, lbl; 82{ 83 nfastate *st; 84 nfaarc *ar; 85 86 st = &nf->nf_state[from]; 87 PyMem_RESIZE(st->st_arc, nfaarc, st->st_narcs + 1); 88 if (st->st_arc == NULL) 89 Py_FatalError("out of mem"); 90 ar = &st->st_arc[st->st_narcs++]; 91 ar->ar_label = lbl; 92 ar->ar_arrow = to; 93} 94 95static nfa * 96newnfa(name) 97 char *name; 98{ 99 nfa *nf; 100 static int type = NT_OFFSET; /* All types will be disjunct */ 101 102 nf = PyMem_NEW(nfa, 1); 103 if (nf == NULL) 104 Py_FatalError("no mem for new nfa"); 105 nf->nf_type = type++; 106 nf->nf_name = name; /* XXX strdup(name) ??? */ 107 nf->nf_nstates = 0; 108 nf->nf_state = NULL; 109 nf->nf_start = nf->nf_finish = -1; 110 return nf; 111} 112 113typedef struct _nfagrammar { 114 int gr_nnfas; 115 nfa **gr_nfa; 116 labellist gr_ll; 117} nfagrammar; 118 119/* Forward */ 120static void compile_rule Py_PROTO((nfagrammar *gr, node *n)); 121 122static nfagrammar * 123newnfagrammar() 124{ 125 nfagrammar *gr; 126 127 gr = PyMem_NEW(nfagrammar, 1); 128 if (gr == NULL) 129 Py_FatalError("no mem for new nfa grammar"); 130 gr->gr_nnfas = 0; 131 gr->gr_nfa = NULL; 132 gr->gr_ll.ll_nlabels = 0; 133 gr->gr_ll.ll_label = NULL; 134 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); 135 return gr; 136} 137 138static nfa * 139addnfa(gr, name) 140 nfagrammar *gr; 141 char *name; 142{ 143 nfa *nf; 144 145 nf = newnfa(name); 146 PyMem_RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1); 147 if (gr->gr_nfa == NULL) 148 Py_FatalError("out of mem"); 149 gr->gr_nfa[gr->gr_nnfas++] = nf; 150 addlabel(&gr->gr_ll, NAME, nf->nf_name); 151 return nf; 152} 153 154#ifdef Py_DEBUG 155 156static char REQNFMT[] = "metacompile: less than %d children\n"; 157 158#define REQN(i, count) \ 159 if (i < count) { \ 160 fprintf(stderr, REQNFMT, count); \ 161 Py_FatalError("REQN"); \ 162 } else 163 164#else 165#define REQN(i, count) /* empty */ 166#endif 167 168static nfagrammar * 169metacompile(n) 170 node *n; 171{ 172 nfagrammar *gr; 173 int i; 174 175 printf("Compiling (meta-) parse tree into NFA grammar\n"); 176 gr = newnfagrammar(); 177 REQ(n, MSTART); 178 i = n->n_nchildren - 1; /* Last child is ENDMARKER */ 179 n = n->n_child; 180 for (; --i >= 0; n++) { 181 if (n->n_type != NEWLINE) 182 compile_rule(gr, n); 183 } 184 return gr; 185} 186 187static void 188compile_rule(gr, n) 189 nfagrammar *gr; 190 node *n; 191{ 192 nfa *nf; 193 194 REQ(n, RULE); 195 REQN(n->n_nchildren, 4); 196 n = n->n_child; 197 REQ(n, NAME); 198 nf = addnfa(gr, n->n_str); 199 n++; 200 REQ(n, COLON); 201 n++; 202 REQ(n, RHS); 203 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); 204 n++; 205 REQ(n, NEWLINE); 206} 207 208static void 209compile_rhs(ll, nf, n, pa, pb) 210 labellist *ll; 211 nfa *nf; 212 node *n; 213 int *pa, *pb; 214{ 215 int i; 216 int a, b; 217 218 REQ(n, RHS); 219 i = n->n_nchildren; 220 REQN(i, 1); 221 n = n->n_child; 222 REQ(n, ALT); 223 compile_alt(ll, nf, n, pa, pb); 224 if (--i <= 0) 225 return; 226 n++; 227 a = *pa; 228 b = *pb; 229 *pa = addnfastate(nf); 230 *pb = addnfastate(nf); 231 addnfaarc(nf, *pa, a, EMPTY); 232 addnfaarc(nf, b, *pb, EMPTY); 233 for (; --i >= 0; n++) { 234 REQ(n, VBAR); 235 REQN(i, 1); 236 --i; 237 n++; 238 REQ(n, ALT); 239 compile_alt(ll, nf, n, &a, &b); 240 addnfaarc(nf, *pa, a, EMPTY); 241 addnfaarc(nf, b, *pb, EMPTY); 242 } 243} 244 245static void 246compile_alt(ll, nf, n, pa, pb) 247 labellist *ll; 248 nfa *nf; 249 node *n; 250 int *pa, *pb; 251{ 252 int i; 253 int a, b; 254 255 REQ(n, ALT); 256 i = n->n_nchildren; 257 REQN(i, 1); 258 n = n->n_child; 259 REQ(n, ITEM); 260 compile_item(ll, nf, n, pa, pb); 261 --i; 262 n++; 263 for (; --i >= 0; n++) { 264 if (n->n_type == COMMA) { /* XXX Temporary */ 265 REQN(i, 1); 266 --i; 267 n++; 268 } 269 REQ(n, ITEM); 270 compile_item(ll, nf, n, &a, &b); 271 addnfaarc(nf, *pb, a, EMPTY); 272 *pb = b; 273 } 274} 275 276static void 277compile_item(ll, nf, n, pa, pb) 278 labellist *ll; 279 nfa *nf; 280 node *n; 281 int *pa, *pb; 282{ 283 int i; 284 int a, b; 285 286 REQ(n, ITEM); 287 i = n->n_nchildren; 288 REQN(i, 1); 289 n = n->n_child; 290 if (n->n_type == LSQB) { 291 REQN(i, 3); 292 n++; 293 REQ(n, RHS); 294 *pa = addnfastate(nf); 295 *pb = addnfastate(nf); 296 addnfaarc(nf, *pa, *pb, EMPTY); 297 compile_rhs(ll, nf, n, &a, &b); 298 addnfaarc(nf, *pa, a, EMPTY); 299 addnfaarc(nf, b, *pb, EMPTY); 300 REQN(i, 1); 301 n++; 302 REQ(n, RSQB); 303 } 304 else { 305 compile_atom(ll, nf, n, pa, pb); 306 if (--i <= 0) 307 return; 308 n++; 309 addnfaarc(nf, *pb, *pa, EMPTY); 310 if (n->n_type == STAR) 311 *pb = *pa; 312 else 313 REQ(n, PLUS); 314 } 315} 316 317static void 318compile_atom(ll, nf, n, pa, pb) 319 labellist *ll; 320 nfa *nf; 321 node *n; 322 int *pa, *pb; 323{ 324 int i; 325 326 REQ(n, ATOM); 327 i = n->n_nchildren; 328 REQN(i, 1); 329 n = n->n_child; 330 if (n->n_type == LPAR) { 331 REQN(i, 3); 332 n++; 333 REQ(n, RHS); 334 compile_rhs(ll, nf, n, pa, pb); 335 n++; 336 REQ(n, RPAR); 337 } 338 else if (n->n_type == NAME || n->n_type == STRING) { 339 *pa = addnfastate(nf); 340 *pb = addnfastate(nf); 341 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); 342 } 343 else 344 REQ(n, NAME); 345} 346 347static void 348dumpstate(ll, nf, istate) 349 labellist *ll; 350 nfa *nf; 351 int istate; 352{ 353 nfastate *st; 354 int i; 355 nfaarc *ar; 356 357 printf("%c%2d%c", 358 istate == nf->nf_start ? '*' : ' ', 359 istate, 360 istate == nf->nf_finish ? '.' : ' '); 361 st = &nf->nf_state[istate]; 362 ar = st->st_arc; 363 for (i = 0; i < st->st_narcs; i++) { 364 if (i > 0) 365 printf("\n "); 366 printf("-> %2d %s", ar->ar_arrow, 367 PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label])); 368 ar++; 369 } 370 printf("\n"); 371} 372 373static void 374dumpnfa(ll, nf) 375 labellist *ll; 376 nfa *nf; 377{ 378 int i; 379 380 printf("NFA '%s' has %d states; start %d, finish %d\n", 381 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); 382 for (i = 0; i < nf->nf_nstates; i++) 383 dumpstate(ll, nf, i); 384} 385 386 387/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ 388 389static void 390addclosure(ss, nf, istate) 391 bitset ss; 392 nfa *nf; 393 int istate; 394{ 395 if (addbit(ss, istate)) { 396 nfastate *st = &nf->nf_state[istate]; 397 nfaarc *ar = st->st_arc; 398 int i; 399 400 for (i = st->st_narcs; --i >= 0; ) { 401 if (ar->ar_label == EMPTY) 402 addclosure(ss, nf, ar->ar_arrow); 403 ar++; 404 } 405 } 406} 407 408typedef struct _ss_arc { 409 bitset sa_bitset; 410 int sa_arrow; 411 int sa_label; 412} ss_arc; 413 414typedef struct _ss_state { 415 bitset ss_ss; 416 int ss_narcs; 417 ss_arc *ss_arc; 418 int ss_deleted; 419 int ss_finish; 420 int ss_rename; 421} ss_state; 422 423typedef struct _ss_dfa { 424 int sd_nstates; 425 ss_state *sd_state; 426} ss_dfa; 427 428/* Forward */ 429static void printssdfa Py_PROTO((int xx_nstates, ss_state *xx_state, int nbits, 430 labellist *ll, char *msg)); 431static void simplify Py_PROTO((int xx_nstates, ss_state *xx_state)); 432static void convert Py_PROTO((dfa *d, int xx_nstates, ss_state *xx_state)); 433 434static void 435makedfa(gr, nf, d) 436 nfagrammar *gr; 437 nfa *nf; 438 dfa *d; 439{ 440 int nbits = nf->nf_nstates; 441 bitset ss; 442 int xx_nstates; 443 ss_state *xx_state, *yy; 444 ss_arc *zz; 445 int istate, jstate, iarc, jarc, ibit; 446 nfastate *st; 447 nfaarc *ar; 448 449 ss = newbitset(nbits); 450 addclosure(ss, nf, nf->nf_start); 451 xx_state = PyMem_NEW(ss_state, 1); 452 if (xx_state == NULL) 453 Py_FatalError("no mem for xx_state in makedfa"); 454 xx_nstates = 1; 455 yy = &xx_state[0]; 456 yy->ss_ss = ss; 457 yy->ss_narcs = 0; 458 yy->ss_arc = NULL; 459 yy->ss_deleted = 0; 460 yy->ss_finish = testbit(ss, nf->nf_finish); 461 if (yy->ss_finish) 462 printf("Error: nonterminal '%s' may produce empty.\n", 463 nf->nf_name); 464 465 /* This algorithm is from a book written before 466 the invention of structured programming... */ 467 468 /* For each unmarked state... */ 469 for (istate = 0; istate < xx_nstates; ++istate) { 470 yy = &xx_state[istate]; 471 ss = yy->ss_ss; 472 /* For all its states... */ 473 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { 474 if (!testbit(ss, ibit)) 475 continue; 476 st = &nf->nf_state[ibit]; 477 /* For all non-empty arcs from this state... */ 478 for (iarc = 0; iarc < st->st_narcs; iarc++) { 479 ar = &st->st_arc[iarc]; 480 if (ar->ar_label == EMPTY) 481 continue; 482 /* Look up in list of arcs from this state */ 483 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { 484 zz = &yy->ss_arc[jarc]; 485 if (ar->ar_label == zz->sa_label) 486 goto found; 487 } 488 /* Add new arc for this state */ 489 PyMem_RESIZE(yy->ss_arc, ss_arc, 490 yy->ss_narcs + 1); 491 if (yy->ss_arc == NULL) 492 Py_FatalError("out of mem"); 493 zz = &yy->ss_arc[yy->ss_narcs++]; 494 zz->sa_label = ar->ar_label; 495 zz->sa_bitset = newbitset(nbits); 496 zz->sa_arrow = -1; 497 found: ; 498 /* Add destination */ 499 addclosure(zz->sa_bitset, nf, ar->ar_arrow); 500 } 501 } 502 /* Now look up all the arrow states */ 503 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { 504 zz = &xx_state[istate].ss_arc[jarc]; 505 for (jstate = 0; jstate < xx_nstates; jstate++) { 506 if (samebitset(zz->sa_bitset, 507 xx_state[jstate].ss_ss, nbits)) { 508 zz->sa_arrow = jstate; 509 goto done; 510 } 511 } 512 PyMem_RESIZE(xx_state, ss_state, xx_nstates + 1); 513 if (xx_state == NULL) 514 Py_FatalError("out of mem"); 515 zz->sa_arrow = xx_nstates; 516 yy = &xx_state[xx_nstates++]; 517 yy->ss_ss = zz->sa_bitset; 518 yy->ss_narcs = 0; 519 yy->ss_arc = NULL; 520 yy->ss_deleted = 0; 521 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); 522 done: ; 523 } 524 } 525 526 if (Py_DebugFlag) 527 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 528 "before minimizing"); 529 530 simplify(xx_nstates, xx_state); 531 532 if (Py_DebugFlag) 533 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 534 "after minimizing"); 535 536 convert(d, xx_nstates, xx_state); 537 538 /* XXX cleanup */ 539} 540 541static void 542printssdfa(xx_nstates, xx_state, nbits, ll, msg) 543 int xx_nstates; 544 ss_state *xx_state; 545 int nbits; 546 labellist *ll; 547 char *msg; 548{ 549 int i, ibit, iarc; 550 ss_state *yy; 551 ss_arc *zz; 552 553 printf("Subset DFA %s\n", msg); 554 for (i = 0; i < xx_nstates; i++) { 555 yy = &xx_state[i]; 556 if (yy->ss_deleted) 557 continue; 558 printf(" Subset %d", i); 559 if (yy->ss_finish) 560 printf(" (finish)"); 561 printf(" { "); 562 for (ibit = 0; ibit < nbits; ibit++) { 563 if (testbit(yy->ss_ss, ibit)) 564 printf("%d ", ibit); 565 } 566 printf("}\n"); 567 for (iarc = 0; iarc < yy->ss_narcs; iarc++) { 568 zz = &yy->ss_arc[iarc]; 569 printf(" Arc to state %d, label %s\n", 570 zz->sa_arrow, 571 PyGrammar_LabelRepr( 572 &ll->ll_label[zz->sa_label])); 573 } 574 } 575} 576 577 578/* PART THREE -- SIMPLIFY DFA */ 579 580/* Simplify the DFA by repeatedly eliminating states that are 581 equivalent to another oner. This is NOT Algorithm 3.3 from 582 [Aho&Ullman 77]. It does not always finds the minimal DFA, 583 but it does usually make a much smaller one... (For an example 584 of sub-optimal behaviour, try S: x a b+ | y a b+.) 585*/ 586 587static int 588samestate(s1, s2) 589 ss_state *s1, *s2; 590{ 591 int i; 592 593 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) 594 return 0; 595 for (i = 0; i < s1->ss_narcs; i++) { 596 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || 597 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) 598 return 0; 599 } 600 return 1; 601} 602 603static void 604renamestates(xx_nstates, xx_state, from, to) 605 int xx_nstates; 606 ss_state *xx_state; 607 int from, to; 608{ 609 int i, j; 610 611 if (Py_DebugFlag) 612 printf("Rename state %d to %d.\n", from, to); 613 for (i = 0; i < xx_nstates; i++) { 614 if (xx_state[i].ss_deleted) 615 continue; 616 for (j = 0; j < xx_state[i].ss_narcs; j++) { 617 if (xx_state[i].ss_arc[j].sa_arrow == from) 618 xx_state[i].ss_arc[j].sa_arrow = to; 619 } 620 } 621} 622 623static void 624simplify(xx_nstates, xx_state) 625 int xx_nstates; 626 ss_state *xx_state; 627{ 628 int changes; 629 int i, j; 630 631 do { 632 changes = 0; 633 for (i = 1; i < xx_nstates; i++) { 634 if (xx_state[i].ss_deleted) 635 continue; 636 for (j = 0; j < i; j++) { 637 if (xx_state[j].ss_deleted) 638 continue; 639 if (samestate(&xx_state[i], &xx_state[j])) { 640 xx_state[i].ss_deleted++; 641 renamestates(xx_nstates, xx_state, 642 i, j); 643 changes++; 644 break; 645 } 646 } 647 } 648 } while (changes); 649} 650 651 652/* PART FOUR -- GENERATE PARSING TABLES */ 653 654/* Convert the DFA into a grammar that can be used by our parser */ 655 656static void 657convert(d, xx_nstates, xx_state) 658 dfa *d; 659 int xx_nstates; 660 ss_state *xx_state; 661{ 662 int i, j; 663 ss_state *yy; 664 ss_arc *zz; 665 666 for (i = 0; i < xx_nstates; i++) { 667 yy = &xx_state[i]; 668 if (yy->ss_deleted) 669 continue; 670 yy->ss_rename = addstate(d); 671 } 672 673 for (i = 0; i < xx_nstates; i++) { 674 yy = &xx_state[i]; 675 if (yy->ss_deleted) 676 continue; 677 for (j = 0; j < yy->ss_narcs; j++) { 678 zz = &yy->ss_arc[j]; 679 addarc(d, yy->ss_rename, 680 xx_state[zz->sa_arrow].ss_rename, 681 zz->sa_label); 682 } 683 if (yy->ss_finish) 684 addarc(d, yy->ss_rename, yy->ss_rename, 0); 685 } 686 687 d->d_initial = 0; 688} 689 690 691/* PART FIVE -- GLUE IT ALL TOGETHER */ 692 693static grammar * 694maketables(gr) 695 nfagrammar *gr; 696{ 697 int i; 698 nfa *nf; 699 dfa *d; 700 grammar *g; 701 702 if (gr->gr_nnfas == 0) 703 return NULL; 704 g = newgrammar(gr->gr_nfa[0]->nf_type); 705 /* XXX first rule must be start rule */ 706 g->g_ll = gr->gr_ll; 707 708 for (i = 0; i < gr->gr_nnfas; i++) { 709 nf = gr->gr_nfa[i]; 710 if (Py_DebugFlag) { 711 printf("Dump of NFA for '%s' ...\n", nf->nf_name); 712 dumpnfa(&gr->gr_ll, nf); 713 } 714 printf("Making DFA for '%s' ...\n", nf->nf_name); 715 d = adddfa(g, nf->nf_type, nf->nf_name); 716 makedfa(gr, gr->gr_nfa[i], d); 717 } 718 719 return g; 720} 721 722grammar * 723pgen(n) 724 node *n; 725{ 726 nfagrammar *gr; 727 grammar *g; 728 729 gr = metacompile(n); 730 g = maketables(gr); 731 translatelabels(g); 732 addfirstsets(g); 733 return g; 734} 735 736 737/* 738 739Description 740----------- 741 742Input is a grammar in extended BNF (using * for repetition, + for 743at-least-once repetition, [] for optional parts, | for alternatives and 744() for grouping). This has already been parsed and turned into a parse 745tree. 746 747Each rule is considered as a regular expression in its own right. 748It is turned into a Non-deterministic Finite Automaton (NFA), which 749is then turned into a Deterministic Finite Automaton (DFA), which is then 750optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, 751or similar compiler books (this technique is more often used for lexical 752analyzers). 753 754The DFA's are used by the parser as parsing tables in a special way 755that's probably unique. Before they are usable, the FIRST sets of all 756non-terminals are computed. 757 758Reference 759--------- 760 761[Aho&Ullman 77] 762 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 763 (first edition) 764 765*/ 766