pgen.c revision 1d5735e84621a7fe68d361fa0e289fa2c3310836
1/*********************************************************** 2Copyright 1991, 1992, 1993, 1994 by Stichting Mathematisch Centrum, 3Amsterdam, The Netherlands. 4 5 All Rights Reserved 6 7Permission to use, copy, modify, and distribute this software and its 8documentation for any purpose and without fee is hereby granted, 9provided that the above copyright notice appear in all copies and that 10both that copyright notice and this permission notice appear in 11supporting documentation, and that the names of Stichting Mathematisch 12Centrum or CWI not be used in advertising or publicity pertaining to 13distribution of the software without specific, written prior permission. 14 15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO 16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE 18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22 23******************************************************************/ 24 25/* Parser generator */ 26/* XXX This file is not yet fully PROTOized */ 27 28/* For a description, see the comments at end of this file */ 29 30#include "pgenheaders.h" 31#include "assert.h" 32#include "token.h" 33#include "node.h" 34#include "grammar.h" 35#include "metagrammar.h" 36#include "pgen.h" 37 38extern int debugging; 39 40 41/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ 42 43typedef struct _nfaarc { 44 int ar_label; 45 int ar_arrow; 46} nfaarc; 47 48typedef struct _nfastate { 49 int st_narcs; 50 nfaarc *st_arc; 51} nfastate; 52 53typedef struct _nfa { 54 int nf_type; 55 char *nf_name; 56 int nf_nstates; 57 nfastate *nf_state; 58 int nf_start, nf_finish; 59} nfa; 60 61/* Forward */ 62static compile_rhs PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb)); 63static compile_alt PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb)); 64static compile_item PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb)); 65static compile_atom PROTO((labellist *ll, nfa *nf, node *n, int *pa, int *pb)); 66 67static int 68addnfastate(nf) 69 nfa *nf; 70{ 71 nfastate *st; 72 73 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1); 74 if (nf->nf_state == NULL) 75 fatal("out of mem"); 76 st = &nf->nf_state[nf->nf_nstates++]; 77 st->st_narcs = 0; 78 st->st_arc = NULL; 79 return st - nf->nf_state; 80} 81 82static void 83addnfaarc(nf, from, to, lbl) 84 nfa *nf; 85 int from, to, lbl; 86{ 87 nfastate *st; 88 nfaarc *ar; 89 90 st = &nf->nf_state[from]; 91 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1); 92 if (st->st_arc == NULL) 93 fatal("out of mem"); 94 ar = &st->st_arc[st->st_narcs++]; 95 ar->ar_label = lbl; 96 ar->ar_arrow = to; 97} 98 99static nfa * 100newnfa(name) 101 char *name; 102{ 103 nfa *nf; 104 static type = NT_OFFSET; /* All types will be disjunct */ 105 106 nf = NEW(nfa, 1); 107 if (nf == NULL) 108 fatal("no mem for new nfa"); 109 nf->nf_type = type++; 110 nf->nf_name = name; /* XXX strdup(name) ??? */ 111 nf->nf_nstates = 0; 112 nf->nf_state = NULL; 113 nf->nf_start = nf->nf_finish = -1; 114 return nf; 115} 116 117typedef struct _nfagrammar { 118 int gr_nnfas; 119 nfa **gr_nfa; 120 labellist gr_ll; 121} nfagrammar; 122 123/* Forward */ 124static compile_rule PROTO((nfagrammar *gr, node *n)); 125 126static nfagrammar * 127newnfagrammar() 128{ 129 nfagrammar *gr; 130 131 gr = NEW(nfagrammar, 1); 132 if (gr == NULL) 133 fatal("no mem for new nfa grammar"); 134 gr->gr_nnfas = 0; 135 gr->gr_nfa = NULL; 136 gr->gr_ll.ll_nlabels = 0; 137 gr->gr_ll.ll_label = NULL; 138 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); 139 return gr; 140} 141 142static nfa * 143addnfa(gr, name) 144 nfagrammar *gr; 145 char *name; 146{ 147 nfa *nf; 148 149 nf = newnfa(name); 150 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1); 151 if (gr->gr_nfa == NULL) 152 fatal("out of mem"); 153 gr->gr_nfa[gr->gr_nnfas++] = nf; 154 addlabel(&gr->gr_ll, NAME, nf->nf_name); 155 return nf; 156} 157 158#ifdef DEBUG 159 160static char REQNFMT[] = "metacompile: less than %d children\n"; 161 162#define REQN(i, count) \ 163 if (i < count) { \ 164 fprintf(stderr, REQNFMT, count); \ 165 abort(); \ 166 } else 167 168#else 169#define REQN(i, count) /* empty */ 170#endif 171 172static nfagrammar * 173metacompile(n) 174 node *n; 175{ 176 nfagrammar *gr; 177 int i; 178 179 printf("Compiling (meta-) parse tree into NFA grammar\n"); 180 gr = newnfagrammar(); 181 REQ(n, MSTART); 182 i = n->n_nchildren - 1; /* Last child is ENDMARKER */ 183 n = n->n_child; 184 for (; --i >= 0; n++) { 185 if (n->n_type != NEWLINE) 186 compile_rule(gr, n); 187 } 188 return gr; 189} 190 191static 192compile_rule(gr, n) 193 nfagrammar *gr; 194 node *n; 195{ 196 nfa *nf; 197 198 REQ(n, RULE); 199 REQN(n->n_nchildren, 4); 200 n = n->n_child; 201 REQ(n, NAME); 202 nf = addnfa(gr, n->n_str); 203 n++; 204 REQ(n, COLON); 205 n++; 206 REQ(n, RHS); 207 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); 208 n++; 209 REQ(n, NEWLINE); 210} 211 212static 213compile_rhs(ll, nf, n, pa, pb) 214 labellist *ll; 215 nfa *nf; 216 node *n; 217 int *pa, *pb; 218{ 219 int i; 220 int a, b; 221 222 REQ(n, RHS); 223 i = n->n_nchildren; 224 REQN(i, 1); 225 n = n->n_child; 226 REQ(n, ALT); 227 compile_alt(ll, nf, n, pa, pb); 228 if (--i <= 0) 229 return; 230 n++; 231 a = *pa; 232 b = *pb; 233 *pa = addnfastate(nf); 234 *pb = addnfastate(nf); 235 addnfaarc(nf, *pa, a, EMPTY); 236 addnfaarc(nf, b, *pb, EMPTY); 237 for (; --i >= 0; n++) { 238 REQ(n, VBAR); 239 REQN(i, 1); 240 --i; 241 n++; 242 REQ(n, ALT); 243 compile_alt(ll, nf, n, &a, &b); 244 addnfaarc(nf, *pa, a, EMPTY); 245 addnfaarc(nf, b, *pb, EMPTY); 246 } 247} 248 249static 250compile_alt(ll, nf, n, pa, pb) 251 labellist *ll; 252 nfa *nf; 253 node *n; 254 int *pa, *pb; 255{ 256 int i; 257 int a, b; 258 259 REQ(n, ALT); 260 i = n->n_nchildren; 261 REQN(i, 1); 262 n = n->n_child; 263 REQ(n, ITEM); 264 compile_item(ll, nf, n, pa, pb); 265 --i; 266 n++; 267 for (; --i >= 0; n++) { 268 if (n->n_type == COMMA) { /* XXX Temporary */ 269 REQN(i, 1); 270 --i; 271 n++; 272 } 273 REQ(n, ITEM); 274 compile_item(ll, nf, n, &a, &b); 275 addnfaarc(nf, *pb, a, EMPTY); 276 *pb = b; 277 } 278} 279 280static 281compile_item(ll, nf, n, pa, pb) 282 labellist *ll; 283 nfa *nf; 284 node *n; 285 int *pa, *pb; 286{ 287 int i; 288 int a, b; 289 290 REQ(n, ITEM); 291 i = n->n_nchildren; 292 REQN(i, 1); 293 n = n->n_child; 294 if (n->n_type == LSQB) { 295 REQN(i, 3); 296 n++; 297 REQ(n, RHS); 298 *pa = addnfastate(nf); 299 *pb = addnfastate(nf); 300 addnfaarc(nf, *pa, *pb, EMPTY); 301 compile_rhs(ll, nf, n, &a, &b); 302 addnfaarc(nf, *pa, a, EMPTY); 303 addnfaarc(nf, b, *pb, EMPTY); 304 REQN(i, 1); 305 n++; 306 REQ(n, RSQB); 307 } 308 else { 309 compile_atom(ll, nf, n, pa, pb); 310 if (--i <= 0) 311 return; 312 n++; 313 addnfaarc(nf, *pb, *pa, EMPTY); 314 if (n->n_type == STAR) 315 *pb = *pa; 316 else 317 REQ(n, PLUS); 318 } 319} 320 321static 322compile_atom(ll, nf, n, pa, pb) 323 labellist *ll; 324 nfa *nf; 325 node *n; 326 int *pa, *pb; 327{ 328 int i; 329 330 REQ(n, ATOM); 331 i = n->n_nchildren; 332 REQN(i, 1); 333 n = n->n_child; 334 if (n->n_type == LPAR) { 335 REQN(i, 3); 336 n++; 337 REQ(n, RHS); 338 compile_rhs(ll, nf, n, pa, pb); 339 n++; 340 REQ(n, RPAR); 341 } 342 else if (n->n_type == NAME || n->n_type == STRING) { 343 *pa = addnfastate(nf); 344 *pb = addnfastate(nf); 345 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); 346 } 347 else 348 REQ(n, NAME); 349} 350 351static void 352dumpstate(ll, nf, istate) 353 labellist *ll; 354 nfa *nf; 355 int istate; 356{ 357 nfastate *st; 358 int i; 359 nfaarc *ar; 360 361 printf("%c%2d%c", 362 istate == nf->nf_start ? '*' : ' ', 363 istate, 364 istate == nf->nf_finish ? '.' : ' '); 365 st = &nf->nf_state[istate]; 366 ar = st->st_arc; 367 for (i = 0; i < st->st_narcs; i++) { 368 if (i > 0) 369 printf("\n "); 370 printf("-> %2d %s", ar->ar_arrow, 371 labelrepr(&ll->ll_label[ar->ar_label])); 372 ar++; 373 } 374 printf("\n"); 375} 376 377static void 378dumpnfa(ll, nf) 379 labellist *ll; 380 nfa *nf; 381{ 382 int i; 383 384 printf("NFA '%s' has %d states; start %d, finish %d\n", 385 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); 386 for (i = 0; i < nf->nf_nstates; i++) 387 dumpstate(ll, nf, i); 388} 389 390 391/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ 392 393static int 394addclosure(ss, nf, istate) 395 bitset ss; 396 nfa *nf; 397 int istate; 398{ 399 if (addbit(ss, istate)) { 400 nfastate *st = &nf->nf_state[istate]; 401 nfaarc *ar = st->st_arc; 402 int i; 403 404 for (i = st->st_narcs; --i >= 0; ) { 405 if (ar->ar_label == EMPTY) 406 addclosure(ss, nf, ar->ar_arrow); 407 ar++; 408 } 409 } 410} 411 412typedef struct _ss_arc { 413 bitset sa_bitset; 414 int sa_arrow; 415 int sa_label; 416} ss_arc; 417 418typedef struct _ss_state { 419 bitset ss_ss; 420 int ss_narcs; 421 ss_arc *ss_arc; 422 int ss_deleted; 423 int ss_finish; 424 int ss_rename; 425} ss_state; 426 427typedef struct _ss_dfa { 428 int sd_nstates; 429 ss_state *sd_state; 430} ss_dfa; 431 432/* Forward */ 433static printssdfa PROTO((int xx_nstates, ss_state *xx_state, int nbits, 434 labellist *ll, char *msg)); 435static simplify PROTO((int xx_nstates, ss_state *xx_state)); 436static convert PROTO((dfa *d, int xx_nstates, ss_state *xx_state)); 437 438static 439makedfa(gr, nf, d) 440 nfagrammar *gr; 441 nfa *nf; 442 dfa *d; 443{ 444 int nbits = nf->nf_nstates; 445 bitset ss; 446 int xx_nstates; 447 ss_state *xx_state, *yy; 448 ss_arc *zz; 449 int istate, jstate, iarc, jarc, ibit; 450 nfastate *st; 451 nfaarc *ar; 452 453 ss = newbitset(nbits); 454 addclosure(ss, nf, nf->nf_start); 455 xx_state = NEW(ss_state, 1); 456 if (xx_state == NULL) 457 fatal("no mem for xx_state in makedfa"); 458 xx_nstates = 1; 459 yy = &xx_state[0]; 460 yy->ss_ss = ss; 461 yy->ss_narcs = 0; 462 yy->ss_arc = NULL; 463 yy->ss_deleted = 0; 464 yy->ss_finish = testbit(ss, nf->nf_finish); 465 if (yy->ss_finish) 466 printf("Error: nonterminal '%s' may produce empty.\n", 467 nf->nf_name); 468 469 /* This algorithm is from a book written before 470 the invention of structured programming... */ 471 472 /* For each unmarked state... */ 473 for (istate = 0; istate < xx_nstates; ++istate) { 474 yy = &xx_state[istate]; 475 ss = yy->ss_ss; 476 /* For all its states... */ 477 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { 478 if (!testbit(ss, ibit)) 479 continue; 480 st = &nf->nf_state[ibit]; 481 /* For all non-empty arcs from this state... */ 482 for (iarc = 0; iarc < st->st_narcs; iarc++) { 483 ar = &st->st_arc[iarc]; 484 if (ar->ar_label == EMPTY) 485 continue; 486 /* Look up in list of arcs from this state */ 487 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { 488 zz = &yy->ss_arc[jarc]; 489 if (ar->ar_label == zz->sa_label) 490 goto found; 491 } 492 /* Add new arc for this state */ 493 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1); 494 if (yy->ss_arc == NULL) 495 fatal("out of mem"); 496 zz = &yy->ss_arc[yy->ss_narcs++]; 497 zz->sa_label = ar->ar_label; 498 zz->sa_bitset = newbitset(nbits); 499 zz->sa_arrow = -1; 500 found: ; 501 /* Add destination */ 502 addclosure(zz->sa_bitset, nf, ar->ar_arrow); 503 } 504 } 505 /* Now look up all the arrow states */ 506 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { 507 zz = &xx_state[istate].ss_arc[jarc]; 508 for (jstate = 0; jstate < xx_nstates; jstate++) { 509 if (samebitset(zz->sa_bitset, 510 xx_state[jstate].ss_ss, nbits)) { 511 zz->sa_arrow = jstate; 512 goto done; 513 } 514 } 515 RESIZE(xx_state, ss_state, xx_nstates + 1); 516 if (xx_state == NULL) 517 fatal("out of mem"); 518 zz->sa_arrow = xx_nstates; 519 yy = &xx_state[xx_nstates++]; 520 yy->ss_ss = zz->sa_bitset; 521 yy->ss_narcs = 0; 522 yy->ss_arc = NULL; 523 yy->ss_deleted = 0; 524 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); 525 done: ; 526 } 527 } 528 529 if (debugging) 530 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 531 "before minimizing"); 532 533 simplify(xx_nstates, xx_state); 534 535 if (debugging) 536 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 537 "after minimizing"); 538 539 convert(d, xx_nstates, xx_state); 540 541 /* XXX cleanup */ 542} 543 544static 545printssdfa(xx_nstates, xx_state, nbits, ll, msg) 546 int xx_nstates; 547 ss_state *xx_state; 548 int nbits; 549 labellist *ll; 550 char *msg; 551{ 552 int i, ibit, iarc; 553 ss_state *yy; 554 ss_arc *zz; 555 556 printf("Subset DFA %s\n", msg); 557 for (i = 0; i < xx_nstates; i++) { 558 yy = &xx_state[i]; 559 if (yy->ss_deleted) 560 continue; 561 printf(" Subset %d", i); 562 if (yy->ss_finish) 563 printf(" (finish)"); 564 printf(" { "); 565 for (ibit = 0; ibit < nbits; ibit++) { 566 if (testbit(yy->ss_ss, ibit)) 567 printf("%d ", ibit); 568 } 569 printf("}\n"); 570 for (iarc = 0; iarc < yy->ss_narcs; iarc++) { 571 zz = &yy->ss_arc[iarc]; 572 printf(" Arc to state %d, label %s\n", 573 zz->sa_arrow, 574 labelrepr(&ll->ll_label[zz->sa_label])); 575 } 576 } 577} 578 579 580/* PART THREE -- SIMPLIFY DFA */ 581 582/* Simplify the DFA by repeatedly eliminating states that are 583 equivalent to another oner. This is NOT Algorithm 3.3 from 584 [Aho&Ullman 77]. It does not always finds the minimal DFA, 585 but it does usually make a much smaller one... (For an example 586 of sub-optimal behaviour, try S: x a b+ | y a b+.) 587*/ 588 589static int 590samestate(s1, s2) 591 ss_state *s1, *s2; 592{ 593 int i; 594 595 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) 596 return 0; 597 for (i = 0; i < s1->ss_narcs; i++) { 598 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || 599 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) 600 return 0; 601 } 602 return 1; 603} 604 605static void 606renamestates(xx_nstates, xx_state, from, to) 607 int xx_nstates; 608 ss_state *xx_state; 609 int from, to; 610{ 611 int i, j; 612 613 if (debugging) 614 printf("Rename state %d to %d.\n", from, to); 615 for (i = 0; i < xx_nstates; i++) { 616 if (xx_state[i].ss_deleted) 617 continue; 618 for (j = 0; j < xx_state[i].ss_narcs; j++) { 619 if (xx_state[i].ss_arc[j].sa_arrow == from) 620 xx_state[i].ss_arc[j].sa_arrow = to; 621 } 622 } 623} 624 625static 626simplify(xx_nstates, xx_state) 627 int xx_nstates; 628 ss_state *xx_state; 629{ 630 int changes; 631 int i, j, k; 632 633 do { 634 changes = 0; 635 for (i = 1; i < xx_nstates; i++) { 636 if (xx_state[i].ss_deleted) 637 continue; 638 for (j = 0; j < i; j++) { 639 if (xx_state[j].ss_deleted) 640 continue; 641 if (samestate(&xx_state[i], &xx_state[j])) { 642 xx_state[i].ss_deleted++; 643 renamestates(xx_nstates, xx_state, i, j); 644 changes++; 645 break; 646 } 647 } 648 } 649 } while (changes); 650} 651 652 653/* PART FOUR -- GENERATE PARSING TABLES */ 654 655/* Convert the DFA into a grammar that can be used by our parser */ 656 657static 658convert(d, xx_nstates, xx_state) 659 dfa *d; 660 int xx_nstates; 661 ss_state *xx_state; 662{ 663 int i, j; 664 ss_state *yy; 665 ss_arc *zz; 666 667 for (i = 0; i < xx_nstates; i++) { 668 yy = &xx_state[i]; 669 if (yy->ss_deleted) 670 continue; 671 yy->ss_rename = addstate(d); 672 } 673 674 for (i = 0; i < xx_nstates; i++) { 675 yy = &xx_state[i]; 676 if (yy->ss_deleted) 677 continue; 678 for (j = 0; j < yy->ss_narcs; j++) { 679 zz = &yy->ss_arc[j]; 680 addarc(d, yy->ss_rename, 681 xx_state[zz->sa_arrow].ss_rename, 682 zz->sa_label); 683 } 684 if (yy->ss_finish) 685 addarc(d, yy->ss_rename, yy->ss_rename, 0); 686 } 687 688 d->d_initial = 0; 689} 690 691 692/* PART FIVE -- GLUE IT ALL TOGETHER */ 693 694static grammar * 695maketables(gr) 696 nfagrammar *gr; 697{ 698 int i; 699 nfa *nf; 700 dfa *d; 701 grammar *g; 702 703 if (gr->gr_nnfas == 0) 704 return NULL; 705 g = newgrammar(gr->gr_nfa[0]->nf_type); 706 /* XXX first rule must be start rule */ 707 g->g_ll = gr->gr_ll; 708 709 for (i = 0; i < gr->gr_nnfas; i++) { 710 nf = gr->gr_nfa[i]; 711 if (debugging) { 712 printf("Dump of NFA for '%s' ...\n", nf->nf_name); 713 dumpnfa(&gr->gr_ll, nf); 714 } 715 printf("Making DFA for '%s' ...\n", nf->nf_name); 716 d = adddfa(g, nf->nf_type, nf->nf_name); 717 makedfa(gr, gr->gr_nfa[i], d); 718 } 719 720 return g; 721} 722 723grammar * 724pgen(n) 725 node *n; 726{ 727 nfagrammar *gr; 728 grammar *g; 729 730 gr = metacompile(n); 731 g = maketables(gr); 732 translatelabels(g); 733 addfirstsets(g); 734 return g; 735} 736 737 738/* 739 740Description 741----------- 742 743Input is a grammar in extended BNF (using * for repetition, + for 744at-least-once repetition, [] for optional parts, | for alternatives and 745() for grouping). This has already been parsed and turned into a parse 746tree. 747 748Each rule is considered as a regular expression in its own right. 749It is turned into a Non-deterministic Finite Automaton (NFA), which 750is then turned into a Deterministic Finite Automaton (DFA), which is then 751optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, 752or similar compiler books (this technique is more often used for lexical 753analyzers). 754 755The DFA's are used by the parser as parsing tables in a special way 756that's probably unique. Before they are usable, the FIRST sets of all 757non-terminals are computed. 758 759Reference 760--------- 761 762[Aho&Ullman 77] 763 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 764 (first edition) 765 766*/ 767