pgen.c revision f70e43a073b36c6f6e9894c01025243a77a452d4
1/*********************************************************** 2Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The 3Netherlands. 4 5 All Rights Reserved 6 7Permission to use, copy, modify, and distribute this software and its 8documentation for any purpose and without fee is hereby granted, 9provided that the above copyright notice appear in all copies and that 10both that copyright notice and this permission notice appear in 11supporting documentation, and that the names of Stichting Mathematisch 12Centrum or CWI not be used in advertising or publicity pertaining to 13distribution of the software without specific, written prior permission. 14 15STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO 16THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE 18FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 20ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 21OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 22 23******************************************************************/ 24 25/* Parser generator */ 26/* XXX This file is not yet fully PROTOized */ 27 28/* For a description, see the comments at end of this file */ 29 30#include "pgenheaders.h" 31#include "assert.h" 32#include "token.h" 33#include "node.h" 34#include "grammar.h" 35#include "metagrammar.h" 36#include "pgen.h" 37 38extern int debugging; 39 40 41/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ 42 43typedef struct _nfaarc { 44 int ar_label; 45 int ar_arrow; 46} nfaarc; 47 48typedef struct _nfastate { 49 int st_narcs; 50 nfaarc *st_arc; 51} nfastate; 52 53typedef struct _nfa { 54 int nf_type; 55 char *nf_name; 56 int nf_nstates; 57 nfastate *nf_state; 58 int nf_start, nf_finish; 59} nfa; 60 61static int 62addnfastate(nf) 63 nfa *nf; 64{ 65 nfastate *st; 66 67 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1); 68 if (nf->nf_state == NULL) 69 fatal("out of mem"); 70 st = &nf->nf_state[nf->nf_nstates++]; 71 st->st_narcs = 0; 72 st->st_arc = NULL; 73 return st - nf->nf_state; 74} 75 76static void 77addnfaarc(nf, from, to, lbl) 78 nfa *nf; 79 int from, to, lbl; 80{ 81 nfastate *st; 82 nfaarc *ar; 83 84 st = &nf->nf_state[from]; 85 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1); 86 if (st->st_arc == NULL) 87 fatal("out of mem"); 88 ar = &st->st_arc[st->st_narcs++]; 89 ar->ar_label = lbl; 90 ar->ar_arrow = to; 91} 92 93static nfa * 94newnfa(name) 95 char *name; 96{ 97 nfa *nf; 98 static type = NT_OFFSET; /* All types will be disjunct */ 99 100 nf = NEW(nfa, 1); 101 if (nf == NULL) 102 fatal("no mem for new nfa"); 103 nf->nf_type = type++; 104 nf->nf_name = name; /* XXX strdup(name) ??? */ 105 nf->nf_nstates = 0; 106 nf->nf_state = NULL; 107 nf->nf_start = nf->nf_finish = -1; 108 return nf; 109} 110 111typedef struct _nfagrammar { 112 int gr_nnfas; 113 nfa **gr_nfa; 114 labellist gr_ll; 115} nfagrammar; 116 117static nfagrammar * 118newnfagrammar() 119{ 120 nfagrammar *gr; 121 122 gr = NEW(nfagrammar, 1); 123 if (gr == NULL) 124 fatal("no mem for new nfa grammar"); 125 gr->gr_nnfas = 0; 126 gr->gr_nfa = NULL; 127 gr->gr_ll.ll_nlabels = 0; 128 gr->gr_ll.ll_label = NULL; 129 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); 130 return gr; 131} 132 133static nfa * 134addnfa(gr, name) 135 nfagrammar *gr; 136 char *name; 137{ 138 nfa *nf; 139 140 nf = newnfa(name); 141 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1); 142 if (gr->gr_nfa == NULL) 143 fatal("out of mem"); 144 gr->gr_nfa[gr->gr_nnfas++] = nf; 145 addlabel(&gr->gr_ll, NAME, nf->nf_name); 146 return nf; 147} 148 149#ifdef DEBUG 150 151static char REQNFMT[] = "metacompile: less than %d children\n"; 152 153#define REQN(i, count) \ 154 if (i < count) { \ 155 fprintf(stderr, REQNFMT, count); \ 156 abort(); \ 157 } else 158 159#else 160#define REQN(i, count) /* empty */ 161#endif 162 163static nfagrammar * 164metacompile(n) 165 node *n; 166{ 167 nfagrammar *gr; 168 int i; 169 170 printf("Compiling (meta-) parse tree into NFA grammar\n"); 171 gr = newnfagrammar(); 172 REQ(n, MSTART); 173 i = n->n_nchildren - 1; /* Last child is ENDMARKER */ 174 n = n->n_child; 175 for (; --i >= 0; n++) { 176 if (n->n_type != NEWLINE) 177 compile_rule(gr, n); 178 } 179 return gr; 180} 181 182static 183compile_rule(gr, n) 184 nfagrammar *gr; 185 node *n; 186{ 187 nfa *nf; 188 189 REQ(n, RULE); 190 REQN(n->n_nchildren, 4); 191 n = n->n_child; 192 REQ(n, NAME); 193 nf = addnfa(gr, n->n_str); 194 n++; 195 REQ(n, COLON); 196 n++; 197 REQ(n, RHS); 198 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); 199 n++; 200 REQ(n, NEWLINE); 201} 202 203static 204compile_rhs(ll, nf, n, pa, pb) 205 labellist *ll; 206 nfa *nf; 207 node *n; 208 int *pa, *pb; 209{ 210 int i; 211 int a, b; 212 213 REQ(n, RHS); 214 i = n->n_nchildren; 215 REQN(i, 1); 216 n = n->n_child; 217 REQ(n, ALT); 218 compile_alt(ll, nf, n, pa, pb); 219 if (--i <= 0) 220 return; 221 n++; 222 a = *pa; 223 b = *pb; 224 *pa = addnfastate(nf); 225 *pb = addnfastate(nf); 226 addnfaarc(nf, *pa, a, EMPTY); 227 addnfaarc(nf, b, *pb, EMPTY); 228 for (; --i >= 0; n++) { 229 REQ(n, VBAR); 230 REQN(i, 1); 231 --i; 232 n++; 233 REQ(n, ALT); 234 compile_alt(ll, nf, n, &a, &b); 235 addnfaarc(nf, *pa, a, EMPTY); 236 addnfaarc(nf, b, *pb, EMPTY); 237 } 238} 239 240static 241compile_alt(ll, nf, n, pa, pb) 242 labellist *ll; 243 nfa *nf; 244 node *n; 245 int *pa, *pb; 246{ 247 int i; 248 int a, b; 249 250 REQ(n, ALT); 251 i = n->n_nchildren; 252 REQN(i, 1); 253 n = n->n_child; 254 REQ(n, ITEM); 255 compile_item(ll, nf, n, pa, pb); 256 --i; 257 n++; 258 for (; --i >= 0; n++) { 259 if (n->n_type == COMMA) { /* XXX Temporary */ 260 REQN(i, 1); 261 --i; 262 n++; 263 } 264 REQ(n, ITEM); 265 compile_item(ll, nf, n, &a, &b); 266 addnfaarc(nf, *pb, a, EMPTY); 267 *pb = b; 268 } 269} 270 271static 272compile_item(ll, nf, n, pa, pb) 273 labellist *ll; 274 nfa *nf; 275 node *n; 276 int *pa, *pb; 277{ 278 int i; 279 int a, b; 280 281 REQ(n, ITEM); 282 i = n->n_nchildren; 283 REQN(i, 1); 284 n = n->n_child; 285 if (n->n_type == LSQB) { 286 REQN(i, 3); 287 n++; 288 REQ(n, RHS); 289 *pa = addnfastate(nf); 290 *pb = addnfastate(nf); 291 addnfaarc(nf, *pa, *pb, EMPTY); 292 compile_rhs(ll, nf, n, &a, &b); 293 addnfaarc(nf, *pa, a, EMPTY); 294 addnfaarc(nf, b, *pb, EMPTY); 295 REQN(i, 1); 296 n++; 297 REQ(n, RSQB); 298 } 299 else { 300 compile_atom(ll, nf, n, pa, pb); 301 if (--i <= 0) 302 return; 303 n++; 304 addnfaarc(nf, *pb, *pa, EMPTY); 305 if (n->n_type == STAR) 306 *pb = *pa; 307 else 308 REQ(n, PLUS); 309 } 310} 311 312static 313compile_atom(ll, nf, n, pa, pb) 314 labellist *ll; 315 nfa *nf; 316 node *n; 317 int *pa, *pb; 318{ 319 int i; 320 321 REQ(n, ATOM); 322 i = n->n_nchildren; 323 REQN(i, 1); 324 n = n->n_child; 325 if (n->n_type == LPAR) { 326 REQN(i, 3); 327 n++; 328 REQ(n, RHS); 329 compile_rhs(ll, nf, n, pa, pb); 330 n++; 331 REQ(n, RPAR); 332 } 333 else if (n->n_type == NAME || n->n_type == STRING) { 334 *pa = addnfastate(nf); 335 *pb = addnfastate(nf); 336 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); 337 } 338 else 339 REQ(n, NAME); 340} 341 342static void 343dumpstate(ll, nf, istate) 344 labellist *ll; 345 nfa *nf; 346 int istate; 347{ 348 nfastate *st; 349 int i; 350 nfaarc *ar; 351 352 printf("%c%2d%c", 353 istate == nf->nf_start ? '*' : ' ', 354 istate, 355 istate == nf->nf_finish ? '.' : ' '); 356 st = &nf->nf_state[istate]; 357 ar = st->st_arc; 358 for (i = 0; i < st->st_narcs; i++) { 359 if (i > 0) 360 printf("\n "); 361 printf("-> %2d %s", ar->ar_arrow, 362 labelrepr(&ll->ll_label[ar->ar_label])); 363 ar++; 364 } 365 printf("\n"); 366} 367 368static void 369dumpnfa(ll, nf) 370 labellist *ll; 371 nfa *nf; 372{ 373 int i; 374 375 printf("NFA '%s' has %d states; start %d, finish %d\n", 376 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); 377 for (i = 0; i < nf->nf_nstates; i++) 378 dumpstate(ll, nf, i); 379} 380 381 382/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ 383 384static int 385addclosure(ss, nf, istate) 386 bitset ss; 387 nfa *nf; 388 int istate; 389{ 390 if (addbit(ss, istate)) { 391 nfastate *st = &nf->nf_state[istate]; 392 nfaarc *ar = st->st_arc; 393 int i; 394 395 for (i = st->st_narcs; --i >= 0; ) { 396 if (ar->ar_label == EMPTY) 397 addclosure(ss, nf, ar->ar_arrow); 398 ar++; 399 } 400 } 401} 402 403typedef struct _ss_arc { 404 bitset sa_bitset; 405 int sa_arrow; 406 int sa_label; 407} ss_arc; 408 409typedef struct _ss_state { 410 bitset ss_ss; 411 int ss_narcs; 412 ss_arc *ss_arc; 413 int ss_deleted; 414 int ss_finish; 415 int ss_rename; 416} ss_state; 417 418typedef struct _ss_dfa { 419 int sd_nstates; 420 ss_state *sd_state; 421} ss_dfa; 422 423static 424makedfa(gr, nf, d) 425 nfagrammar *gr; 426 nfa *nf; 427 dfa *d; 428{ 429 int nbits = nf->nf_nstates; 430 bitset ss; 431 int xx_nstates; 432 ss_state *xx_state, *yy; 433 ss_arc *zz; 434 int istate, jstate, iarc, jarc, ibit; 435 nfastate *st; 436 nfaarc *ar; 437 438 ss = newbitset(nbits); 439 addclosure(ss, nf, nf->nf_start); 440 xx_state = NEW(ss_state, 1); 441 if (xx_state == NULL) 442 fatal("no mem for xx_state in makedfa"); 443 xx_nstates = 1; 444 yy = &xx_state[0]; 445 yy->ss_ss = ss; 446 yy->ss_narcs = 0; 447 yy->ss_arc = NULL; 448 yy->ss_deleted = 0; 449 yy->ss_finish = testbit(ss, nf->nf_finish); 450 if (yy->ss_finish) 451 printf("Error: nonterminal '%s' may produce empty.\n", 452 nf->nf_name); 453 454 /* This algorithm is from a book written before 455 the invention of structured programming... */ 456 457 /* For each unmarked state... */ 458 for (istate = 0; istate < xx_nstates; ++istate) { 459 yy = &xx_state[istate]; 460 ss = yy->ss_ss; 461 /* For all its states... */ 462 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { 463 if (!testbit(ss, ibit)) 464 continue; 465 st = &nf->nf_state[ibit]; 466 /* For all non-empty arcs from this state... */ 467 for (iarc = 0; iarc < st->st_narcs; iarc++) { 468 ar = &st->st_arc[iarc]; 469 if (ar->ar_label == EMPTY) 470 continue; 471 /* Look up in list of arcs from this state */ 472 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { 473 zz = &yy->ss_arc[jarc]; 474 if (ar->ar_label == zz->sa_label) 475 goto found; 476 } 477 /* Add new arc for this state */ 478 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1); 479 if (yy->ss_arc == NULL) 480 fatal("out of mem"); 481 zz = &yy->ss_arc[yy->ss_narcs++]; 482 zz->sa_label = ar->ar_label; 483 zz->sa_bitset = newbitset(nbits); 484 zz->sa_arrow = -1; 485 found: ; 486 /* Add destination */ 487 addclosure(zz->sa_bitset, nf, ar->ar_arrow); 488 } 489 } 490 /* Now look up all the arrow states */ 491 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { 492 zz = &xx_state[istate].ss_arc[jarc]; 493 for (jstate = 0; jstate < xx_nstates; jstate++) { 494 if (samebitset(zz->sa_bitset, 495 xx_state[jstate].ss_ss, nbits)) { 496 zz->sa_arrow = jstate; 497 goto done; 498 } 499 } 500 RESIZE(xx_state, ss_state, xx_nstates + 1); 501 if (xx_state == NULL) 502 fatal("out of mem"); 503 zz->sa_arrow = xx_nstates; 504 yy = &xx_state[xx_nstates++]; 505 yy->ss_ss = zz->sa_bitset; 506 yy->ss_narcs = 0; 507 yy->ss_arc = NULL; 508 yy->ss_deleted = 0; 509 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); 510 done: ; 511 } 512 } 513 514 if (debugging) 515 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 516 "before minimizing"); 517 518 simplify(xx_nstates, xx_state); 519 520 if (debugging) 521 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 522 "after minimizing"); 523 524 convert(d, xx_nstates, xx_state); 525 526 /* XXX cleanup */ 527} 528 529static 530printssdfa(xx_nstates, xx_state, nbits, ll, msg) 531 int xx_nstates; 532 ss_state *xx_state; 533 int nbits; 534 labellist *ll; 535 char *msg; 536{ 537 int i, ibit, iarc; 538 ss_state *yy; 539 ss_arc *zz; 540 541 printf("Subset DFA %s\n", msg); 542 for (i = 0; i < xx_nstates; i++) { 543 yy = &xx_state[i]; 544 if (yy->ss_deleted) 545 continue; 546 printf(" Subset %d", i); 547 if (yy->ss_finish) 548 printf(" (finish)"); 549 printf(" { "); 550 for (ibit = 0; ibit < nbits; ibit++) { 551 if (testbit(yy->ss_ss, ibit)) 552 printf("%d ", ibit); 553 } 554 printf("}\n"); 555 for (iarc = 0; iarc < yy->ss_narcs; iarc++) { 556 zz = &yy->ss_arc[iarc]; 557 printf(" Arc to state %d, label %s\n", 558 zz->sa_arrow, 559 labelrepr(&ll->ll_label[zz->sa_label])); 560 } 561 } 562} 563 564 565/* PART THREE -- SIMPLIFY DFA */ 566 567/* Simplify the DFA by repeatedly eliminating states that are 568 equivalent to another oner. This is NOT Algorithm 3.3 from 569 [Aho&Ullman 77]. It does not always finds the minimal DFA, 570 but it does usually make a much smaller one... (For an example 571 of sub-optimal behaviour, try S: x a b+ | y a b+.) 572*/ 573 574static int 575samestate(s1, s2) 576 ss_state *s1, *s2; 577{ 578 int i; 579 580 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) 581 return 0; 582 for (i = 0; i < s1->ss_narcs; i++) { 583 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || 584 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) 585 return 0; 586 } 587 return 1; 588} 589 590static void 591renamestates(xx_nstates, xx_state, from, to) 592 int xx_nstates; 593 ss_state *xx_state; 594 int from, to; 595{ 596 int i, j; 597 598 if (debugging) 599 printf("Rename state %d to %d.\n", from, to); 600 for (i = 0; i < xx_nstates; i++) { 601 if (xx_state[i].ss_deleted) 602 continue; 603 for (j = 0; j < xx_state[i].ss_narcs; j++) { 604 if (xx_state[i].ss_arc[j].sa_arrow == from) 605 xx_state[i].ss_arc[j].sa_arrow = to; 606 } 607 } 608} 609 610static 611simplify(xx_nstates, xx_state) 612 int xx_nstates; 613 ss_state *xx_state; 614{ 615 int changes; 616 int i, j, k; 617 618 do { 619 changes = 0; 620 for (i = 1; i < xx_nstates; i++) { 621 if (xx_state[i].ss_deleted) 622 continue; 623 for (j = 0; j < i; j++) { 624 if (xx_state[j].ss_deleted) 625 continue; 626 if (samestate(&xx_state[i], &xx_state[j])) { 627 xx_state[i].ss_deleted++; 628 renamestates(xx_nstates, xx_state, i, j); 629 changes++; 630 break; 631 } 632 } 633 } 634 } while (changes); 635} 636 637 638/* PART FOUR -- GENERATE PARSING TABLES */ 639 640/* Convert the DFA into a grammar that can be used by our parser */ 641 642static 643convert(d, xx_nstates, xx_state) 644 dfa *d; 645 int xx_nstates; 646 ss_state *xx_state; 647{ 648 int i, j; 649 ss_state *yy; 650 ss_arc *zz; 651 652 for (i = 0; i < xx_nstates; i++) { 653 yy = &xx_state[i]; 654 if (yy->ss_deleted) 655 continue; 656 yy->ss_rename = addstate(d); 657 } 658 659 for (i = 0; i < xx_nstates; i++) { 660 yy = &xx_state[i]; 661 if (yy->ss_deleted) 662 continue; 663 for (j = 0; j < yy->ss_narcs; j++) { 664 zz = &yy->ss_arc[j]; 665 addarc(d, yy->ss_rename, 666 xx_state[zz->sa_arrow].ss_rename, 667 zz->sa_label); 668 } 669 if (yy->ss_finish) 670 addarc(d, yy->ss_rename, yy->ss_rename, 0); 671 } 672 673 d->d_initial = 0; 674} 675 676 677/* PART FIVE -- GLUE IT ALL TOGETHER */ 678 679static grammar * 680maketables(gr) 681 nfagrammar *gr; 682{ 683 int i; 684 nfa *nf; 685 dfa *d; 686 grammar *g; 687 688 if (gr->gr_nnfas == 0) 689 return NULL; 690 g = newgrammar(gr->gr_nfa[0]->nf_type); 691 /* XXX first rule must be start rule */ 692 g->g_ll = gr->gr_ll; 693 694 for (i = 0; i < gr->gr_nnfas; i++) { 695 nf = gr->gr_nfa[i]; 696 if (debugging) { 697 printf("Dump of NFA for '%s' ...\n", nf->nf_name); 698 dumpnfa(&gr->gr_ll, nf); 699 } 700 printf("Making DFA for '%s' ...\n", nf->nf_name); 701 d = adddfa(g, nf->nf_type, nf->nf_name); 702 makedfa(gr, gr->gr_nfa[i], d); 703 } 704 705 return g; 706} 707 708grammar * 709pgen(n) 710 node *n; 711{ 712 nfagrammar *gr; 713 grammar *g; 714 715 gr = metacompile(n); 716 g = maketables(gr); 717 translatelabels(g); 718 addfirstsets(g); 719 return g; 720} 721 722 723/* 724 725Description 726----------- 727 728Input is a grammar in extended BNF (using * for repetition, + for 729at-least-once repetition, [] for optional parts, | for alternatives and 730() for grouping). This has already been parsed and turned into a parse 731tree. 732 733Each rule is considered as a regular expression in its own right. 734It is turned into a Non-deterministic Finite Automaton (NFA), which 735is then turned into a Deterministic Finite Automaton (DFA), which is then 736optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, 737or similar compiler books (this technique is more often used for lexical 738analyzers). 739 740The DFA's are used by the parser as parsing tables in a special way 741that's probably unique. Before they are usable, the FIRST sets of all 742non-terminals are computed. 743 744Reference 745--------- 746 747[Aho&Ullman 77] 748 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 749 (first edition) 750 751*/ 752