pgen.c revision 85a5fbbdfea617f6cc8fae82c9e8c2b5c424436d
1/* Parser generator */ 2 3/* For a description, see the comments at end of this file */ 4 5#include <stdio.h> 6#include "assert.h" 7 8#include "PROTO.h" 9#include "malloc.h" 10#include "token.h" 11#include "node.h" 12#include "grammar.h" 13#include "metagrammar.h" 14#include "pgen.h" 15 16extern int debugging; 17 18 19/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ 20 21typedef struct _nfaarc { 22 int ar_label; 23 int ar_arrow; 24} nfaarc; 25 26typedef struct _nfastate { 27 int st_narcs; 28 nfaarc *st_arc; 29} nfastate; 30 31typedef struct _nfa { 32 int nf_type; 33 char *nf_name; 34 int nf_nstates; 35 nfastate *nf_state; 36 int nf_start, nf_finish; 37} nfa; 38 39static int 40addnfastate(nf) 41 nfa *nf; 42{ 43 nfastate *st; 44 45 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1); 46 if (nf->nf_state == NULL) 47 fatal("out of mem"); 48 st = &nf->nf_state[nf->nf_nstates++]; 49 st->st_narcs = 0; 50 st->st_arc = NULL; 51 return st - nf->nf_state; 52} 53 54static void 55addnfaarc(nf, from, to, lbl) 56 nfa *nf; 57 int from, to, lbl; 58{ 59 nfastate *st; 60 nfaarc *ar; 61 62 st = &nf->nf_state[from]; 63 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1); 64 if (st->st_arc == NULL) 65 fatal("out of mem"); 66 ar = &st->st_arc[st->st_narcs++]; 67 ar->ar_label = lbl; 68 ar->ar_arrow = to; 69} 70 71static nfa * 72newnfa(name) 73 char *name; 74{ 75 nfa *nf; 76 static type = NT_OFFSET; /* All types will be disjunct */ 77 78 nf = NEW(nfa, 1); 79 if (nf == NULL) 80 fatal("no mem for new nfa"); 81 nf->nf_type = type++; 82 nf->nf_name = name; /* XXX strdup(name) ??? */ 83 nf->nf_nstates = 0; 84 nf->nf_state = NULL; 85 nf->nf_start = nf->nf_finish = -1; 86 return nf; 87} 88 89typedef struct _nfagrammar { 90 int gr_nnfas; 91 nfa **gr_nfa; 92 labellist gr_ll; 93} nfagrammar; 94 95static nfagrammar * 96newnfagrammar() 97{ 98 nfagrammar *gr; 99 100 gr = NEW(nfagrammar, 1); 101 if (gr == NULL) 102 fatal("no mem for new nfa grammar"); 103 gr->gr_nnfas = 0; 104 gr->gr_nfa = NULL; 105 gr->gr_ll.ll_nlabels = 0; 106 gr->gr_ll.ll_label = NULL; 107 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); 108 return gr; 109} 110 111static nfa * 112addnfa(gr, name) 113 nfagrammar *gr; 114 char *name; 115{ 116 nfa *nf; 117 118 nf = newnfa(name); 119 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1); 120 if (gr->gr_nfa == NULL) 121 fatal("out of mem"); 122 gr->gr_nfa[gr->gr_nnfas++] = nf; 123 addlabel(&gr->gr_ll, NAME, nf->nf_name); 124 return nf; 125} 126 127#ifdef DEBUG 128 129static char REQNFMT[] = "metacompile: less than %d children\n"; 130 131#define REQN(i, count) \ 132 if (i < count) { \ 133 fprintf(stderr, REQNFMT, count); \ 134 abort(); \ 135 } else 136 137#else 138#define REQN(i, count) /* empty */ 139#endif 140 141static nfagrammar * 142metacompile(n) 143 node *n; 144{ 145 nfagrammar *gr; 146 int i; 147 148 printf("Compiling (meta-) parse tree into NFA grammar\n"); 149 gr = newnfagrammar(); 150 REQ(n, MSTART); 151 i = n->n_nchildren - 1; /* Last child is ENDMARKER */ 152 n = n->n_child; 153 for (; --i >= 0; n++) { 154 if (n->n_type != NEWLINE) 155 compile_rule(gr, n); 156 } 157 return gr; 158} 159 160static 161compile_rule(gr, n) 162 nfagrammar *gr; 163 node *n; 164{ 165 nfa *nf; 166 167 REQ(n, RULE); 168 REQN(n->n_nchildren, 4); 169 n = n->n_child; 170 REQ(n, NAME); 171 nf = addnfa(gr, n->n_str); 172 n++; 173 REQ(n, COLON); 174 n++; 175 REQ(n, RHS); 176 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); 177 n++; 178 REQ(n, NEWLINE); 179} 180 181static 182compile_rhs(ll, nf, n, pa, pb) 183 labellist *ll; 184 nfa *nf; 185 node *n; 186 int *pa, *pb; 187{ 188 int i; 189 int a, b; 190 191 REQ(n, RHS); 192 i = n->n_nchildren; 193 REQN(i, 1); 194 n = n->n_child; 195 REQ(n, ALT); 196 compile_alt(ll, nf, n, pa, pb); 197 if (--i <= 0) 198 return; 199 n++; 200 a = *pa; 201 b = *pb; 202 *pa = addnfastate(nf); 203 *pb = addnfastate(nf); 204 addnfaarc(nf, *pa, a, EMPTY); 205 addnfaarc(nf, b, *pb, EMPTY); 206 for (; --i >= 0; n++) { 207 REQ(n, VBAR); 208 REQN(i, 1); 209 --i; 210 n++; 211 REQ(n, ALT); 212 compile_alt(ll, nf, n, &a, &b); 213 addnfaarc(nf, *pa, a, EMPTY); 214 addnfaarc(nf, b, *pb, EMPTY); 215 } 216} 217 218static 219compile_alt(ll, nf, n, pa, pb) 220 labellist *ll; 221 nfa *nf; 222 node *n; 223 int *pa, *pb; 224{ 225 int i; 226 int a, b; 227 228 REQ(n, ALT); 229 i = n->n_nchildren; 230 REQN(i, 1); 231 n = n->n_child; 232 REQ(n, ITEM); 233 compile_item(ll, nf, n, pa, pb); 234 --i; 235 n++; 236 for (; --i >= 0; n++) { 237 if (n->n_type == COMMA) { /* XXX Temporary */ 238 REQN(i, 1); 239 --i; 240 n++; 241 } 242 REQ(n, ITEM); 243 compile_item(ll, nf, n, &a, &b); 244 addnfaarc(nf, *pb, a, EMPTY); 245 *pb = b; 246 } 247} 248 249static 250compile_item(ll, nf, n, pa, pb) 251 labellist *ll; 252 nfa *nf; 253 node *n; 254 int *pa, *pb; 255{ 256 int i; 257 int a, b; 258 259 REQ(n, ITEM); 260 i = n->n_nchildren; 261 REQN(i, 1); 262 n = n->n_child; 263 if (n->n_type == LSQB) { 264 REQN(i, 3); 265 n++; 266 REQ(n, RHS); 267 *pa = addnfastate(nf); 268 *pb = addnfastate(nf); 269 addnfaarc(nf, *pa, *pb, EMPTY); 270 compile_rhs(ll, nf, n, &a, &b); 271 addnfaarc(nf, *pa, a, EMPTY); 272 addnfaarc(nf, b, *pb, EMPTY); 273 REQN(i, 1); 274 n++; 275 REQ(n, RSQB); 276 } 277 else { 278 compile_atom(ll, nf, n, pa, pb); 279 if (--i <= 0) 280 return; 281 n++; 282 addnfaarc(nf, *pb, *pa, EMPTY); 283 if (n->n_type == STAR) 284 *pb = *pa; 285 else 286 REQ(n, PLUS); 287 } 288} 289 290static 291compile_atom(ll, nf, n, pa, pb) 292 labellist *ll; 293 nfa *nf; 294 node *n; 295 int *pa, *pb; 296{ 297 int i; 298 299 REQ(n, ATOM); 300 i = n->n_nchildren; 301 REQN(i, 1); 302 n = n->n_child; 303 if (n->n_type == LPAR) { 304 REQN(i, 3); 305 n++; 306 REQ(n, RHS); 307 compile_rhs(ll, nf, n, pa, pb); 308 n++; 309 REQ(n, RPAR); 310 } 311 else if (n->n_type == NAME || n->n_type == STRING) { 312 *pa = addnfastate(nf); 313 *pb = addnfastate(nf); 314 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); 315 } 316 else 317 REQ(n, NAME); 318} 319 320static void 321dumpstate(ll, nf, istate) 322 labellist *ll; 323 nfa *nf; 324 int istate; 325{ 326 nfastate *st; 327 int i; 328 nfaarc *ar; 329 330 printf("%c%2d%c", 331 istate == nf->nf_start ? '*' : ' ', 332 istate, 333 istate == nf->nf_finish ? '.' : ' '); 334 st = &nf->nf_state[istate]; 335 ar = st->st_arc; 336 for (i = 0; i < st->st_narcs; i++) { 337 if (i > 0) 338 printf("\n "); 339 printf("-> %2d %s", ar->ar_arrow, 340 labelrepr(&ll->ll_label[ar->ar_label])); 341 ar++; 342 } 343 printf("\n"); 344} 345 346static void 347dumpnfa(ll, nf) 348 labellist *ll; 349 nfa *nf; 350{ 351 int i; 352 353 printf("NFA '%s' has %d states; start %d, finish %d\n", 354 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); 355 for (i = 0; i < nf->nf_nstates; i++) 356 dumpstate(ll, nf, i); 357} 358 359 360/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ 361 362static int 363addclosure(ss, nf, istate) 364 bitset ss; 365 nfa *nf; 366 int istate; 367{ 368 if (addbit(ss, istate)) { 369 nfastate *st = &nf->nf_state[istate]; 370 nfaarc *ar = st->st_arc; 371 int i; 372 373 for (i = st->st_narcs; --i >= 0; ) { 374 if (ar->ar_label == EMPTY) 375 addclosure(ss, nf, ar->ar_arrow); 376 ar++; 377 } 378 } 379} 380 381typedef struct _ss_arc { 382 bitset sa_bitset; 383 int sa_arrow; 384 int sa_label; 385} ss_arc; 386 387typedef struct _ss_state { 388 bitset ss_ss; 389 int ss_narcs; 390 ss_arc *ss_arc; 391 int ss_deleted; 392 int ss_finish; 393 int ss_rename; 394} ss_state; 395 396typedef struct _ss_dfa { 397 int sd_nstates; 398 ss_state *sd_state; 399} ss_dfa; 400 401static 402makedfa(gr, nf, d) 403 nfagrammar *gr; 404 nfa *nf; 405 dfa *d; 406{ 407 int nbits = nf->nf_nstates; 408 bitset ss; 409 int xx_nstates; 410 ss_state *xx_state, *yy; 411 ss_arc *zz; 412 int istate, jstate, iarc, jarc, ibit; 413 nfastate *st; 414 nfaarc *ar; 415 416 ss = newbitset(nbits); 417 addclosure(ss, nf, nf->nf_start); 418 xx_state = NEW(ss_state, 1); 419 if (xx_state == NULL) 420 fatal("no mem for xx_state in makedfa"); 421 xx_nstates = 1; 422 yy = &xx_state[0]; 423 yy->ss_ss = ss; 424 yy->ss_narcs = 0; 425 yy->ss_arc = NULL; 426 yy->ss_deleted = 0; 427 yy->ss_finish = testbit(ss, nf->nf_finish); 428 if (yy->ss_finish) 429 printf("Error: nonterminal '%s' may produce empty.\n", 430 nf->nf_name); 431 432 /* This algorithm is from a book written before 433 the invention of structured programming... */ 434 435 /* For each unmarked state... */ 436 for (istate = 0; istate < xx_nstates; ++istate) { 437 yy = &xx_state[istate]; 438 ss = yy->ss_ss; 439 /* For all its states... */ 440 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { 441 if (!testbit(ss, ibit)) 442 continue; 443 st = &nf->nf_state[ibit]; 444 /* For all non-empty arcs from this state... */ 445 for (iarc = 0; iarc < st->st_narcs; iarc++) { 446 ar = &st->st_arc[iarc]; 447 if (ar->ar_label == EMPTY) 448 continue; 449 /* Look up in list of arcs from this state */ 450 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { 451 zz = &yy->ss_arc[jarc]; 452 if (ar->ar_label == zz->sa_label) 453 goto found; 454 } 455 /* Add new arc for this state */ 456 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1); 457 if (yy->ss_arc == NULL) 458 fatal("out of mem"); 459 zz = &yy->ss_arc[yy->ss_narcs++]; 460 zz->sa_label = ar->ar_label; 461 zz->sa_bitset = newbitset(nbits); 462 zz->sa_arrow = -1; 463 found: ; 464 /* Add destination */ 465 addclosure(zz->sa_bitset, nf, ar->ar_arrow); 466 } 467 } 468 /* Now look up all the arrow states */ 469 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { 470 zz = &xx_state[istate].ss_arc[jarc]; 471 for (jstate = 0; jstate < xx_nstates; jstate++) { 472 if (samebitset(zz->sa_bitset, 473 xx_state[jstate].ss_ss, nbits)) { 474 zz->sa_arrow = jstate; 475 goto done; 476 } 477 } 478 RESIZE(xx_state, ss_state, xx_nstates + 1); 479 if (xx_state == NULL) 480 fatal("out of mem"); 481 zz->sa_arrow = xx_nstates; 482 yy = &xx_state[xx_nstates++]; 483 yy->ss_ss = zz->sa_bitset; 484 yy->ss_narcs = 0; 485 yy->ss_arc = NULL; 486 yy->ss_deleted = 0; 487 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); 488 done: ; 489 } 490 } 491 492 if (debugging) 493 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 494 "before minimizing"); 495 496 simplify(xx_nstates, xx_state); 497 498 if (debugging) 499 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 500 "after minimizing"); 501 502 convert(d, xx_nstates, xx_state); 503 504 /* XXX cleanup */ 505} 506 507static 508printssdfa(xx_nstates, xx_state, nbits, ll, msg) 509 int xx_nstates; 510 ss_state *xx_state; 511 int nbits; 512 labellist *ll; 513 char *msg; 514{ 515 int i, ibit, iarc; 516 ss_state *yy; 517 ss_arc *zz; 518 519 printf("Subset DFA %s\n", msg); 520 for (i = 0; i < xx_nstates; i++) { 521 yy = &xx_state[i]; 522 if (yy->ss_deleted) 523 continue; 524 printf(" Subset %d", i); 525 if (yy->ss_finish) 526 printf(" (finish)"); 527 printf(" { "); 528 for (ibit = 0; ibit < nbits; ibit++) { 529 if (testbit(yy->ss_ss, ibit)) 530 printf("%d ", ibit); 531 } 532 printf("}\n"); 533 for (iarc = 0; iarc < yy->ss_narcs; iarc++) { 534 zz = &yy->ss_arc[iarc]; 535 printf(" Arc to state %d, label %s\n", 536 zz->sa_arrow, 537 labelrepr(&ll->ll_label[zz->sa_label])); 538 } 539 } 540} 541 542 543/* PART THREE -- SIMPLIFY DFA */ 544 545/* Simplify the DFA by repeatedly eliminating states that are 546 equivalent to another oner. This is NOT Algorithm 3.3 from 547 [Aho&Ullman 77]. It does not always finds the minimal DFA, 548 but it does usually make a much smaller one... (For an example 549 of sub-optimal behaviour, try S: x a b+ | y a b+.) 550*/ 551 552static int 553samestate(s1, s2) 554 ss_state *s1, *s2; 555{ 556 int i; 557 558 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) 559 return 0; 560 for (i = 0; i < s1->ss_narcs; i++) { 561 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || 562 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) 563 return 0; 564 } 565 return 1; 566} 567 568static void 569renamestates(xx_nstates, xx_state, from, to) 570 int xx_nstates; 571 ss_state *xx_state; 572 int from, to; 573{ 574 int i, j; 575 576 if (debugging) 577 printf("Rename state %d to %d.\n", from, to); 578 for (i = 0; i < xx_nstates; i++) { 579 if (xx_state[i].ss_deleted) 580 continue; 581 for (j = 0; j < xx_state[i].ss_narcs; j++) { 582 if (xx_state[i].ss_arc[j].sa_arrow == from) 583 xx_state[i].ss_arc[j].sa_arrow = to; 584 } 585 } 586} 587 588static 589simplify(xx_nstates, xx_state) 590 int xx_nstates; 591 ss_state *xx_state; 592{ 593 int changes; 594 int i, j, k; 595 596 do { 597 changes = 0; 598 for (i = 1; i < xx_nstates; i++) { 599 if (xx_state[i].ss_deleted) 600 continue; 601 for (j = 0; j < i; j++) { 602 if (xx_state[j].ss_deleted) 603 continue; 604 if (samestate(&xx_state[i], &xx_state[j])) { 605 xx_state[i].ss_deleted++; 606 renamestates(xx_nstates, xx_state, i, j); 607 changes++; 608 break; 609 } 610 } 611 } 612 } while (changes); 613} 614 615 616/* PART FOUR -- GENERATE PARSING TABLES */ 617 618/* Convert the DFA into a grammar that can be used by our parser */ 619 620static 621convert(d, xx_nstates, xx_state) 622 dfa *d; 623 int xx_nstates; 624 ss_state *xx_state; 625{ 626 int i, j; 627 ss_state *yy; 628 ss_arc *zz; 629 630 for (i = 0; i < xx_nstates; i++) { 631 yy = &xx_state[i]; 632 if (yy->ss_deleted) 633 continue; 634 yy->ss_rename = addstate(d); 635 } 636 637 for (i = 0; i < xx_nstates; i++) { 638 yy = &xx_state[i]; 639 if (yy->ss_deleted) 640 continue; 641 for (j = 0; j < yy->ss_narcs; j++) { 642 zz = &yy->ss_arc[j]; 643 addarc(d, yy->ss_rename, 644 xx_state[zz->sa_arrow].ss_rename, 645 zz->sa_label); 646 } 647 if (yy->ss_finish) 648 addarc(d, yy->ss_rename, yy->ss_rename, 0); 649 } 650 651 d->d_initial = 0; 652} 653 654 655/* PART FIVE -- GLUE IT ALL TOGETHER */ 656 657static grammar * 658maketables(gr) 659 nfagrammar *gr; 660{ 661 int i; 662 nfa *nf; 663 dfa *d; 664 grammar *g; 665 666 if (gr->gr_nnfas == 0) 667 return NULL; 668 g = newgrammar(gr->gr_nfa[0]->nf_type); 669 /* XXX first rule must be start rule */ 670 g->g_ll = gr->gr_ll; 671 672 for (i = 0; i < gr->gr_nnfas; i++) { 673 nf = gr->gr_nfa[i]; 674 if (debugging) { 675 printf("Dump of NFA for '%s' ...\n", nf->nf_name); 676 dumpnfa(&gr->gr_ll, nf); 677 } 678 printf("Making DFA for '%s' ...\n", nf->nf_name); 679 d = adddfa(g, nf->nf_type, nf->nf_name); 680 makedfa(gr, gr->gr_nfa[i], d); 681 } 682 683 return g; 684} 685 686grammar * 687pgen(n) 688 node *n; 689{ 690 nfagrammar *gr; 691 grammar *g; 692 693 gr = metacompile(n); 694 g = maketables(gr); 695 translatelabels(g); 696 addfirstsets(g); 697 return g; 698} 699 700 701/* 702 703Description 704----------- 705 706Input is a grammar in extended BNF (using * for repetition, + for 707at-least-once repetition, [] for optional parts, | for alternatives and 708() for grouping). This has already been parsed and turned into a parse 709tree. 710 711Each rule is considered as a regular expression in its own right. 712It is turned into a Non-deterministic Finite Automaton (NFA), which 713is then turned into a Deterministic Finite Automaton (DFA), which is then 714optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, 715or similar compiler books (this technique is more often used for lexical 716analyzers). 717 718The DFA's are used by the parser as parsing tables in a special way 719that's probably unique. Before they are usable, the FIRST sets of all 720non-terminals are computed. 721 722Reference 723--------- 724 725[Aho&Ullman 77] 726 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 727 (first edition) 728 729*/ 730