pgen.c revision 408027ea4635de5c892b28c4fdf41840eb9089a4
1/*********************************************************** 2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, 3The Netherlands. 4 5 All Rights Reserved 6 7Permission to use, copy, modify, and distribute this software and its 8documentation for any purpose and without fee is hereby granted, 9provided that the above copyright notice appear in all copies and that 10both that copyright notice and this permission notice appear in 11supporting documentation, and that the names of Stichting Mathematisch 12Centrum or CWI or Corporation for National Research Initiatives or 13CNRI not be used in advertising or publicity pertaining to 14distribution of the software without specific, written prior 15permission. 16 17While CWI is the initial source for this software, a modified version 18is made available by the Corporation for National Research Initiatives 19(CNRI) at the Internet address ftp://ftp.python.org. 20 21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH 22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF 23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH 24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL 25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR 26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 28PERFORMANCE OF THIS SOFTWARE. 29 30******************************************************************/ 31 32/* Parser generator */ 33/* XXX This file is not yet fully PROTOized */ 34 35/* For a description, see the comments at end of this file */ 36 37#include "pgenheaders.h" 38#include "assert.h" 39#include "token.h" 40#include "node.h" 41#include "grammar.h" 42#include "metagrammar.h" 43#include "pgen.h" 44 45extern int debugging; 46 47 48/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ 49 50typedef struct _nfaarc { 51 int ar_label; 52 int ar_arrow; 53} nfaarc; 54 55typedef struct _nfastate { 56 int st_narcs; 57 nfaarc *st_arc; 58} nfastate; 59 60typedef struct _nfa { 61 int nf_type; 62 char *nf_name; 63 int nf_nstates; 64 nfastate *nf_state; 65 int nf_start, nf_finish; 66} nfa; 67 68/* Forward */ 69static void compile_rhs PROTO((labellist *ll, 70 nfa *nf, node *n, int *pa, int *pb)); 71static void compile_alt PROTO((labellist *ll, 72 nfa *nf, node *n, int *pa, int *pb)); 73static void compile_item PROTO((labellist *ll, 74 nfa *nf, node *n, int *pa, int *pb)); 75static void compile_atom PROTO((labellist *ll, 76 nfa *nf, node *n, int *pa, int *pb)); 77 78static int 79addnfastate(nf) 80 nfa *nf; 81{ 82 nfastate *st; 83 84 RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1); 85 if (nf->nf_state == NULL) 86 fatal("out of mem"); 87 st = &nf->nf_state[nf->nf_nstates++]; 88 st->st_narcs = 0; 89 st->st_arc = NULL; 90 return st - nf->nf_state; 91} 92 93static void 94addnfaarc(nf, from, to, lbl) 95 nfa *nf; 96 int from, to, lbl; 97{ 98 nfastate *st; 99 nfaarc *ar; 100 101 st = &nf->nf_state[from]; 102 RESIZE(st->st_arc, nfaarc, st->st_narcs + 1); 103 if (st->st_arc == NULL) 104 fatal("out of mem"); 105 ar = &st->st_arc[st->st_narcs++]; 106 ar->ar_label = lbl; 107 ar->ar_arrow = to; 108} 109 110static nfa * 111newnfa(name) 112 char *name; 113{ 114 nfa *nf; 115 static type = NT_OFFSET; /* All types will be disjunct */ 116 117 nf = NEW(nfa, 1); 118 if (nf == NULL) 119 fatal("no mem for new nfa"); 120 nf->nf_type = type++; 121 nf->nf_name = name; /* XXX strdup(name) ??? */ 122 nf->nf_nstates = 0; 123 nf->nf_state = NULL; 124 nf->nf_start = nf->nf_finish = -1; 125 return nf; 126} 127 128typedef struct _nfagrammar { 129 int gr_nnfas; 130 nfa **gr_nfa; 131 labellist gr_ll; 132} nfagrammar; 133 134/* Forward */ 135static void compile_rule PROTO((nfagrammar *gr, node *n)); 136 137static nfagrammar * 138newnfagrammar() 139{ 140 nfagrammar *gr; 141 142 gr = NEW(nfagrammar, 1); 143 if (gr == NULL) 144 fatal("no mem for new nfa grammar"); 145 gr->gr_nnfas = 0; 146 gr->gr_nfa = NULL; 147 gr->gr_ll.ll_nlabels = 0; 148 gr->gr_ll.ll_label = NULL; 149 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); 150 return gr; 151} 152 153static nfa * 154addnfa(gr, name) 155 nfagrammar *gr; 156 char *name; 157{ 158 nfa *nf; 159 160 nf = newnfa(name); 161 RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1); 162 if (gr->gr_nfa == NULL) 163 fatal("out of mem"); 164 gr->gr_nfa[gr->gr_nnfas++] = nf; 165 addlabel(&gr->gr_ll, NAME, nf->nf_name); 166 return nf; 167} 168 169#ifdef Py_DEBUG 170 171static char REQNFMT[] = "metacompile: less than %d children\n"; 172 173#define REQN(i, count) \ 174 if (i < count) { \ 175 fprintf(stderr, REQNFMT, count); \ 176 fatal("REQN"); \ 177 } else 178 179#else 180#define REQN(i, count) /* empty */ 181#endif 182 183static nfagrammar * 184metacompile(n) 185 node *n; 186{ 187 nfagrammar *gr; 188 int i; 189 190 printf("Compiling (meta-) parse tree into NFA grammar\n"); 191 gr = newnfagrammar(); 192 REQ(n, MSTART); 193 i = n->n_nchildren - 1; /* Last child is ENDMARKER */ 194 n = n->n_child; 195 for (; --i >= 0; n++) { 196 if (n->n_type != NEWLINE) 197 compile_rule(gr, n); 198 } 199 return gr; 200} 201 202static void 203compile_rule(gr, n) 204 nfagrammar *gr; 205 node *n; 206{ 207 nfa *nf; 208 209 REQ(n, RULE); 210 REQN(n->n_nchildren, 4); 211 n = n->n_child; 212 REQ(n, NAME); 213 nf = addnfa(gr, n->n_str); 214 n++; 215 REQ(n, COLON); 216 n++; 217 REQ(n, RHS); 218 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); 219 n++; 220 REQ(n, NEWLINE); 221} 222 223static void 224compile_rhs(ll, nf, n, pa, pb) 225 labellist *ll; 226 nfa *nf; 227 node *n; 228 int *pa, *pb; 229{ 230 int i; 231 int a, b; 232 233 REQ(n, RHS); 234 i = n->n_nchildren; 235 REQN(i, 1); 236 n = n->n_child; 237 REQ(n, ALT); 238 compile_alt(ll, nf, n, pa, pb); 239 if (--i <= 0) 240 return; 241 n++; 242 a = *pa; 243 b = *pb; 244 *pa = addnfastate(nf); 245 *pb = addnfastate(nf); 246 addnfaarc(nf, *pa, a, EMPTY); 247 addnfaarc(nf, b, *pb, EMPTY); 248 for (; --i >= 0; n++) { 249 REQ(n, VBAR); 250 REQN(i, 1); 251 --i; 252 n++; 253 REQ(n, ALT); 254 compile_alt(ll, nf, n, &a, &b); 255 addnfaarc(nf, *pa, a, EMPTY); 256 addnfaarc(nf, b, *pb, EMPTY); 257 } 258} 259 260static void 261compile_alt(ll, nf, n, pa, pb) 262 labellist *ll; 263 nfa *nf; 264 node *n; 265 int *pa, *pb; 266{ 267 int i; 268 int a, b; 269 270 REQ(n, ALT); 271 i = n->n_nchildren; 272 REQN(i, 1); 273 n = n->n_child; 274 REQ(n, ITEM); 275 compile_item(ll, nf, n, pa, pb); 276 --i; 277 n++; 278 for (; --i >= 0; n++) { 279 if (n->n_type == COMMA) { /* XXX Temporary */ 280 REQN(i, 1); 281 --i; 282 n++; 283 } 284 REQ(n, ITEM); 285 compile_item(ll, nf, n, &a, &b); 286 addnfaarc(nf, *pb, a, EMPTY); 287 *pb = b; 288 } 289} 290 291static void 292compile_item(ll, nf, n, pa, pb) 293 labellist *ll; 294 nfa *nf; 295 node *n; 296 int *pa, *pb; 297{ 298 int i; 299 int a, b; 300 301 REQ(n, ITEM); 302 i = n->n_nchildren; 303 REQN(i, 1); 304 n = n->n_child; 305 if (n->n_type == LSQB) { 306 REQN(i, 3); 307 n++; 308 REQ(n, RHS); 309 *pa = addnfastate(nf); 310 *pb = addnfastate(nf); 311 addnfaarc(nf, *pa, *pb, EMPTY); 312 compile_rhs(ll, nf, n, &a, &b); 313 addnfaarc(nf, *pa, a, EMPTY); 314 addnfaarc(nf, b, *pb, EMPTY); 315 REQN(i, 1); 316 n++; 317 REQ(n, RSQB); 318 } 319 else { 320 compile_atom(ll, nf, n, pa, pb); 321 if (--i <= 0) 322 return; 323 n++; 324 addnfaarc(nf, *pb, *pa, EMPTY); 325 if (n->n_type == STAR) 326 *pb = *pa; 327 else 328 REQ(n, PLUS); 329 } 330} 331 332static void 333compile_atom(ll, nf, n, pa, pb) 334 labellist *ll; 335 nfa *nf; 336 node *n; 337 int *pa, *pb; 338{ 339 int i; 340 341 REQ(n, ATOM); 342 i = n->n_nchildren; 343 REQN(i, 1); 344 n = n->n_child; 345 if (n->n_type == LPAR) { 346 REQN(i, 3); 347 n++; 348 REQ(n, RHS); 349 compile_rhs(ll, nf, n, pa, pb); 350 n++; 351 REQ(n, RPAR); 352 } 353 else if (n->n_type == NAME || n->n_type == STRING) { 354 *pa = addnfastate(nf); 355 *pb = addnfastate(nf); 356 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); 357 } 358 else 359 REQ(n, NAME); 360} 361 362static void 363dumpstate(ll, nf, istate) 364 labellist *ll; 365 nfa *nf; 366 int istate; 367{ 368 nfastate *st; 369 int i; 370 nfaarc *ar; 371 372 printf("%c%2d%c", 373 istate == nf->nf_start ? '*' : ' ', 374 istate, 375 istate == nf->nf_finish ? '.' : ' '); 376 st = &nf->nf_state[istate]; 377 ar = st->st_arc; 378 for (i = 0; i < st->st_narcs; i++) { 379 if (i > 0) 380 printf("\n "); 381 printf("-> %2d %s", ar->ar_arrow, 382 labelrepr(&ll->ll_label[ar->ar_label])); 383 ar++; 384 } 385 printf("\n"); 386} 387 388static void 389dumpnfa(ll, nf) 390 labellist *ll; 391 nfa *nf; 392{ 393 int i; 394 395 printf("NFA '%s' has %d states; start %d, finish %d\n", 396 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); 397 for (i = 0; i < nf->nf_nstates; i++) 398 dumpstate(ll, nf, i); 399} 400 401 402/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ 403 404static void 405addclosure(ss, nf, istate) 406 bitset ss; 407 nfa *nf; 408 int istate; 409{ 410 if (addbit(ss, istate)) { 411 nfastate *st = &nf->nf_state[istate]; 412 nfaarc *ar = st->st_arc; 413 int i; 414 415 for (i = st->st_narcs; --i >= 0; ) { 416 if (ar->ar_label == EMPTY) 417 addclosure(ss, nf, ar->ar_arrow); 418 ar++; 419 } 420 } 421} 422 423typedef struct _ss_arc { 424 bitset sa_bitset; 425 int sa_arrow; 426 int sa_label; 427} ss_arc; 428 429typedef struct _ss_state { 430 bitset ss_ss; 431 int ss_narcs; 432 ss_arc *ss_arc; 433 int ss_deleted; 434 int ss_finish; 435 int ss_rename; 436} ss_state; 437 438typedef struct _ss_dfa { 439 int sd_nstates; 440 ss_state *sd_state; 441} ss_dfa; 442 443/* Forward */ 444static void printssdfa PROTO((int xx_nstates, ss_state *xx_state, int nbits, 445 labellist *ll, char *msg)); 446static void simplify PROTO((int xx_nstates, ss_state *xx_state)); 447static void convert PROTO((dfa *d, int xx_nstates, ss_state *xx_state)); 448 449static void 450makedfa(gr, nf, d) 451 nfagrammar *gr; 452 nfa *nf; 453 dfa *d; 454{ 455 int nbits = nf->nf_nstates; 456 bitset ss; 457 int xx_nstates; 458 ss_state *xx_state, *yy; 459 ss_arc *zz; 460 int istate, jstate, iarc, jarc, ibit; 461 nfastate *st; 462 nfaarc *ar; 463 464 ss = newbitset(nbits); 465 addclosure(ss, nf, nf->nf_start); 466 xx_state = NEW(ss_state, 1); 467 if (xx_state == NULL) 468 fatal("no mem for xx_state in makedfa"); 469 xx_nstates = 1; 470 yy = &xx_state[0]; 471 yy->ss_ss = ss; 472 yy->ss_narcs = 0; 473 yy->ss_arc = NULL; 474 yy->ss_deleted = 0; 475 yy->ss_finish = testbit(ss, nf->nf_finish); 476 if (yy->ss_finish) 477 printf("Error: nonterminal '%s' may produce empty.\n", 478 nf->nf_name); 479 480 /* This algorithm is from a book written before 481 the invention of structured programming... */ 482 483 /* For each unmarked state... */ 484 for (istate = 0; istate < xx_nstates; ++istate) { 485 yy = &xx_state[istate]; 486 ss = yy->ss_ss; 487 /* For all its states... */ 488 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { 489 if (!testbit(ss, ibit)) 490 continue; 491 st = &nf->nf_state[ibit]; 492 /* For all non-empty arcs from this state... */ 493 for (iarc = 0; iarc < st->st_narcs; iarc++) { 494 ar = &st->st_arc[iarc]; 495 if (ar->ar_label == EMPTY) 496 continue; 497 /* Look up in list of arcs from this state */ 498 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { 499 zz = &yy->ss_arc[jarc]; 500 if (ar->ar_label == zz->sa_label) 501 goto found; 502 } 503 /* Add new arc for this state */ 504 RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1); 505 if (yy->ss_arc == NULL) 506 fatal("out of mem"); 507 zz = &yy->ss_arc[yy->ss_narcs++]; 508 zz->sa_label = ar->ar_label; 509 zz->sa_bitset = newbitset(nbits); 510 zz->sa_arrow = -1; 511 found: ; 512 /* Add destination */ 513 addclosure(zz->sa_bitset, nf, ar->ar_arrow); 514 } 515 } 516 /* Now look up all the arrow states */ 517 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { 518 zz = &xx_state[istate].ss_arc[jarc]; 519 for (jstate = 0; jstate < xx_nstates; jstate++) { 520 if (samebitset(zz->sa_bitset, 521 xx_state[jstate].ss_ss, nbits)) { 522 zz->sa_arrow = jstate; 523 goto done; 524 } 525 } 526 RESIZE(xx_state, ss_state, xx_nstates + 1); 527 if (xx_state == NULL) 528 fatal("out of mem"); 529 zz->sa_arrow = xx_nstates; 530 yy = &xx_state[xx_nstates++]; 531 yy->ss_ss = zz->sa_bitset; 532 yy->ss_narcs = 0; 533 yy->ss_arc = NULL; 534 yy->ss_deleted = 0; 535 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); 536 done: ; 537 } 538 } 539 540 if (debugging) 541 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 542 "before minimizing"); 543 544 simplify(xx_nstates, xx_state); 545 546 if (debugging) 547 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 548 "after minimizing"); 549 550 convert(d, xx_nstates, xx_state); 551 552 /* XXX cleanup */ 553} 554 555static void 556printssdfa(xx_nstates, xx_state, nbits, ll, msg) 557 int xx_nstates; 558 ss_state *xx_state; 559 int nbits; 560 labellist *ll; 561 char *msg; 562{ 563 int i, ibit, iarc; 564 ss_state *yy; 565 ss_arc *zz; 566 567 printf("Subset DFA %s\n", msg); 568 for (i = 0; i < xx_nstates; i++) { 569 yy = &xx_state[i]; 570 if (yy->ss_deleted) 571 continue; 572 printf(" Subset %d", i); 573 if (yy->ss_finish) 574 printf(" (finish)"); 575 printf(" { "); 576 for (ibit = 0; ibit < nbits; ibit++) { 577 if (testbit(yy->ss_ss, ibit)) 578 printf("%d ", ibit); 579 } 580 printf("}\n"); 581 for (iarc = 0; iarc < yy->ss_narcs; iarc++) { 582 zz = &yy->ss_arc[iarc]; 583 printf(" Arc to state %d, label %s\n", 584 zz->sa_arrow, 585 labelrepr(&ll->ll_label[zz->sa_label])); 586 } 587 } 588} 589 590 591/* PART THREE -- SIMPLIFY DFA */ 592 593/* Simplify the DFA by repeatedly eliminating states that are 594 equivalent to another oner. This is NOT Algorithm 3.3 from 595 [Aho&Ullman 77]. It does not always finds the minimal DFA, 596 but it does usually make a much smaller one... (For an example 597 of sub-optimal behaviour, try S: x a b+ | y a b+.) 598*/ 599 600static int 601samestate(s1, s2) 602 ss_state *s1, *s2; 603{ 604 int i; 605 606 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) 607 return 0; 608 for (i = 0; i < s1->ss_narcs; i++) { 609 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || 610 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) 611 return 0; 612 } 613 return 1; 614} 615 616static void 617renamestates(xx_nstates, xx_state, from, to) 618 int xx_nstates; 619 ss_state *xx_state; 620 int from, to; 621{ 622 int i, j; 623 624 if (debugging) 625 printf("Rename state %d to %d.\n", from, to); 626 for (i = 0; i < xx_nstates; i++) { 627 if (xx_state[i].ss_deleted) 628 continue; 629 for (j = 0; j < xx_state[i].ss_narcs; j++) { 630 if (xx_state[i].ss_arc[j].sa_arrow == from) 631 xx_state[i].ss_arc[j].sa_arrow = to; 632 } 633 } 634} 635 636static void 637simplify(xx_nstates, xx_state) 638 int xx_nstates; 639 ss_state *xx_state; 640{ 641 int changes; 642 int i, j; 643 644 do { 645 changes = 0; 646 for (i = 1; i < xx_nstates; i++) { 647 if (xx_state[i].ss_deleted) 648 continue; 649 for (j = 0; j < i; j++) { 650 if (xx_state[j].ss_deleted) 651 continue; 652 if (samestate(&xx_state[i], &xx_state[j])) { 653 xx_state[i].ss_deleted++; 654 renamestates(xx_nstates, xx_state, i, j); 655 changes++; 656 break; 657 } 658 } 659 } 660 } while (changes); 661} 662 663 664/* PART FOUR -- GENERATE PARSING TABLES */ 665 666/* Convert the DFA into a grammar that can be used by our parser */ 667 668static void 669convert(d, xx_nstates, xx_state) 670 dfa *d; 671 int xx_nstates; 672 ss_state *xx_state; 673{ 674 int i, j; 675 ss_state *yy; 676 ss_arc *zz; 677 678 for (i = 0; i < xx_nstates; i++) { 679 yy = &xx_state[i]; 680 if (yy->ss_deleted) 681 continue; 682 yy->ss_rename = addstate(d); 683 } 684 685 for (i = 0; i < xx_nstates; i++) { 686 yy = &xx_state[i]; 687 if (yy->ss_deleted) 688 continue; 689 for (j = 0; j < yy->ss_narcs; j++) { 690 zz = &yy->ss_arc[j]; 691 addarc(d, yy->ss_rename, 692 xx_state[zz->sa_arrow].ss_rename, 693 zz->sa_label); 694 } 695 if (yy->ss_finish) 696 addarc(d, yy->ss_rename, yy->ss_rename, 0); 697 } 698 699 d->d_initial = 0; 700} 701 702 703/* PART FIVE -- GLUE IT ALL TOGETHER */ 704 705static grammar * 706maketables(gr) 707 nfagrammar *gr; 708{ 709 int i; 710 nfa *nf; 711 dfa *d; 712 grammar *g; 713 714 if (gr->gr_nnfas == 0) 715 return NULL; 716 g = newgrammar(gr->gr_nfa[0]->nf_type); 717 /* XXX first rule must be start rule */ 718 g->g_ll = gr->gr_ll; 719 720 for (i = 0; i < gr->gr_nnfas; i++) { 721 nf = gr->gr_nfa[i]; 722 if (debugging) { 723 printf("Dump of NFA for '%s' ...\n", nf->nf_name); 724 dumpnfa(&gr->gr_ll, nf); 725 } 726 printf("Making DFA for '%s' ...\n", nf->nf_name); 727 d = adddfa(g, nf->nf_type, nf->nf_name); 728 makedfa(gr, gr->gr_nfa[i], d); 729 } 730 731 return g; 732} 733 734grammar * 735pgen(n) 736 node *n; 737{ 738 nfagrammar *gr; 739 grammar *g; 740 741 gr = metacompile(n); 742 g = maketables(gr); 743 translatelabels(g); 744 addfirstsets(g); 745 return g; 746} 747 748 749/* 750 751Description 752----------- 753 754Input is a grammar in extended BNF (using * for repetition, + for 755at-least-once repetition, [] for optional parts, | for alternatives and 756() for grouping). This has already been parsed and turned into a parse 757tree. 758 759Each rule is considered as a regular expression in its own right. 760It is turned into a Non-deterministic Finite Automaton (NFA), which 761is then turned into a Deterministic Finite Automaton (DFA), which is then 762optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, 763or similar compiler books (this technique is more often used for lexical 764analyzers). 765 766The DFA's are used by the parser as parsing tables in a special way 767that's probably unique. Before they are usable, the FIRST sets of all 768non-terminals are computed. 769 770Reference 771--------- 772 773[Aho&Ullman 77] 774 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 775 (first edition) 776 777*/ 778