pgen.c revision 86bea46b3d16c4ed0453e17f241ddbdfade76c98
1/*********************************************************** 2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, 3The Netherlands. 4 5 All Rights Reserved 6 7Permission to use, copy, modify, and distribute this software and its 8documentation for any purpose and without fee is hereby granted, 9provided that the above copyright notice appear in all copies and that 10both that copyright notice and this permission notice appear in 11supporting documentation, and that the names of Stichting Mathematisch 12Centrum or CWI or Corporation for National Research Initiatives or 13CNRI not be used in advertising or publicity pertaining to 14distribution of the software without specific, written prior 15permission. 16 17While CWI is the initial source for this software, a modified version 18is made available by the Corporation for National Research Initiatives 19(CNRI) at the Internet address ftp://ftp.python.org. 20 21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH 22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF 23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH 24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL 25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR 26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 28PERFORMANCE OF THIS SOFTWARE. 29 30******************************************************************/ 31 32/* Parser generator */ 33/* XXX This file is not yet fully PROTOized */ 34 35/* For a description, see the comments at end of this file */ 36 37#include "pgenheaders.h" 38#include "assert.h" 39#include "token.h" 40#include "node.h" 41#include "grammar.h" 42#include "metagrammar.h" 43#include "pgen.h" 44 45extern int Py_DebugFlag; 46 47 48/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ 49 50typedef struct _nfaarc { 51 int ar_label; 52 int ar_arrow; 53} nfaarc; 54 55typedef struct _nfastate { 56 int st_narcs; 57 nfaarc *st_arc; 58} nfastate; 59 60typedef struct _nfa { 61 int nf_type; 62 char *nf_name; 63 int nf_nstates; 64 nfastate *nf_state; 65 int nf_start, nf_finish; 66} nfa; 67 68/* Forward */ 69static void compile_rhs Py_PROTO((labellist *ll, 70 nfa *nf, node *n, int *pa, int *pb)); 71static void compile_alt Py_PROTO((labellist *ll, 72 nfa *nf, node *n, int *pa, int *pb)); 73static void compile_item Py_PROTO((labellist *ll, 74 nfa *nf, node *n, int *pa, int *pb)); 75static void compile_atom Py_PROTO((labellist *ll, 76 nfa *nf, node *n, int *pa, int *pb)); 77 78static int 79addnfastate(nf) 80 nfa *nf; 81{ 82 nfastate *st; 83 84 PyMem_RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1); 85 if (nf->nf_state == NULL) 86 Py_FatalError("out of mem"); 87 st = &nf->nf_state[nf->nf_nstates++]; 88 st->st_narcs = 0; 89 st->st_arc = NULL; 90 return st - nf->nf_state; 91} 92 93static void 94addnfaarc(nf, from, to, lbl) 95 nfa *nf; 96 int from, to, lbl; 97{ 98 nfastate *st; 99 nfaarc *ar; 100 101 st = &nf->nf_state[from]; 102 PyMem_RESIZE(st->st_arc, nfaarc, st->st_narcs + 1); 103 if (st->st_arc == NULL) 104 Py_FatalError("out of mem"); 105 ar = &st->st_arc[st->st_narcs++]; 106 ar->ar_label = lbl; 107 ar->ar_arrow = to; 108} 109 110static nfa * 111newnfa(name) 112 char *name; 113{ 114 nfa *nf; 115 static type = NT_OFFSET; /* All types will be disjunct */ 116 117 nf = PyMem_NEW(nfa, 1); 118 if (nf == NULL) 119 Py_FatalError("no mem for new nfa"); 120 nf->nf_type = type++; 121 nf->nf_name = name; /* XXX strdup(name) ??? */ 122 nf->nf_nstates = 0; 123 nf->nf_state = NULL; 124 nf->nf_start = nf->nf_finish = -1; 125 return nf; 126} 127 128typedef struct _nfagrammar { 129 int gr_nnfas; 130 nfa **gr_nfa; 131 labellist gr_ll; 132} nfagrammar; 133 134/* Forward */ 135static void compile_rule Py_PROTO((nfagrammar *gr, node *n)); 136 137static nfagrammar * 138newnfagrammar() 139{ 140 nfagrammar *gr; 141 142 gr = PyMem_NEW(nfagrammar, 1); 143 if (gr == NULL) 144 Py_FatalError("no mem for new nfa grammar"); 145 gr->gr_nnfas = 0; 146 gr->gr_nfa = NULL; 147 gr->gr_ll.ll_nlabels = 0; 148 gr->gr_ll.ll_label = NULL; 149 addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); 150 return gr; 151} 152 153static nfa * 154addnfa(gr, name) 155 nfagrammar *gr; 156 char *name; 157{ 158 nfa *nf; 159 160 nf = newnfa(name); 161 PyMem_RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1); 162 if (gr->gr_nfa == NULL) 163 Py_FatalError("out of mem"); 164 gr->gr_nfa[gr->gr_nnfas++] = nf; 165 addlabel(&gr->gr_ll, NAME, nf->nf_name); 166 return nf; 167} 168 169#ifdef Py_DEBUG 170 171static char REQNFMT[] = "metacompile: less than %d children\n"; 172 173#define REQN(i, count) \ 174 if (i < count) { \ 175 fprintf(stderr, REQNFMT, count); \ 176 Py_FatalError("REQN"); \ 177 } else 178 179#else 180#define REQN(i, count) /* empty */ 181#endif 182 183static nfagrammar * 184metacompile(n) 185 node *n; 186{ 187 nfagrammar *gr; 188 int i; 189 190 printf("Compiling (meta-) parse tree into NFA grammar\n"); 191 gr = newnfagrammar(); 192 REQ(n, MSTART); 193 i = n->n_nchildren - 1; /* Last child is ENDMARKER */ 194 n = n->n_child; 195 for (; --i >= 0; n++) { 196 if (n->n_type != NEWLINE) 197 compile_rule(gr, n); 198 } 199 return gr; 200} 201 202static void 203compile_rule(gr, n) 204 nfagrammar *gr; 205 node *n; 206{ 207 nfa *nf; 208 209 REQ(n, RULE); 210 REQN(n->n_nchildren, 4); 211 n = n->n_child; 212 REQ(n, NAME); 213 nf = addnfa(gr, n->n_str); 214 n++; 215 REQ(n, COLON); 216 n++; 217 REQ(n, RHS); 218 compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); 219 n++; 220 REQ(n, NEWLINE); 221} 222 223static void 224compile_rhs(ll, nf, n, pa, pb) 225 labellist *ll; 226 nfa *nf; 227 node *n; 228 int *pa, *pb; 229{ 230 int i; 231 int a, b; 232 233 REQ(n, RHS); 234 i = n->n_nchildren; 235 REQN(i, 1); 236 n = n->n_child; 237 REQ(n, ALT); 238 compile_alt(ll, nf, n, pa, pb); 239 if (--i <= 0) 240 return; 241 n++; 242 a = *pa; 243 b = *pb; 244 *pa = addnfastate(nf); 245 *pb = addnfastate(nf); 246 addnfaarc(nf, *pa, a, EMPTY); 247 addnfaarc(nf, b, *pb, EMPTY); 248 for (; --i >= 0; n++) { 249 REQ(n, VBAR); 250 REQN(i, 1); 251 --i; 252 n++; 253 REQ(n, ALT); 254 compile_alt(ll, nf, n, &a, &b); 255 addnfaarc(nf, *pa, a, EMPTY); 256 addnfaarc(nf, b, *pb, EMPTY); 257 } 258} 259 260static void 261compile_alt(ll, nf, n, pa, pb) 262 labellist *ll; 263 nfa *nf; 264 node *n; 265 int *pa, *pb; 266{ 267 int i; 268 int a, b; 269 270 REQ(n, ALT); 271 i = n->n_nchildren; 272 REQN(i, 1); 273 n = n->n_child; 274 REQ(n, ITEM); 275 compile_item(ll, nf, n, pa, pb); 276 --i; 277 n++; 278 for (; --i >= 0; n++) { 279 if (n->n_type == COMMA) { /* XXX Temporary */ 280 REQN(i, 1); 281 --i; 282 n++; 283 } 284 REQ(n, ITEM); 285 compile_item(ll, nf, n, &a, &b); 286 addnfaarc(nf, *pb, a, EMPTY); 287 *pb = b; 288 } 289} 290 291static void 292compile_item(ll, nf, n, pa, pb) 293 labellist *ll; 294 nfa *nf; 295 node *n; 296 int *pa, *pb; 297{ 298 int i; 299 int a, b; 300 301 REQ(n, ITEM); 302 i = n->n_nchildren; 303 REQN(i, 1); 304 n = n->n_child; 305 if (n->n_type == LSQB) { 306 REQN(i, 3); 307 n++; 308 REQ(n, RHS); 309 *pa = addnfastate(nf); 310 *pb = addnfastate(nf); 311 addnfaarc(nf, *pa, *pb, EMPTY); 312 compile_rhs(ll, nf, n, &a, &b); 313 addnfaarc(nf, *pa, a, EMPTY); 314 addnfaarc(nf, b, *pb, EMPTY); 315 REQN(i, 1); 316 n++; 317 REQ(n, RSQB); 318 } 319 else { 320 compile_atom(ll, nf, n, pa, pb); 321 if (--i <= 0) 322 return; 323 n++; 324 addnfaarc(nf, *pb, *pa, EMPTY); 325 if (n->n_type == STAR) 326 *pb = *pa; 327 else 328 REQ(n, PLUS); 329 } 330} 331 332static void 333compile_atom(ll, nf, n, pa, pb) 334 labellist *ll; 335 nfa *nf; 336 node *n; 337 int *pa, *pb; 338{ 339 int i; 340 341 REQ(n, ATOM); 342 i = n->n_nchildren; 343 REQN(i, 1); 344 n = n->n_child; 345 if (n->n_type == LPAR) { 346 REQN(i, 3); 347 n++; 348 REQ(n, RHS); 349 compile_rhs(ll, nf, n, pa, pb); 350 n++; 351 REQ(n, RPAR); 352 } 353 else if (n->n_type == NAME || n->n_type == STRING) { 354 *pa = addnfastate(nf); 355 *pb = addnfastate(nf); 356 addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); 357 } 358 else 359 REQ(n, NAME); 360} 361 362static void 363dumpstate(ll, nf, istate) 364 labellist *ll; 365 nfa *nf; 366 int istate; 367{ 368 nfastate *st; 369 int i; 370 nfaarc *ar; 371 372 printf("%c%2d%c", 373 istate == nf->nf_start ? '*' : ' ', 374 istate, 375 istate == nf->nf_finish ? '.' : ' '); 376 st = &nf->nf_state[istate]; 377 ar = st->st_arc; 378 for (i = 0; i < st->st_narcs; i++) { 379 if (i > 0) 380 printf("\n "); 381 printf("-> %2d %s", ar->ar_arrow, 382 PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label])); 383 ar++; 384 } 385 printf("\n"); 386} 387 388static void 389dumpnfa(ll, nf) 390 labellist *ll; 391 nfa *nf; 392{ 393 int i; 394 395 printf("NFA '%s' has %d states; start %d, finish %d\n", 396 nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); 397 for (i = 0; i < nf->nf_nstates; i++) 398 dumpstate(ll, nf, i); 399} 400 401 402/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ 403 404static void 405addclosure(ss, nf, istate) 406 bitset ss; 407 nfa *nf; 408 int istate; 409{ 410 if (addbit(ss, istate)) { 411 nfastate *st = &nf->nf_state[istate]; 412 nfaarc *ar = st->st_arc; 413 int i; 414 415 for (i = st->st_narcs; --i >= 0; ) { 416 if (ar->ar_label == EMPTY) 417 addclosure(ss, nf, ar->ar_arrow); 418 ar++; 419 } 420 } 421} 422 423typedef struct _ss_arc { 424 bitset sa_bitset; 425 int sa_arrow; 426 int sa_label; 427} ss_arc; 428 429typedef struct _ss_state { 430 bitset ss_ss; 431 int ss_narcs; 432 ss_arc *ss_arc; 433 int ss_deleted; 434 int ss_finish; 435 int ss_rename; 436} ss_state; 437 438typedef struct _ss_dfa { 439 int sd_nstates; 440 ss_state *sd_state; 441} ss_dfa; 442 443/* Forward */ 444static void printssdfa Py_PROTO((int xx_nstates, ss_state *xx_state, int nbits, 445 labellist *ll, char *msg)); 446static void simplify Py_PROTO((int xx_nstates, ss_state *xx_state)); 447static void convert Py_PROTO((dfa *d, int xx_nstates, ss_state *xx_state)); 448 449static void 450makedfa(gr, nf, d) 451 nfagrammar *gr; 452 nfa *nf; 453 dfa *d; 454{ 455 int nbits = nf->nf_nstates; 456 bitset ss; 457 int xx_nstates; 458 ss_state *xx_state, *yy; 459 ss_arc *zz; 460 int istate, jstate, iarc, jarc, ibit; 461 nfastate *st; 462 nfaarc *ar; 463 464 ss = newbitset(nbits); 465 addclosure(ss, nf, nf->nf_start); 466 xx_state = PyMem_NEW(ss_state, 1); 467 if (xx_state == NULL) 468 Py_FatalError("no mem for xx_state in makedfa"); 469 xx_nstates = 1; 470 yy = &xx_state[0]; 471 yy->ss_ss = ss; 472 yy->ss_narcs = 0; 473 yy->ss_arc = NULL; 474 yy->ss_deleted = 0; 475 yy->ss_finish = testbit(ss, nf->nf_finish); 476 if (yy->ss_finish) 477 printf("Error: nonterminal '%s' may produce empty.\n", 478 nf->nf_name); 479 480 /* This algorithm is from a book written before 481 the invention of structured programming... */ 482 483 /* For each unmarked state... */ 484 for (istate = 0; istate < xx_nstates; ++istate) { 485 yy = &xx_state[istate]; 486 ss = yy->ss_ss; 487 /* For all its states... */ 488 for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { 489 if (!testbit(ss, ibit)) 490 continue; 491 st = &nf->nf_state[ibit]; 492 /* For all non-empty arcs from this state... */ 493 for (iarc = 0; iarc < st->st_narcs; iarc++) { 494 ar = &st->st_arc[iarc]; 495 if (ar->ar_label == EMPTY) 496 continue; 497 /* Look up in list of arcs from this state */ 498 for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { 499 zz = &yy->ss_arc[jarc]; 500 if (ar->ar_label == zz->sa_label) 501 goto found; 502 } 503 /* Add new arc for this state */ 504 PyMem_RESIZE(yy->ss_arc, ss_arc, 505 yy->ss_narcs + 1); 506 if (yy->ss_arc == NULL) 507 Py_FatalError("out of mem"); 508 zz = &yy->ss_arc[yy->ss_narcs++]; 509 zz->sa_label = ar->ar_label; 510 zz->sa_bitset = newbitset(nbits); 511 zz->sa_arrow = -1; 512 found: ; 513 /* Add destination */ 514 addclosure(zz->sa_bitset, nf, ar->ar_arrow); 515 } 516 } 517 /* Now look up all the arrow states */ 518 for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { 519 zz = &xx_state[istate].ss_arc[jarc]; 520 for (jstate = 0; jstate < xx_nstates; jstate++) { 521 if (samebitset(zz->sa_bitset, 522 xx_state[jstate].ss_ss, nbits)) { 523 zz->sa_arrow = jstate; 524 goto done; 525 } 526 } 527 PyMem_RESIZE(xx_state, ss_state, xx_nstates + 1); 528 if (xx_state == NULL) 529 Py_FatalError("out of mem"); 530 zz->sa_arrow = xx_nstates; 531 yy = &xx_state[xx_nstates++]; 532 yy->ss_ss = zz->sa_bitset; 533 yy->ss_narcs = 0; 534 yy->ss_arc = NULL; 535 yy->ss_deleted = 0; 536 yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); 537 done: ; 538 } 539 } 540 541 if (Py_DebugFlag) 542 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 543 "before minimizing"); 544 545 simplify(xx_nstates, xx_state); 546 547 if (Py_DebugFlag) 548 printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, 549 "after minimizing"); 550 551 convert(d, xx_nstates, xx_state); 552 553 /* XXX cleanup */ 554} 555 556static void 557printssdfa(xx_nstates, xx_state, nbits, ll, msg) 558 int xx_nstates; 559 ss_state *xx_state; 560 int nbits; 561 labellist *ll; 562 char *msg; 563{ 564 int i, ibit, iarc; 565 ss_state *yy; 566 ss_arc *zz; 567 568 printf("Subset DFA %s\n", msg); 569 for (i = 0; i < xx_nstates; i++) { 570 yy = &xx_state[i]; 571 if (yy->ss_deleted) 572 continue; 573 printf(" Subset %d", i); 574 if (yy->ss_finish) 575 printf(" (finish)"); 576 printf(" { "); 577 for (ibit = 0; ibit < nbits; ibit++) { 578 if (testbit(yy->ss_ss, ibit)) 579 printf("%d ", ibit); 580 } 581 printf("}\n"); 582 for (iarc = 0; iarc < yy->ss_narcs; iarc++) { 583 zz = &yy->ss_arc[iarc]; 584 printf(" Arc to state %d, label %s\n", 585 zz->sa_arrow, 586 PyGrammar_LabelRepr( 587 &ll->ll_label[zz->sa_label])); 588 } 589 } 590} 591 592 593/* PART THREE -- SIMPLIFY DFA */ 594 595/* Simplify the DFA by repeatedly eliminating states that are 596 equivalent to another oner. This is NOT Algorithm 3.3 from 597 [Aho&Ullman 77]. It does not always finds the minimal DFA, 598 but it does usually make a much smaller one... (For an example 599 of sub-optimal behaviour, try S: x a b+ | y a b+.) 600*/ 601 602static int 603samestate(s1, s2) 604 ss_state *s1, *s2; 605{ 606 int i; 607 608 if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) 609 return 0; 610 for (i = 0; i < s1->ss_narcs; i++) { 611 if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || 612 s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) 613 return 0; 614 } 615 return 1; 616} 617 618static void 619renamestates(xx_nstates, xx_state, from, to) 620 int xx_nstates; 621 ss_state *xx_state; 622 int from, to; 623{ 624 int i, j; 625 626 if (Py_DebugFlag) 627 printf("Rename state %d to %d.\n", from, to); 628 for (i = 0; i < xx_nstates; i++) { 629 if (xx_state[i].ss_deleted) 630 continue; 631 for (j = 0; j < xx_state[i].ss_narcs; j++) { 632 if (xx_state[i].ss_arc[j].sa_arrow == from) 633 xx_state[i].ss_arc[j].sa_arrow = to; 634 } 635 } 636} 637 638static void 639simplify(xx_nstates, xx_state) 640 int xx_nstates; 641 ss_state *xx_state; 642{ 643 int changes; 644 int i, j; 645 646 do { 647 changes = 0; 648 for (i = 1; i < xx_nstates; i++) { 649 if (xx_state[i].ss_deleted) 650 continue; 651 for (j = 0; j < i; j++) { 652 if (xx_state[j].ss_deleted) 653 continue; 654 if (samestate(&xx_state[i], &xx_state[j])) { 655 xx_state[i].ss_deleted++; 656 renamestates(xx_nstates, xx_state, 657 i, j); 658 changes++; 659 break; 660 } 661 } 662 } 663 } while (changes); 664} 665 666 667/* PART FOUR -- GENERATE PARSING TABLES */ 668 669/* Convert the DFA into a grammar that can be used by our parser */ 670 671static void 672convert(d, xx_nstates, xx_state) 673 dfa *d; 674 int xx_nstates; 675 ss_state *xx_state; 676{ 677 int i, j; 678 ss_state *yy; 679 ss_arc *zz; 680 681 for (i = 0; i < xx_nstates; i++) { 682 yy = &xx_state[i]; 683 if (yy->ss_deleted) 684 continue; 685 yy->ss_rename = addstate(d); 686 } 687 688 for (i = 0; i < xx_nstates; i++) { 689 yy = &xx_state[i]; 690 if (yy->ss_deleted) 691 continue; 692 for (j = 0; j < yy->ss_narcs; j++) { 693 zz = &yy->ss_arc[j]; 694 addarc(d, yy->ss_rename, 695 xx_state[zz->sa_arrow].ss_rename, 696 zz->sa_label); 697 } 698 if (yy->ss_finish) 699 addarc(d, yy->ss_rename, yy->ss_rename, 0); 700 } 701 702 d->d_initial = 0; 703} 704 705 706/* PART FIVE -- GLUE IT ALL TOGETHER */ 707 708static grammar * 709maketables(gr) 710 nfagrammar *gr; 711{ 712 int i; 713 nfa *nf; 714 dfa *d; 715 grammar *g; 716 717 if (gr->gr_nnfas == 0) 718 return NULL; 719 g = newgrammar(gr->gr_nfa[0]->nf_type); 720 /* XXX first rule must be start rule */ 721 g->g_ll = gr->gr_ll; 722 723 for (i = 0; i < gr->gr_nnfas; i++) { 724 nf = gr->gr_nfa[i]; 725 if (Py_DebugFlag) { 726 printf("Dump of NFA for '%s' ...\n", nf->nf_name); 727 dumpnfa(&gr->gr_ll, nf); 728 } 729 printf("Making DFA for '%s' ...\n", nf->nf_name); 730 d = adddfa(g, nf->nf_type, nf->nf_name); 731 makedfa(gr, gr->gr_nfa[i], d); 732 } 733 734 return g; 735} 736 737grammar * 738pgen(n) 739 node *n; 740{ 741 nfagrammar *gr; 742 grammar *g; 743 744 gr = metacompile(n); 745 g = maketables(gr); 746 translatelabels(g); 747 addfirstsets(g); 748 return g; 749} 750 751 752/* 753 754Description 755----------- 756 757Input is a grammar in extended BNF (using * for repetition, + for 758at-least-once repetition, [] for optional parts, | for alternatives and 759() for grouping). This has already been parsed and turned into a parse 760tree. 761 762Each rule is considered as a regular expression in its own right. 763It is turned into a Non-deterministic Finite Automaton (NFA), which 764is then turned into a Deterministic Finite Automaton (DFA), which is then 765optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, 766or similar compiler books (this technique is more often used for lexical 767analyzers). 768 769The DFA's are used by the parser as parsing tables in a special way 770that's probably unique. Before they are usable, the FIRST sets of all 771non-terminals are computed. 772 773Reference 774--------- 775 776[Aho&Ullman 77] 777 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 778 (first edition) 779 780*/ 781