1/* 2********************************************************************** 3* Copyright (c) 2002-2008, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6*/ 7// 8// rbbitblb.cpp 9// 10 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_BREAK_ITERATION 15 16#include "unicode/unistr.h" 17#include "rbbitblb.h" 18#include "rbbirb.h" 19#include "rbbisetb.h" 20#include "rbbidata.h" 21#include "cstring.h" 22#include "uassert.h" 23#include "cmemory.h" 24 25U_NAMESPACE_BEGIN 26 27RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) : 28 fTree(*rootNode) { 29 fRB = rb; 30 fStatus = fRB->fStatus; 31 UErrorCode status = U_ZERO_ERROR; 32 fDStates = new UVector(status); 33 if (U_FAILURE(*fStatus)) { 34 return; 35 } 36 if (U_FAILURE(status)) { 37 *fStatus = status; 38 return; 39 } 40 if (fDStates == NULL) { 41 *fStatus = U_MEMORY_ALLOCATION_ERROR;; 42 } 43} 44 45 46 47RBBITableBuilder::~RBBITableBuilder() { 48 int i; 49 for (i=0; i<fDStates->size(); i++) { 50 delete (RBBIStateDescriptor *)fDStates->elementAt(i); 51 } 52 delete fDStates; 53} 54 55 56//----------------------------------------------------------------------------- 57// 58// RBBITableBuilder::build - This is the main function for building the DFA state transtion 59// table from the RBBI rules parse tree. 60// 61//----------------------------------------------------------------------------- 62void RBBITableBuilder::build() { 63 64 if (U_FAILURE(*fStatus)) { 65 return; 66 } 67 68 // If there were no rules, just return. This situation can easily arise 69 // for the reverse rules. 70 if (fTree==NULL) { 71 return; 72 } 73 74 // 75 // Walk through the tree, replacing any references to $variables with a copy of the 76 // parse tree for the substition expression. 77 // 78 fTree = fTree->flattenVariables(); 79#ifdef RBBI_DEBUG 80 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { 81 RBBIDebugPuts("Parse tree after flattening variable references."); 82 fTree->printTree(TRUE); 83 } 84#endif 85 86 // 87 // If the rules contained any references to {bof} 88 // add a {bof} <cat> <former root of tree> to the 89 // tree. Means that all matches must start out with the 90 // {bof} fake character. 91 // 92 if (fRB->fSetBuilder->sawBOF()) { 93 RBBINode *bofTop = new RBBINode(RBBINode::opCat); 94 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); 95 // Delete and exit if memory allocation failed. 96 if (bofTop == NULL || bofLeaf == NULL) { 97 *fStatus = U_MEMORY_ALLOCATION_ERROR; 98 delete bofTop; 99 delete bofLeaf; 100 return; 101 } 102 bofTop->fLeftChild = bofLeaf; 103 bofTop->fRightChild = fTree; 104 bofLeaf->fParent = bofTop; 105 bofLeaf->fVal = 2; // Reserved value for {bof}. 106 fTree = bofTop; 107 } 108 109 // 110 // Add a unique right-end marker to the expression. 111 // Appears as a cat-node, left child being the original tree, 112 // right child being the end marker. 113 // 114 RBBINode *cn = new RBBINode(RBBINode::opCat); 115 // Exit if memory allocation failed. 116 if (cn == NULL) { 117 *fStatus = U_MEMORY_ALLOCATION_ERROR; 118 return; 119 } 120 cn->fLeftChild = fTree; 121 fTree->fParent = cn; 122 cn->fRightChild = new RBBINode(RBBINode::endMark); 123 // Delete and exit if memory allocation failed. 124 if (cn->fRightChild == NULL) { 125 *fStatus = U_MEMORY_ALLOCATION_ERROR; 126 delete cn; 127 return; 128 } 129 cn->fRightChild->fParent = cn; 130 fTree = cn; 131 132 // 133 // Replace all references to UnicodeSets with the tree for the equivalent 134 // expression. 135 // 136 fTree->flattenSets(); 137#ifdef RBBI_DEBUG 138 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { 139 RBBIDebugPuts("Parse tree after flattening Unicode Set references."); 140 fTree->printTree(TRUE); 141 } 142#endif 143 144 145 // 146 // calculate the functions nullable, firstpos, lastpos and followpos on 147 // nodes in the parse tree. 148 // See the alogrithm description in Aho. 149 // Understanding how this works by looking at the code alone will be 150 // nearly impossible. 151 // 152 calcNullable(fTree); 153 calcFirstPos(fTree); 154 calcLastPos(fTree); 155 calcFollowPos(fTree); 156 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { 157 RBBIDebugPuts("\n"); 158 printPosSets(fTree); 159 } 160 161 // 162 // For "chained" rules, modify the followPos sets 163 // 164 if (fRB->fChainRules) { 165 calcChainedFollowPos(fTree); 166 } 167 168 // 169 // BOF (start of input) test fixup. 170 // 171 if (fRB->fSetBuilder->sawBOF()) { 172 bofFixup(); 173 } 174 175 // 176 // Build the DFA state transition tables. 177 // 178 buildStateTable(); 179 flagAcceptingStates(); 180 flagLookAheadStates(); 181 flagTaggedStates(); 182 183 // 184 // Update the global table of rule status {tag} values 185 // The rule builder has a global vector of status values that are common 186 // for all tables. Merge the ones from this table into the global set. 187 // 188 mergeRuleStatusVals(); 189 190 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; 191} 192 193 194 195//----------------------------------------------------------------------------- 196// 197// calcNullable. Impossible to explain succinctly. See Aho, section 3.9 198// 199//----------------------------------------------------------------------------- 200void RBBITableBuilder::calcNullable(RBBINode *n) { 201 if (n == NULL) { 202 return; 203 } 204 if (n->fType == RBBINode::setRef || 205 n->fType == RBBINode::endMark ) { 206 // These are non-empty leaf node types. 207 n->fNullable = FALSE; 208 return; 209 } 210 211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { 212 // Lookahead marker node. It's a leaf, so no recursion on children. 213 // It's nullable because it does not match any literal text from the input stream. 214 n->fNullable = TRUE; 215 return; 216 } 217 218 219 // The node is not a leaf. 220 // Calculate nullable on its children. 221 calcNullable(n->fLeftChild); 222 calcNullable(n->fRightChild); 223 224 // Apply functions from table 3.40 in Aho 225 if (n->fType == RBBINode::opOr) { 226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable; 227 } 228 else if (n->fType == RBBINode::opCat) { 229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable; 230 } 231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) { 232 n->fNullable = TRUE; 233 } 234 else { 235 n->fNullable = FALSE; 236 } 237} 238 239 240 241 242//----------------------------------------------------------------------------- 243// 244// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 245// 246//----------------------------------------------------------------------------- 247void RBBITableBuilder::calcFirstPos(RBBINode *n) { 248 if (n == NULL) { 249 return; 250 } 251 if (n->fType == RBBINode::leafChar || 252 n->fType == RBBINode::endMark || 253 n->fType == RBBINode::lookAhead || 254 n->fType == RBBINode::tag) { 255 // These are non-empty leaf node types. 256 // Note: In order to maintain the sort invariant on the set, 257 // this function should only be called on a node whose set is 258 // empty to start with. 259 n->fFirstPosSet->addElement(n, *fStatus); 260 return; 261 } 262 263 // The node is not a leaf. 264 // Calculate firstPos on its children. 265 calcFirstPos(n->fLeftChild); 266 calcFirstPos(n->fRightChild); 267 268 // Apply functions from table 3.40 in Aho 269 if (n->fType == RBBINode::opOr) { 270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 272 } 273 else if (n->fType == RBBINode::opCat) { 274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 275 if (n->fLeftChild->fNullable) { 276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 277 } 278 } 279 else if (n->fType == RBBINode::opStar || 280 n->fType == RBBINode::opQuestion || 281 n->fType == RBBINode::opPlus) { 282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 283 } 284} 285 286 287 288//----------------------------------------------------------------------------- 289// 290// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 291// 292//----------------------------------------------------------------------------- 293void RBBITableBuilder::calcLastPos(RBBINode *n) { 294 if (n == NULL) { 295 return; 296 } 297 if (n->fType == RBBINode::leafChar || 298 n->fType == RBBINode::endMark || 299 n->fType == RBBINode::lookAhead || 300 n->fType == RBBINode::tag) { 301 // These are non-empty leaf node types. 302 // Note: In order to maintain the sort invariant on the set, 303 // this function should only be called on a node whose set is 304 // empty to start with. 305 n->fLastPosSet->addElement(n, *fStatus); 306 return; 307 } 308 309 // The node is not a leaf. 310 // Calculate lastPos on its children. 311 calcLastPos(n->fLeftChild); 312 calcLastPos(n->fRightChild); 313 314 // Apply functions from table 3.40 in Aho 315 if (n->fType == RBBINode::opOr) { 316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 318 } 319 else if (n->fType == RBBINode::opCat) { 320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 321 if (n->fRightChild->fNullable) { 322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 323 } 324 } 325 else if (n->fType == RBBINode::opStar || 326 n->fType == RBBINode::opQuestion || 327 n->fType == RBBINode::opPlus) { 328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 329 } 330} 331 332 333 334//----------------------------------------------------------------------------- 335// 336// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 337// 338//----------------------------------------------------------------------------- 339void RBBITableBuilder::calcFollowPos(RBBINode *n) { 340 if (n == NULL || 341 n->fType == RBBINode::leafChar || 342 n->fType == RBBINode::endMark) { 343 return; 344 } 345 346 calcFollowPos(n->fLeftChild); 347 calcFollowPos(n->fRightChild); 348 349 // Aho rule #1 350 if (n->fType == RBBINode::opCat) { 351 RBBINode *i; // is 'i' in Aho's description 352 uint32_t ix; 353 354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet; 355 356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) { 357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix); 358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); 359 } 360 } 361 362 // Aho rule #2 363 if (n->fType == RBBINode::opStar || 364 n->fType == RBBINode::opPlus) { 365 RBBINode *i; // again, n and i are the names from Aho's description. 366 uint32_t ix; 367 368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) { 369 i = (RBBINode *)n->fLastPosSet->elementAt(ix); 370 setAdd(i->fFollowPos, n->fFirstPosSet); 371 } 372 } 373 374 375 376} 377 378 379//----------------------------------------------------------------------------- 380// 381// calcChainedFollowPos. Modify the previously calculated followPos sets 382// to implement rule chaining. NOT described by Aho 383// 384//----------------------------------------------------------------------------- 385void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { 386 387 UVector endMarkerNodes(*fStatus); 388 UVector leafNodes(*fStatus); 389 int32_t i; 390 391 if (U_FAILURE(*fStatus)) { 392 return; 393 } 394 395 // get a list of all endmarker nodes. 396 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 397 398 // get a list all leaf nodes 399 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); 400 if (U_FAILURE(*fStatus)) { 401 return; 402 } 403 404 // Get all nodes that can be the start a match, which is FirstPosition() 405 // of the portion of the tree corresponding to user-written rules. 406 // See the tree description in bofFixup(). 407 RBBINode *userRuleRoot = tree; 408 if (fRB->fSetBuilder->sawBOF()) { 409 userRuleRoot = tree->fLeftChild->fRightChild; 410 } 411 U_ASSERT(userRuleRoot != NULL); 412 UVector *matchStartNodes = userRuleRoot->fFirstPosSet; 413 414 415 // Iteratate over all leaf nodes, 416 // 417 int32_t endNodeIx; 418 int32_t startNodeIx; 419 420 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) { 421 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx); 422 RBBINode *endNode = NULL; 423 424 // Identify leaf nodes that correspond to overall rule match positions. 425 // These include an endMarkerNode in their followPos sets. 426 for (i=0; i<endMarkerNodes.size(); i++) { 427 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) { 428 endNode = tNode; 429 break; 430 } 431 } 432 if (endNode == NULL) { 433 // node wasn't an end node. Try again with the next. 434 continue; 435 } 436 437 // We've got a node that can end a match. 438 439 // Line Break Specific hack: If this node's val correspond to the $CM char class, 440 // don't chain from it. 441 // TODO: Add rule syntax for this behavior, get specifics out of here and 442 // into the rule file. 443 if (fRB->fLBCMNoChain) { 444 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); 445 if (c != -1) { 446 // c == -1 occurs with sets containing only the {eof} marker string. 447 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 448 if (cLBProp == U_LB_COMBINING_MARK) { 449 continue; 450 } 451 } 452 } 453 454 455 // Now iterate over the nodes that can start a match, looking for ones 456 // with the same char class as our ending node. 457 RBBINode *startNode; 458 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { 459 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); 460 if (startNode->fType != RBBINode::leafChar) { 461 continue; 462 } 463 464 if (endNode->fVal == startNode->fVal) { 465 // The end val (character class) of one possible match is the 466 // same as the start of another. 467 468 // Add all nodes from the followPos of the start node to the 469 // followPos set of the end node, which will have the effect of 470 // letting matches transition from a match state at endNode 471 // to the second char of a match starting with startNode. 472 setAdd(endNode->fFollowPos, startNode->fFollowPos); 473 } 474 } 475 } 476} 477 478 479//----------------------------------------------------------------------------- 480// 481// bofFixup. Fixup for state tables that include {bof} beginning of input testing. 482// Do an swizzle similar to chaining, modifying the followPos set of 483// the bofNode to include the followPos nodes from other {bot} nodes 484// scattered through the tree. 485// 486// This function has much in common with calcChainedFollowPos(). 487// 488//----------------------------------------------------------------------------- 489void RBBITableBuilder::bofFixup() { 490 491 if (U_FAILURE(*fStatus)) { 492 return; 493 } 494 495 // The parse tree looks like this ... 496 // fTree root ---> <cat> 497 // / \ . 498 // <cat> <#end node> 499 // / \ . 500 // <bofNode> rest 501 // of tree 502 // 503 // We will be adding things to the followPos set of the <bofNode> 504 // 505 RBBINode *bofNode = fTree->fLeftChild->fLeftChild; 506 U_ASSERT(bofNode->fType == RBBINode::leafChar); 507 U_ASSERT(bofNode->fVal == 2); 508 509 // Get all nodes that can be the start a match of the user-written rules 510 // (excluding the fake bofNode) 511 // We want the nodes that can start a match in the 512 // part labeled "rest of tree" 513 // 514 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; 515 516 RBBINode *startNode; 517 int startNodeIx; 518 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { 519 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); 520 if (startNode->fType != RBBINode::leafChar) { 521 continue; 522 } 523 524 if (startNode->fVal == bofNode->fVal) { 525 // We found a leaf node corresponding to a {bof} that was 526 // explicitly written into a rule. 527 // Add everything from the followPos set of this node to the 528 // followPos set of the fake bofNode at the start of the tree. 529 // 530 setAdd(bofNode->fFollowPos, startNode->fFollowPos); 531 } 532 } 533} 534 535//----------------------------------------------------------------------------- 536// 537// buildStateTable() Determine the set of runtime DFA states and the 538// transition tables for these states, by the algorithm 539// of fig. 3.44 in Aho. 540// 541// Most of the comments are quotes of Aho's psuedo-code. 542// 543//----------------------------------------------------------------------------- 544void RBBITableBuilder::buildStateTable() { 545 if (U_FAILURE(*fStatus)) { 546 return; 547 } 548 RBBIStateDescriptor *failState; 549 // Set it to NULL to avoid uninitialized warning 550 RBBIStateDescriptor *initialState = NULL; 551 // 552 // Add a dummy state 0 - the stop state. Not from Aho. 553 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; 554 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 555 if (failState == NULL) { 556 *fStatus = U_MEMORY_ALLOCATION_ERROR; 557 goto ExitBuildSTdeleteall; 558 } 559 failState->fPositions = new UVector(*fStatus); 560 if (failState->fPositions == NULL) { 561 *fStatus = U_MEMORY_ALLOCATION_ERROR; 562 } 563 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) { 564 goto ExitBuildSTdeleteall; 565 } 566 fDStates->addElement(failState, *fStatus); 567 if (U_FAILURE(*fStatus)) { 568 goto ExitBuildSTdeleteall; 569 } 570 571 // initially, the only unmarked state in Dstates is firstpos(root), 572 // where toot is the root of the syntax tree for (r)#; 573 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 574 if (initialState == NULL) { 575 *fStatus = U_MEMORY_ALLOCATION_ERROR; 576 } 577 if (U_FAILURE(*fStatus)) { 578 goto ExitBuildSTdeleteall; 579 } 580 initialState->fPositions = new UVector(*fStatus); 581 if (initialState->fPositions == NULL) { 582 *fStatus = U_MEMORY_ALLOCATION_ERROR; 583 } 584 if (U_FAILURE(*fStatus)) { 585 goto ExitBuildSTdeleteall; 586 } 587 setAdd(initialState->fPositions, fTree->fFirstPosSet); 588 fDStates->addElement(initialState, *fStatus); 589 if (U_FAILURE(*fStatus)) { 590 goto ExitBuildSTdeleteall; 591 } 592 593 // while there is an unmarked state T in Dstates do begin 594 for (;;) { 595 RBBIStateDescriptor *T = NULL; 596 int32_t tx; 597 for (tx=1; tx<fDStates->size(); tx++) { 598 RBBIStateDescriptor *temp; 599 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx); 600 if (temp->fMarked == FALSE) { 601 T = temp; 602 break; 603 } 604 } 605 if (T == NULL) { 606 break; 607 } 608 609 // mark T; 610 T->fMarked = TRUE; 611 612 // for each input symbol a do begin 613 int32_t a; 614 for (a = 1; a<=lastInputSymbol; a++) { 615 // let U be the set of positions that are in followpos(p) 616 // for some position p in T 617 // such that the symbol at position p is a; 618 UVector *U = NULL; 619 RBBINode *p; 620 int32_t px; 621 for (px=0; px<T->fPositions->size(); px++) { 622 p = (RBBINode *)T->fPositions->elementAt(px); 623 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { 624 if (U == NULL) { 625 U = new UVector(*fStatus); 626 if (U == NULL) { 627 *fStatus = U_MEMORY_ALLOCATION_ERROR; 628 goto ExitBuildSTdeleteall; 629 } 630 } 631 setAdd(U, p->fFollowPos); 632 } 633 } 634 635 // if U is not empty and not in DStates then 636 int32_t ux = 0; 637 UBool UinDstates = FALSE; 638 if (U != NULL) { 639 U_ASSERT(U->size() > 0); 640 int ix; 641 for (ix=0; ix<fDStates->size(); ix++) { 642 RBBIStateDescriptor *temp2; 643 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix); 644 if (setEquals(U, temp2->fPositions)) { 645 delete U; 646 U = temp2->fPositions; 647 ux = ix; 648 UinDstates = TRUE; 649 break; 650 } 651 } 652 653 // Add U as an unmarked state to Dstates 654 if (!UinDstates) 655 { 656 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 657 if (newState == NULL) { 658 *fStatus = U_MEMORY_ALLOCATION_ERROR; 659 } 660 if (U_FAILURE(*fStatus)) { 661 goto ExitBuildSTdeleteall; 662 } 663 newState->fPositions = U; 664 fDStates->addElement(newState, *fStatus); 665 if (U_FAILURE(*fStatus)) { 666 return; 667 } 668 ux = fDStates->size()-1; 669 } 670 671 // Dtran[T, a] := U; 672 T->fDtran->setElementAt(ux, a); 673 } 674 } 675 } 676 return; 677 // delete local pointers only if error occured. 678ExitBuildSTdeleteall: 679 delete initialState; 680 delete failState; 681} 682 683 684 685//----------------------------------------------------------------------------- 686// 687// flagAcceptingStates Identify accepting states. 688// First get a list of all of the end marker nodes. 689// Then, for each state s, 690// if s contains one of the end marker nodes in its list of tree positions then 691// s is an accepting state. 692// 693//----------------------------------------------------------------------------- 694void RBBITableBuilder::flagAcceptingStates() { 695 if (U_FAILURE(*fStatus)) { 696 return; 697 } 698 UVector endMarkerNodes(*fStatus); 699 RBBINode *endMarker; 700 int32_t i; 701 int32_t n; 702 703 if (U_FAILURE(*fStatus)) { 704 return; 705 } 706 707 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 708 if (U_FAILURE(*fStatus)) { 709 return; 710 } 711 712 for (i=0; i<endMarkerNodes.size(); i++) { 713 endMarker = (RBBINode *)endMarkerNodes.elementAt(i); 714 for (n=0; n<fDStates->size(); n++) { 715 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 716 if (sd->fPositions->indexOf(endMarker) >= 0) { 717 // Any non-zero value for fAccepting means this is an accepting node. 718 // The value is what will be returned to the user as the break status. 719 // If no other value was specified, force it to -1. 720 721 if (sd->fAccepting==0) { 722 // State hasn't been marked as accepting yet. Do it now. 723 sd->fAccepting = endMarker->fVal; 724 if (sd->fAccepting == 0) { 725 sd->fAccepting = -1; 726 } 727 } 728 if (sd->fAccepting==-1 && endMarker->fVal != 0) { 729 // Both lookahead and non-lookahead accepting for this state. 730 // Favor the look-ahead. Expedient for line break. 731 // TODO: need a more elegant resolution for conflicting rules. 732 sd->fAccepting = endMarker->fVal; 733 } 734 // implicit else: 735 // if sd->fAccepting already had a value other than 0 or -1, leave it be. 736 737 // If the end marker node is from a look-ahead rule, set 738 // the fLookAhead field or this state also. 739 if (endMarker->fLookAheadEnd) { 740 // TODO: don't change value if already set? 741 // TODO: allow for more than one active look-ahead rule in engine. 742 // Make value here an index to a side array in engine? 743 sd->fLookAhead = sd->fAccepting; 744 } 745 } 746 } 747 } 748} 749 750 751//----------------------------------------------------------------------------- 752// 753// flagLookAheadStates Very similar to flagAcceptingStates, above. 754// 755//----------------------------------------------------------------------------- 756void RBBITableBuilder::flagLookAheadStates() { 757 if (U_FAILURE(*fStatus)) { 758 return; 759 } 760 UVector lookAheadNodes(*fStatus); 761 RBBINode *lookAheadNode; 762 int32_t i; 763 int32_t n; 764 765 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus); 766 if (U_FAILURE(*fStatus)) { 767 return; 768 } 769 for (i=0; i<lookAheadNodes.size(); i++) { 770 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i); 771 772 for (n=0; n<fDStates->size(); n++) { 773 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 774 if (sd->fPositions->indexOf(lookAheadNode) >= 0) { 775 sd->fLookAhead = lookAheadNode->fVal; 776 } 777 } 778 } 779} 780 781 782 783 784//----------------------------------------------------------------------------- 785// 786// flagTaggedStates 787// 788//----------------------------------------------------------------------------- 789void RBBITableBuilder::flagTaggedStates() { 790 if (U_FAILURE(*fStatus)) { 791 return; 792 } 793 UVector tagNodes(*fStatus); 794 RBBINode *tagNode; 795 int32_t i; 796 int32_t n; 797 798 if (U_FAILURE(*fStatus)) { 799 return; 800 } 801 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus); 802 if (U_FAILURE(*fStatus)) { 803 return; 804 } 805 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) 806 tagNode = (RBBINode *)tagNodes.elementAt(i); 807 808 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table) 809 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 810 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t 811 sortedAdd(&sd->fTagVals, tagNode->fVal); 812 } 813 } 814 } 815} 816 817 818 819 820//----------------------------------------------------------------------------- 821// 822// mergeRuleStatusVals 823// 824// Update the global table of rule status {tag} values 825// The rule builder has a global vector of status values that are common 826// for all tables. Merge the ones from this table into the global set. 827// 828//----------------------------------------------------------------------------- 829void RBBITableBuilder::mergeRuleStatusVals() { 830 // 831 // The basic outline of what happens here is this... 832 // 833 // for each state in this state table 834 // if the status tag list for this state is in the global statuses list 835 // record where and 836 // continue with the next state 837 // else 838 // add the tag list for this state to the global list. 839 // 840 int i; 841 int n; 842 843 // Pre-set a single tag of {0} into the table. 844 // We will need this as a default, for rule sets with no explicit tagging. 845 if (fRB->fRuleStatusVals->size() == 0) { 846 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group 847 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero 848 } 849 850 // For each state 851 for (n=0; n<fDStates->size(); n++) { 852 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 853 UVector *thisStatesTagValues = sd->fTagVals; 854 if (thisStatesTagValues == NULL) { 855 // No tag values are explicitly associated with this state. 856 // Set the default tag value. 857 sd->fTagsIdx = 0; 858 continue; 859 } 860 861 // There are tag(s) associated with this state. 862 // fTagsIdx will be the index into the global tag list for this state's tag values. 863 // Initial value of -1 flags that we haven't got it set yet. 864 sd->fTagsIdx = -1; 865 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list 866 int32_t nextTagGroupStart = 0; 867 868 // Loop runs once per group of tags in the global list 869 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { 870 thisTagGroupStart = nextTagGroupStart; 871 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1; 872 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) { 873 // The number of tags for this state is different from 874 // the number of tags in this group from the global list. 875 // Continue with the next group from the global list. 876 continue; 877 } 878 // The lengths match, go ahead and compare the actual tag values 879 // between this state and the group from the global list. 880 for (i=0; i<thisStatesTagValues->size(); i++) { 881 if (thisStatesTagValues->elementAti(i) != 882 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { 883 // Mismatch. 884 break; 885 } 886 } 887 888 if (i == thisStatesTagValues->size()) { 889 // We found a set of tag values in the global list that match 890 // those for this state. Use them. 891 sd->fTagsIdx = thisTagGroupStart; 892 break; 893 } 894 } 895 896 if (sd->fTagsIdx == -1) { 897 // No suitable entry in the global tag list already. Add one 898 sd->fTagsIdx = fRB->fRuleStatusVals->size(); 899 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus); 900 for (i=0; i<thisStatesTagValues->size(); i++) { 901 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus); 902 } 903 } 904 } 905} 906 907 908 909 910 911 912 913//----------------------------------------------------------------------------- 914// 915// sortedAdd Add a value to a vector of sorted values (ints). 916// Do not replicate entries; if the value is already there, do not 917// add a second one. 918// Lazily create the vector if it does not already exist. 919// 920//----------------------------------------------------------------------------- 921void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { 922 int32_t i; 923 924 if (*vector == NULL) { 925 *vector = new UVector(*fStatus); 926 } 927 if (*vector == NULL || U_FAILURE(*fStatus)) { 928 return; 929 } 930 UVector *vec = *vector; 931 int32_t vSize = vec->size(); 932 for (i=0; i<vSize; i++) { 933 int32_t valAtI = vec->elementAti(i); 934 if (valAtI == val) { 935 // The value is already in the vector. Don't add it again. 936 return; 937 } 938 if (valAtI > val) { 939 break; 940 } 941 } 942 vec->insertElementAt(val, i, *fStatus); 943} 944 945 946 947//----------------------------------------------------------------------------- 948// 949// setAdd Set operation on UVector 950// dest = dest union source 951// Elements may only appear once and must be sorted. 952// 953//----------------------------------------------------------------------------- 954void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { 955 int32_t destOriginalSize = dest->size(); 956 int32_t sourceSize = source->size(); 957 int32_t di = 0; 958 void *(destS[16]), *(sourceS[16]); // Handle small cases without malloc 959 void **destH = 0, **sourceH = 0; 960 void **destBuff, **sourceBuff; 961 void **destLim, **sourceLim; 962 963 if (destOriginalSize > (int32_t)(sizeof(destS)/sizeof(destS[0]))) { 964 destH = (void **)uprv_malloc(sizeof(void *) * destOriginalSize); 965 destBuff = destH; 966 } 967 else { 968 destBuff = destS; 969 } 970 if (destBuff == 0) { 971 return; 972 } 973 destLim = destBuff + destOriginalSize; 974 975 if (sourceSize > (int32_t)(sizeof(sourceS)/sizeof(sourceS[0]))) { 976 sourceH = (void **)uprv_malloc(sizeof(void *) * sourceSize); 977 sourceBuff = sourceH; 978 } 979 else { 980 sourceBuff = sourceS; 981 } 982 if (sourceBuff == 0) { 983 if (destH) { 984 uprv_free(destH); 985 } 986 return; 987 } 988 sourceLim = sourceBuff + sourceSize; 989 990 // Avoid multiple "get element" calls by getting the contents into arrays 991 (void) dest->toArray(destBuff); 992 (void) source->toArray(sourceBuff); 993 994 dest->setSize(sourceSize+destOriginalSize, *fStatus); 995 996 while (sourceBuff < sourceLim && destBuff < destLim) { 997 if (*destBuff == *sourceBuff) { 998 dest->setElementAt(*sourceBuff++, di++); 999 destBuff++; 1000 } 1001 // This check is required for machines with segmented memory, like i5/OS. 1002 // Direct pointer comparison is not recommended. 1003 else if (uprv_memcmp(destBuff, sourceBuff, sizeof(void *)) < 0) { 1004 dest->setElementAt(*destBuff++, di++); 1005 } 1006 else { /* *sourceBuff < *destBuff */ 1007 dest->setElementAt(*sourceBuff++, di++); 1008 } 1009 } 1010 1011 // At most one of these two cleanup loops will execute 1012 while (destBuff < destLim) { 1013 dest->setElementAt(*destBuff++, di++); 1014 } 1015 while (sourceBuff < sourceLim) { 1016 dest->setElementAt(*sourceBuff++, di++); 1017 } 1018 1019 dest->setSize(di, *fStatus); 1020 if (destH) { 1021 uprv_free(destH); 1022 } 1023 if (sourceH) { 1024 uprv_free(sourceH); 1025 } 1026} 1027 1028 1029 1030//----------------------------------------------------------------------------- 1031// 1032// setEqual Set operation on UVector. 1033// Compare for equality. 1034// Elements must be sorted. 1035// 1036//----------------------------------------------------------------------------- 1037UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { 1038 return a->equals(*b); 1039} 1040 1041 1042//----------------------------------------------------------------------------- 1043// 1044// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos 1045// for each node in the tree. 1046// 1047//----------------------------------------------------------------------------- 1048#ifdef RBBI_DEBUG 1049void RBBITableBuilder::printPosSets(RBBINode *n) { 1050 if (n==NULL) { 1051 return; 1052 } 1053 n->printNode(); 1054 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); 1055 1056 RBBIDebugPrintf(" firstpos: "); 1057 printSet(n->fFirstPosSet); 1058 1059 RBBIDebugPrintf(" lastpos: "); 1060 printSet(n->fLastPosSet); 1061 1062 RBBIDebugPrintf(" followpos: "); 1063 printSet(n->fFollowPos); 1064 1065 printPosSets(n->fLeftChild); 1066 printPosSets(n->fRightChild); 1067} 1068#endif 1069 1070 1071 1072//----------------------------------------------------------------------------- 1073// 1074// getTableSize() Calculate the size of the runtime form of this 1075// state transition table. 1076// 1077//----------------------------------------------------------------------------- 1078int32_t RBBITableBuilder::getTableSize() const { 1079 int32_t size = 0; 1080 int32_t numRows; 1081 int32_t numCols; 1082 int32_t rowSize; 1083 1084 if (fTree == NULL) { 1085 return 0; 1086 } 1087 1088 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table. 1089 1090 numRows = fDStates->size(); 1091 numCols = fRB->fSetBuilder->getNumCharCategories(); 1092 1093 // Note The declaration of RBBIStateTableRow is for a table of two columns. 1094 // Therefore we subtract two from numCols when determining 1095 // how much storage to add to a row for the total columns. 1096 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); 1097 size += numRows * rowSize; 1098 return size; 1099} 1100 1101 1102 1103//----------------------------------------------------------------------------- 1104// 1105// exportTable() export the state transition table in the format required 1106// by the runtime engine. getTableSize() bytes of memory 1107// must be available at the output address "where". 1108// 1109//----------------------------------------------------------------------------- 1110void RBBITableBuilder::exportTable(void *where) { 1111 RBBIStateTable *table = (RBBIStateTable *)where; 1112 uint32_t state; 1113 int col; 1114 1115 if (U_FAILURE(*fStatus) || fTree == NULL) { 1116 return; 1117 } 1118 1119 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff || 1120 fDStates->size() > 0x7fff) { 1121 *fStatus = U_BRK_INTERNAL_ERROR; 1122 return; 1123 } 1124 1125 table->fRowLen = sizeof(RBBIStateTableRow) + 1126 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2); 1127 table->fNumStates = fDStates->size(); 1128 table->fFlags = 0; 1129 if (fRB->fLookAheadHardBreak) { 1130 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; 1131 } 1132 if (fRB->fSetBuilder->sawBOF()) { 1133 table->fFlags |= RBBI_BOF_REQUIRED; 1134 } 1135 table->fReserved = 0; 1136 1137 for (state=0; state<table->fNumStates; state++) { 1138 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); 1139 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); 1140 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); 1141 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); 1142 row->fAccepting = (int16_t)sd->fAccepting; 1143 row->fLookAhead = (int16_t)sd->fLookAhead; 1144 row->fTagIdx = (int16_t)sd->fTagsIdx; 1145 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) { 1146 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col); 1147 } 1148 } 1149} 1150 1151 1152 1153//----------------------------------------------------------------------------- 1154// 1155// printSet Debug function. Print the contents of a UVector 1156// 1157//----------------------------------------------------------------------------- 1158#ifdef RBBI_DEBUG 1159void RBBITableBuilder::printSet(UVector *s) { 1160 int32_t i; 1161 for (i=0; i<s->size(); i++) { 1162 void *v = s->elementAt(i); 1163 RBBIDebugPrintf("%10p", v); 1164 } 1165 RBBIDebugPrintf("\n"); 1166} 1167#endif 1168 1169 1170//----------------------------------------------------------------------------- 1171// 1172// printStates Debug Function. Dump the fully constructed state transition table. 1173// 1174//----------------------------------------------------------------------------- 1175#ifdef RBBI_DEBUG 1176void RBBITableBuilder::printStates() { 1177 int c; // input "character" 1178 int n; // state number 1179 1180 RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); 1181 RBBIDebugPrintf(" | Acc LA Tag"); 1182 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1183 RBBIDebugPrintf(" %2d", c); 1184 } 1185 RBBIDebugPrintf("\n"); 1186 RBBIDebugPrintf(" |---------------"); 1187 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1188 RBBIDebugPrintf("---"); 1189 } 1190 RBBIDebugPrintf("\n"); 1191 1192 for (n=0; n<fDStates->size(); n++) { 1193 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 1194 RBBIDebugPrintf(" %3d | " , n); 1195 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); 1196 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1197 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c)); 1198 } 1199 RBBIDebugPrintf("\n"); 1200 } 1201 RBBIDebugPrintf("\n\n"); 1202} 1203#endif 1204 1205 1206 1207//----------------------------------------------------------------------------- 1208// 1209// printRuleStatusTable Debug Function. Dump the common rule status table 1210// 1211//----------------------------------------------------------------------------- 1212#ifdef RBBI_DEBUG 1213void RBBITableBuilder::printRuleStatusTable() { 1214 int32_t thisRecord = 0; 1215 int32_t nextRecord = 0; 1216 int i; 1217 UVector *tbl = fRB->fRuleStatusVals; 1218 1219 RBBIDebugPrintf("index | tags \n"); 1220 RBBIDebugPrintf("-------------------\n"); 1221 1222 while (nextRecord < tbl->size()) { 1223 thisRecord = nextRecord; 1224 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; 1225 RBBIDebugPrintf("%4d ", thisRecord); 1226 for (i=thisRecord+1; i<nextRecord; i++) { 1227 RBBIDebugPrintf(" %5d", tbl->elementAti(i)); 1228 } 1229 RBBIDebugPrintf("\n"); 1230 } 1231 RBBIDebugPrintf("\n\n"); 1232} 1233#endif 1234 1235 1236//----------------------------------------------------------------------------- 1237// 1238// RBBIStateDescriptor Methods. This is a very struct-like class 1239// Most access is directly to the fields. 1240// 1241//----------------------------------------------------------------------------- 1242 1243RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) { 1244 fMarked = FALSE; 1245 fAccepting = 0; 1246 fLookAhead = 0; 1247 fTagsIdx = 0; 1248 fTagVals = NULL; 1249 fPositions = NULL; 1250 fDtran = NULL; 1251 1252 fDtran = new UVector(lastInputSymbol+1, *fStatus); 1253 if (U_FAILURE(*fStatus)) { 1254 return; 1255 } 1256 if (fDtran == NULL) { 1257 *fStatus = U_MEMORY_ALLOCATION_ERROR; 1258 return; 1259 } 1260 fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized. 1261 // It is indexed by input symbols, and will 1262 // hold the next state number for each 1263 // symbol. 1264} 1265 1266 1267RBBIStateDescriptor::~RBBIStateDescriptor() { 1268 delete fPositions; 1269 delete fDtran; 1270 delete fTagVals; 1271 fPositions = NULL; 1272 fDtran = NULL; 1273 fTagVals = NULL; 1274} 1275 1276U_NAMESPACE_END 1277 1278#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1279