rbbitblb.cpp revision c73f511526464f8e56c242df80552e9b0d94ae3d
1/* 2********************************************************************** 3* Copyright (c) 2002-2009, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6*/ 7// 8// rbbitblb.cpp 9// 10 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_BREAK_ITERATION 15 16#include "unicode/unistr.h" 17#include "rbbitblb.h" 18#include "rbbirb.h" 19#include "rbbisetb.h" 20#include "rbbidata.h" 21#include "cstring.h" 22#include "uassert.h" 23#include "cmemory.h" 24 25U_NAMESPACE_BEGIN 26 27RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) : 28 fTree(*rootNode) { 29 fRB = rb; 30 fStatus = fRB->fStatus; 31 UErrorCode status = U_ZERO_ERROR; 32 fDStates = new UVector(status); 33 if (U_FAILURE(*fStatus)) { 34 return; 35 } 36 if (U_FAILURE(status)) { 37 *fStatus = status; 38 return; 39 } 40 if (fDStates == NULL) { 41 *fStatus = U_MEMORY_ALLOCATION_ERROR;; 42 } 43} 44 45 46 47RBBITableBuilder::~RBBITableBuilder() { 48 int i; 49 for (i=0; i<fDStates->size(); i++) { 50 delete (RBBIStateDescriptor *)fDStates->elementAt(i); 51 } 52 delete fDStates; 53} 54 55 56//----------------------------------------------------------------------------- 57// 58// RBBITableBuilder::build - This is the main function for building the DFA state transtion 59// table from the RBBI rules parse tree. 60// 61//----------------------------------------------------------------------------- 62void RBBITableBuilder::build() { 63 64 if (U_FAILURE(*fStatus)) { 65 return; 66 } 67 68 // If there were no rules, just return. This situation can easily arise 69 // for the reverse rules. 70 if (fTree==NULL) { 71 return; 72 } 73 74 // 75 // Walk through the tree, replacing any references to $variables with a copy of the 76 // parse tree for the substition expression. 77 // 78 fTree = fTree->flattenVariables(); 79#ifdef RBBI_DEBUG 80 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { 81 RBBIDebugPuts("Parse tree after flattening variable references."); 82 fTree->printTree(TRUE); 83 } 84#endif 85 86 // 87 // If the rules contained any references to {bof} 88 // add a {bof} <cat> <former root of tree> to the 89 // tree. Means that all matches must start out with the 90 // {bof} fake character. 91 // 92 if (fRB->fSetBuilder->sawBOF()) { 93 RBBINode *bofTop = new RBBINode(RBBINode::opCat); 94 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); 95 // Delete and exit if memory allocation failed. 96 if (bofTop == NULL || bofLeaf == NULL) { 97 *fStatus = U_MEMORY_ALLOCATION_ERROR; 98 delete bofTop; 99 delete bofLeaf; 100 return; 101 } 102 bofTop->fLeftChild = bofLeaf; 103 bofTop->fRightChild = fTree; 104 bofLeaf->fParent = bofTop; 105 bofLeaf->fVal = 2; // Reserved value for {bof}. 106 fTree = bofTop; 107 } 108 109 // 110 // Add a unique right-end marker to the expression. 111 // Appears as a cat-node, left child being the original tree, 112 // right child being the end marker. 113 // 114 RBBINode *cn = new RBBINode(RBBINode::opCat); 115 // Exit if memory allocation failed. 116 if (cn == NULL) { 117 *fStatus = U_MEMORY_ALLOCATION_ERROR; 118 return; 119 } 120 cn->fLeftChild = fTree; 121 fTree->fParent = cn; 122 cn->fRightChild = new RBBINode(RBBINode::endMark); 123 // Delete and exit if memory allocation failed. 124 if (cn->fRightChild == NULL) { 125 *fStatus = U_MEMORY_ALLOCATION_ERROR; 126 delete cn; 127 return; 128 } 129 cn->fRightChild->fParent = cn; 130 fTree = cn; 131 132 // 133 // Replace all references to UnicodeSets with the tree for the equivalent 134 // expression. 135 // 136 fTree->flattenSets(); 137#ifdef RBBI_DEBUG 138 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { 139 RBBIDebugPuts("Parse tree after flattening Unicode Set references."); 140 fTree->printTree(TRUE); 141 } 142#endif 143 144 145 // 146 // calculate the functions nullable, firstpos, lastpos and followpos on 147 // nodes in the parse tree. 148 // See the alogrithm description in Aho. 149 // Understanding how this works by looking at the code alone will be 150 // nearly impossible. 151 // 152 calcNullable(fTree); 153 calcFirstPos(fTree); 154 calcLastPos(fTree); 155 calcFollowPos(fTree); 156 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { 157 RBBIDebugPuts("\n"); 158 printPosSets(fTree); 159 } 160 161 // 162 // For "chained" rules, modify the followPos sets 163 // 164 if (fRB->fChainRules) { 165 calcChainedFollowPos(fTree); 166 } 167 168 // 169 // BOF (start of input) test fixup. 170 // 171 if (fRB->fSetBuilder->sawBOF()) { 172 bofFixup(); 173 } 174 175 // 176 // Build the DFA state transition tables. 177 // 178 buildStateTable(); 179 flagAcceptingStates(); 180 flagLookAheadStates(); 181 flagTaggedStates(); 182 183 // 184 // Update the global table of rule status {tag} values 185 // The rule builder has a global vector of status values that are common 186 // for all tables. Merge the ones from this table into the global set. 187 // 188 mergeRuleStatusVals(); 189 190 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; 191} 192 193 194 195//----------------------------------------------------------------------------- 196// 197// calcNullable. Impossible to explain succinctly. See Aho, section 3.9 198// 199//----------------------------------------------------------------------------- 200void RBBITableBuilder::calcNullable(RBBINode *n) { 201 if (n == NULL) { 202 return; 203 } 204 if (n->fType == RBBINode::setRef || 205 n->fType == RBBINode::endMark ) { 206 // These are non-empty leaf node types. 207 n->fNullable = FALSE; 208 return; 209 } 210 211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { 212 // Lookahead marker node. It's a leaf, so no recursion on children. 213 // It's nullable because it does not match any literal text from the input stream. 214 n->fNullable = TRUE; 215 return; 216 } 217 218 219 // The node is not a leaf. 220 // Calculate nullable on its children. 221 calcNullable(n->fLeftChild); 222 calcNullable(n->fRightChild); 223 224 // Apply functions from table 3.40 in Aho 225 if (n->fType == RBBINode::opOr) { 226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable; 227 } 228 else if (n->fType == RBBINode::opCat) { 229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable; 230 } 231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) { 232 n->fNullable = TRUE; 233 } 234 else { 235 n->fNullable = FALSE; 236 } 237} 238 239 240 241 242//----------------------------------------------------------------------------- 243// 244// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 245// 246//----------------------------------------------------------------------------- 247void RBBITableBuilder::calcFirstPos(RBBINode *n) { 248 if (n == NULL) { 249 return; 250 } 251 if (n->fType == RBBINode::leafChar || 252 n->fType == RBBINode::endMark || 253 n->fType == RBBINode::lookAhead || 254 n->fType == RBBINode::tag) { 255 // These are non-empty leaf node types. 256 // Note: In order to maintain the sort invariant on the set, 257 // this function should only be called on a node whose set is 258 // empty to start with. 259 n->fFirstPosSet->addElement(n, *fStatus); 260 return; 261 } 262 263 // The node is not a leaf. 264 // Calculate firstPos on its children. 265 calcFirstPos(n->fLeftChild); 266 calcFirstPos(n->fRightChild); 267 268 // Apply functions from table 3.40 in Aho 269 if (n->fType == RBBINode::opOr) { 270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 272 } 273 else if (n->fType == RBBINode::opCat) { 274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 275 if (n->fLeftChild->fNullable) { 276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 277 } 278 } 279 else if (n->fType == RBBINode::opStar || 280 n->fType == RBBINode::opQuestion || 281 n->fType == RBBINode::opPlus) { 282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 283 } 284} 285 286 287 288//----------------------------------------------------------------------------- 289// 290// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 291// 292//----------------------------------------------------------------------------- 293void RBBITableBuilder::calcLastPos(RBBINode *n) { 294 if (n == NULL) { 295 return; 296 } 297 if (n->fType == RBBINode::leafChar || 298 n->fType == RBBINode::endMark || 299 n->fType == RBBINode::lookAhead || 300 n->fType == RBBINode::tag) { 301 // These are non-empty leaf node types. 302 // Note: In order to maintain the sort invariant on the set, 303 // this function should only be called on a node whose set is 304 // empty to start with. 305 n->fLastPosSet->addElement(n, *fStatus); 306 return; 307 } 308 309 // The node is not a leaf. 310 // Calculate lastPos on its children. 311 calcLastPos(n->fLeftChild); 312 calcLastPos(n->fRightChild); 313 314 // Apply functions from table 3.40 in Aho 315 if (n->fType == RBBINode::opOr) { 316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 318 } 319 else if (n->fType == RBBINode::opCat) { 320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 321 if (n->fRightChild->fNullable) { 322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 323 } 324 } 325 else if (n->fType == RBBINode::opStar || 326 n->fType == RBBINode::opQuestion || 327 n->fType == RBBINode::opPlus) { 328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 329 } 330} 331 332 333 334//----------------------------------------------------------------------------- 335// 336// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 337// 338//----------------------------------------------------------------------------- 339void RBBITableBuilder::calcFollowPos(RBBINode *n) { 340 if (n == NULL || 341 n->fType == RBBINode::leafChar || 342 n->fType == RBBINode::endMark) { 343 return; 344 } 345 346 calcFollowPos(n->fLeftChild); 347 calcFollowPos(n->fRightChild); 348 349 // Aho rule #1 350 if (n->fType == RBBINode::opCat) { 351 RBBINode *i; // is 'i' in Aho's description 352 uint32_t ix; 353 354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet; 355 356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) { 357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix); 358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); 359 } 360 } 361 362 // Aho rule #2 363 if (n->fType == RBBINode::opStar || 364 n->fType == RBBINode::opPlus) { 365 RBBINode *i; // again, n and i are the names from Aho's description. 366 uint32_t ix; 367 368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) { 369 i = (RBBINode *)n->fLastPosSet->elementAt(ix); 370 setAdd(i->fFollowPos, n->fFirstPosSet); 371 } 372 } 373 374 375 376} 377 378 379//----------------------------------------------------------------------------- 380// 381// calcChainedFollowPos. Modify the previously calculated followPos sets 382// to implement rule chaining. NOT described by Aho 383// 384//----------------------------------------------------------------------------- 385void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { 386 387 UVector endMarkerNodes(*fStatus); 388 UVector leafNodes(*fStatus); 389 int32_t i; 390 391 if (U_FAILURE(*fStatus)) { 392 return; 393 } 394 395 // get a list of all endmarker nodes. 396 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 397 398 // get a list all leaf nodes 399 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); 400 if (U_FAILURE(*fStatus)) { 401 return; 402 } 403 404 // Get all nodes that can be the start a match, which is FirstPosition() 405 // of the portion of the tree corresponding to user-written rules. 406 // See the tree description in bofFixup(). 407 RBBINode *userRuleRoot = tree; 408 if (fRB->fSetBuilder->sawBOF()) { 409 userRuleRoot = tree->fLeftChild->fRightChild; 410 } 411 U_ASSERT(userRuleRoot != NULL); 412 UVector *matchStartNodes = userRuleRoot->fFirstPosSet; 413 414 415 // Iteratate over all leaf nodes, 416 // 417 int32_t endNodeIx; 418 int32_t startNodeIx; 419 420 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) { 421 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx); 422 RBBINode *endNode = NULL; 423 424 // Identify leaf nodes that correspond to overall rule match positions. 425 // These include an endMarkerNode in their followPos sets. 426 for (i=0; i<endMarkerNodes.size(); i++) { 427 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) { 428 endNode = tNode; 429 break; 430 } 431 } 432 if (endNode == NULL) { 433 // node wasn't an end node. Try again with the next. 434 continue; 435 } 436 437 // We've got a node that can end a match. 438 439 // Line Break Specific hack: If this node's val correspond to the $CM char class, 440 // don't chain from it. 441 // TODO: Add rule syntax for this behavior, get specifics out of here and 442 // into the rule file. 443 if (fRB->fLBCMNoChain) { 444 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); 445 if (c != -1) { 446 // c == -1 occurs with sets containing only the {eof} marker string. 447 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 448 if (cLBProp == U_LB_COMBINING_MARK) { 449 continue; 450 } 451 } 452 } 453 454 455 // Now iterate over the nodes that can start a match, looking for ones 456 // with the same char class as our ending node. 457 RBBINode *startNode; 458 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { 459 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); 460 if (startNode->fType != RBBINode::leafChar) { 461 continue; 462 } 463 464 if (endNode->fVal == startNode->fVal) { 465 // The end val (character class) of one possible match is the 466 // same as the start of another. 467 468 // Add all nodes from the followPos of the start node to the 469 // followPos set of the end node, which will have the effect of 470 // letting matches transition from a match state at endNode 471 // to the second char of a match starting with startNode. 472 setAdd(endNode->fFollowPos, startNode->fFollowPos); 473 } 474 } 475 } 476} 477 478 479//----------------------------------------------------------------------------- 480// 481// bofFixup. Fixup for state tables that include {bof} beginning of input testing. 482// Do an swizzle similar to chaining, modifying the followPos set of 483// the bofNode to include the followPos nodes from other {bot} nodes 484// scattered through the tree. 485// 486// This function has much in common with calcChainedFollowPos(). 487// 488//----------------------------------------------------------------------------- 489void RBBITableBuilder::bofFixup() { 490 491 if (U_FAILURE(*fStatus)) { 492 return; 493 } 494 495 // The parse tree looks like this ... 496 // fTree root ---> <cat> 497 // / \ . 498 // <cat> <#end node> 499 // / \ . 500 // <bofNode> rest 501 // of tree 502 // 503 // We will be adding things to the followPos set of the <bofNode> 504 // 505 RBBINode *bofNode = fTree->fLeftChild->fLeftChild; 506 U_ASSERT(bofNode->fType == RBBINode::leafChar); 507 U_ASSERT(bofNode->fVal == 2); 508 509 // Get all nodes that can be the start a match of the user-written rules 510 // (excluding the fake bofNode) 511 // We want the nodes that can start a match in the 512 // part labeled "rest of tree" 513 // 514 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; 515 516 RBBINode *startNode; 517 int startNodeIx; 518 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { 519 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); 520 if (startNode->fType != RBBINode::leafChar) { 521 continue; 522 } 523 524 if (startNode->fVal == bofNode->fVal) { 525 // We found a leaf node corresponding to a {bof} that was 526 // explicitly written into a rule. 527 // Add everything from the followPos set of this node to the 528 // followPos set of the fake bofNode at the start of the tree. 529 // 530 setAdd(bofNode->fFollowPos, startNode->fFollowPos); 531 } 532 } 533} 534 535//----------------------------------------------------------------------------- 536// 537// buildStateTable() Determine the set of runtime DFA states and the 538// transition tables for these states, by the algorithm 539// of fig. 3.44 in Aho. 540// 541// Most of the comments are quotes of Aho's psuedo-code. 542// 543//----------------------------------------------------------------------------- 544void RBBITableBuilder::buildStateTable() { 545 if (U_FAILURE(*fStatus)) { 546 return; 547 } 548 RBBIStateDescriptor *failState; 549 // Set it to NULL to avoid uninitialized warning 550 RBBIStateDescriptor *initialState = NULL; 551 // 552 // Add a dummy state 0 - the stop state. Not from Aho. 553 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; 554 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 555 if (failState == NULL) { 556 *fStatus = U_MEMORY_ALLOCATION_ERROR; 557 goto ExitBuildSTdeleteall; 558 } 559 failState->fPositions = new UVector(*fStatus); 560 if (failState->fPositions == NULL) { 561 *fStatus = U_MEMORY_ALLOCATION_ERROR; 562 } 563 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) { 564 goto ExitBuildSTdeleteall; 565 } 566 fDStates->addElement(failState, *fStatus); 567 if (U_FAILURE(*fStatus)) { 568 goto ExitBuildSTdeleteall; 569 } 570 571 // initially, the only unmarked state in Dstates is firstpos(root), 572 // where toot is the root of the syntax tree for (r)#; 573 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 574 if (initialState == NULL) { 575 *fStatus = U_MEMORY_ALLOCATION_ERROR; 576 } 577 if (U_FAILURE(*fStatus)) { 578 goto ExitBuildSTdeleteall; 579 } 580 initialState->fPositions = new UVector(*fStatus); 581 if (initialState->fPositions == NULL) { 582 *fStatus = U_MEMORY_ALLOCATION_ERROR; 583 } 584 if (U_FAILURE(*fStatus)) { 585 goto ExitBuildSTdeleteall; 586 } 587 setAdd(initialState->fPositions, fTree->fFirstPosSet); 588 fDStates->addElement(initialState, *fStatus); 589 if (U_FAILURE(*fStatus)) { 590 goto ExitBuildSTdeleteall; 591 } 592 593 // while there is an unmarked state T in Dstates do begin 594 for (;;) { 595 RBBIStateDescriptor *T = NULL; 596 int32_t tx; 597 for (tx=1; tx<fDStates->size(); tx++) { 598 RBBIStateDescriptor *temp; 599 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx); 600 if (temp->fMarked == FALSE) { 601 T = temp; 602 break; 603 } 604 } 605 if (T == NULL) { 606 break; 607 } 608 609 // mark T; 610 T->fMarked = TRUE; 611 612 // for each input symbol a do begin 613 int32_t a; 614 for (a = 1; a<=lastInputSymbol; a++) { 615 // let U be the set of positions that are in followpos(p) 616 // for some position p in T 617 // such that the symbol at position p is a; 618 UVector *U = NULL; 619 RBBINode *p; 620 int32_t px; 621 for (px=0; px<T->fPositions->size(); px++) { 622 p = (RBBINode *)T->fPositions->elementAt(px); 623 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { 624 if (U == NULL) { 625 U = new UVector(*fStatus); 626 if (U == NULL) { 627 *fStatus = U_MEMORY_ALLOCATION_ERROR; 628 goto ExitBuildSTdeleteall; 629 } 630 } 631 setAdd(U, p->fFollowPos); 632 } 633 } 634 635 // if U is not empty and not in DStates then 636 int32_t ux = 0; 637 UBool UinDstates = FALSE; 638 if (U != NULL) { 639 U_ASSERT(U->size() > 0); 640 int ix; 641 for (ix=0; ix<fDStates->size(); ix++) { 642 RBBIStateDescriptor *temp2; 643 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix); 644 if (setEquals(U, temp2->fPositions)) { 645 delete U; 646 U = temp2->fPositions; 647 ux = ix; 648 UinDstates = TRUE; 649 break; 650 } 651 } 652 653 // Add U as an unmarked state to Dstates 654 if (!UinDstates) 655 { 656 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 657 if (newState == NULL) { 658 *fStatus = U_MEMORY_ALLOCATION_ERROR; 659 } 660 if (U_FAILURE(*fStatus)) { 661 goto ExitBuildSTdeleteall; 662 } 663 newState->fPositions = U; 664 fDStates->addElement(newState, *fStatus); 665 if (U_FAILURE(*fStatus)) { 666 return; 667 } 668 ux = fDStates->size()-1; 669 } 670 671 // Dtran[T, a] := U; 672 T->fDtran->setElementAt(ux, a); 673 } 674 } 675 } 676 return; 677 // delete local pointers only if error occured. 678ExitBuildSTdeleteall: 679 delete initialState; 680 delete failState; 681} 682 683 684 685//----------------------------------------------------------------------------- 686// 687// flagAcceptingStates Identify accepting states. 688// First get a list of all of the end marker nodes. 689// Then, for each state s, 690// if s contains one of the end marker nodes in its list of tree positions then 691// s is an accepting state. 692// 693//----------------------------------------------------------------------------- 694void RBBITableBuilder::flagAcceptingStates() { 695 if (U_FAILURE(*fStatus)) { 696 return; 697 } 698 UVector endMarkerNodes(*fStatus); 699 RBBINode *endMarker; 700 int32_t i; 701 int32_t n; 702 703 if (U_FAILURE(*fStatus)) { 704 return; 705 } 706 707 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 708 if (U_FAILURE(*fStatus)) { 709 return; 710 } 711 712 for (i=0; i<endMarkerNodes.size(); i++) { 713 endMarker = (RBBINode *)endMarkerNodes.elementAt(i); 714 for (n=0; n<fDStates->size(); n++) { 715 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 716 if (sd->fPositions->indexOf(endMarker) >= 0) { 717 // Any non-zero value for fAccepting means this is an accepting node. 718 // The value is what will be returned to the user as the break status. 719 // If no other value was specified, force it to -1. 720 721 if (sd->fAccepting==0) { 722 // State hasn't been marked as accepting yet. Do it now. 723 sd->fAccepting = endMarker->fVal; 724 if (sd->fAccepting == 0) { 725 sd->fAccepting = -1; 726 } 727 } 728 if (sd->fAccepting==-1 && endMarker->fVal != 0) { 729 // Both lookahead and non-lookahead accepting for this state. 730 // Favor the look-ahead. Expedient for line break. 731 // TODO: need a more elegant resolution for conflicting rules. 732 sd->fAccepting = endMarker->fVal; 733 } 734 // implicit else: 735 // if sd->fAccepting already had a value other than 0 or -1, leave it be. 736 737 // If the end marker node is from a look-ahead rule, set 738 // the fLookAhead field or this state also. 739 if (endMarker->fLookAheadEnd) { 740 // TODO: don't change value if already set? 741 // TODO: allow for more than one active look-ahead rule in engine. 742 // Make value here an index to a side array in engine? 743 sd->fLookAhead = sd->fAccepting; 744 } 745 } 746 } 747 } 748} 749 750 751//----------------------------------------------------------------------------- 752// 753// flagLookAheadStates Very similar to flagAcceptingStates, above. 754// 755//----------------------------------------------------------------------------- 756void RBBITableBuilder::flagLookAheadStates() { 757 if (U_FAILURE(*fStatus)) { 758 return; 759 } 760 UVector lookAheadNodes(*fStatus); 761 RBBINode *lookAheadNode; 762 int32_t i; 763 int32_t n; 764 765 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus); 766 if (U_FAILURE(*fStatus)) { 767 return; 768 } 769 for (i=0; i<lookAheadNodes.size(); i++) { 770 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i); 771 772 for (n=0; n<fDStates->size(); n++) { 773 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 774 if (sd->fPositions->indexOf(lookAheadNode) >= 0) { 775 sd->fLookAhead = lookAheadNode->fVal; 776 } 777 } 778 } 779} 780 781 782 783 784//----------------------------------------------------------------------------- 785// 786// flagTaggedStates 787// 788//----------------------------------------------------------------------------- 789void RBBITableBuilder::flagTaggedStates() { 790 if (U_FAILURE(*fStatus)) { 791 return; 792 } 793 UVector tagNodes(*fStatus); 794 RBBINode *tagNode; 795 int32_t i; 796 int32_t n; 797 798 if (U_FAILURE(*fStatus)) { 799 return; 800 } 801 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus); 802 if (U_FAILURE(*fStatus)) { 803 return; 804 } 805 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) 806 tagNode = (RBBINode *)tagNodes.elementAt(i); 807 808 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table) 809 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 810 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t 811 sortedAdd(&sd->fTagVals, tagNode->fVal); 812 } 813 } 814 } 815} 816 817 818 819 820//----------------------------------------------------------------------------- 821// 822// mergeRuleStatusVals 823// 824// Update the global table of rule status {tag} values 825// The rule builder has a global vector of status values that are common 826// for all tables. Merge the ones from this table into the global set. 827// 828//----------------------------------------------------------------------------- 829void RBBITableBuilder::mergeRuleStatusVals() { 830 // 831 // The basic outline of what happens here is this... 832 // 833 // for each state in this state table 834 // if the status tag list for this state is in the global statuses list 835 // record where and 836 // continue with the next state 837 // else 838 // add the tag list for this state to the global list. 839 // 840 int i; 841 int n; 842 843 // Pre-set a single tag of {0} into the table. 844 // We will need this as a default, for rule sets with no explicit tagging. 845 if (fRB->fRuleStatusVals->size() == 0) { 846 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group 847 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero 848 } 849 850 // For each state 851 for (n=0; n<fDStates->size(); n++) { 852 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 853 UVector *thisStatesTagValues = sd->fTagVals; 854 if (thisStatesTagValues == NULL) { 855 // No tag values are explicitly associated with this state. 856 // Set the default tag value. 857 sd->fTagsIdx = 0; 858 continue; 859 } 860 861 // There are tag(s) associated with this state. 862 // fTagsIdx will be the index into the global tag list for this state's tag values. 863 // Initial value of -1 flags that we haven't got it set yet. 864 sd->fTagsIdx = -1; 865 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list 866 int32_t nextTagGroupStart = 0; 867 868 // Loop runs once per group of tags in the global list 869 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { 870 thisTagGroupStart = nextTagGroupStart; 871 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1; 872 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) { 873 // The number of tags for this state is different from 874 // the number of tags in this group from the global list. 875 // Continue with the next group from the global list. 876 continue; 877 } 878 // The lengths match, go ahead and compare the actual tag values 879 // between this state and the group from the global list. 880 for (i=0; i<thisStatesTagValues->size(); i++) { 881 if (thisStatesTagValues->elementAti(i) != 882 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { 883 // Mismatch. 884 break; 885 } 886 } 887 888 if (i == thisStatesTagValues->size()) { 889 // We found a set of tag values in the global list that match 890 // those for this state. Use them. 891 sd->fTagsIdx = thisTagGroupStart; 892 break; 893 } 894 } 895 896 if (sd->fTagsIdx == -1) { 897 // No suitable entry in the global tag list already. Add one 898 sd->fTagsIdx = fRB->fRuleStatusVals->size(); 899 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus); 900 for (i=0; i<thisStatesTagValues->size(); i++) { 901 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus); 902 } 903 } 904 } 905} 906 907 908 909 910 911 912 913//----------------------------------------------------------------------------- 914// 915// sortedAdd Add a value to a vector of sorted values (ints). 916// Do not replicate entries; if the value is already there, do not 917// add a second one. 918// Lazily create the vector if it does not already exist. 919// 920//----------------------------------------------------------------------------- 921void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { 922 int32_t i; 923 924 if (*vector == NULL) { 925 *vector = new UVector(*fStatus); 926 } 927 if (*vector == NULL || U_FAILURE(*fStatus)) { 928 return; 929 } 930 UVector *vec = *vector; 931 int32_t vSize = vec->size(); 932 for (i=0; i<vSize; i++) { 933 int32_t valAtI = vec->elementAti(i); 934 if (valAtI == val) { 935 // The value is already in the vector. Don't add it again. 936 return; 937 } 938 if (valAtI > val) { 939 break; 940 } 941 } 942 vec->insertElementAt(val, i, *fStatus); 943} 944 945 946 947//----------------------------------------------------------------------------- 948// 949// setAdd Set operation on UVector 950// dest = dest union source 951// Elements may only appear once and must be sorted. 952// 953//----------------------------------------------------------------------------- 954void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { 955 int32_t destOriginalSize = dest->size(); 956 int32_t sourceSize = source->size(); 957 int32_t di = 0; 958 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc 959 void **destPtr, **sourcePtr; 960 void **destLim, **sourceLim; 961 962 if (destOriginalSize > destArray.getCapacity()) { 963 if (destArray.resize(destOriginalSize) == NULL) { 964 return; 965 } 966 } 967 destPtr = destArray.getAlias(); 968 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()? 969 970 if (sourceSize > sourceArray.getCapacity()) { 971 if (sourceArray.resize(sourceSize) == NULL) { 972 return; 973 } 974 } 975 sourcePtr = sourceArray.getAlias(); 976 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()? 977 978 // Avoid multiple "get element" calls by getting the contents into arrays 979 (void) dest->toArray(destPtr); 980 (void) source->toArray(sourcePtr); 981 982 dest->setSize(sourceSize+destOriginalSize, *fStatus); 983 984 while (sourcePtr < sourceLim && destPtr < destLim) { 985 if (*destPtr == *sourcePtr) { 986 dest->setElementAt(*sourcePtr++, di++); 987 destPtr++; 988 } 989 // This check is required for machines with segmented memory, like i5/OS. 990 // Direct pointer comparison is not recommended. 991 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) { 992 dest->setElementAt(*destPtr++, di++); 993 } 994 else { /* *sourcePtr < *destPtr */ 995 dest->setElementAt(*sourcePtr++, di++); 996 } 997 } 998 999 // At most one of these two cleanup loops will execute 1000 while (destPtr < destLim) { 1001 dest->setElementAt(*destPtr++, di++); 1002 } 1003 while (sourcePtr < sourceLim) { 1004 dest->setElementAt(*sourcePtr++, di++); 1005 } 1006 1007 dest->setSize(di, *fStatus); 1008} 1009 1010 1011 1012//----------------------------------------------------------------------------- 1013// 1014// setEqual Set operation on UVector. 1015// Compare for equality. 1016// Elements must be sorted. 1017// 1018//----------------------------------------------------------------------------- 1019UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { 1020 return a->equals(*b); 1021} 1022 1023 1024//----------------------------------------------------------------------------- 1025// 1026// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos 1027// for each node in the tree. 1028// 1029//----------------------------------------------------------------------------- 1030#ifdef RBBI_DEBUG 1031void RBBITableBuilder::printPosSets(RBBINode *n) { 1032 if (n==NULL) { 1033 return; 1034 } 1035 n->printNode(); 1036 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); 1037 1038 RBBIDebugPrintf(" firstpos: "); 1039 printSet(n->fFirstPosSet); 1040 1041 RBBIDebugPrintf(" lastpos: "); 1042 printSet(n->fLastPosSet); 1043 1044 RBBIDebugPrintf(" followpos: "); 1045 printSet(n->fFollowPos); 1046 1047 printPosSets(n->fLeftChild); 1048 printPosSets(n->fRightChild); 1049} 1050#endif 1051 1052 1053 1054//----------------------------------------------------------------------------- 1055// 1056// getTableSize() Calculate the size of the runtime form of this 1057// state transition table. 1058// 1059//----------------------------------------------------------------------------- 1060int32_t RBBITableBuilder::getTableSize() const { 1061 int32_t size = 0; 1062 int32_t numRows; 1063 int32_t numCols; 1064 int32_t rowSize; 1065 1066 if (fTree == NULL) { 1067 return 0; 1068 } 1069 1070 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table. 1071 1072 numRows = fDStates->size(); 1073 numCols = fRB->fSetBuilder->getNumCharCategories(); 1074 1075 // Note The declaration of RBBIStateTableRow is for a table of two columns. 1076 // Therefore we subtract two from numCols when determining 1077 // how much storage to add to a row for the total columns. 1078 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); 1079 size += numRows * rowSize; 1080 return size; 1081} 1082 1083 1084 1085//----------------------------------------------------------------------------- 1086// 1087// exportTable() export the state transition table in the format required 1088// by the runtime engine. getTableSize() bytes of memory 1089// must be available at the output address "where". 1090// 1091//----------------------------------------------------------------------------- 1092void RBBITableBuilder::exportTable(void *where) { 1093 RBBIStateTable *table = (RBBIStateTable *)where; 1094 uint32_t state; 1095 int col; 1096 1097 if (U_FAILURE(*fStatus) || fTree == NULL) { 1098 return; 1099 } 1100 1101 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff || 1102 fDStates->size() > 0x7fff) { 1103 *fStatus = U_BRK_INTERNAL_ERROR; 1104 return; 1105 } 1106 1107 table->fRowLen = sizeof(RBBIStateTableRow) + 1108 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2); 1109 table->fNumStates = fDStates->size(); 1110 table->fFlags = 0; 1111 if (fRB->fLookAheadHardBreak) { 1112 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; 1113 } 1114 if (fRB->fSetBuilder->sawBOF()) { 1115 table->fFlags |= RBBI_BOF_REQUIRED; 1116 } 1117 table->fReserved = 0; 1118 1119 for (state=0; state<table->fNumStates; state++) { 1120 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); 1121 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); 1122 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); 1123 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); 1124 row->fAccepting = (int16_t)sd->fAccepting; 1125 row->fLookAhead = (int16_t)sd->fLookAhead; 1126 row->fTagIdx = (int16_t)sd->fTagsIdx; 1127 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) { 1128 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col); 1129 } 1130 } 1131} 1132 1133 1134 1135//----------------------------------------------------------------------------- 1136// 1137// printSet Debug function. Print the contents of a UVector 1138// 1139//----------------------------------------------------------------------------- 1140#ifdef RBBI_DEBUG 1141void RBBITableBuilder::printSet(UVector *s) { 1142 int32_t i; 1143 for (i=0; i<s->size(); i++) { 1144 void *v = s->elementAt(i); 1145 RBBIDebugPrintf("%10p", v); 1146 } 1147 RBBIDebugPrintf("\n"); 1148} 1149#endif 1150 1151 1152//----------------------------------------------------------------------------- 1153// 1154// printStates Debug Function. Dump the fully constructed state transition table. 1155// 1156//----------------------------------------------------------------------------- 1157#ifdef RBBI_DEBUG 1158void RBBITableBuilder::printStates() { 1159 int c; // input "character" 1160 int n; // state number 1161 1162 RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); 1163 RBBIDebugPrintf(" | Acc LA Tag"); 1164 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1165 RBBIDebugPrintf(" %2d", c); 1166 } 1167 RBBIDebugPrintf("\n"); 1168 RBBIDebugPrintf(" |---------------"); 1169 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1170 RBBIDebugPrintf("---"); 1171 } 1172 RBBIDebugPrintf("\n"); 1173 1174 for (n=0; n<fDStates->size(); n++) { 1175 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 1176 RBBIDebugPrintf(" %3d | " , n); 1177 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); 1178 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1179 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c)); 1180 } 1181 RBBIDebugPrintf("\n"); 1182 } 1183 RBBIDebugPrintf("\n\n"); 1184} 1185#endif 1186 1187 1188 1189//----------------------------------------------------------------------------- 1190// 1191// printRuleStatusTable Debug Function. Dump the common rule status table 1192// 1193//----------------------------------------------------------------------------- 1194#ifdef RBBI_DEBUG 1195void RBBITableBuilder::printRuleStatusTable() { 1196 int32_t thisRecord = 0; 1197 int32_t nextRecord = 0; 1198 int i; 1199 UVector *tbl = fRB->fRuleStatusVals; 1200 1201 RBBIDebugPrintf("index | tags \n"); 1202 RBBIDebugPrintf("-------------------\n"); 1203 1204 while (nextRecord < tbl->size()) { 1205 thisRecord = nextRecord; 1206 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; 1207 RBBIDebugPrintf("%4d ", thisRecord); 1208 for (i=thisRecord+1; i<nextRecord; i++) { 1209 RBBIDebugPrintf(" %5d", tbl->elementAti(i)); 1210 } 1211 RBBIDebugPrintf("\n"); 1212 } 1213 RBBIDebugPrintf("\n\n"); 1214} 1215#endif 1216 1217 1218//----------------------------------------------------------------------------- 1219// 1220// RBBIStateDescriptor Methods. This is a very struct-like class 1221// Most access is directly to the fields. 1222// 1223//----------------------------------------------------------------------------- 1224 1225RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) { 1226 fMarked = FALSE; 1227 fAccepting = 0; 1228 fLookAhead = 0; 1229 fTagsIdx = 0; 1230 fTagVals = NULL; 1231 fPositions = NULL; 1232 fDtran = NULL; 1233 1234 fDtran = new UVector(lastInputSymbol+1, *fStatus); 1235 if (U_FAILURE(*fStatus)) { 1236 return; 1237 } 1238 if (fDtran == NULL) { 1239 *fStatus = U_MEMORY_ALLOCATION_ERROR; 1240 return; 1241 } 1242 fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized. 1243 // It is indexed by input symbols, and will 1244 // hold the next state number for each 1245 // symbol. 1246} 1247 1248 1249RBBIStateDescriptor::~RBBIStateDescriptor() { 1250 delete fPositions; 1251 delete fDtran; 1252 delete fTagVals; 1253 fPositions = NULL; 1254 fDtran = NULL; 1255 fTagVals = NULL; 1256} 1257 1258U_NAMESPACE_END 1259 1260#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1261