1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4********************************************************************** 5* Copyright (c) 2002-2016, International Business Machines 6* Corporation and others. All Rights Reserved. 7********************************************************************** 8*/ 9// 10// rbbitblb.cpp 11// 12 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_BREAK_ITERATION 17 18#include "unicode/unistr.h" 19#include "rbbitblb.h" 20#include "rbbirb.h" 21#include "rbbisetb.h" 22#include "rbbidata.h" 23#include "cstring.h" 24#include "uassert.h" 25#include "cmemory.h" 26 27U_NAMESPACE_BEGIN 28 29RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) : 30 fTree(*rootNode) { 31 fRB = rb; 32 fStatus = fRB->fStatus; 33 UErrorCode status = U_ZERO_ERROR; 34 fDStates = new UVector(status); 35 if (U_FAILURE(*fStatus)) { 36 return; 37 } 38 if (U_FAILURE(status)) { 39 *fStatus = status; 40 return; 41 } 42 if (fDStates == NULL) { 43 *fStatus = U_MEMORY_ALLOCATION_ERROR;; 44 } 45} 46 47 48 49RBBITableBuilder::~RBBITableBuilder() { 50 int i; 51 for (i=0; i<fDStates->size(); i++) { 52 delete (RBBIStateDescriptor *)fDStates->elementAt(i); 53 } 54 delete fDStates; 55} 56 57 58//----------------------------------------------------------------------------- 59// 60// RBBITableBuilder::build - This is the main function for building the DFA state transtion 61// table from the RBBI rules parse tree. 62// 63//----------------------------------------------------------------------------- 64void RBBITableBuilder::build() { 65 66 if (U_FAILURE(*fStatus)) { 67 return; 68 } 69 70 // If there were no rules, just return. This situation can easily arise 71 // for the reverse rules. 72 if (fTree==NULL) { 73 return; 74 } 75 76 // 77 // Walk through the tree, replacing any references to $variables with a copy of the 78 // parse tree for the substition expression. 79 // 80 fTree = fTree->flattenVariables(); 81#ifdef RBBI_DEBUG 82 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) { 83 RBBIDebugPuts("\nParse tree after flattening variable references."); 84 RBBINode::printTree(fTree, TRUE); 85 } 86#endif 87 88 // 89 // If the rules contained any references to {bof} 90 // add a {bof} <cat> <former root of tree> to the 91 // tree. Means that all matches must start out with the 92 // {bof} fake character. 93 // 94 if (fRB->fSetBuilder->sawBOF()) { 95 RBBINode *bofTop = new RBBINode(RBBINode::opCat); 96 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); 97 // Delete and exit if memory allocation failed. 98 if (bofTop == NULL || bofLeaf == NULL) { 99 *fStatus = U_MEMORY_ALLOCATION_ERROR; 100 delete bofTop; 101 delete bofLeaf; 102 return; 103 } 104 bofTop->fLeftChild = bofLeaf; 105 bofTop->fRightChild = fTree; 106 bofLeaf->fParent = bofTop; 107 bofLeaf->fVal = 2; // Reserved value for {bof}. 108 fTree = bofTop; 109 } 110 111 // 112 // Add a unique right-end marker to the expression. 113 // Appears as a cat-node, left child being the original tree, 114 // right child being the end marker. 115 // 116 RBBINode *cn = new RBBINode(RBBINode::opCat); 117 // Exit if memory allocation failed. 118 if (cn == NULL) { 119 *fStatus = U_MEMORY_ALLOCATION_ERROR; 120 return; 121 } 122 cn->fLeftChild = fTree; 123 fTree->fParent = cn; 124 cn->fRightChild = new RBBINode(RBBINode::endMark); 125 // Delete and exit if memory allocation failed. 126 if (cn->fRightChild == NULL) { 127 *fStatus = U_MEMORY_ALLOCATION_ERROR; 128 delete cn; 129 return; 130 } 131 cn->fRightChild->fParent = cn; 132 fTree = cn; 133 134 // 135 // Replace all references to UnicodeSets with the tree for the equivalent 136 // expression. 137 // 138 fTree->flattenSets(); 139#ifdef RBBI_DEBUG 140 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) { 141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references."); 142 RBBINode::printTree(fTree, TRUE); 143 } 144#endif 145 146 147 // 148 // calculate the functions nullable, firstpos, lastpos and followpos on 149 // nodes in the parse tree. 150 // See the alogrithm description in Aho. 151 // Understanding how this works by looking at the code alone will be 152 // nearly impossible. 153 // 154 calcNullable(fTree); 155 calcFirstPos(fTree); 156 calcLastPos(fTree); 157 calcFollowPos(fTree); 158 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) { 159 RBBIDebugPuts("\n"); 160 printPosSets(fTree); 161 } 162 163 // 164 // For "chained" rules, modify the followPos sets 165 // 166 if (fRB->fChainRules) { 167 calcChainedFollowPos(fTree); 168 } 169 170 // 171 // BOF (start of input) test fixup. 172 // 173 if (fRB->fSetBuilder->sawBOF()) { 174 bofFixup(); 175 } 176 177 // 178 // Build the DFA state transition tables. 179 // 180 buildStateTable(); 181 flagAcceptingStates(); 182 flagLookAheadStates(); 183 flagTaggedStates(); 184 185 // 186 // Update the global table of rule status {tag} values 187 // The rule builder has a global vector of status values that are common 188 // for all tables. Merge the ones from this table into the global set. 189 // 190 mergeRuleStatusVals(); 191 192 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; 193} 194 195 196 197//----------------------------------------------------------------------------- 198// 199// calcNullable. Impossible to explain succinctly. See Aho, section 3.9 200// 201//----------------------------------------------------------------------------- 202void RBBITableBuilder::calcNullable(RBBINode *n) { 203 if (n == NULL) { 204 return; 205 } 206 if (n->fType == RBBINode::setRef || 207 n->fType == RBBINode::endMark ) { 208 // These are non-empty leaf node types. 209 n->fNullable = FALSE; 210 return; 211 } 212 213 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) { 214 // Lookahead marker node. It's a leaf, so no recursion on children. 215 // It's nullable because it does not match any literal text from the input stream. 216 n->fNullable = TRUE; 217 return; 218 } 219 220 221 // The node is not a leaf. 222 // Calculate nullable on its children. 223 calcNullable(n->fLeftChild); 224 calcNullable(n->fRightChild); 225 226 // Apply functions from table 3.40 in Aho 227 if (n->fType == RBBINode::opOr) { 228 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable; 229 } 230 else if (n->fType == RBBINode::opCat) { 231 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable; 232 } 233 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) { 234 n->fNullable = TRUE; 235 } 236 else { 237 n->fNullable = FALSE; 238 } 239} 240 241 242 243 244//----------------------------------------------------------------------------- 245// 246// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9 247// 248//----------------------------------------------------------------------------- 249void RBBITableBuilder::calcFirstPos(RBBINode *n) { 250 if (n == NULL) { 251 return; 252 } 253 if (n->fType == RBBINode::leafChar || 254 n->fType == RBBINode::endMark || 255 n->fType == RBBINode::lookAhead || 256 n->fType == RBBINode::tag) { 257 // These are non-empty leaf node types. 258 // Note: In order to maintain the sort invariant on the set, 259 // this function should only be called on a node whose set is 260 // empty to start with. 261 n->fFirstPosSet->addElement(n, *fStatus); 262 return; 263 } 264 265 // The node is not a leaf. 266 // Calculate firstPos on its children. 267 calcFirstPos(n->fLeftChild); 268 calcFirstPos(n->fRightChild); 269 270 // Apply functions from table 3.40 in Aho 271 if (n->fType == RBBINode::opOr) { 272 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 273 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 274 } 275 else if (n->fType == RBBINode::opCat) { 276 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 277 if (n->fLeftChild->fNullable) { 278 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet); 279 } 280 } 281 else if (n->fType == RBBINode::opStar || 282 n->fType == RBBINode::opQuestion || 283 n->fType == RBBINode::opPlus) { 284 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet); 285 } 286} 287 288 289 290//----------------------------------------------------------------------------- 291// 292// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 293// 294//----------------------------------------------------------------------------- 295void RBBITableBuilder::calcLastPos(RBBINode *n) { 296 if (n == NULL) { 297 return; 298 } 299 if (n->fType == RBBINode::leafChar || 300 n->fType == RBBINode::endMark || 301 n->fType == RBBINode::lookAhead || 302 n->fType == RBBINode::tag) { 303 // These are non-empty leaf node types. 304 // Note: In order to maintain the sort invariant on the set, 305 // this function should only be called on a node whose set is 306 // empty to start with. 307 n->fLastPosSet->addElement(n, *fStatus); 308 return; 309 } 310 311 // The node is not a leaf. 312 // Calculate lastPos on its children. 313 calcLastPos(n->fLeftChild); 314 calcLastPos(n->fRightChild); 315 316 // Apply functions from table 3.40 in Aho 317 if (n->fType == RBBINode::opOr) { 318 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 319 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 320 } 321 else if (n->fType == RBBINode::opCat) { 322 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet); 323 if (n->fRightChild->fNullable) { 324 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 325 } 326 } 327 else if (n->fType == RBBINode::opStar || 328 n->fType == RBBINode::opQuestion || 329 n->fType == RBBINode::opPlus) { 330 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet); 331 } 332} 333 334 335 336//----------------------------------------------------------------------------- 337// 338// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 339// 340//----------------------------------------------------------------------------- 341void RBBITableBuilder::calcFollowPos(RBBINode *n) { 342 if (n == NULL || 343 n->fType == RBBINode::leafChar || 344 n->fType == RBBINode::endMark) { 345 return; 346 } 347 348 calcFollowPos(n->fLeftChild); 349 calcFollowPos(n->fRightChild); 350 351 // Aho rule #1 352 if (n->fType == RBBINode::opCat) { 353 RBBINode *i; // is 'i' in Aho's description 354 uint32_t ix; 355 356 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet; 357 358 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) { 359 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix); 360 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet); 361 } 362 } 363 364 // Aho rule #2 365 if (n->fType == RBBINode::opStar || 366 n->fType == RBBINode::opPlus) { 367 RBBINode *i; // again, n and i are the names from Aho's description. 368 uint32_t ix; 369 370 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) { 371 i = (RBBINode *)n->fLastPosSet->elementAt(ix); 372 setAdd(i->fFollowPos, n->fFirstPosSet); 373 } 374 } 375 376 377 378} 379 380//----------------------------------------------------------------------------- 381// 382// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged 383// as roots of a rule to a destination vector. 384// 385//----------------------------------------------------------------------------- 386void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) { 387 if (node == NULL || U_FAILURE(*fStatus)) { 388 return; 389 } 390 if (node->fRuleRoot) { 391 dest->addElement(node, *fStatus); 392 // Note: rules cannot nest. If we found a rule start node, 393 // no child node can also be a start node. 394 return; 395 } 396 addRuleRootNodes(dest, node->fLeftChild); 397 addRuleRootNodes(dest, node->fRightChild); 398} 399 400//----------------------------------------------------------------------------- 401// 402// calcChainedFollowPos. Modify the previously calculated followPos sets 403// to implement rule chaining. NOT described by Aho 404// 405//----------------------------------------------------------------------------- 406void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { 407 408 UVector endMarkerNodes(*fStatus); 409 UVector leafNodes(*fStatus); 410 int32_t i; 411 412 if (U_FAILURE(*fStatus)) { 413 return; 414 } 415 416 // get a list of all endmarker nodes. 417 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 418 419 // get a list all leaf nodes 420 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus); 421 if (U_FAILURE(*fStatus)) { 422 return; 423 } 424 425 // Collect all leaf nodes that can start matches for rules 426 // with inbound chaining enabled, which is the union of the 427 // firstPosition sets from each of the rule root nodes. 428 429 UVector ruleRootNodes(*fStatus); 430 addRuleRootNodes(&ruleRootNodes, tree); 431 432 UVector matchStartNodes(*fStatus); 433 for (int i=0; i<ruleRootNodes.size(); ++i) { 434 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i)); 435 if (node->fChainIn) { 436 setAdd(&matchStartNodes, node->fFirstPosSet); 437 } 438 } 439 if (U_FAILURE(*fStatus)) { 440 return; 441 } 442 443 int32_t endNodeIx; 444 int32_t startNodeIx; 445 446 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) { 447 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx); 448 RBBINode *endNode = NULL; 449 450 // Identify leaf nodes that correspond to overall rule match positions. 451 // These include an endMarkerNode in their followPos sets. 452 for (i=0; i<endMarkerNodes.size(); i++) { 453 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) { 454 endNode = tNode; 455 break; 456 } 457 } 458 if (endNode == NULL) { 459 // node wasn't an end node. Try again with the next. 460 continue; 461 } 462 463 // We've got a node that can end a match. 464 465 // Line Break Specific hack: If this node's val correspond to the $CM char class, 466 // don't chain from it. 467 // TODO: Add rule syntax for this behavior, get specifics out of here and 468 // into the rule file. 469 if (fRB->fLBCMNoChain) { 470 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal); 471 if (c != -1) { 472 // c == -1 occurs with sets containing only the {eof} marker string. 473 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 474 if (cLBProp == U_LB_COMBINING_MARK) { 475 continue; 476 } 477 } 478 } 479 480 481 // Now iterate over the nodes that can start a match, looking for ones 482 // with the same char class as our ending node. 483 RBBINode *startNode; 484 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) { 485 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx); 486 if (startNode->fType != RBBINode::leafChar) { 487 continue; 488 } 489 490 if (endNode->fVal == startNode->fVal) { 491 // The end val (character class) of one possible match is the 492 // same as the start of another. 493 494 // Add all nodes from the followPos of the start node to the 495 // followPos set of the end node, which will have the effect of 496 // letting matches transition from a match state at endNode 497 // to the second char of a match starting with startNode. 498 setAdd(endNode->fFollowPos, startNode->fFollowPos); 499 } 500 } 501 } 502} 503 504 505//----------------------------------------------------------------------------- 506// 507// bofFixup. Fixup for state tables that include {bof} beginning of input testing. 508// Do an swizzle similar to chaining, modifying the followPos set of 509// the bofNode to include the followPos nodes from other {bot} nodes 510// scattered through the tree. 511// 512// This function has much in common with calcChainedFollowPos(). 513// 514//----------------------------------------------------------------------------- 515void RBBITableBuilder::bofFixup() { 516 517 if (U_FAILURE(*fStatus)) { 518 return; 519 } 520 521 // The parse tree looks like this ... 522 // fTree root ---> <cat> 523 // / \ . 524 // <cat> <#end node> 525 // / \ . 526 // <bofNode> rest 527 // of tree 528 // 529 // We will be adding things to the followPos set of the <bofNode> 530 // 531 RBBINode *bofNode = fTree->fLeftChild->fLeftChild; 532 U_ASSERT(bofNode->fType == RBBINode::leafChar); 533 U_ASSERT(bofNode->fVal == 2); 534 535 // Get all nodes that can be the start a match of the user-written rules 536 // (excluding the fake bofNode) 537 // We want the nodes that can start a match in the 538 // part labeled "rest of tree" 539 // 540 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; 541 542 RBBINode *startNode; 543 int startNodeIx; 544 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) { 545 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); 546 if (startNode->fType != RBBINode::leafChar) { 547 continue; 548 } 549 550 if (startNode->fVal == bofNode->fVal) { 551 // We found a leaf node corresponding to a {bof} that was 552 // explicitly written into a rule. 553 // Add everything from the followPos set of this node to the 554 // followPos set of the fake bofNode at the start of the tree. 555 // 556 setAdd(bofNode->fFollowPos, startNode->fFollowPos); 557 } 558 } 559} 560 561//----------------------------------------------------------------------------- 562// 563// buildStateTable() Determine the set of runtime DFA states and the 564// transition tables for these states, by the algorithm 565// of fig. 3.44 in Aho. 566// 567// Most of the comments are quotes of Aho's psuedo-code. 568// 569//----------------------------------------------------------------------------- 570void RBBITableBuilder::buildStateTable() { 571 if (U_FAILURE(*fStatus)) { 572 return; 573 } 574 RBBIStateDescriptor *failState; 575 // Set it to NULL to avoid uninitialized warning 576 RBBIStateDescriptor *initialState = NULL; 577 // 578 // Add a dummy state 0 - the stop state. Not from Aho. 579 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1; 580 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 581 if (failState == NULL) { 582 *fStatus = U_MEMORY_ALLOCATION_ERROR; 583 goto ExitBuildSTdeleteall; 584 } 585 failState->fPositions = new UVector(*fStatus); 586 if (failState->fPositions == NULL) { 587 *fStatus = U_MEMORY_ALLOCATION_ERROR; 588 } 589 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) { 590 goto ExitBuildSTdeleteall; 591 } 592 fDStates->addElement(failState, *fStatus); 593 if (U_FAILURE(*fStatus)) { 594 goto ExitBuildSTdeleteall; 595 } 596 597 // initially, the only unmarked state in Dstates is firstpos(root), 598 // where toot is the root of the syntax tree for (r)#; 599 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 600 if (initialState == NULL) { 601 *fStatus = U_MEMORY_ALLOCATION_ERROR; 602 } 603 if (U_FAILURE(*fStatus)) { 604 goto ExitBuildSTdeleteall; 605 } 606 initialState->fPositions = new UVector(*fStatus); 607 if (initialState->fPositions == NULL) { 608 *fStatus = U_MEMORY_ALLOCATION_ERROR; 609 } 610 if (U_FAILURE(*fStatus)) { 611 goto ExitBuildSTdeleteall; 612 } 613 setAdd(initialState->fPositions, fTree->fFirstPosSet); 614 fDStates->addElement(initialState, *fStatus); 615 if (U_FAILURE(*fStatus)) { 616 goto ExitBuildSTdeleteall; 617 } 618 619 // while there is an unmarked state T in Dstates do begin 620 for (;;) { 621 RBBIStateDescriptor *T = NULL; 622 int32_t tx; 623 for (tx=1; tx<fDStates->size(); tx++) { 624 RBBIStateDescriptor *temp; 625 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx); 626 if (temp->fMarked == FALSE) { 627 T = temp; 628 break; 629 } 630 } 631 if (T == NULL) { 632 break; 633 } 634 635 // mark T; 636 T->fMarked = TRUE; 637 638 // for each input symbol a do begin 639 int32_t a; 640 for (a = 1; a<=lastInputSymbol; a++) { 641 // let U be the set of positions that are in followpos(p) 642 // for some position p in T 643 // such that the symbol at position p is a; 644 UVector *U = NULL; 645 RBBINode *p; 646 int32_t px; 647 for (px=0; px<T->fPositions->size(); px++) { 648 p = (RBBINode *)T->fPositions->elementAt(px); 649 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) { 650 if (U == NULL) { 651 U = new UVector(*fStatus); 652 if (U == NULL) { 653 *fStatus = U_MEMORY_ALLOCATION_ERROR; 654 goto ExitBuildSTdeleteall; 655 } 656 } 657 setAdd(U, p->fFollowPos); 658 } 659 } 660 661 // if U is not empty and not in DStates then 662 int32_t ux = 0; 663 UBool UinDstates = FALSE; 664 if (U != NULL) { 665 U_ASSERT(U->size() > 0); 666 int ix; 667 for (ix=0; ix<fDStates->size(); ix++) { 668 RBBIStateDescriptor *temp2; 669 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix); 670 if (setEquals(U, temp2->fPositions)) { 671 delete U; 672 U = temp2->fPositions; 673 ux = ix; 674 UinDstates = TRUE; 675 break; 676 } 677 } 678 679 // Add U as an unmarked state to Dstates 680 if (!UinDstates) 681 { 682 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus); 683 if (newState == NULL) { 684 *fStatus = U_MEMORY_ALLOCATION_ERROR; 685 } 686 if (U_FAILURE(*fStatus)) { 687 goto ExitBuildSTdeleteall; 688 } 689 newState->fPositions = U; 690 fDStates->addElement(newState, *fStatus); 691 if (U_FAILURE(*fStatus)) { 692 return; 693 } 694 ux = fDStates->size()-1; 695 } 696 697 // Dtran[T, a] := U; 698 T->fDtran->setElementAt(ux, a); 699 } 700 } 701 } 702 return; 703 // delete local pointers only if error occured. 704ExitBuildSTdeleteall: 705 delete initialState; 706 delete failState; 707} 708 709 710 711//----------------------------------------------------------------------------- 712// 713// flagAcceptingStates Identify accepting states. 714// First get a list of all of the end marker nodes. 715// Then, for each state s, 716// if s contains one of the end marker nodes in its list of tree positions then 717// s is an accepting state. 718// 719//----------------------------------------------------------------------------- 720void RBBITableBuilder::flagAcceptingStates() { 721 if (U_FAILURE(*fStatus)) { 722 return; 723 } 724 UVector endMarkerNodes(*fStatus); 725 RBBINode *endMarker; 726 int32_t i; 727 int32_t n; 728 729 if (U_FAILURE(*fStatus)) { 730 return; 731 } 732 733 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus); 734 if (U_FAILURE(*fStatus)) { 735 return; 736 } 737 738 for (i=0; i<endMarkerNodes.size(); i++) { 739 endMarker = (RBBINode *)endMarkerNodes.elementAt(i); 740 for (n=0; n<fDStates->size(); n++) { 741 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 742 if (sd->fPositions->indexOf(endMarker) >= 0) { 743 // Any non-zero value for fAccepting means this is an accepting node. 744 // The value is what will be returned to the user as the break status. 745 // If no other value was specified, force it to -1. 746 747 if (sd->fAccepting==0) { 748 // State hasn't been marked as accepting yet. Do it now. 749 sd->fAccepting = endMarker->fVal; 750 if (sd->fAccepting == 0) { 751 sd->fAccepting = -1; 752 } 753 } 754 if (sd->fAccepting==-1 && endMarker->fVal != 0) { 755 // Both lookahead and non-lookahead accepting for this state. 756 // Favor the look-ahead. Expedient for line break. 757 // TODO: need a more elegant resolution for conflicting rules. 758 sd->fAccepting = endMarker->fVal; 759 } 760 // implicit else: 761 // if sd->fAccepting already had a value other than 0 or -1, leave it be. 762 763 // If the end marker node is from a look-ahead rule, set 764 // the fLookAhead field or this state also. 765 if (endMarker->fLookAheadEnd) { 766 // TODO: don't change value if already set? 767 // TODO: allow for more than one active look-ahead rule in engine. 768 // Make value here an index to a side array in engine? 769 sd->fLookAhead = sd->fAccepting; 770 } 771 } 772 } 773 } 774} 775 776 777//----------------------------------------------------------------------------- 778// 779// flagLookAheadStates Very similar to flagAcceptingStates, above. 780// 781//----------------------------------------------------------------------------- 782void RBBITableBuilder::flagLookAheadStates() { 783 if (U_FAILURE(*fStatus)) { 784 return; 785 } 786 UVector lookAheadNodes(*fStatus); 787 RBBINode *lookAheadNode; 788 int32_t i; 789 int32_t n; 790 791 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus); 792 if (U_FAILURE(*fStatus)) { 793 return; 794 } 795 for (i=0; i<lookAheadNodes.size(); i++) { 796 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i); 797 798 for (n=0; n<fDStates->size(); n++) { 799 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 800 if (sd->fPositions->indexOf(lookAheadNode) >= 0) { 801 sd->fLookAhead = lookAheadNode->fVal; 802 } 803 } 804 } 805} 806 807 808 809 810//----------------------------------------------------------------------------- 811// 812// flagTaggedStates 813// 814//----------------------------------------------------------------------------- 815void RBBITableBuilder::flagTaggedStates() { 816 if (U_FAILURE(*fStatus)) { 817 return; 818 } 819 UVector tagNodes(*fStatus); 820 RBBINode *tagNode; 821 int32_t i; 822 int32_t n; 823 824 if (U_FAILURE(*fStatus)) { 825 return; 826 } 827 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus); 828 if (U_FAILURE(*fStatus)) { 829 return; 830 } 831 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em) 832 tagNode = (RBBINode *)tagNodes.elementAt(i); 833 834 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table) 835 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 836 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t 837 sortedAdd(&sd->fTagVals, tagNode->fVal); 838 } 839 } 840 } 841} 842 843 844 845 846//----------------------------------------------------------------------------- 847// 848// mergeRuleStatusVals 849// 850// Update the global table of rule status {tag} values 851// The rule builder has a global vector of status values that are common 852// for all tables. Merge the ones from this table into the global set. 853// 854//----------------------------------------------------------------------------- 855void RBBITableBuilder::mergeRuleStatusVals() { 856 // 857 // The basic outline of what happens here is this... 858 // 859 // for each state in this state table 860 // if the status tag list for this state is in the global statuses list 861 // record where and 862 // continue with the next state 863 // else 864 // add the tag list for this state to the global list. 865 // 866 int i; 867 int n; 868 869 // Pre-set a single tag of {0} into the table. 870 // We will need this as a default, for rule sets with no explicit tagging. 871 if (fRB->fRuleStatusVals->size() == 0) { 872 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group 873 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero 874 } 875 876 // For each state 877 for (n=0; n<fDStates->size(); n++) { 878 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 879 UVector *thisStatesTagValues = sd->fTagVals; 880 if (thisStatesTagValues == NULL) { 881 // No tag values are explicitly associated with this state. 882 // Set the default tag value. 883 sd->fTagsIdx = 0; 884 continue; 885 } 886 887 // There are tag(s) associated with this state. 888 // fTagsIdx will be the index into the global tag list for this state's tag values. 889 // Initial value of -1 flags that we haven't got it set yet. 890 sd->fTagsIdx = -1; 891 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list 892 int32_t nextTagGroupStart = 0; 893 894 // Loop runs once per group of tags in the global list 895 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) { 896 thisTagGroupStart = nextTagGroupStart; 897 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1; 898 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) { 899 // The number of tags for this state is different from 900 // the number of tags in this group from the global list. 901 // Continue with the next group from the global list. 902 continue; 903 } 904 // The lengths match, go ahead and compare the actual tag values 905 // between this state and the group from the global list. 906 for (i=0; i<thisStatesTagValues->size(); i++) { 907 if (thisStatesTagValues->elementAti(i) != 908 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) { 909 // Mismatch. 910 break; 911 } 912 } 913 914 if (i == thisStatesTagValues->size()) { 915 // We found a set of tag values in the global list that match 916 // those for this state. Use them. 917 sd->fTagsIdx = thisTagGroupStart; 918 break; 919 } 920 } 921 922 if (sd->fTagsIdx == -1) { 923 // No suitable entry in the global tag list already. Add one 924 sd->fTagsIdx = fRB->fRuleStatusVals->size(); 925 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus); 926 for (i=0; i<thisStatesTagValues->size(); i++) { 927 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus); 928 } 929 } 930 } 931} 932 933 934 935 936 937 938 939//----------------------------------------------------------------------------- 940// 941// sortedAdd Add a value to a vector of sorted values (ints). 942// Do not replicate entries; if the value is already there, do not 943// add a second one. 944// Lazily create the vector if it does not already exist. 945// 946//----------------------------------------------------------------------------- 947void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) { 948 int32_t i; 949 950 if (*vector == NULL) { 951 *vector = new UVector(*fStatus); 952 } 953 if (*vector == NULL || U_FAILURE(*fStatus)) { 954 return; 955 } 956 UVector *vec = *vector; 957 int32_t vSize = vec->size(); 958 for (i=0; i<vSize; i++) { 959 int32_t valAtI = vec->elementAti(i); 960 if (valAtI == val) { 961 // The value is already in the vector. Don't add it again. 962 return; 963 } 964 if (valAtI > val) { 965 break; 966 } 967 } 968 vec->insertElementAt(val, i, *fStatus); 969} 970 971 972 973//----------------------------------------------------------------------------- 974// 975// setAdd Set operation on UVector 976// dest = dest union source 977// Elements may only appear once and must be sorted. 978// 979//----------------------------------------------------------------------------- 980void RBBITableBuilder::setAdd(UVector *dest, UVector *source) { 981 int32_t destOriginalSize = dest->size(); 982 int32_t sourceSize = source->size(); 983 int32_t di = 0; 984 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc 985 void **destPtr, **sourcePtr; 986 void **destLim, **sourceLim; 987 988 if (destOriginalSize > destArray.getCapacity()) { 989 if (destArray.resize(destOriginalSize) == NULL) { 990 return; 991 } 992 } 993 destPtr = destArray.getAlias(); 994 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()? 995 996 if (sourceSize > sourceArray.getCapacity()) { 997 if (sourceArray.resize(sourceSize) == NULL) { 998 return; 999 } 1000 } 1001 sourcePtr = sourceArray.getAlias(); 1002 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()? 1003 1004 // Avoid multiple "get element" calls by getting the contents into arrays 1005 (void) dest->toArray(destPtr); 1006 (void) source->toArray(sourcePtr); 1007 1008 dest->setSize(sourceSize+destOriginalSize, *fStatus); 1009 1010 while (sourcePtr < sourceLim && destPtr < destLim) { 1011 if (*destPtr == *sourcePtr) { 1012 dest->setElementAt(*sourcePtr++, di++); 1013 destPtr++; 1014 } 1015 // This check is required for machines with segmented memory, like i5/OS. 1016 // Direct pointer comparison is not recommended. 1017 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) { 1018 dest->setElementAt(*destPtr++, di++); 1019 } 1020 else { /* *sourcePtr < *destPtr */ 1021 dest->setElementAt(*sourcePtr++, di++); 1022 } 1023 } 1024 1025 // At most one of these two cleanup loops will execute 1026 while (destPtr < destLim) { 1027 dest->setElementAt(*destPtr++, di++); 1028 } 1029 while (sourcePtr < sourceLim) { 1030 dest->setElementAt(*sourcePtr++, di++); 1031 } 1032 1033 dest->setSize(di, *fStatus); 1034} 1035 1036 1037 1038//----------------------------------------------------------------------------- 1039// 1040// setEqual Set operation on UVector. 1041// Compare for equality. 1042// Elements must be sorted. 1043// 1044//----------------------------------------------------------------------------- 1045UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) { 1046 return a->equals(*b); 1047} 1048 1049 1050//----------------------------------------------------------------------------- 1051// 1052// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos 1053// for each node in the tree. 1054// 1055//----------------------------------------------------------------------------- 1056#ifdef RBBI_DEBUG 1057void RBBITableBuilder::printPosSets(RBBINode *n) { 1058 if (n==NULL) { 1059 return; 1060 } 1061 printf("\n"); 1062 RBBINode::printNodeHeader(); 1063 RBBINode::printNode(n); 1064 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE"); 1065 1066 RBBIDebugPrintf(" firstpos: "); 1067 printSet(n->fFirstPosSet); 1068 1069 RBBIDebugPrintf(" lastpos: "); 1070 printSet(n->fLastPosSet); 1071 1072 RBBIDebugPrintf(" followpos: "); 1073 printSet(n->fFollowPos); 1074 1075 printPosSets(n->fLeftChild); 1076 printPosSets(n->fRightChild); 1077} 1078#endif 1079 1080 1081 1082//----------------------------------------------------------------------------- 1083// 1084// getTableSize() Calculate the size of the runtime form of this 1085// state transition table. 1086// 1087//----------------------------------------------------------------------------- 1088int32_t RBBITableBuilder::getTableSize() const { 1089 int32_t size = 0; 1090 int32_t numRows; 1091 int32_t numCols; 1092 int32_t rowSize; 1093 1094 if (fTree == NULL) { 1095 return 0; 1096 } 1097 1098 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table. 1099 1100 numRows = fDStates->size(); 1101 numCols = fRB->fSetBuilder->getNumCharCategories(); 1102 1103 // Note The declaration of RBBIStateTableRow is for a table of two columns. 1104 // Therefore we subtract two from numCols when determining 1105 // how much storage to add to a row for the total columns. 1106 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2); 1107 size += numRows * rowSize; 1108 return size; 1109} 1110 1111 1112 1113//----------------------------------------------------------------------------- 1114// 1115// exportTable() export the state transition table in the format required 1116// by the runtime engine. getTableSize() bytes of memory 1117// must be available at the output address "where". 1118// 1119//----------------------------------------------------------------------------- 1120void RBBITableBuilder::exportTable(void *where) { 1121 RBBIStateTable *table = (RBBIStateTable *)where; 1122 uint32_t state; 1123 int col; 1124 1125 if (U_FAILURE(*fStatus) || fTree == NULL) { 1126 return; 1127 } 1128 1129 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff || 1130 fDStates->size() > 0x7fff) { 1131 *fStatus = U_BRK_INTERNAL_ERROR; 1132 return; 1133 } 1134 1135 table->fRowLen = sizeof(RBBIStateTableRow) + 1136 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2); 1137 table->fNumStates = fDStates->size(); 1138 table->fFlags = 0; 1139 if (fRB->fLookAheadHardBreak) { 1140 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; 1141 } 1142 if (fRB->fSetBuilder->sawBOF()) { 1143 table->fFlags |= RBBI_BOF_REQUIRED; 1144 } 1145 table->fReserved = 0; 1146 1147 for (state=0; state<table->fNumStates; state++) { 1148 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); 1149 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); 1150 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); 1151 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); 1152 row->fAccepting = (int16_t)sd->fAccepting; 1153 row->fLookAhead = (int16_t)sd->fLookAhead; 1154 row->fTagIdx = (int16_t)sd->fTagsIdx; 1155 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) { 1156 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col); 1157 } 1158 } 1159} 1160 1161 1162 1163//----------------------------------------------------------------------------- 1164// 1165// printSet Debug function. Print the contents of a UVector 1166// 1167//----------------------------------------------------------------------------- 1168#ifdef RBBI_DEBUG 1169void RBBITableBuilder::printSet(UVector *s) { 1170 int32_t i; 1171 for (i=0; i<s->size(); i++) { 1172 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i)); 1173 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum); 1174 } 1175 RBBIDebugPrintf("\n"); 1176} 1177#endif 1178 1179 1180//----------------------------------------------------------------------------- 1181// 1182// printStates Debug Function. Dump the fully constructed state transition table. 1183// 1184//----------------------------------------------------------------------------- 1185#ifdef RBBI_DEBUG 1186void RBBITableBuilder::printStates() { 1187 int c; // input "character" 1188 int n; // state number 1189 1190 RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); 1191 RBBIDebugPrintf(" | Acc LA Tag"); 1192 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1193 RBBIDebugPrintf(" %2d", c); 1194 } 1195 RBBIDebugPrintf("\n"); 1196 RBBIDebugPrintf(" |---------------"); 1197 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1198 RBBIDebugPrintf("---"); 1199 } 1200 RBBIDebugPrintf("\n"); 1201 1202 for (n=0; n<fDStates->size(); n++) { 1203 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); 1204 RBBIDebugPrintf(" %3d | " , n); 1205 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx); 1206 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) { 1207 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c)); 1208 } 1209 RBBIDebugPrintf("\n"); 1210 } 1211 RBBIDebugPrintf("\n\n"); 1212} 1213#endif 1214 1215 1216 1217//----------------------------------------------------------------------------- 1218// 1219// printRuleStatusTable Debug Function. Dump the common rule status table 1220// 1221//----------------------------------------------------------------------------- 1222#ifdef RBBI_DEBUG 1223void RBBITableBuilder::printRuleStatusTable() { 1224 int32_t thisRecord = 0; 1225 int32_t nextRecord = 0; 1226 int i; 1227 UVector *tbl = fRB->fRuleStatusVals; 1228 1229 RBBIDebugPrintf("index | tags \n"); 1230 RBBIDebugPrintf("-------------------\n"); 1231 1232 while (nextRecord < tbl->size()) { 1233 thisRecord = nextRecord; 1234 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1; 1235 RBBIDebugPrintf("%4d ", thisRecord); 1236 for (i=thisRecord+1; i<nextRecord; i++) { 1237 RBBIDebugPrintf(" %5d", tbl->elementAti(i)); 1238 } 1239 RBBIDebugPrintf("\n"); 1240 } 1241 RBBIDebugPrintf("\n\n"); 1242} 1243#endif 1244 1245 1246//----------------------------------------------------------------------------- 1247// 1248// RBBIStateDescriptor Methods. This is a very struct-like class 1249// Most access is directly to the fields. 1250// 1251//----------------------------------------------------------------------------- 1252 1253RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) { 1254 fMarked = FALSE; 1255 fAccepting = 0; 1256 fLookAhead = 0; 1257 fTagsIdx = 0; 1258 fTagVals = NULL; 1259 fPositions = NULL; 1260 fDtran = NULL; 1261 1262 fDtran = new UVector(lastInputSymbol+1, *fStatus); 1263 if (U_FAILURE(*fStatus)) { 1264 return; 1265 } 1266 if (fDtran == NULL) { 1267 *fStatus = U_MEMORY_ALLOCATION_ERROR; 1268 return; 1269 } 1270 fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized. 1271 // It is indexed by input symbols, and will 1272 // hold the next state number for each 1273 // symbol. 1274} 1275 1276 1277RBBIStateDescriptor::~RBBIStateDescriptor() { 1278 delete fPositions; 1279 delete fDtran; 1280 delete fTagVals; 1281 fPositions = NULL; 1282 fDtran = NULL; 1283 fTagVals = NULL; 1284} 1285 1286U_NAMESPACE_END 1287 1288#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1289