1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3// 4// file: rbbiscan.cpp 5// 6// Copyright (C) 2002-2016, International Business Machines Corporation and others. 7// All Rights Reserved. 8// 9// This file contains the Rule Based Break Iterator Rule Builder functions for 10// scanning the rules and assembling a parse tree. This is the first phase 11// of compiling the rules. 12// 13// The overall of the rules is managed by class RBBIRuleBuilder, which will 14// create and use an instance of this class as part of the process. 15// 16 17#include "unicode/utypes.h" 18 19#if !UCONFIG_NO_BREAK_ITERATION 20 21#include "unicode/unistr.h" 22#include "unicode/uniset.h" 23#include "unicode/uchar.h" 24#include "unicode/uchriter.h" 25#include "unicode/parsepos.h" 26#include "unicode/parseerr.h" 27#include "cmemory.h" 28#include "cstring.h" 29 30#include "rbbirpt.h" // Contains state table for the rbbi rules parser. 31 // generated by a Perl script. 32#include "rbbirb.h" 33#include "rbbinode.h" 34#include "rbbiscan.h" 35#include "rbbitblb.h" 36 37#include "uassert.h" 38 39//------------------------------------------------------------------------------ 40// 41// Unicode Set init strings for each of the character classes needed for parsing a rule file. 42// (Initialized with hex values for portability to EBCDIC based machines. 43// Really ugly, but there's no good way to avoid it.) 44// 45// The sets are referred to by name in the rbbirpt.txt, which is the 46// source form of the state transition table for the RBBI rule parser. 47// 48//------------------------------------------------------------------------------ 49static const UChar gRuleSet_rule_char_pattern[] = { 50 // Characters that may appear as literals in patterns without escaping or quoting. 51 // [ ^ [ \ p { Z } \ u 0 0 2 0 52 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, 53 // - \ u 0 0 7 f ] - [ \ p 54 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 55 // { L } ] - [ \ p { N } ] ] 56 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; 57 58static const UChar gRuleSet_name_char_pattern[] = { 59// [ _ \ p { L } \ p { N } ] 60 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; 61 62static const UChar gRuleSet_digit_char_pattern[] = { 63// [ 0 - 9 ] 64 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; 65 66static const UChar gRuleSet_name_start_char_pattern[] = { 67// [ _ \ p { L } ] 68 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; 69 70static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" 71 72 73U_CDECL_BEGIN 74static void U_CALLCONV RBBISetTable_deleter(void *p) { 75 icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; 76 delete px->key; 77 // Note: px->val is owned by the linked list "fSetsListHead" in scanner. 78 // Don't delete the value nodes here. 79 uprv_free(px); 80} 81U_CDECL_END 82 83U_NAMESPACE_BEGIN 84 85//------------------------------------------------------------------------------ 86// 87// Constructor. 88// 89//------------------------------------------------------------------------------ 90RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) 91{ 92 fRB = rb; 93 fScanIndex = 0; 94 fNextIndex = 0; 95 fQuoteMode = FALSE; 96 fLineNum = 1; 97 fCharNum = 0; 98 fLastChar = 0; 99 100 fStateTable = NULL; 101 fStack[0] = 0; 102 fStackPtr = 0; 103 fNodeStack[0] = NULL; 104 fNodeStackPtr = 0; 105 106 fReverseRule = FALSE; 107 fLookAheadRule = FALSE; 108 fNoChainInRule = FALSE; 109 110 fSymbolTable = NULL; 111 fSetTable = NULL; 112 fRuleNum = 0; 113 fOptionStart = 0; 114 115 // Do not check status until after all critical fields are sufficiently initialized 116 // that the destructor can run cleanly. 117 if (U_FAILURE(*rb->fStatus)) { 118 return; 119 } 120 121 // 122 // Set up the constant Unicode Sets. 123 // Note: These could be made static, lazily initialized, and shared among 124 // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, 125 // and the time to build these few sets should be small compared to a 126 // full break iterator build. 127 fRuleSets[kRuleSet_rule_char-128] 128 = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus); 129 // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:] 130 fRuleSets[kRuleSet_white_space-128]. 131 add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); 132 fRuleSets[kRuleSet_name_char-128] 133 = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus); 134 fRuleSets[kRuleSet_name_start_char-128] 135 = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus); 136 fRuleSets[kRuleSet_digit_char-128] 137 = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus); 138 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { 139 // This case happens if ICU's data is missing. UnicodeSet tries to look up property 140 // names from the init string, can't find them, and claims an illegal argument. 141 // Change the error so that the actual problem will be clearer to users. 142 *rb->fStatus = U_BRK_INIT_ERROR; 143 } 144 if (U_FAILURE(*rb->fStatus)) { 145 return; 146 } 147 148 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); 149 if (fSymbolTable == NULL) { 150 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; 151 return; 152 } 153 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); 154 if (U_FAILURE(*rb->fStatus)) { 155 return; 156 } 157 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); 158} 159 160 161 162//------------------------------------------------------------------------------ 163// 164// Destructor 165// 166//------------------------------------------------------------------------------ 167RBBIRuleScanner::~RBBIRuleScanner() { 168 delete fSymbolTable; 169 if (fSetTable != NULL) { 170 uhash_close(fSetTable); 171 fSetTable = NULL; 172 173 } 174 175 176 // Node Stack. 177 // Normally has one entry, which is the entire parse tree for the rules. 178 // If errors occured, there may be additional subtrees left on the stack. 179 while (fNodeStackPtr > 0) { 180 delete fNodeStack[fNodeStackPtr]; 181 fNodeStackPtr--; 182 } 183 184} 185 186//------------------------------------------------------------------------------ 187// 188// doParseAction Do some action during rule parsing. 189// Called by the parse state machine. 190// Actions build the parse tree and Unicode Sets, 191// and maintain the parse stack for nested expressions. 192// 193// TODO: unify EParseAction and RBBI_RuleParseAction enum types. 194// They represent exactly the same thing. They're separate 195// only to work around enum forward declaration restrictions 196// in some compilers, while at the same time avoiding multiple 197// definitions problems. I'm sure that there's a better way. 198// 199//------------------------------------------------------------------------------ 200UBool RBBIRuleScanner::doParseActions(int32_t action) 201{ 202 RBBINode *n = NULL; 203 204 UBool returnVal = TRUE; 205 206 switch (action) { 207 208 case doExprStart: 209 pushNewNode(RBBINode::opStart); 210 fRuleNum++; 211 break; 212 213 214 case doNoChain: 215 // Scanned a '^' while on the rule start state. 216 fNoChainInRule = TRUE; 217 break; 218 219 220 case doExprOrOperator: 221 { 222 fixOpStack(RBBINode::precOpCat); 223 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 224 RBBINode *orNode = pushNewNode(RBBINode::opOr); 225 if (U_FAILURE(*fRB->fStatus)) { 226 break; 227 } 228 orNode->fLeftChild = operandNode; 229 operandNode->fParent = orNode; 230 } 231 break; 232 233 case doExprCatOperator: 234 // concatenation operator. 235 // For the implicit concatenation of adjacent terms in an expression that are 236 // not separated by any other operator. Action is invoked between the 237 // actions for the two terms. 238 { 239 fixOpStack(RBBINode::precOpCat); 240 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 241 RBBINode *catNode = pushNewNode(RBBINode::opCat); 242 if (U_FAILURE(*fRB->fStatus)) { 243 break; 244 } 245 catNode->fLeftChild = operandNode; 246 operandNode->fParent = catNode; 247 } 248 break; 249 250 case doLParen: 251 // Open Paren. 252 // The openParen node is a dummy operation type with a low precedence, 253 // which has the affect of ensuring that any real binary op that 254 // follows within the parens binds more tightly to the operands than 255 // stuff outside of the parens. 256 pushNewNode(RBBINode::opLParen); 257 break; 258 259 case doExprRParen: 260 fixOpStack(RBBINode::precLParen); 261 break; 262 263 case doNOP: 264 break; 265 266 case doStartAssign: 267 // We've just scanned "$variable = " 268 // The top of the node stack has the $variable ref node. 269 270 // Save the start position of the RHS text in the StartExpression node 271 // that precedes the $variableReference node on the stack. 272 // This will eventually be used when saving the full $variable replacement 273 // text as a string. 274 n = fNodeStack[fNodeStackPtr-1]; 275 n->fFirstPos = fNextIndex; // move past the '=' 276 277 // Push a new start-of-expression node; needed to keep parse of the 278 // RHS expression happy. 279 pushNewNode(RBBINode::opStart); 280 break; 281 282 283 284 285 case doEndAssign: 286 { 287 // We have reached the end of an assignement statement. 288 // Current scan char is the ';' that terminates the assignment. 289 290 // Terminate expression, leaves expression parse tree rooted in TOS node. 291 fixOpStack(RBBINode::precStart); 292 293 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; 294 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; 295 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; 296 297 // Save original text of right side of assignment, excluding the terminating ';' 298 // in the root of the node for the right-hand-side expression. 299 RHSExprNode->fFirstPos = startExprNode->fFirstPos; 300 RHSExprNode->fLastPos = fScanIndex; 301 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); 302 303 // Expression parse tree becomes l. child of the $variable reference node. 304 varRefNode->fLeftChild = RHSExprNode; 305 RHSExprNode->fParent = varRefNode; 306 307 // Make a symbol table entry for the $variableRef node. 308 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); 309 if (U_FAILURE(*fRB->fStatus)) { 310 // This is a round-about way to get the parse position set 311 // so that duplicate symbols error messages include a line number. 312 UErrorCode t = *fRB->fStatus; 313 *fRB->fStatus = U_ZERO_ERROR; 314 error(t); 315 } 316 317 // Clean up the stack. 318 delete startExprNode; 319 fNodeStackPtr-=3; 320 break; 321 } 322 323 case doEndOfRule: 324 { 325 fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression 326 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. 327 break; 328 } 329#ifdef RBBI_DEBUG 330 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} 331#endif 332 U_ASSERT(fNodeStackPtr == 1); 333 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 334 335 // If this rule includes a look-ahead '/', add a endMark node to the 336 // expression tree. 337 if (fLookAheadRule) { 338 RBBINode *endNode = pushNewNode(RBBINode::endMark); 339 RBBINode *catNode = pushNewNode(RBBINode::opCat); 340 if (U_FAILURE(*fRB->fStatus)) { 341 break; 342 } 343 fNodeStackPtr -= 2; 344 catNode->fLeftChild = thisRule; 345 catNode->fRightChild = endNode; 346 fNodeStack[fNodeStackPtr] = catNode; 347 endNode->fVal = fRuleNum; 348 endNode->fLookAheadEnd = TRUE; 349 thisRule = catNode; 350 351 // TODO: Disable chaining out of look-ahead (hard break) rules. 352 // The break on rule match is forced, so there is no point in building up 353 // the state table to chain into another rule for a longer match. 354 } 355 356 // Mark this node as being the root of a rule. 357 thisRule->fRuleRoot = TRUE; 358 359 // Flag if chaining into this rule is wanted. 360 // 361 if (fRB->fChainRules && // If rule chaining is enabled globally via !!chain 362 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 363 thisRule->fChainIn = TRUE; 364 } 365 366 367 // All rule expressions are ORed together. 368 // The ';' that terminates an expression really just functions as a '|' with 369 // a low operator prededence. 370 // 371 // Each of the four sets of rules are collected separately. 372 // (forward, reverse, safe_forward, safe_reverse) 373 // OR this rule into the appropriate group of them. 374 // 375 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree); 376 377 if (*destRules != NULL) { 378 // This is not the first rule encounted. 379 // OR previous stuff (from *destRules) 380 // with the current rule expression (on the Node Stack) 381 // with the resulting OR expression going to *destRules 382 // 383 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 384 RBBINode *prevRules = *destRules; 385 RBBINode *orNode = pushNewNode(RBBINode::opOr); 386 if (U_FAILURE(*fRB->fStatus)) { 387 break; 388 } 389 orNode->fLeftChild = prevRules; 390 prevRules->fParent = orNode; 391 orNode->fRightChild = thisRule; 392 thisRule->fParent = orNode; 393 *destRules = orNode; 394 } 395 else 396 { 397 // This is the first rule encountered (for this direction). 398 // Just move its parse tree from the stack to *destRules. 399 *destRules = fNodeStack[fNodeStackPtr]; 400 } 401 fReverseRule = FALSE; // in preparation for the next rule. 402 fLookAheadRule = FALSE; 403 fNoChainInRule = FALSE; 404 fNodeStackPtr = 0; 405 } 406 break; 407 408 409 case doRuleError: 410 error(U_BRK_RULE_SYNTAX); 411 returnVal = FALSE; 412 break; 413 414 415 case doVariableNameExpectedErr: 416 error(U_BRK_RULE_SYNTAX); 417 break; 418 419 420 // 421 // Unary operands + ? * 422 // These all appear after the operand to which they apply. 423 // When we hit one, the operand (may be a whole sub expression) 424 // will be on the top of the stack. 425 // Unary Operator becomes TOS, with the old TOS as its one child. 426 case doUnaryOpPlus: 427 { 428 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 429 RBBINode *plusNode = pushNewNode(RBBINode::opPlus); 430 if (U_FAILURE(*fRB->fStatus)) { 431 break; 432 } 433 plusNode->fLeftChild = operandNode; 434 operandNode->fParent = plusNode; 435 } 436 break; 437 438 case doUnaryOpQuestion: 439 { 440 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 441 RBBINode *qNode = pushNewNode(RBBINode::opQuestion); 442 if (U_FAILURE(*fRB->fStatus)) { 443 break; 444 } 445 qNode->fLeftChild = operandNode; 446 operandNode->fParent = qNode; 447 } 448 break; 449 450 case doUnaryOpStar: 451 { 452 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 453 RBBINode *starNode = pushNewNode(RBBINode::opStar); 454 if (U_FAILURE(*fRB->fStatus)) { 455 break; 456 } 457 starNode->fLeftChild = operandNode; 458 operandNode->fParent = starNode; 459 } 460 break; 461 462 case doRuleChar: 463 // A "Rule Character" is any single character that is a literal part 464 // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" 465 // These are pretty uncommon in break rules; the terms are more commonly 466 // sets. To keep things uniform, treat these characters like as 467 // sets that just happen to contain only one character. 468 { 469 n = pushNewNode(RBBINode::setRef); 470 if (U_FAILURE(*fRB->fStatus)) { 471 break; 472 } 473 findSetFor(UnicodeString(fC.fChar), n); 474 n->fFirstPos = fScanIndex; 475 n->fLastPos = fNextIndex; 476 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 477 break; 478 } 479 480 case doDotAny: 481 // scanned a ".", meaning match any single character. 482 { 483 n = pushNewNode(RBBINode::setRef); 484 if (U_FAILURE(*fRB->fStatus)) { 485 break; 486 } 487 findSetFor(UnicodeString(TRUE, kAny, 3), n); 488 n->fFirstPos = fScanIndex; 489 n->fLastPos = fNextIndex; 490 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 491 break; 492 } 493 494 case doSlash: 495 // Scanned a '/', which identifies a look-ahead break position in a rule. 496 n = pushNewNode(RBBINode::lookAhead); 497 if (U_FAILURE(*fRB->fStatus)) { 498 break; 499 } 500 n->fVal = fRuleNum; 501 n->fFirstPos = fScanIndex; 502 n->fLastPos = fNextIndex; 503 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 504 fLookAheadRule = TRUE; 505 break; 506 507 508 case doStartTagValue: 509 // Scanned a '{', the opening delimiter for a tag value within a rule. 510 n = pushNewNode(RBBINode::tag); 511 if (U_FAILURE(*fRB->fStatus)) { 512 break; 513 } 514 n->fVal = 0; 515 n->fFirstPos = fScanIndex; 516 n->fLastPos = fNextIndex; 517 break; 518 519 case doTagDigit: 520 // Just scanned a decimal digit that's part of a tag value 521 { 522 n = fNodeStack[fNodeStackPtr]; 523 uint32_t v = u_charDigitValue(fC.fChar); 524 U_ASSERT(v < 10); 525 n->fVal = n->fVal*10 + v; 526 break; 527 } 528 529 case doTagValue: 530 n = fNodeStack[fNodeStackPtr]; 531 n->fLastPos = fNextIndex; 532 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 533 break; 534 535 case doTagExpectedError: 536 error(U_BRK_MALFORMED_RULE_TAG); 537 returnVal = FALSE; 538 break; 539 540 case doOptionStart: 541 // Scanning a !!option. At the start of string. 542 fOptionStart = fScanIndex; 543 break; 544 545 case doOptionEnd: 546 { 547 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); 548 if (opt == UNICODE_STRING("chain", 5)) { 549 fRB->fChainRules = TRUE; 550 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { 551 fRB->fLBCMNoChain = TRUE; 552 } else if (opt == UNICODE_STRING("forward", 7)) { 553 fRB->fDefaultTree = &fRB->fForwardTree; 554 } else if (opt == UNICODE_STRING("reverse", 7)) { 555 fRB->fDefaultTree = &fRB->fReverseTree; 556 } else if (opt == UNICODE_STRING("safe_forward", 12)) { 557 fRB->fDefaultTree = &fRB->fSafeFwdTree; 558 } else if (opt == UNICODE_STRING("safe_reverse", 12)) { 559 fRB->fDefaultTree = &fRB->fSafeRevTree; 560 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { 561 fRB->fLookAheadHardBreak = TRUE; 562 } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) { 563 fRuleSets[kRuleSet_rule_char-128].clear(); 564 } else if (opt == UNICODE_STRING("unquoted_literals", 17)) { 565 fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus); 566 } else { 567 error(U_BRK_UNRECOGNIZED_OPTION); 568 } 569 } 570 break; 571 572 case doReverseDir: 573 fReverseRule = TRUE; 574 break; 575 576 case doStartVariableName: 577 n = pushNewNode(RBBINode::varRef); 578 if (U_FAILURE(*fRB->fStatus)) { 579 break; 580 } 581 n->fFirstPos = fScanIndex; 582 break; 583 584 case doEndVariableName: 585 n = fNodeStack[fNodeStackPtr]; 586 if (n==NULL || n->fType != RBBINode::varRef) { 587 error(U_BRK_INTERNAL_ERROR); 588 break; 589 } 590 n->fLastPos = fScanIndex; 591 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); 592 // Look the newly scanned name up in the symbol table 593 // If there's an entry, set the l. child of the var ref to the replacement expression. 594 // (We also pass through here when scanning assignments, but no harm is done, other 595 // than a slight wasted effort that seems hard to avoid. Lookup will be null) 596 n->fLeftChild = fSymbolTable->lookupNode(n->fText); 597 break; 598 599 case doCheckVarDef: 600 n = fNodeStack[fNodeStackPtr]; 601 if (n->fLeftChild == NULL) { 602 error(U_BRK_UNDEFINED_VARIABLE); 603 returnVal = FALSE; 604 } 605 break; 606 607 case doExprFinished: 608 break; 609 610 case doRuleErrorAssignExpr: 611 error(U_BRK_ASSIGN_ERROR); 612 returnVal = FALSE; 613 break; 614 615 case doExit: 616 returnVal = FALSE; 617 break; 618 619 case doScanUnicodeSet: 620 scanSet(); 621 break; 622 623 default: 624 error(U_BRK_INTERNAL_ERROR); 625 returnVal = FALSE; 626 break; 627 } 628 return returnVal && U_SUCCESS(*fRB->fStatus); 629} 630 631 632 633 634//------------------------------------------------------------------------------ 635// 636// Error Report a rule parse error. 637// Only report it if no previous error has been recorded. 638// 639//------------------------------------------------------------------------------ 640void RBBIRuleScanner::error(UErrorCode e) { 641 if (U_SUCCESS(*fRB->fStatus)) { 642 *fRB->fStatus = e; 643 if (fRB->fParseError) { 644 fRB->fParseError->line = fLineNum; 645 fRB->fParseError->offset = fCharNum; 646 fRB->fParseError->preContext[0] = 0; 647 fRB->fParseError->postContext[0] = 0; 648 } 649 } 650} 651 652 653 654 655//------------------------------------------------------------------------------ 656// 657// fixOpStack The parse stack holds partially assembled chunks of the parse tree. 658// An entry on the stack may be as small as a single setRef node, 659// or as large as the parse tree 660// for an entire expression (this will be the one item left on the stack 661// when the parsing of an RBBI rule completes. 662// 663// This function is called when a binary operator is encountered. 664// It looks back up the stack for operators that are not yet associated 665// with a right operand, and if the precedence of the stacked operator >= 666// the precedence of the current operator, binds the operand left, 667// to the previously encountered operator. 668// 669//------------------------------------------------------------------------------ 670void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { 671 RBBINode *n; 672 // printNodeStack("entering fixOpStack()"); 673 for (;;) { 674 n = fNodeStack[fNodeStackPtr-1]; // an operator node 675 if (n->fPrecedence == 0) { 676 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); 677 error(U_BRK_INTERNAL_ERROR); 678 return; 679 } 680 681 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { 682 // The most recent operand goes with the current operator, 683 // not with the previously stacked one. 684 break; 685 } 686 // Stack operator is a binary op ( '|' or concatenation) 687 // TOS operand becomes right child of this operator. 688 // Resulting subexpression becomes the TOS operand. 689 n->fRightChild = fNodeStack[fNodeStackPtr]; 690 fNodeStack[fNodeStackPtr]->fParent = n; 691 fNodeStackPtr--; 692 // printNodeStack("looping in fixOpStack() "); 693 } 694 695 if (p <= RBBINode::precLParen) { 696 // Scan is at a right paren or end of expression. 697 // The scanned item must match the stack, or else there was an error. 698 // Discard the left paren (or start expr) node from the stack, 699 // leaving the completed (sub)expression as TOS. 700 if (n->fPrecedence != p) { 701 // Right paren encountered matched start of expression node, or 702 // end of expression matched with a left paren node. 703 error(U_BRK_MISMATCHED_PAREN); 704 } 705 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; 706 fNodeStackPtr--; 707 // Delete the now-discarded LParen or Start node. 708 delete n; 709 } 710 // printNodeStack("leaving fixOpStack()"); 711} 712 713 714 715 716//------------------------------------------------------------------------------ 717// 718// findSetFor given a UnicodeString, 719// - find the corresponding Unicode Set (uset node) 720// (create one if necessary) 721// - Set fLeftChild of the caller's node (should be a setRef node) 722// to the uset node 723// Maintain a hash table of uset nodes, so the same one is always used 724// for the same string. 725// If a "to adopt" set is provided and we haven't seen this key before, 726// add the provided set to the hash table. 727// If the string is one (32 bit) char in length, the set contains 728// just one element which is the char in question. 729// If the string is "any", return a set containing all chars. 730// 731//------------------------------------------------------------------------------ 732void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { 733 734 RBBISetTableEl *el; 735 736 // First check whether we've already cached a set for this string. 737 // If so, just use the cached set in the new node. 738 // delete any set provided by the caller, since we own it. 739 el = (RBBISetTableEl *)uhash_get(fSetTable, &s); 740 if (el != NULL) { 741 delete setToAdopt; 742 node->fLeftChild = el->val; 743 U_ASSERT(node->fLeftChild->fType == RBBINode::uset); 744 return; 745 } 746 747 // Haven't seen this set before. 748 // If the caller didn't provide us with a prebuilt set, 749 // create a new UnicodeSet now. 750 if (setToAdopt == NULL) { 751 if (s.compare(kAny, -1) == 0) { 752 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 753 } else { 754 UChar32 c; 755 c = s.char32At(0); 756 setToAdopt = new UnicodeSet(c, c); 757 } 758 } 759 760 // 761 // Make a new uset node to refer to this UnicodeSet 762 // This new uset node becomes the child of the caller's setReference node. 763 // 764 RBBINode *usetNode = new RBBINode(RBBINode::uset); 765 if (usetNode == NULL) { 766 error(U_MEMORY_ALLOCATION_ERROR); 767 return; 768 } 769 usetNode->fInputSet = setToAdopt; 770 usetNode->fParent = node; 771 node->fLeftChild = usetNode; 772 usetNode->fText = s; 773 774 775 // 776 // Add the new uset node to the list of all uset nodes. 777 // 778 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); 779 780 781 // 782 // Add the new set to the set hash table. 783 // 784 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); 785 UnicodeString *tkey = new UnicodeString(s); 786 if (tkey == NULL || el == NULL || setToAdopt == NULL) { 787 // Delete to avoid memory leak 788 delete tkey; 789 tkey = NULL; 790 uprv_free(el); 791 el = NULL; 792 delete setToAdopt; 793 setToAdopt = NULL; 794 795 error(U_MEMORY_ALLOCATION_ERROR); 796 return; 797 } 798 el->key = tkey; 799 el->val = usetNode; 800 uhash_put(fSetTable, el->key, el, fRB->fStatus); 801 802 return; 803} 804 805 806 807// 808// Assorted Unicode character constants. 809// Numeric because there is no portable way to enter them as literals. 810// (Think EBCDIC). 811// 812static const UChar chCR = 0x0d; // New lines, for terminating comments. 813static const UChar chLF = 0x0a; 814static const UChar chNEL = 0x85; // NEL newline variant 815static const UChar chLS = 0x2028; // Unicode Line Separator 816static const UChar chApos = 0x27; // single quote, for quoted chars. 817static const UChar chPound = 0x23; // '#', introduces a comment. 818static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 819static const UChar chLParen = 0x28; 820static const UChar chRParen = 0x29; 821 822 823//------------------------------------------------------------------------------ 824// 825// stripRules Return a rules string without unnecessary 826// characters. 827// 828//------------------------------------------------------------------------------ 829UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { 830 UnicodeString strippedRules; 831 int rulesLength = rules.length(); 832 for (int idx = 0; idx < rulesLength; ) { 833 UChar ch = rules[idx++]; 834 if (ch == chPound) { 835 while (idx < rulesLength 836 && ch != chCR && ch != chLF && ch != chNEL) 837 { 838 ch = rules[idx++]; 839 } 840 } 841 if (!u_isISOControl(ch)) { 842 strippedRules.append(ch); 843 } 844 } 845 // strippedRules = strippedRules.unescape(); 846 return strippedRules; 847} 848 849 850//------------------------------------------------------------------------------ 851// 852// nextCharLL Low Level Next Char from rule input source. 853// Get a char from the input character iterator, 854// keep track of input position for error reporting. 855// 856//------------------------------------------------------------------------------ 857UChar32 RBBIRuleScanner::nextCharLL() { 858 UChar32 ch; 859 860 if (fNextIndex >= fRB->fRules.length()) { 861 return (UChar32)-1; 862 } 863 ch = fRB->fRules.char32At(fNextIndex); 864 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); 865 866 if (ch == chCR || 867 ch == chNEL || 868 ch == chLS || 869 (ch == chLF && fLastChar != chCR)) { 870 // Character is starting a new line. Bump up the line number, and 871 // reset the column to 0. 872 fLineNum++; 873 fCharNum=0; 874 if (fQuoteMode) { 875 error(U_BRK_NEW_LINE_IN_QUOTED_STRING); 876 fQuoteMode = FALSE; 877 } 878 } 879 else { 880 // Character is not starting a new line. Except in the case of a 881 // LF following a CR, increment the column position. 882 if (ch != chLF) { 883 fCharNum++; 884 } 885 } 886 fLastChar = ch; 887 return ch; 888} 889 890 891//------------------------------------------------------------------------------ 892// 893// nextChar for rules scanning. At this level, we handle stripping 894// out comments and processing backslash character escapes. 895// The rest of the rules grammar is handled at the next level up. 896// 897//------------------------------------------------------------------------------ 898void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { 899 900 // Unicode Character constants needed for the processing done by nextChar(), 901 // in hex because literals wont work on EBCDIC machines. 902 903 fScanIndex = fNextIndex; 904 c.fChar = nextCharLL(); 905 c.fEscaped = FALSE; 906 907 // 908 // check for '' sequence. 909 // These are recognized in all contexts, whether in quoted text or not. 910 // 911 if (c.fChar == chApos) { 912 if (fRB->fRules.char32At(fNextIndex) == chApos) { 913 c.fChar = nextCharLL(); // get nextChar officially so character counts 914 c.fEscaped = TRUE; // stay correct. 915 } 916 else 917 { 918 // Single quote, by itself. 919 // Toggle quoting mode. 920 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 921 fQuoteMode = !fQuoteMode; 922 if (fQuoteMode == TRUE) { 923 c.fChar = chLParen; 924 } else { 925 c.fChar = chRParen; 926 } 927 c.fEscaped = FALSE; // The paren that we return is not escaped. 928 return; 929 } 930 } 931 932 if (fQuoteMode) { 933 c.fEscaped = TRUE; 934 } 935 else 936 { 937 // We are not in a 'quoted region' of the source. 938 // 939 if (c.fChar == chPound) { 940 // Start of a comment. Consume the rest of it. 941 // The new-line char that terminates the comment is always returned. 942 // It will be treated as white-space, and serves to break up anything 943 // that might otherwise incorrectly clump together with a comment in 944 // the middle (a variable name, for example.) 945 for (;;) { 946 c.fChar = nextCharLL(); 947 if (c.fChar == (UChar32)-1 || // EOF 948 c.fChar == chCR || 949 c.fChar == chLF || 950 c.fChar == chNEL || 951 c.fChar == chLS) {break;} 952 } 953 } 954 if (c.fChar == (UChar32)-1) { 955 return; 956 } 957 958 // 959 // check for backslash escaped characters. 960 // Use UnicodeString::unescapeAt() to handle them. 961 // 962 if (c.fChar == chBackSlash) { 963 c.fEscaped = TRUE; 964 int32_t startX = fNextIndex; 965 c.fChar = fRB->fRules.unescapeAt(fNextIndex); 966 if (fNextIndex == startX) { 967 error(U_BRK_HEX_DIGITS_EXPECTED); 968 } 969 fCharNum += fNextIndex-startX; 970 } 971 } 972 // putc(c.fChar, stdout); 973} 974 975//------------------------------------------------------------------------------ 976// 977// Parse RBBI rules. The state machine for rules parsing is here. 978// The state tables are hand-written in the file rbbirpt.txt, 979// and converted to the form used here by a perl 980// script rbbicst.pl 981// 982//------------------------------------------------------------------------------ 983void RBBIRuleScanner::parse() { 984 uint16_t state; 985 const RBBIRuleTableEl *tableEl; 986 987 if (U_FAILURE(*fRB->fStatus)) { 988 return; 989 } 990 991 state = 1; 992 nextChar(fC); 993 // 994 // Main loop for the rule parsing state machine. 995 // Runs once per state transition. 996 // Each time through optionally performs, depending on the state table, 997 // - an advance to the the next input char 998 // - an action to be performed. 999 // - pushing or popping a state to/from the local state return stack. 1000 // 1001 for (;;) { 1002 // Bail out if anything has gone wrong. 1003 // RBBI rule file parsing stops on the first error encountered. 1004 if (U_FAILURE(*fRB->fStatus)) { 1005 break; 1006 } 1007 1008 // Quit if state == 0. This is the normal way to exit the state machine. 1009 // 1010 if (state == 0) { 1011 break; 1012 } 1013 1014 // Find the state table element that matches the input char from the rule, or the 1015 // class of the input character. Start with the first table row for this 1016 // state, then linearly scan forward until we find a row that matches the 1017 // character. The last row for each state always matches all characters, so 1018 // the search will stop there, if not before. 1019 // 1020 tableEl = &gRuleParseStateTable[state]; 1021 #ifdef RBBI_DEBUG 1022 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { 1023 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", 1024 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); 1025 } 1026 #endif 1027 1028 for (;;) { 1029 #ifdef RBBI_DEBUG 1030 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);} 1031 #endif 1032 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { 1033 // Table row specified an individual character, not a set, and 1034 // the input character is not escaped, and 1035 // the input character matched it. 1036 break; 1037 } 1038 if (tableEl->fCharClass == 255) { 1039 // Table row specified default, match anything character class. 1040 break; 1041 } 1042 if (tableEl->fCharClass == 254 && fC.fEscaped) { 1043 // Table row specified "escaped" and the char was escaped. 1044 break; 1045 } 1046 if (tableEl->fCharClass == 253 && fC.fEscaped && 1047 (fC.fChar == 0x50 || fC.fChar == 0x70 )) { 1048 // Table row specified "escaped P" and the char is either 'p' or 'P'. 1049 break; 1050 } 1051 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { 1052 // Table row specified eof and we hit eof on the input. 1053 break; 1054 } 1055 1056 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 1057 fC.fEscaped == FALSE && // char is not escaped && 1058 fC.fChar != (UChar32)-1) { // char is not EOF 1059 U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets)); 1060 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 1061 // Table row specified a character class, or set of characters, 1062 // and the current char matches it. 1063 break; 1064 } 1065 } 1066 1067 // No match on this row, advance to the next row for this state, 1068 tableEl++; 1069 } 1070 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} 1071 1072 // 1073 // We've found the row of the state table that matches the current input 1074 // character from the rules string. 1075 // Perform any action specified by this row in the state table. 1076 if (doParseActions((int32_t)tableEl->fAction) == FALSE) { 1077 // Break out of the state machine loop if the 1078 // the action signalled some kind of error, or 1079 // the action was to exit, occurs on normal end-of-rules-input. 1080 break; 1081 } 1082 1083 if (tableEl->fPushState != 0) { 1084 fStackPtr++; 1085 if (fStackPtr >= kStackSize) { 1086 error(U_BRK_INTERNAL_ERROR); 1087 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); 1088 fStackPtr--; 1089 } 1090 fStack[fStackPtr] = tableEl->fPushState; 1091 } 1092 1093 if (tableEl->fNextChar) { 1094 nextChar(fC); 1095 } 1096 1097 // Get the next state from the table entry, or from the 1098 // state stack if the next state was specified as "pop". 1099 if (tableEl->fNextState != 255) { 1100 state = tableEl->fNextState; 1101 } else { 1102 state = fStack[fStackPtr]; 1103 fStackPtr--; 1104 if (fStackPtr < 0) { 1105 error(U_BRK_INTERNAL_ERROR); 1106 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); 1107 fStackPtr++; 1108 } 1109 } 1110 1111 } 1112 1113 if (U_FAILURE(*fRB->fStatus)) { 1114 return; 1115 } 1116 1117 // If there are no forward rules set an error. 1118 // 1119 if (fRB->fForwardTree == NULL) { 1120 error(U_BRK_RULE_SYNTAX); 1121 return; 1122 } 1123 1124 // 1125 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 1126 // 1127 if (fRB->fReverseTree == NULL) { 1128 fRB->fReverseTree = pushNewNode(RBBINode::opStar); 1129 RBBINode *operand = pushNewNode(RBBINode::setRef); 1130 if (U_FAILURE(*fRB->fStatus)) { 1131 return; 1132 } 1133 findSetFor(UnicodeString(TRUE, kAny, 3), operand); 1134 fRB->fReverseTree->fLeftChild = operand; 1135 operand->fParent = fRB->fReverseTree; 1136 fNodeStackPtr -= 2; 1137 } 1138 1139 1140 // 1141 // Parsing of the input RBBI rules is complete. 1142 // We now have a parse tree for the rule expressions 1143 // and a list of all UnicodeSets that are referenced. 1144 // 1145#ifdef RBBI_DEBUG 1146 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} 1147 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) { 1148 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); 1149 RBBINode::printTree(fRB->fForwardTree, TRUE); 1150 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); 1151 RBBINode::printTree(fRB->fReverseTree, TRUE); 1152 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); 1153 RBBINode::printTree(fRB->fSafeFwdTree, TRUE); 1154 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); 1155 RBBINode::printTree(fRB->fSafeRevTree, TRUE); 1156 } 1157#endif 1158} 1159 1160 1161//------------------------------------------------------------------------------ 1162// 1163// printNodeStack for debugging... 1164// 1165//------------------------------------------------------------------------------ 1166#ifdef RBBI_DEBUG 1167void RBBIRuleScanner::printNodeStack(const char *title) { 1168 int i; 1169 RBBIDebugPrintf("%s. Dumping node stack...\n", title); 1170 for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], TRUE);} 1171} 1172#endif 1173 1174 1175 1176 1177//------------------------------------------------------------------------------ 1178// 1179// pushNewNode create a new RBBINode of the specified type and push it 1180// onto the stack of nodes. 1181// 1182//------------------------------------------------------------------------------ 1183RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { 1184 if (U_FAILURE(*fRB->fStatus)) { 1185 return NULL; 1186 } 1187 if (fNodeStackPtr >= kStackSize - 1) { 1188 error(U_BRK_RULE_SYNTAX); 1189 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); 1190 return NULL; 1191 } 1192 fNodeStackPtr++; 1193 fNodeStack[fNodeStackPtr] = new RBBINode(t); 1194 if (fNodeStack[fNodeStackPtr] == NULL) { 1195 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; 1196 } 1197 return fNodeStack[fNodeStackPtr]; 1198} 1199 1200 1201 1202//------------------------------------------------------------------------------ 1203// 1204// scanSet Construct a UnicodeSet from the text at the current scan 1205// position. Advance the scan position to the first character 1206// after the set. 1207// 1208// A new RBBI setref node referring to the set is pushed onto the node 1209// stack. 1210// 1211// The scan position is normally under the control of the state machine 1212// that controls rule parsing. UnicodeSets, however, are parsed by 1213// the UnicodeSet constructor, not by the RBBI rule parser. 1214// 1215//------------------------------------------------------------------------------ 1216void RBBIRuleScanner::scanSet() { 1217 UnicodeSet *uset; 1218 ParsePosition pos; 1219 int startPos; 1220 int i; 1221 1222 if (U_FAILURE(*fRB->fStatus)) { 1223 return; 1224 } 1225 1226 pos.setIndex(fScanIndex); 1227 startPos = fScanIndex; 1228 UErrorCode localStatus = U_ZERO_ERROR; 1229 uset = new UnicodeSet(); 1230 if (uset == NULL) { 1231 localStatus = U_MEMORY_ALLOCATION_ERROR; 1232 } else { 1233 uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); 1234 } 1235 if (U_FAILURE(localStatus)) { 1236 // TODO: Get more accurate position of the error from UnicodeSet's return info. 1237 // UnicodeSet appears to not be reporting correctly at this time. 1238 #ifdef RBBI_DEBUG 1239 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); 1240 #endif 1241 error(localStatus); 1242 delete uset; 1243 return; 1244 } 1245 1246 // Verify that the set contains at least one code point. 1247 // 1248 U_ASSERT(uset!=NULL); 1249 if (uset->isEmpty()) { 1250 // This set is empty. 1251 // Make it an error, because it almost certainly is not what the user wanted. 1252 // Also, avoids having to think about corner cases in the tree manipulation code 1253 // that occurs later on. 1254 error(U_BRK_RULE_EMPTY_SET); 1255 delete uset; 1256 return; 1257 } 1258 1259 1260 // Advance the RBBI parse postion over the UnicodeSet pattern. 1261 // Don't just set fScanIndex because the line/char positions maintained 1262 // for error reporting would be thrown off. 1263 i = pos.getIndex(); 1264 for (;;) { 1265 if (fNextIndex >= i) { 1266 break; 1267 } 1268 nextCharLL(); 1269 } 1270 1271 if (U_SUCCESS(*fRB->fStatus)) { 1272 RBBINode *n; 1273 1274 n = pushNewNode(RBBINode::setRef); 1275 if (U_FAILURE(*fRB->fStatus)) { 1276 return; 1277 } 1278 n->fFirstPos = startPos; 1279 n->fLastPos = fNextIndex; 1280 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1281 // findSetFor() serves several purposes here: 1282 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1283 // - Mantains collection of all sets in use, needed later for establishing 1284 // character categories for run time engine. 1285 // - Eliminates mulitiple instances of the same set. 1286 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1287 findSetFor(n->fText, n, uset); 1288 } 1289 1290} 1291 1292U_NAMESPACE_END 1293 1294#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1295