1 2// 3// file: rbbiscan.cpp 4// 5// Copyright (C) 2002-2010, International Business Machines Corporation and others. 6// All Rights Reserved. 7// 8// This file contains the Rule Based Break Iterator Rule Builder functions for 9// scanning the rules and assembling a parse tree. This is the first phase 10// of compiling the rules. 11// 12// The overall of the rules is managed by class RBBIRuleBuilder, which will 13// create and use an instance of this class as part of the process. 14// 15 16#include "unicode/utypes.h" 17 18#if !UCONFIG_NO_BREAK_ITERATION 19 20#include "unicode/unistr.h" 21#include "unicode/uniset.h" 22#include "unicode/uchar.h" 23#include "unicode/uchriter.h" 24#include "unicode/parsepos.h" 25#include "unicode/parseerr.h" 26#include "util.h" 27#include "cmemory.h" 28#include "cstring.h" 29 30#include "rbbirpt.h" // Contains state table for the rbbi rules parser. 31 // generated by a Perl script. 32#include "rbbirb.h" 33#include "rbbinode.h" 34#include "rbbiscan.h" 35#include "rbbitblb.h" 36 37#include "uassert.h" 38 39 40//------------------------------------------------------------------------------ 41// 42// Unicode Set init strings for each of the character classes needed for parsing a rule file. 43// (Initialized with hex values for portability to EBCDIC based machines. 44// Really ugly, but there's no good way to avoid it.) 45// 46// The sets are referred to by name in the rbbirpt.txt, which is the 47// source form of the state transition table for the RBBI rule parser. 48// 49//------------------------------------------------------------------------------ 50static const UChar gRuleSet_rule_char_pattern[] = { 51 // [ ^ [ \ p { Z } \ u 0 0 2 0 52 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, 53 // - \ u 0 0 7 f ] - [ \ p 54 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 55 // { L } ] - [ \ p { N } ] ] 56 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; 57 58static const UChar gRuleSet_name_char_pattern[] = { 59// [ _ \ p { L } \ p { N } ] 60 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; 61 62static const UChar gRuleSet_digit_char_pattern[] = { 63// [ 0 - 9 ] 64 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; 65 66static const UChar gRuleSet_name_start_char_pattern[] = { 67// [ _ \ p { L } ] 68 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; 69 70static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" 71 72 73U_CDECL_BEGIN 74static void U_CALLCONV RBBISetTable_deleter(void *p) { 75 U_NAMESPACE_QUALIFIER RBBISetTableEl *px = (U_NAMESPACE_QUALIFIER RBBISetTableEl *)p; 76 delete px->key; 77 // Note: px->val is owned by the linked list "fSetsListHead" in scanner. 78 // Don't delete the value nodes here. 79 uprv_free(px); 80} 81U_CDECL_END 82 83U_NAMESPACE_BEGIN 84 85//------------------------------------------------------------------------------ 86// 87// Constructor. 88// 89//------------------------------------------------------------------------------ 90RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) 91{ 92 fRB = rb; 93 fStackPtr = 0; 94 fStack[fStackPtr] = 0; 95 fNodeStackPtr = 0; 96 fRuleNum = 0; 97 fNodeStack[0] = NULL; 98 99 fSymbolTable = NULL; 100 fSetTable = NULL; 101 102 fScanIndex = 0; 103 fNextIndex = 0; 104 105 fReverseRule = FALSE; 106 fLookAheadRule = FALSE; 107 108 fLineNum = 1; 109 fCharNum = 0; 110 fQuoteMode = FALSE; 111 112 // Do not check status until after all critical fields are sufficiently initialized 113 // that the destructor can run cleanly. 114 if (U_FAILURE(*rb->fStatus)) { 115 return; 116 } 117 118 // 119 // Set up the constant Unicode Sets. 120 // Note: These could be made static, lazily initialized, and shared among 121 // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, 122 // and the time to build these few sets should be small compared to a 123 // full break iterator build. 124 fRuleSets[kRuleSet_rule_char-128] = UnicodeSet(gRuleSet_rule_char_pattern, *rb->fStatus); 125 UnicodeSet *whitespaceSet = uprv_openRuleWhiteSpaceSet(rb->fStatus); 126 if (U_FAILURE(*rb->fStatus)) { 127 return; 128 } 129 fRuleSets[kRuleSet_white_space-128] = *whitespaceSet; 130 delete whitespaceSet; 131 fRuleSets[kRuleSet_name_char-128] = UnicodeSet(gRuleSet_name_char_pattern, *rb->fStatus); 132 fRuleSets[kRuleSet_name_start_char-128] = UnicodeSet(gRuleSet_name_start_char_pattern, *rb->fStatus); 133 fRuleSets[kRuleSet_digit_char-128] = UnicodeSet(gRuleSet_digit_char_pattern, *rb->fStatus); 134 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { 135 // This case happens if ICU's data is missing. UnicodeSet tries to look up property 136 // names from the init string, can't find them, and claims an illegal arguement. 137 // Change the error so that the actual problem will be clearer to users. 138 *rb->fStatus = U_BRK_INIT_ERROR; 139 } 140 if (U_FAILURE(*rb->fStatus)) { 141 return; 142 } 143 144 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); 145 if (fSymbolTable == NULL) { 146 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; 147 return; 148 } 149 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); 150 if (U_FAILURE(*rb->fStatus)) { 151 return; 152 } 153 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); 154} 155 156 157 158//------------------------------------------------------------------------------ 159// 160// Destructor 161// 162//------------------------------------------------------------------------------ 163RBBIRuleScanner::~RBBIRuleScanner() { 164 delete fSymbolTable; 165 if (fSetTable != NULL) { 166 uhash_close(fSetTable); 167 fSetTable = NULL; 168 169 } 170 171 172 // Node Stack. 173 // Normally has one entry, which is the entire parse tree for the rules. 174 // If errors occured, there may be additional subtrees left on the stack. 175 while (fNodeStackPtr > 0) { 176 delete fNodeStack[fNodeStackPtr]; 177 fNodeStackPtr--; 178 } 179 180} 181 182//------------------------------------------------------------------------------ 183// 184// doParseAction Do some action during rule parsing. 185// Called by the parse state machine. 186// Actions build the parse tree and Unicode Sets, 187// and maintain the parse stack for nested expressions. 188// 189// TODO: unify EParseAction and RBBI_RuleParseAction enum types. 190// They represent exactly the same thing. They're separate 191// only to work around enum forward declaration restrictions 192// in some compilers, while at the same time avoiding multiple 193// definitions problems. I'm sure that there's a better way. 194// 195//------------------------------------------------------------------------------ 196UBool RBBIRuleScanner::doParseActions(int32_t action) 197{ 198 RBBINode *n = NULL; 199 200 UBool returnVal = TRUE; 201 202 switch (action) { 203 204 case doExprStart: 205 pushNewNode(RBBINode::opStart); 206 fRuleNum++; 207 break; 208 209 210 case doExprOrOperator: 211 { 212 fixOpStack(RBBINode::precOpCat); 213 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 214 RBBINode *orNode = pushNewNode(RBBINode::opOr); 215 orNode->fLeftChild = operandNode; 216 operandNode->fParent = orNode; 217 } 218 break; 219 220 case doExprCatOperator: 221 // concatenation operator. 222 // For the implicit concatenation of adjacent terms in an expression that are 223 // not separated by any other operator. Action is invoked between the 224 // actions for the two terms. 225 { 226 fixOpStack(RBBINode::precOpCat); 227 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 228 RBBINode *catNode = pushNewNode(RBBINode::opCat); 229 catNode->fLeftChild = operandNode; 230 operandNode->fParent = catNode; 231 } 232 break; 233 234 case doLParen: 235 // Open Paren. 236 // The openParen node is a dummy operation type with a low precedence, 237 // which has the affect of ensuring that any real binary op that 238 // follows within the parens binds more tightly to the operands than 239 // stuff outside of the parens. 240 pushNewNode(RBBINode::opLParen); 241 break; 242 243 case doExprRParen: 244 fixOpStack(RBBINode::precLParen); 245 break; 246 247 case doNOP: 248 break; 249 250 case doStartAssign: 251 // We've just scanned "$variable = " 252 // The top of the node stack has the $variable ref node. 253 254 // Save the start position of the RHS text in the StartExpression node 255 // that precedes the $variableReference node on the stack. 256 // This will eventually be used when saving the full $variable replacement 257 // text as a string. 258 n = fNodeStack[fNodeStackPtr-1]; 259 n->fFirstPos = fNextIndex; // move past the '=' 260 261 // Push a new start-of-expression node; needed to keep parse of the 262 // RHS expression happy. 263 pushNewNode(RBBINode::opStart); 264 break; 265 266 267 268 269 case doEndAssign: 270 { 271 // We have reached the end of an assignement statement. 272 // Current scan char is the ';' that terminates the assignment. 273 274 // Terminate expression, leaves expression parse tree rooted in TOS node. 275 fixOpStack(RBBINode::precStart); 276 277 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; 278 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; 279 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; 280 281 // Save original text of right side of assignment, excluding the terminating ';' 282 // in the root of the node for the right-hand-side expression. 283 RHSExprNode->fFirstPos = startExprNode->fFirstPos; 284 RHSExprNode->fLastPos = fScanIndex; 285 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); 286 287 // Expression parse tree becomes l. child of the $variable reference node. 288 varRefNode->fLeftChild = RHSExprNode; 289 RHSExprNode->fParent = varRefNode; 290 291 // Make a symbol table entry for the $variableRef node. 292 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); 293 if (U_FAILURE(*fRB->fStatus)) { 294 // This is a round-about way to get the parse position set 295 // so that duplicate symbols error messages include a line number. 296 UErrorCode t = *fRB->fStatus; 297 *fRB->fStatus = U_ZERO_ERROR; 298 error(t); 299 } 300 301 // Clean up the stack. 302 delete startExprNode; 303 fNodeStackPtr-=3; 304 break; 305 } 306 307 case doEndOfRule: 308 { 309 fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression 310 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. 311 break; 312 } 313#ifdef RBBI_DEBUG 314 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} 315#endif 316 U_ASSERT(fNodeStackPtr == 1); 317 318 // If this rule includes a look-ahead '/', add a endMark node to the 319 // expression tree. 320 if (fLookAheadRule) { 321 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 322 RBBINode *endNode = pushNewNode(RBBINode::endMark); 323 RBBINode *catNode = pushNewNode(RBBINode::opCat); 324 fNodeStackPtr -= 2; 325 catNode->fLeftChild = thisRule; 326 catNode->fRightChild = endNode; 327 fNodeStack[fNodeStackPtr] = catNode; 328 endNode->fVal = fRuleNum; 329 endNode->fLookAheadEnd = TRUE; 330 } 331 332 // All rule expressions are ORed together. 333 // The ';' that terminates an expression really just functions as a '|' with 334 // a low operator prededence. 335 // 336 // Each of the four sets of rules are collected separately. 337 // (forward, reverse, safe_forward, safe_reverse) 338 // OR this rule into the appropriate group of them. 339 // 340 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree); 341 342 if (*destRules != NULL) { 343 // This is not the first rule encounted. 344 // OR previous stuff (from *destRules) 345 // with the current rule expression (on the Node Stack) 346 // with the resulting OR expression going to *destRules 347 // 348 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 349 RBBINode *prevRules = *destRules; 350 RBBINode *orNode = pushNewNode(RBBINode::opOr); 351 orNode->fLeftChild = prevRules; 352 prevRules->fParent = orNode; 353 orNode->fRightChild = thisRule; 354 thisRule->fParent = orNode; 355 *destRules = orNode; 356 } 357 else 358 { 359 // This is the first rule encountered (for this direction). 360 // Just move its parse tree from the stack to *destRules. 361 *destRules = fNodeStack[fNodeStackPtr]; 362 } 363 fReverseRule = FALSE; // in preparation for the next rule. 364 fLookAheadRule = FALSE; 365 fNodeStackPtr = 0; 366 } 367 break; 368 369 370 case doRuleError: 371 error(U_BRK_RULE_SYNTAX); 372 returnVal = FALSE; 373 break; 374 375 376 case doVariableNameExpectedErr: 377 error(U_BRK_RULE_SYNTAX); 378 break; 379 380 381 // 382 // Unary operands + ? * 383 // These all appear after the operand to which they apply. 384 // When we hit one, the operand (may be a whole sub expression) 385 // will be on the top of the stack. 386 // Unary Operator becomes TOS, with the old TOS as its one child. 387 case doUnaryOpPlus: 388 { 389 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 390 RBBINode *plusNode = pushNewNode(RBBINode::opPlus); 391 plusNode->fLeftChild = operandNode; 392 operandNode->fParent = plusNode; 393 } 394 break; 395 396 case doUnaryOpQuestion: 397 { 398 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 399 RBBINode *qNode = pushNewNode(RBBINode::opQuestion); 400 qNode->fLeftChild = operandNode; 401 operandNode->fParent = qNode; 402 } 403 break; 404 405 case doUnaryOpStar: 406 { 407 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 408 RBBINode *starNode = pushNewNode(RBBINode::opStar); 409 starNode->fLeftChild = operandNode; 410 operandNode->fParent = starNode; 411 } 412 break; 413 414 case doRuleChar: 415 // A "Rule Character" is any single character that is a literal part 416 // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" 417 // These are pretty uncommon in break rules; the terms are more commonly 418 // sets. To keep things uniform, treat these characters like as 419 // sets that just happen to contain only one character. 420 { 421 n = pushNewNode(RBBINode::setRef); 422 findSetFor(fC.fChar, n); 423 n->fFirstPos = fScanIndex; 424 n->fLastPos = fNextIndex; 425 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 426 break; 427 } 428 429 case doDotAny: 430 // scanned a ".", meaning match any single character. 431 { 432 n = pushNewNode(RBBINode::setRef); 433 findSetFor(kAny, n); 434 n->fFirstPos = fScanIndex; 435 n->fLastPos = fNextIndex; 436 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 437 break; 438 } 439 440 case doSlash: 441 // Scanned a '/', which identifies a look-ahead break position in a rule. 442 n = pushNewNode(RBBINode::lookAhead); 443 n->fVal = fRuleNum; 444 n->fFirstPos = fScanIndex; 445 n->fLastPos = fNextIndex; 446 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 447 fLookAheadRule = TRUE; 448 break; 449 450 451 case doStartTagValue: 452 // Scanned a '{', the opening delimiter for a tag value within a rule. 453 n = pushNewNode(RBBINode::tag); 454 n->fVal = 0; 455 n->fFirstPos = fScanIndex; 456 n->fLastPos = fNextIndex; 457 break; 458 459 case doTagDigit: 460 // Just scanned a decimal digit that's part of a tag value 461 { 462 n = fNodeStack[fNodeStackPtr]; 463 uint32_t v = u_charDigitValue(fC.fChar); 464 U_ASSERT(v < 10); 465 n->fVal = n->fVal*10 + v; 466 break; 467 } 468 469 case doTagValue: 470 n = fNodeStack[fNodeStackPtr]; 471 n->fLastPos = fNextIndex; 472 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 473 break; 474 475 case doTagExpectedError: 476 error(U_BRK_MALFORMED_RULE_TAG); 477 returnVal = FALSE; 478 break; 479 480 case doOptionStart: 481 // Scanning a !!option. At the start of string. 482 fOptionStart = fScanIndex; 483 break; 484 485 case doOptionEnd: 486 { 487 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); 488 if (opt == UNICODE_STRING("chain", 5)) { 489 fRB->fChainRules = TRUE; 490 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { 491 fRB->fLBCMNoChain = TRUE; 492 } else if (opt == UNICODE_STRING("forward", 7)) { 493 fRB->fDefaultTree = &fRB->fForwardTree; 494 } else if (opt == UNICODE_STRING("reverse", 7)) { 495 fRB->fDefaultTree = &fRB->fReverseTree; 496 } else if (opt == UNICODE_STRING("safe_forward", 12)) { 497 fRB->fDefaultTree = &fRB->fSafeFwdTree; 498 } else if (opt == UNICODE_STRING("safe_reverse", 12)) { 499 fRB->fDefaultTree = &fRB->fSafeRevTree; 500 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { 501 fRB->fLookAheadHardBreak = TRUE; 502 } else { 503 error(U_BRK_UNRECOGNIZED_OPTION); 504 } 505 } 506 break; 507 508 case doReverseDir: 509 fReverseRule = TRUE; 510 break; 511 512 case doStartVariableName: 513 n = pushNewNode(RBBINode::varRef); 514 if (U_FAILURE(*fRB->fStatus)) { 515 break; 516 } 517 n->fFirstPos = fScanIndex; 518 break; 519 520 case doEndVariableName: 521 n = fNodeStack[fNodeStackPtr]; 522 if (n==NULL || n->fType != RBBINode::varRef) { 523 error(U_BRK_INTERNAL_ERROR); 524 break; 525 } 526 n->fLastPos = fScanIndex; 527 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); 528 // Look the newly scanned name up in the symbol table 529 // If there's an entry, set the l. child of the var ref to the replacement expression. 530 // (We also pass through here when scanning assignments, but no harm is done, other 531 // than a slight wasted effort that seems hard to avoid. Lookup will be null) 532 n->fLeftChild = fSymbolTable->lookupNode(n->fText); 533 break; 534 535 case doCheckVarDef: 536 n = fNodeStack[fNodeStackPtr]; 537 if (n->fLeftChild == NULL) { 538 error(U_BRK_UNDEFINED_VARIABLE); 539 returnVal = FALSE; 540 } 541 break; 542 543 case doExprFinished: 544 break; 545 546 case doRuleErrorAssignExpr: 547 error(U_BRK_ASSIGN_ERROR); 548 returnVal = FALSE; 549 break; 550 551 case doExit: 552 returnVal = FALSE; 553 break; 554 555 case doScanUnicodeSet: 556 scanSet(); 557 break; 558 559 default: 560 error(U_BRK_INTERNAL_ERROR); 561 returnVal = FALSE; 562 break; 563 } 564 return returnVal; 565} 566 567 568 569 570//------------------------------------------------------------------------------ 571// 572// Error Report a rule parse error. 573// Only report it if no previous error has been recorded. 574// 575//------------------------------------------------------------------------------ 576void RBBIRuleScanner::error(UErrorCode e) { 577 if (U_SUCCESS(*fRB->fStatus)) { 578 *fRB->fStatus = e; 579 if (fRB->fParseError) { 580 fRB->fParseError->line = fLineNum; 581 fRB->fParseError->offset = fCharNum; 582 fRB->fParseError->preContext[0] = 0; 583 fRB->fParseError->preContext[0] = 0; 584 } 585 } 586} 587 588 589 590 591//------------------------------------------------------------------------------ 592// 593// fixOpStack The parse stack holds partially assembled chunks of the parse tree. 594// An entry on the stack may be as small as a single setRef node, 595// or as large as the parse tree 596// for an entire expression (this will be the one item left on the stack 597// when the parsing of an RBBI rule completes. 598// 599// This function is called when a binary operator is encountered. 600// It looks back up the stack for operators that are not yet associated 601// with a right operand, and if the precedence of the stacked operator >= 602// the precedence of the current operator, binds the operand left, 603// to the previously encountered operator. 604// 605//------------------------------------------------------------------------------ 606void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { 607 RBBINode *n; 608 // printNodeStack("entering fixOpStack()"); 609 for (;;) { 610 n = fNodeStack[fNodeStackPtr-1]; // an operator node 611 if (n->fPrecedence == 0) { 612 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); 613 error(U_BRK_INTERNAL_ERROR); 614 return; 615 } 616 617 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { 618 // The most recent operand goes with the current operator, 619 // not with the previously stacked one. 620 break; 621 } 622 // Stack operator is a binary op ( '|' or concatenation) 623 // TOS operand becomes right child of this operator. 624 // Resulting subexpression becomes the TOS operand. 625 n->fRightChild = fNodeStack[fNodeStackPtr]; 626 fNodeStack[fNodeStackPtr]->fParent = n; 627 fNodeStackPtr--; 628 // printNodeStack("looping in fixOpStack() "); 629 } 630 631 if (p <= RBBINode::precLParen) { 632 // Scan is at a right paren or end of expression. 633 // The scanned item must match the stack, or else there was an error. 634 // Discard the left paren (or start expr) node from the stack, 635 // leaving the completed (sub)expression as TOS. 636 if (n->fPrecedence != p) { 637 // Right paren encountered matched start of expression node, or 638 // end of expression matched with a left paren node. 639 error(U_BRK_MISMATCHED_PAREN); 640 } 641 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; 642 fNodeStackPtr--; 643 // Delete the now-discarded LParen or Start node. 644 delete n; 645 } 646 // printNodeStack("leaving fixOpStack()"); 647} 648 649 650 651 652//------------------------------------------------------------------------------ 653// 654// findSetFor given a UnicodeString, 655// - find the corresponding Unicode Set (uset node) 656// (create one if necessary) 657// - Set fLeftChild of the caller's node (should be a setRef node) 658// to the uset node 659// Maintain a hash table of uset nodes, so the same one is always used 660// for the same string. 661// If a "to adopt" set is provided and we haven't seen this key before, 662// add the provided set to the hash table. 663// If the string is one (32 bit) char in length, the set contains 664// just one element which is the char in question. 665// If the string is "any", return a set containing all chars. 666// 667//------------------------------------------------------------------------------ 668void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { 669 670 RBBISetTableEl *el; 671 672 // First check whether we've already cached a set for this string. 673 // If so, just use the cached set in the new node. 674 // delete any set provided by the caller, since we own it. 675 el = (RBBISetTableEl *)uhash_get(fSetTable, &s); 676 if (el != NULL) { 677 delete setToAdopt; 678 node->fLeftChild = el->val; 679 U_ASSERT(node->fLeftChild->fType == RBBINode::uset); 680 return; 681 } 682 683 // Haven't seen this set before. 684 // If the caller didn't provide us with a prebuilt set, 685 // create a new UnicodeSet now. 686 if (setToAdopt == NULL) { 687 if (s.compare(kAny, -1) == 0) { 688 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 689 } else { 690 UChar32 c; 691 c = s.char32At(0); 692 setToAdopt = new UnicodeSet(c, c); 693 } 694 } 695 696 // 697 // Make a new uset node to refer to this UnicodeSet 698 // This new uset node becomes the child of the caller's setReference node. 699 // 700 RBBINode *usetNode = new RBBINode(RBBINode::uset); 701 if (usetNode == NULL) { 702 error(U_MEMORY_ALLOCATION_ERROR); 703 return; 704 } 705 usetNode->fInputSet = setToAdopt; 706 usetNode->fParent = node; 707 node->fLeftChild = usetNode; 708 usetNode->fText = s; 709 710 711 // 712 // Add the new uset node to the list of all uset nodes. 713 // 714 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); 715 716 717 // 718 // Add the new set to the set hash table. 719 // 720 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); 721 UnicodeString *tkey = new UnicodeString(s); 722 if (tkey == NULL || el == NULL || setToAdopt == NULL) { 723 // Delete to avoid memory leak 724 delete tkey; 725 tkey = NULL; 726 uprv_free(el); 727 el = NULL; 728 delete setToAdopt; 729 setToAdopt = NULL; 730 731 error(U_MEMORY_ALLOCATION_ERROR); 732 return; 733 } 734 el->key = tkey; 735 el->val = usetNode; 736 uhash_put(fSetTable, el->key, el, fRB->fStatus); 737 738 return; 739} 740 741 742 743// 744// Assorted Unicode character constants. 745// Numeric because there is no portable way to enter them as literals. 746// (Think EBCDIC). 747// 748static const UChar chCR = 0x0d; // New lines, for terminating comments. 749static const UChar chLF = 0x0a; 750static const UChar chNEL = 0x85; // NEL newline variant 751static const UChar chLS = 0x2028; // Unicode Line Separator 752static const UChar chApos = 0x27; // single quote, for quoted chars. 753static const UChar chPound = 0x23; // '#', introduces a comment. 754static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 755static const UChar chLParen = 0x28; 756static const UChar chRParen = 0x29; 757 758 759//------------------------------------------------------------------------------ 760// 761// stripRules Return a rules string without unnecessary 762// characters. 763// 764//------------------------------------------------------------------------------ 765UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { 766 UnicodeString strippedRules; 767 int rulesLength = rules.length(); 768 for (int idx = 0; idx < rulesLength; ) { 769 UChar ch = rules[idx++]; 770 if (ch == chPound) { 771 while (idx < rulesLength 772 && ch != chCR && ch != chLF && ch != chNEL) 773 { 774 ch = rules[idx++]; 775 } 776 } 777 if (!u_isISOControl(ch)) { 778 strippedRules.append(ch); 779 } 780 } 781 // strippedRules = strippedRules.unescape(); 782 return strippedRules; 783} 784 785 786//------------------------------------------------------------------------------ 787// 788// nextCharLL Low Level Next Char from rule input source. 789// Get a char from the input character iterator, 790// keep track of input position for error reporting. 791// 792//------------------------------------------------------------------------------ 793UChar32 RBBIRuleScanner::nextCharLL() { 794 UChar32 ch; 795 796 if (fNextIndex >= fRB->fRules.length()) { 797 return (UChar32)-1; 798 } 799 ch = fRB->fRules.char32At(fNextIndex); 800 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); 801 802 if (ch == chCR || 803 ch == chNEL || 804 ch == chLS || 805 (ch == chLF && fLastChar != chCR)) { 806 // Character is starting a new line. Bump up the line number, and 807 // reset the column to 0. 808 fLineNum++; 809 fCharNum=0; 810 if (fQuoteMode) { 811 error(U_BRK_NEW_LINE_IN_QUOTED_STRING); 812 fQuoteMode = FALSE; 813 } 814 } 815 else { 816 // Character is not starting a new line. Except in the case of a 817 // LF following a CR, increment the column position. 818 if (ch != chLF) { 819 fCharNum++; 820 } 821 } 822 fLastChar = ch; 823 return ch; 824} 825 826 827//------------------------------------------------------------------------------ 828// 829// nextChar for rules scanning. At this level, we handle stripping 830// out comments and processing backslash character escapes. 831// The rest of the rules grammar is handled at the next level up. 832// 833//------------------------------------------------------------------------------ 834void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { 835 836 // Unicode Character constants needed for the processing done by nextChar(), 837 // in hex because literals wont work on EBCDIC machines. 838 839 fScanIndex = fNextIndex; 840 c.fChar = nextCharLL(); 841 c.fEscaped = FALSE; 842 843 // 844 // check for '' sequence. 845 // These are recognized in all contexts, whether in quoted text or not. 846 // 847 if (c.fChar == chApos) { 848 if (fRB->fRules.char32At(fNextIndex) == chApos) { 849 c.fChar = nextCharLL(); // get nextChar officially so character counts 850 c.fEscaped = TRUE; // stay correct. 851 } 852 else 853 { 854 // Single quote, by itself. 855 // Toggle quoting mode. 856 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 857 fQuoteMode = !fQuoteMode; 858 if (fQuoteMode == TRUE) { 859 c.fChar = chLParen; 860 } else { 861 c.fChar = chRParen; 862 } 863 c.fEscaped = FALSE; // The paren that we return is not escaped. 864 return; 865 } 866 } 867 868 if (fQuoteMode) { 869 c.fEscaped = TRUE; 870 } 871 else 872 { 873 // We are not in a 'quoted region' of the source. 874 // 875 if (c.fChar == chPound) { 876 // Start of a comment. Consume the rest of it. 877 // The new-line char that terminates the comment is always returned. 878 // It will be treated as white-space, and serves to break up anything 879 // that might otherwise incorrectly clump together with a comment in 880 // the middle (a variable name, for example.) 881 for (;;) { 882 c.fChar = nextCharLL(); 883 if (c.fChar == (UChar32)-1 || // EOF 884 c.fChar == chCR || 885 c.fChar == chLF || 886 c.fChar == chNEL || 887 c.fChar == chLS) {break;} 888 } 889 } 890 if (c.fChar == (UChar32)-1) { 891 return; 892 } 893 894 // 895 // check for backslash escaped characters. 896 // Use UnicodeString::unescapeAt() to handle them. 897 // 898 if (c.fChar == chBackSlash) { 899 c.fEscaped = TRUE; 900 int32_t startX = fNextIndex; 901 c.fChar = fRB->fRules.unescapeAt(fNextIndex); 902 if (fNextIndex == startX) { 903 error(U_BRK_HEX_DIGITS_EXPECTED); 904 } 905 fCharNum += fNextIndex-startX; 906 } 907 } 908 // putc(c.fChar, stdout); 909} 910 911//------------------------------------------------------------------------------ 912// 913// Parse RBBI rules. The state machine for rules parsing is here. 914// The state tables are hand-written in the file rbbirpt.txt, 915// and converted to the form used here by a perl 916// script rbbicst.pl 917// 918//------------------------------------------------------------------------------ 919void RBBIRuleScanner::parse() { 920 uint16_t state; 921 const RBBIRuleTableEl *tableEl; 922 923 if (U_FAILURE(*fRB->fStatus)) { 924 return; 925 } 926 927 state = 1; 928 nextChar(fC); 929 // 930 // Main loop for the rule parsing state machine. 931 // Runs once per state transition. 932 // Each time through optionally performs, depending on the state table, 933 // - an advance to the the next input char 934 // - an action to be performed. 935 // - pushing or popping a state to/from the local state return stack. 936 // 937 for (;;) { 938 // Bail out if anything has gone wrong. 939 // RBBI rule file parsing stops on the first error encountered. 940 if (U_FAILURE(*fRB->fStatus)) { 941 break; 942 } 943 944 // Quit if state == 0. This is the normal way to exit the state machine. 945 // 946 if (state == 0) { 947 break; 948 } 949 950 // Find the state table element that matches the input char from the rule, or the 951 // class of the input character. Start with the first table row for this 952 // state, then linearly scan forward until we find a row that matches the 953 // character. The last row for each state always matches all characters, so 954 // the search will stop there, if not before. 955 // 956 tableEl = &gRuleParseStateTable[state]; 957 #ifdef RBBI_DEBUG 958 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { 959 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", 960 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); 961 } 962 #endif 963 964 for (;;) { 965 #ifdef RBBI_DEBUG 966 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");} 967 #endif 968 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { 969 // Table row specified an individual character, not a set, and 970 // the input character is not escaped, and 971 // the input character matched it. 972 break; 973 } 974 if (tableEl->fCharClass == 255) { 975 // Table row specified default, match anything character class. 976 break; 977 } 978 if (tableEl->fCharClass == 254 && fC.fEscaped) { 979 // Table row specified "escaped" and the char was escaped. 980 break; 981 } 982 if (tableEl->fCharClass == 253 && fC.fEscaped && 983 (fC.fChar == 0x50 || fC.fChar == 0x70 )) { 984 // Table row specified "escaped P" and the char is either 'p' or 'P'. 985 break; 986 } 987 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { 988 // Table row specified eof and we hit eof on the input. 989 break; 990 } 991 992 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 993 fC.fEscaped == FALSE && // char is not escaped && 994 fC.fChar != (UChar32)-1) { // char is not EOF 995 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 996 // Table row specified a character class, or set of characters, 997 // and the current char matches it. 998 break; 999 } 1000 } 1001 1002 // No match on this row, advance to the next row for this state, 1003 tableEl++; 1004 } 1005 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} 1006 1007 // 1008 // We've found the row of the state table that matches the current input 1009 // character from the rules string. 1010 // Perform any action specified by this row in the state table. 1011 if (doParseActions((int32_t)tableEl->fAction) == FALSE) { 1012 // Break out of the state machine loop if the 1013 // the action signalled some kind of error, or 1014 // the action was to exit, occurs on normal end-of-rules-input. 1015 break; 1016 } 1017 1018 if (tableEl->fPushState != 0) { 1019 fStackPtr++; 1020 if (fStackPtr >= kStackSize) { 1021 error(U_BRK_INTERNAL_ERROR); 1022 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); 1023 fStackPtr--; 1024 } 1025 fStack[fStackPtr] = tableEl->fPushState; 1026 } 1027 1028 if (tableEl->fNextChar) { 1029 nextChar(fC); 1030 } 1031 1032 // Get the next state from the table entry, or from the 1033 // state stack if the next state was specified as "pop". 1034 if (tableEl->fNextState != 255) { 1035 state = tableEl->fNextState; 1036 } else { 1037 state = fStack[fStackPtr]; 1038 fStackPtr--; 1039 if (fStackPtr < 0) { 1040 error(U_BRK_INTERNAL_ERROR); 1041 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); 1042 fStackPtr++; 1043 } 1044 } 1045 1046 } 1047 1048 // 1049 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 1050 // 1051 if (fRB->fReverseTree == NULL) { 1052 fRB->fReverseTree = pushNewNode(RBBINode::opStar); 1053 RBBINode *operand = pushNewNode(RBBINode::setRef); 1054 findSetFor(kAny, operand); 1055 fRB->fReverseTree->fLeftChild = operand; 1056 operand->fParent = fRB->fReverseTree; 1057 fNodeStackPtr -= 2; 1058 } 1059 1060 1061 // 1062 // Parsing of the input RBBI rules is complete. 1063 // We now have a parse tree for the rule expressions 1064 // and a list of all UnicodeSets that are referenced. 1065 // 1066#ifdef RBBI_DEBUG 1067 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} 1068 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) 1069 { 1070 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); 1071 fRB->fForwardTree->printTree(TRUE); 1072 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); 1073 fRB->fReverseTree->printTree(TRUE); 1074 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); 1075 fRB->fSafeFwdTree->printTree(TRUE); 1076 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); 1077 fRB->fSafeRevTree->printTree(TRUE); 1078 } 1079#endif 1080} 1081 1082 1083//------------------------------------------------------------------------------ 1084// 1085// printNodeStack for debugging... 1086// 1087//------------------------------------------------------------------------------ 1088#ifdef RBBI_DEBUG 1089void RBBIRuleScanner::printNodeStack(const char *title) { 1090 int i; 1091 RBBIDebugPrintf("%s. Dumping node stack...\n", title); 1092 for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);} 1093} 1094#endif 1095 1096 1097 1098 1099//------------------------------------------------------------------------------ 1100// 1101// pushNewNode create a new RBBINode of the specified type and push it 1102// onto the stack of nodes. 1103// 1104//------------------------------------------------------------------------------ 1105RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { 1106 fNodeStackPtr++; 1107 if (fNodeStackPtr >= kStackSize) { 1108 error(U_BRK_INTERNAL_ERROR); 1109 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); 1110 *fRB->fStatus = U_BRK_INTERNAL_ERROR; 1111 return NULL; 1112 } 1113 fNodeStack[fNodeStackPtr] = new RBBINode(t); 1114 if (fNodeStack[fNodeStackPtr] == NULL) { 1115 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; 1116 } 1117 return fNodeStack[fNodeStackPtr]; 1118} 1119 1120 1121 1122//------------------------------------------------------------------------------ 1123// 1124// scanSet Construct a UnicodeSet from the text at the current scan 1125// position. Advance the scan position to the first character 1126// after the set. 1127// 1128// A new RBBI setref node referring to the set is pushed onto the node 1129// stack. 1130// 1131// The scan position is normally under the control of the state machine 1132// that controls rule parsing. UnicodeSets, however, are parsed by 1133// the UnicodeSet constructor, not by the RBBI rule parser. 1134// 1135//------------------------------------------------------------------------------ 1136void RBBIRuleScanner::scanSet() { 1137 UnicodeSet *uset; 1138 ParsePosition pos; 1139 int startPos; 1140 int i; 1141 1142 if (U_FAILURE(*fRB->fStatus)) { 1143 return; 1144 } 1145 1146 pos.setIndex(fScanIndex); 1147 startPos = fScanIndex; 1148 UErrorCode localStatus = U_ZERO_ERROR; 1149 uset = new UnicodeSet(fRB->fRules, pos, USET_IGNORE_SPACE, 1150 fSymbolTable, 1151 localStatus); 1152 if (uset == NULL) { 1153 localStatus = U_MEMORY_ALLOCATION_ERROR; 1154 } 1155 if (U_FAILURE(localStatus)) { 1156 // TODO: Get more accurate position of the error from UnicodeSet's return info. 1157 // UnicodeSet appears to not be reporting correctly at this time. 1158 #ifdef RBBI_DEBUG 1159 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); 1160 #endif 1161 error(localStatus); 1162 delete uset; 1163 return; 1164 } 1165 1166 // Verify that the set contains at least one code point. 1167 // 1168 if (uset->isEmpty()) { 1169 // This set is empty. 1170 // Make it an error, because it almost certainly is not what the user wanted. 1171 // Also, avoids having to think about corner cases in the tree manipulation code 1172 // that occurs later on. 1173 error(U_BRK_RULE_EMPTY_SET); 1174 delete uset; 1175 return; 1176 } 1177 1178 1179 // Advance the RBBI parse postion over the UnicodeSet pattern. 1180 // Don't just set fScanIndex because the line/char positions maintained 1181 // for error reporting would be thrown off. 1182 i = pos.getIndex(); 1183 for (;;) { 1184 if (fNextIndex >= i) { 1185 break; 1186 } 1187 nextCharLL(); 1188 } 1189 1190 if (U_SUCCESS(*fRB->fStatus)) { 1191 RBBINode *n; 1192 1193 n = pushNewNode(RBBINode::setRef); 1194 n->fFirstPos = startPos; 1195 n->fLastPos = fNextIndex; 1196 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1197 // findSetFor() serves several purposes here: 1198 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1199 // - Mantains collection of all sets in use, needed later for establishing 1200 // character categories for run time engine. 1201 // - Eliminates mulitiple instances of the same set. 1202 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1203 findSetFor(n->fText, n, uset); 1204 } 1205 1206} 1207 1208U_NAMESPACE_END 1209 1210#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1211