1 2// 3// file: rbbiscan.cpp 4// 5// Copyright (C) 2002-2012, International Business Machines Corporation and others. 6// All Rights Reserved. 7// 8// This file contains the Rule Based Break Iterator Rule Builder functions for 9// scanning the rules and assembling a parse tree. This is the first phase 10// of compiling the rules. 11// 12// The overall of the rules is managed by class RBBIRuleBuilder, which will 13// create and use an instance of this class as part of the process. 14// 15 16#include "unicode/utypes.h" 17 18#if !UCONFIG_NO_BREAK_ITERATION 19 20#include "unicode/unistr.h" 21#include "unicode/uniset.h" 22#include "unicode/uchar.h" 23#include "unicode/uchriter.h" 24#include "unicode/parsepos.h" 25#include "unicode/parseerr.h" 26#include "cmemory.h" 27#include "cstring.h" 28 29#include "rbbirpt.h" // Contains state table for the rbbi rules parser. 30 // generated by a Perl script. 31#include "rbbirb.h" 32#include "rbbinode.h" 33#include "rbbiscan.h" 34#include "rbbitblb.h" 35 36#include "uassert.h" 37 38#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 39 40//------------------------------------------------------------------------------ 41// 42// Unicode Set init strings for each of the character classes needed for parsing a rule file. 43// (Initialized with hex values for portability to EBCDIC based machines. 44// Really ugly, but there's no good way to avoid it.) 45// 46// The sets are referred to by name in the rbbirpt.txt, which is the 47// source form of the state transition table for the RBBI rule parser. 48// 49//------------------------------------------------------------------------------ 50static const UChar gRuleSet_rule_char_pattern[] = { 51 // [ ^ [ \ p { Z } \ u 0 0 2 0 52 0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30, 53 // - \ u 0 0 7 f ] - [ \ p 54 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 55 // { L } ] - [ \ p { N } ] ] 56 0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0}; 57 58static const UChar gRuleSet_name_char_pattern[] = { 59// [ _ \ p { L } \ p { N } ] 60 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0}; 61 62static const UChar gRuleSet_digit_char_pattern[] = { 63// [ 0 - 9 ] 64 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; 65 66static const UChar gRuleSet_name_start_char_pattern[] = { 67// [ _ \ p { L } ] 68 0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 }; 69 70static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any" 71 72 73U_CDECL_BEGIN 74static void U_CALLCONV RBBISetTable_deleter(void *p) { 75 icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p; 76 delete px->key; 77 // Note: px->val is owned by the linked list "fSetsListHead" in scanner. 78 // Don't delete the value nodes here. 79 uprv_free(px); 80} 81U_CDECL_END 82 83U_NAMESPACE_BEGIN 84 85//------------------------------------------------------------------------------ 86// 87// Constructor. 88// 89//------------------------------------------------------------------------------ 90RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb) 91{ 92 fRB = rb; 93 fStackPtr = 0; 94 fStack[fStackPtr] = 0; 95 fNodeStackPtr = 0; 96 fRuleNum = 0; 97 fNodeStack[0] = NULL; 98 99 fSymbolTable = NULL; 100 fSetTable = NULL; 101 102 fScanIndex = 0; 103 fNextIndex = 0; 104 105 fReverseRule = FALSE; 106 fLookAheadRule = FALSE; 107 108 fLineNum = 1; 109 fCharNum = 0; 110 fQuoteMode = FALSE; 111 112 // Do not check status until after all critical fields are sufficiently initialized 113 // that the destructor can run cleanly. 114 if (U_FAILURE(*rb->fStatus)) { 115 return; 116 } 117 118 // 119 // Set up the constant Unicode Sets. 120 // Note: These could be made static, lazily initialized, and shared among 121 // all instances of RBBIRuleScanners. BUT this is quite a bit simpler, 122 // and the time to build these few sets should be small compared to a 123 // full break iterator build. 124 fRuleSets[kRuleSet_rule_char-128] 125 = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern), *rb->fStatus); 126 // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:] 127 fRuleSets[kRuleSet_white_space-128]. 128 add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029); 129 fRuleSets[kRuleSet_name_char-128] 130 = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern), *rb->fStatus); 131 fRuleSets[kRuleSet_name_start_char-128] 132 = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus); 133 fRuleSets[kRuleSet_digit_char-128] 134 = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern), *rb->fStatus); 135 if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) { 136 // This case happens if ICU's data is missing. UnicodeSet tries to look up property 137 // names from the init string, can't find them, and claims an illegal argument. 138 // Change the error so that the actual problem will be clearer to users. 139 *rb->fStatus = U_BRK_INIT_ERROR; 140 } 141 if (U_FAILURE(*rb->fStatus)) { 142 return; 143 } 144 145 fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus); 146 if (fSymbolTable == NULL) { 147 *rb->fStatus = U_MEMORY_ALLOCATION_ERROR; 148 return; 149 } 150 fSetTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, NULL, rb->fStatus); 151 if (U_FAILURE(*rb->fStatus)) { 152 return; 153 } 154 uhash_setValueDeleter(fSetTable, RBBISetTable_deleter); 155} 156 157 158 159//------------------------------------------------------------------------------ 160// 161// Destructor 162// 163//------------------------------------------------------------------------------ 164RBBIRuleScanner::~RBBIRuleScanner() { 165 delete fSymbolTable; 166 if (fSetTable != NULL) { 167 uhash_close(fSetTable); 168 fSetTable = NULL; 169 170 } 171 172 173 // Node Stack. 174 // Normally has one entry, which is the entire parse tree for the rules. 175 // If errors occured, there may be additional subtrees left on the stack. 176 while (fNodeStackPtr > 0) { 177 delete fNodeStack[fNodeStackPtr]; 178 fNodeStackPtr--; 179 } 180 181} 182 183//------------------------------------------------------------------------------ 184// 185// doParseAction Do some action during rule parsing. 186// Called by the parse state machine. 187// Actions build the parse tree and Unicode Sets, 188// and maintain the parse stack for nested expressions. 189// 190// TODO: unify EParseAction and RBBI_RuleParseAction enum types. 191// They represent exactly the same thing. They're separate 192// only to work around enum forward declaration restrictions 193// in some compilers, while at the same time avoiding multiple 194// definitions problems. I'm sure that there's a better way. 195// 196//------------------------------------------------------------------------------ 197UBool RBBIRuleScanner::doParseActions(int32_t action) 198{ 199 RBBINode *n = NULL; 200 201 UBool returnVal = TRUE; 202 203 switch (action) { 204 205 case doExprStart: 206 pushNewNode(RBBINode::opStart); 207 fRuleNum++; 208 break; 209 210 211 case doExprOrOperator: 212 { 213 fixOpStack(RBBINode::precOpCat); 214 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 215 RBBINode *orNode = pushNewNode(RBBINode::opOr); 216 orNode->fLeftChild = operandNode; 217 operandNode->fParent = orNode; 218 } 219 break; 220 221 case doExprCatOperator: 222 // concatenation operator. 223 // For the implicit concatenation of adjacent terms in an expression that are 224 // not separated by any other operator. Action is invoked between the 225 // actions for the two terms. 226 { 227 fixOpStack(RBBINode::precOpCat); 228 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 229 RBBINode *catNode = pushNewNode(RBBINode::opCat); 230 catNode->fLeftChild = operandNode; 231 operandNode->fParent = catNode; 232 } 233 break; 234 235 case doLParen: 236 // Open Paren. 237 // The openParen node is a dummy operation type with a low precedence, 238 // which has the affect of ensuring that any real binary op that 239 // follows within the parens binds more tightly to the operands than 240 // stuff outside of the parens. 241 pushNewNode(RBBINode::opLParen); 242 break; 243 244 case doExprRParen: 245 fixOpStack(RBBINode::precLParen); 246 break; 247 248 case doNOP: 249 break; 250 251 case doStartAssign: 252 // We've just scanned "$variable = " 253 // The top of the node stack has the $variable ref node. 254 255 // Save the start position of the RHS text in the StartExpression node 256 // that precedes the $variableReference node on the stack. 257 // This will eventually be used when saving the full $variable replacement 258 // text as a string. 259 n = fNodeStack[fNodeStackPtr-1]; 260 n->fFirstPos = fNextIndex; // move past the '=' 261 262 // Push a new start-of-expression node; needed to keep parse of the 263 // RHS expression happy. 264 pushNewNode(RBBINode::opStart); 265 break; 266 267 268 269 270 case doEndAssign: 271 { 272 // We have reached the end of an assignement statement. 273 // Current scan char is the ';' that terminates the assignment. 274 275 // Terminate expression, leaves expression parse tree rooted in TOS node. 276 fixOpStack(RBBINode::precStart); 277 278 RBBINode *startExprNode = fNodeStack[fNodeStackPtr-2]; 279 RBBINode *varRefNode = fNodeStack[fNodeStackPtr-1]; 280 RBBINode *RHSExprNode = fNodeStack[fNodeStackPtr]; 281 282 // Save original text of right side of assignment, excluding the terminating ';' 283 // in the root of the node for the right-hand-side expression. 284 RHSExprNode->fFirstPos = startExprNode->fFirstPos; 285 RHSExprNode->fLastPos = fScanIndex; 286 fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText); 287 288 // Expression parse tree becomes l. child of the $variable reference node. 289 varRefNode->fLeftChild = RHSExprNode; 290 RHSExprNode->fParent = varRefNode; 291 292 // Make a symbol table entry for the $variableRef node. 293 fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus); 294 if (U_FAILURE(*fRB->fStatus)) { 295 // This is a round-about way to get the parse position set 296 // so that duplicate symbols error messages include a line number. 297 UErrorCode t = *fRB->fStatus; 298 *fRB->fStatus = U_ZERO_ERROR; 299 error(t); 300 } 301 302 // Clean up the stack. 303 delete startExprNode; 304 fNodeStackPtr-=3; 305 break; 306 } 307 308 case doEndOfRule: 309 { 310 fixOpStack(RBBINode::precStart); // Terminate expression, leaves expression 311 if (U_FAILURE(*fRB->fStatus)) { // parse tree rooted in TOS node. 312 break; 313 } 314#ifdef RBBI_DEBUG 315 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");} 316#endif 317 U_ASSERT(fNodeStackPtr == 1); 318 319 // If this rule includes a look-ahead '/', add a endMark node to the 320 // expression tree. 321 if (fLookAheadRule) { 322 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 323 RBBINode *endNode = pushNewNode(RBBINode::endMark); 324 RBBINode *catNode = pushNewNode(RBBINode::opCat); 325 fNodeStackPtr -= 2; 326 catNode->fLeftChild = thisRule; 327 catNode->fRightChild = endNode; 328 fNodeStack[fNodeStackPtr] = catNode; 329 endNode->fVal = fRuleNum; 330 endNode->fLookAheadEnd = TRUE; 331 } 332 333 // All rule expressions are ORed together. 334 // The ';' that terminates an expression really just functions as a '|' with 335 // a low operator prededence. 336 // 337 // Each of the four sets of rules are collected separately. 338 // (forward, reverse, safe_forward, safe_reverse) 339 // OR this rule into the appropriate group of them. 340 // 341 RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree); 342 343 if (*destRules != NULL) { 344 // This is not the first rule encounted. 345 // OR previous stuff (from *destRules) 346 // with the current rule expression (on the Node Stack) 347 // with the resulting OR expression going to *destRules 348 // 349 RBBINode *thisRule = fNodeStack[fNodeStackPtr]; 350 RBBINode *prevRules = *destRules; 351 RBBINode *orNode = pushNewNode(RBBINode::opOr); 352 orNode->fLeftChild = prevRules; 353 prevRules->fParent = orNode; 354 orNode->fRightChild = thisRule; 355 thisRule->fParent = orNode; 356 *destRules = orNode; 357 } 358 else 359 { 360 // This is the first rule encountered (for this direction). 361 // Just move its parse tree from the stack to *destRules. 362 *destRules = fNodeStack[fNodeStackPtr]; 363 } 364 fReverseRule = FALSE; // in preparation for the next rule. 365 fLookAheadRule = FALSE; 366 fNodeStackPtr = 0; 367 } 368 break; 369 370 371 case doRuleError: 372 error(U_BRK_RULE_SYNTAX); 373 returnVal = FALSE; 374 break; 375 376 377 case doVariableNameExpectedErr: 378 error(U_BRK_RULE_SYNTAX); 379 break; 380 381 382 // 383 // Unary operands + ? * 384 // These all appear after the operand to which they apply. 385 // When we hit one, the operand (may be a whole sub expression) 386 // will be on the top of the stack. 387 // Unary Operator becomes TOS, with the old TOS as its one child. 388 case doUnaryOpPlus: 389 { 390 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 391 RBBINode *plusNode = pushNewNode(RBBINode::opPlus); 392 plusNode->fLeftChild = operandNode; 393 operandNode->fParent = plusNode; 394 } 395 break; 396 397 case doUnaryOpQuestion: 398 { 399 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 400 RBBINode *qNode = pushNewNode(RBBINode::opQuestion); 401 qNode->fLeftChild = operandNode; 402 operandNode->fParent = qNode; 403 } 404 break; 405 406 case doUnaryOpStar: 407 { 408 RBBINode *operandNode = fNodeStack[fNodeStackPtr--]; 409 RBBINode *starNode = pushNewNode(RBBINode::opStar); 410 starNode->fLeftChild = operandNode; 411 operandNode->fParent = starNode; 412 } 413 break; 414 415 case doRuleChar: 416 // A "Rule Character" is any single character that is a literal part 417 // of the regular expression. Like a, b and c in the expression "(abc*) | [:L:]" 418 // These are pretty uncommon in break rules; the terms are more commonly 419 // sets. To keep things uniform, treat these characters like as 420 // sets that just happen to contain only one character. 421 { 422 n = pushNewNode(RBBINode::setRef); 423 findSetFor(UnicodeString(fC.fChar), n); 424 n->fFirstPos = fScanIndex; 425 n->fLastPos = fNextIndex; 426 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 427 break; 428 } 429 430 case doDotAny: 431 // scanned a ".", meaning match any single character. 432 { 433 n = pushNewNode(RBBINode::setRef); 434 findSetFor(UnicodeString(TRUE, kAny, 3), n); 435 n->fFirstPos = fScanIndex; 436 n->fLastPos = fNextIndex; 437 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 438 break; 439 } 440 441 case doSlash: 442 // Scanned a '/', which identifies a look-ahead break position in a rule. 443 n = pushNewNode(RBBINode::lookAhead); 444 n->fVal = fRuleNum; 445 n->fFirstPos = fScanIndex; 446 n->fLastPos = fNextIndex; 447 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 448 fLookAheadRule = TRUE; 449 break; 450 451 452 case doStartTagValue: 453 // Scanned a '{', the opening delimiter for a tag value within a rule. 454 n = pushNewNode(RBBINode::tag); 455 n->fVal = 0; 456 n->fFirstPos = fScanIndex; 457 n->fLastPos = fNextIndex; 458 break; 459 460 case doTagDigit: 461 // Just scanned a decimal digit that's part of a tag value 462 { 463 n = fNodeStack[fNodeStackPtr]; 464 uint32_t v = u_charDigitValue(fC.fChar); 465 U_ASSERT(v < 10); 466 n->fVal = n->fVal*10 + v; 467 break; 468 } 469 470 case doTagValue: 471 n = fNodeStack[fNodeStackPtr]; 472 n->fLastPos = fNextIndex; 473 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 474 break; 475 476 case doTagExpectedError: 477 error(U_BRK_MALFORMED_RULE_TAG); 478 returnVal = FALSE; 479 break; 480 481 case doOptionStart: 482 // Scanning a !!option. At the start of string. 483 fOptionStart = fScanIndex; 484 break; 485 486 case doOptionEnd: 487 { 488 UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart); 489 if (opt == UNICODE_STRING("chain", 5)) { 490 fRB->fChainRules = TRUE; 491 } else if (opt == UNICODE_STRING("LBCMNoChain", 11)) { 492 fRB->fLBCMNoChain = TRUE; 493 } else if (opt == UNICODE_STRING("forward", 7)) { 494 fRB->fDefaultTree = &fRB->fForwardTree; 495 } else if (opt == UNICODE_STRING("reverse", 7)) { 496 fRB->fDefaultTree = &fRB->fReverseTree; 497 } else if (opt == UNICODE_STRING("safe_forward", 12)) { 498 fRB->fDefaultTree = &fRB->fSafeFwdTree; 499 } else if (opt == UNICODE_STRING("safe_reverse", 12)) { 500 fRB->fDefaultTree = &fRB->fSafeRevTree; 501 } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) { 502 fRB->fLookAheadHardBreak = TRUE; 503 } else { 504 error(U_BRK_UNRECOGNIZED_OPTION); 505 } 506 } 507 break; 508 509 case doReverseDir: 510 fReverseRule = TRUE; 511 break; 512 513 case doStartVariableName: 514 n = pushNewNode(RBBINode::varRef); 515 if (U_FAILURE(*fRB->fStatus)) { 516 break; 517 } 518 n->fFirstPos = fScanIndex; 519 break; 520 521 case doEndVariableName: 522 n = fNodeStack[fNodeStackPtr]; 523 if (n==NULL || n->fType != RBBINode::varRef) { 524 error(U_BRK_INTERNAL_ERROR); 525 break; 526 } 527 n->fLastPos = fScanIndex; 528 fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText); 529 // Look the newly scanned name up in the symbol table 530 // If there's an entry, set the l. child of the var ref to the replacement expression. 531 // (We also pass through here when scanning assignments, but no harm is done, other 532 // than a slight wasted effort that seems hard to avoid. Lookup will be null) 533 n->fLeftChild = fSymbolTable->lookupNode(n->fText); 534 break; 535 536 case doCheckVarDef: 537 n = fNodeStack[fNodeStackPtr]; 538 if (n->fLeftChild == NULL) { 539 error(U_BRK_UNDEFINED_VARIABLE); 540 returnVal = FALSE; 541 } 542 break; 543 544 case doExprFinished: 545 break; 546 547 case doRuleErrorAssignExpr: 548 error(U_BRK_ASSIGN_ERROR); 549 returnVal = FALSE; 550 break; 551 552 case doExit: 553 returnVal = FALSE; 554 break; 555 556 case doScanUnicodeSet: 557 scanSet(); 558 break; 559 560 default: 561 error(U_BRK_INTERNAL_ERROR); 562 returnVal = FALSE; 563 break; 564 } 565 return returnVal; 566} 567 568 569 570 571//------------------------------------------------------------------------------ 572// 573// Error Report a rule parse error. 574// Only report it if no previous error has been recorded. 575// 576//------------------------------------------------------------------------------ 577void RBBIRuleScanner::error(UErrorCode e) { 578 if (U_SUCCESS(*fRB->fStatus)) { 579 *fRB->fStatus = e; 580 if (fRB->fParseError) { 581 fRB->fParseError->line = fLineNum; 582 fRB->fParseError->offset = fCharNum; 583 fRB->fParseError->preContext[0] = 0; 584 fRB->fParseError->preContext[0] = 0; 585 } 586 } 587} 588 589 590 591 592//------------------------------------------------------------------------------ 593// 594// fixOpStack The parse stack holds partially assembled chunks of the parse tree. 595// An entry on the stack may be as small as a single setRef node, 596// or as large as the parse tree 597// for an entire expression (this will be the one item left on the stack 598// when the parsing of an RBBI rule completes. 599// 600// This function is called when a binary operator is encountered. 601// It looks back up the stack for operators that are not yet associated 602// with a right operand, and if the precedence of the stacked operator >= 603// the precedence of the current operator, binds the operand left, 604// to the previously encountered operator. 605// 606//------------------------------------------------------------------------------ 607void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) { 608 RBBINode *n; 609 // printNodeStack("entering fixOpStack()"); 610 for (;;) { 611 n = fNodeStack[fNodeStackPtr-1]; // an operator node 612 if (n->fPrecedence == 0) { 613 RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node"); 614 error(U_BRK_INTERNAL_ERROR); 615 return; 616 } 617 618 if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) { 619 // The most recent operand goes with the current operator, 620 // not with the previously stacked one. 621 break; 622 } 623 // Stack operator is a binary op ( '|' or concatenation) 624 // TOS operand becomes right child of this operator. 625 // Resulting subexpression becomes the TOS operand. 626 n->fRightChild = fNodeStack[fNodeStackPtr]; 627 fNodeStack[fNodeStackPtr]->fParent = n; 628 fNodeStackPtr--; 629 // printNodeStack("looping in fixOpStack() "); 630 } 631 632 if (p <= RBBINode::precLParen) { 633 // Scan is at a right paren or end of expression. 634 // The scanned item must match the stack, or else there was an error. 635 // Discard the left paren (or start expr) node from the stack, 636 // leaving the completed (sub)expression as TOS. 637 if (n->fPrecedence != p) { 638 // Right paren encountered matched start of expression node, or 639 // end of expression matched with a left paren node. 640 error(U_BRK_MISMATCHED_PAREN); 641 } 642 fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr]; 643 fNodeStackPtr--; 644 // Delete the now-discarded LParen or Start node. 645 delete n; 646 } 647 // printNodeStack("leaving fixOpStack()"); 648} 649 650 651 652 653//------------------------------------------------------------------------------ 654// 655// findSetFor given a UnicodeString, 656// - find the corresponding Unicode Set (uset node) 657// (create one if necessary) 658// - Set fLeftChild of the caller's node (should be a setRef node) 659// to the uset node 660// Maintain a hash table of uset nodes, so the same one is always used 661// for the same string. 662// If a "to adopt" set is provided and we haven't seen this key before, 663// add the provided set to the hash table. 664// If the string is one (32 bit) char in length, the set contains 665// just one element which is the char in question. 666// If the string is "any", return a set containing all chars. 667// 668//------------------------------------------------------------------------------ 669void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) { 670 671 RBBISetTableEl *el; 672 673 // First check whether we've already cached a set for this string. 674 // If so, just use the cached set in the new node. 675 // delete any set provided by the caller, since we own it. 676 el = (RBBISetTableEl *)uhash_get(fSetTable, &s); 677 if (el != NULL) { 678 delete setToAdopt; 679 node->fLeftChild = el->val; 680 U_ASSERT(node->fLeftChild->fType == RBBINode::uset); 681 return; 682 } 683 684 // Haven't seen this set before. 685 // If the caller didn't provide us with a prebuilt set, 686 // create a new UnicodeSet now. 687 if (setToAdopt == NULL) { 688 if (s.compare(kAny, -1) == 0) { 689 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 690 } else { 691 UChar32 c; 692 c = s.char32At(0); 693 setToAdopt = new UnicodeSet(c, c); 694 } 695 } 696 697 // 698 // Make a new uset node to refer to this UnicodeSet 699 // This new uset node becomes the child of the caller's setReference node. 700 // 701 RBBINode *usetNode = new RBBINode(RBBINode::uset); 702 if (usetNode == NULL) { 703 error(U_MEMORY_ALLOCATION_ERROR); 704 return; 705 } 706 usetNode->fInputSet = setToAdopt; 707 usetNode->fParent = node; 708 node->fLeftChild = usetNode; 709 usetNode->fText = s; 710 711 712 // 713 // Add the new uset node to the list of all uset nodes. 714 // 715 fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus); 716 717 718 // 719 // Add the new set to the set hash table. 720 // 721 el = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl)); 722 UnicodeString *tkey = new UnicodeString(s); 723 if (tkey == NULL || el == NULL || setToAdopt == NULL) { 724 // Delete to avoid memory leak 725 delete tkey; 726 tkey = NULL; 727 uprv_free(el); 728 el = NULL; 729 delete setToAdopt; 730 setToAdopt = NULL; 731 732 error(U_MEMORY_ALLOCATION_ERROR); 733 return; 734 } 735 el->key = tkey; 736 el->val = usetNode; 737 uhash_put(fSetTable, el->key, el, fRB->fStatus); 738 739 return; 740} 741 742 743 744// 745// Assorted Unicode character constants. 746// Numeric because there is no portable way to enter them as literals. 747// (Think EBCDIC). 748// 749static const UChar chCR = 0x0d; // New lines, for terminating comments. 750static const UChar chLF = 0x0a; 751static const UChar chNEL = 0x85; // NEL newline variant 752static const UChar chLS = 0x2028; // Unicode Line Separator 753static const UChar chApos = 0x27; // single quote, for quoted chars. 754static const UChar chPound = 0x23; // '#', introduces a comment. 755static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 756static const UChar chLParen = 0x28; 757static const UChar chRParen = 0x29; 758 759 760//------------------------------------------------------------------------------ 761// 762// stripRules Return a rules string without unnecessary 763// characters. 764// 765//------------------------------------------------------------------------------ 766UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { 767 UnicodeString strippedRules; 768 int rulesLength = rules.length(); 769 for (int idx = 0; idx < rulesLength; ) { 770 UChar ch = rules[idx++]; 771 if (ch == chPound) { 772 while (idx < rulesLength 773 && ch != chCR && ch != chLF && ch != chNEL) 774 { 775 ch = rules[idx++]; 776 } 777 } 778 if (!u_isISOControl(ch)) { 779 strippedRules.append(ch); 780 } 781 } 782 // strippedRules = strippedRules.unescape(); 783 return strippedRules; 784} 785 786 787//------------------------------------------------------------------------------ 788// 789// nextCharLL Low Level Next Char from rule input source. 790// Get a char from the input character iterator, 791// keep track of input position for error reporting. 792// 793//------------------------------------------------------------------------------ 794UChar32 RBBIRuleScanner::nextCharLL() { 795 UChar32 ch; 796 797 if (fNextIndex >= fRB->fRules.length()) { 798 return (UChar32)-1; 799 } 800 ch = fRB->fRules.char32At(fNextIndex); 801 fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1); 802 803 if (ch == chCR || 804 ch == chNEL || 805 ch == chLS || 806 (ch == chLF && fLastChar != chCR)) { 807 // Character is starting a new line. Bump up the line number, and 808 // reset the column to 0. 809 fLineNum++; 810 fCharNum=0; 811 if (fQuoteMode) { 812 error(U_BRK_NEW_LINE_IN_QUOTED_STRING); 813 fQuoteMode = FALSE; 814 } 815 } 816 else { 817 // Character is not starting a new line. Except in the case of a 818 // LF following a CR, increment the column position. 819 if (ch != chLF) { 820 fCharNum++; 821 } 822 } 823 fLastChar = ch; 824 return ch; 825} 826 827 828//------------------------------------------------------------------------------ 829// 830// nextChar for rules scanning. At this level, we handle stripping 831// out comments and processing backslash character escapes. 832// The rest of the rules grammar is handled at the next level up. 833// 834//------------------------------------------------------------------------------ 835void RBBIRuleScanner::nextChar(RBBIRuleChar &c) { 836 837 // Unicode Character constants needed for the processing done by nextChar(), 838 // in hex because literals wont work on EBCDIC machines. 839 840 fScanIndex = fNextIndex; 841 c.fChar = nextCharLL(); 842 c.fEscaped = FALSE; 843 844 // 845 // check for '' sequence. 846 // These are recognized in all contexts, whether in quoted text or not. 847 // 848 if (c.fChar == chApos) { 849 if (fRB->fRules.char32At(fNextIndex) == chApos) { 850 c.fChar = nextCharLL(); // get nextChar officially so character counts 851 c.fEscaped = TRUE; // stay correct. 852 } 853 else 854 { 855 // Single quote, by itself. 856 // Toggle quoting mode. 857 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 858 fQuoteMode = !fQuoteMode; 859 if (fQuoteMode == TRUE) { 860 c.fChar = chLParen; 861 } else { 862 c.fChar = chRParen; 863 } 864 c.fEscaped = FALSE; // The paren that we return is not escaped. 865 return; 866 } 867 } 868 869 if (fQuoteMode) { 870 c.fEscaped = TRUE; 871 } 872 else 873 { 874 // We are not in a 'quoted region' of the source. 875 // 876 if (c.fChar == chPound) { 877 // Start of a comment. Consume the rest of it. 878 // The new-line char that terminates the comment is always returned. 879 // It will be treated as white-space, and serves to break up anything 880 // that might otherwise incorrectly clump together with a comment in 881 // the middle (a variable name, for example.) 882 for (;;) { 883 c.fChar = nextCharLL(); 884 if (c.fChar == (UChar32)-1 || // EOF 885 c.fChar == chCR || 886 c.fChar == chLF || 887 c.fChar == chNEL || 888 c.fChar == chLS) {break;} 889 } 890 } 891 if (c.fChar == (UChar32)-1) { 892 return; 893 } 894 895 // 896 // check for backslash escaped characters. 897 // Use UnicodeString::unescapeAt() to handle them. 898 // 899 if (c.fChar == chBackSlash) { 900 c.fEscaped = TRUE; 901 int32_t startX = fNextIndex; 902 c.fChar = fRB->fRules.unescapeAt(fNextIndex); 903 if (fNextIndex == startX) { 904 error(U_BRK_HEX_DIGITS_EXPECTED); 905 } 906 fCharNum += fNextIndex-startX; 907 } 908 } 909 // putc(c.fChar, stdout); 910} 911 912//------------------------------------------------------------------------------ 913// 914// Parse RBBI rules. The state machine for rules parsing is here. 915// The state tables are hand-written in the file rbbirpt.txt, 916// and converted to the form used here by a perl 917// script rbbicst.pl 918// 919//------------------------------------------------------------------------------ 920void RBBIRuleScanner::parse() { 921 uint16_t state; 922 const RBBIRuleTableEl *tableEl; 923 924 if (U_FAILURE(*fRB->fStatus)) { 925 return; 926 } 927 928 state = 1; 929 nextChar(fC); 930 // 931 // Main loop for the rule parsing state machine. 932 // Runs once per state transition. 933 // Each time through optionally performs, depending on the state table, 934 // - an advance to the the next input char 935 // - an action to be performed. 936 // - pushing or popping a state to/from the local state return stack. 937 // 938 for (;;) { 939 // Bail out if anything has gone wrong. 940 // RBBI rule file parsing stops on the first error encountered. 941 if (U_FAILURE(*fRB->fStatus)) { 942 break; 943 } 944 945 // Quit if state == 0. This is the normal way to exit the state machine. 946 // 947 if (state == 0) { 948 break; 949 } 950 951 // Find the state table element that matches the input char from the rule, or the 952 // class of the input character. Start with the first table row for this 953 // state, then linearly scan forward until we find a row that matches the 954 // character. The last row for each state always matches all characters, so 955 // the search will stop there, if not before. 956 // 957 tableEl = &gRuleParseStateTable[state]; 958 #ifdef RBBI_DEBUG 959 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { 960 RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d) state=%s ", 961 fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]); 962 } 963 #endif 964 965 for (;;) { 966 #ifdef RBBI_DEBUG 967 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf(".");} 968 #endif 969 if (tableEl->fCharClass < 127 && fC.fEscaped == FALSE && tableEl->fCharClass == fC.fChar) { 970 // Table row specified an individual character, not a set, and 971 // the input character is not escaped, and 972 // the input character matched it. 973 break; 974 } 975 if (tableEl->fCharClass == 255) { 976 // Table row specified default, match anything character class. 977 break; 978 } 979 if (tableEl->fCharClass == 254 && fC.fEscaped) { 980 // Table row specified "escaped" and the char was escaped. 981 break; 982 } 983 if (tableEl->fCharClass == 253 && fC.fEscaped && 984 (fC.fChar == 0x50 || fC.fChar == 0x70 )) { 985 // Table row specified "escaped P" and the char is either 'p' or 'P'. 986 break; 987 } 988 if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) { 989 // Table row specified eof and we hit eof on the input. 990 break; 991 } 992 993 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 994 fC.fEscaped == FALSE && // char is not escaped && 995 fC.fChar != (UChar32)-1) { // char is not EOF 996 U_ASSERT((tableEl->fCharClass-128) < LENGTHOF(fRuleSets)); 997 if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 998 // Table row specified a character class, or set of characters, 999 // and the current char matches it. 1000 break; 1001 } 1002 } 1003 1004 // No match on this row, advance to the next row for this state, 1005 tableEl++; 1006 } 1007 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");} 1008 1009 // 1010 // We've found the row of the state table that matches the current input 1011 // character from the rules string. 1012 // Perform any action specified by this row in the state table. 1013 if (doParseActions((int32_t)tableEl->fAction) == FALSE) { 1014 // Break out of the state machine loop if the 1015 // the action signalled some kind of error, or 1016 // the action was to exit, occurs on normal end-of-rules-input. 1017 break; 1018 } 1019 1020 if (tableEl->fPushState != 0) { 1021 fStackPtr++; 1022 if (fStackPtr >= kStackSize) { 1023 error(U_BRK_INTERNAL_ERROR); 1024 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow."); 1025 fStackPtr--; 1026 } 1027 fStack[fStackPtr] = tableEl->fPushState; 1028 } 1029 1030 if (tableEl->fNextChar) { 1031 nextChar(fC); 1032 } 1033 1034 // Get the next state from the table entry, or from the 1035 // state stack if the next state was specified as "pop". 1036 if (tableEl->fNextState != 255) { 1037 state = tableEl->fNextState; 1038 } else { 1039 state = fStack[fStackPtr]; 1040 fStackPtr--; 1041 if (fStackPtr < 0) { 1042 error(U_BRK_INTERNAL_ERROR); 1043 RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow."); 1044 fStackPtr++; 1045 } 1046 } 1047 1048 } 1049 1050 // 1051 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 1052 // 1053 if (fRB->fReverseTree == NULL) { 1054 fRB->fReverseTree = pushNewNode(RBBINode::opStar); 1055 RBBINode *operand = pushNewNode(RBBINode::setRef); 1056 findSetFor(UnicodeString(TRUE, kAny, 3), operand); 1057 fRB->fReverseTree->fLeftChild = operand; 1058 operand->fParent = fRB->fReverseTree; 1059 fNodeStackPtr -= 2; 1060 } 1061 1062 1063 // 1064 // Parsing of the input RBBI rules is complete. 1065 // We now have a parse tree for the rule expressions 1066 // and a list of all UnicodeSets that are referenced. 1067 // 1068#ifdef RBBI_DEBUG 1069 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();} 1070 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) 1071 { 1072 RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n"); 1073 fRB->fForwardTree->printTree(TRUE); 1074 RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n"); 1075 fRB->fReverseTree->printTree(TRUE); 1076 RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n"); 1077 fRB->fSafeFwdTree->printTree(TRUE); 1078 RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n"); 1079 fRB->fSafeRevTree->printTree(TRUE); 1080 } 1081#endif 1082} 1083 1084 1085//------------------------------------------------------------------------------ 1086// 1087// printNodeStack for debugging... 1088// 1089//------------------------------------------------------------------------------ 1090#ifdef RBBI_DEBUG 1091void RBBIRuleScanner::printNodeStack(const char *title) { 1092 int i; 1093 RBBIDebugPrintf("%s. Dumping node stack...\n", title); 1094 for (i=fNodeStackPtr; i>0; i--) {fNodeStack[i]->printTree(TRUE);} 1095} 1096#endif 1097 1098 1099 1100 1101//------------------------------------------------------------------------------ 1102// 1103// pushNewNode create a new RBBINode of the specified type and push it 1104// onto the stack of nodes. 1105// 1106//------------------------------------------------------------------------------ 1107RBBINode *RBBIRuleScanner::pushNewNode(RBBINode::NodeType t) { 1108 fNodeStackPtr++; 1109 if (fNodeStackPtr >= kStackSize) { 1110 error(U_BRK_INTERNAL_ERROR); 1111 RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow."); 1112 *fRB->fStatus = U_BRK_INTERNAL_ERROR; 1113 return NULL; 1114 } 1115 fNodeStack[fNodeStackPtr] = new RBBINode(t); 1116 if (fNodeStack[fNodeStackPtr] == NULL) { 1117 *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR; 1118 } 1119 return fNodeStack[fNodeStackPtr]; 1120} 1121 1122 1123 1124//------------------------------------------------------------------------------ 1125// 1126// scanSet Construct a UnicodeSet from the text at the current scan 1127// position. Advance the scan position to the first character 1128// after the set. 1129// 1130// A new RBBI setref node referring to the set is pushed onto the node 1131// stack. 1132// 1133// The scan position is normally under the control of the state machine 1134// that controls rule parsing. UnicodeSets, however, are parsed by 1135// the UnicodeSet constructor, not by the RBBI rule parser. 1136// 1137//------------------------------------------------------------------------------ 1138void RBBIRuleScanner::scanSet() { 1139 UnicodeSet *uset; 1140 ParsePosition pos; 1141 int startPos; 1142 int i; 1143 1144 if (U_FAILURE(*fRB->fStatus)) { 1145 return; 1146 } 1147 1148 pos.setIndex(fScanIndex); 1149 startPos = fScanIndex; 1150 UErrorCode localStatus = U_ZERO_ERROR; 1151 uset = new UnicodeSet(); 1152 if (uset == NULL) { 1153 localStatus = U_MEMORY_ALLOCATION_ERROR; 1154 } else { 1155 uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus); 1156 } 1157 if (U_FAILURE(localStatus)) { 1158 // TODO: Get more accurate position of the error from UnicodeSet's return info. 1159 // UnicodeSet appears to not be reporting correctly at this time. 1160 #ifdef RBBI_DEBUG 1161 RBBIDebugPrintf("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()); 1162 #endif 1163 error(localStatus); 1164 delete uset; 1165 return; 1166 } 1167 1168 // Verify that the set contains at least one code point. 1169 // 1170 U_ASSERT(uset!=NULL); 1171 if (uset->isEmpty()) { 1172 // This set is empty. 1173 // Make it an error, because it almost certainly is not what the user wanted. 1174 // Also, avoids having to think about corner cases in the tree manipulation code 1175 // that occurs later on. 1176 error(U_BRK_RULE_EMPTY_SET); 1177 delete uset; 1178 return; 1179 } 1180 1181 1182 // Advance the RBBI parse postion over the UnicodeSet pattern. 1183 // Don't just set fScanIndex because the line/char positions maintained 1184 // for error reporting would be thrown off. 1185 i = pos.getIndex(); 1186 for (;;) { 1187 if (fNextIndex >= i) { 1188 break; 1189 } 1190 nextCharLL(); 1191 } 1192 1193 if (U_SUCCESS(*fRB->fStatus)) { 1194 RBBINode *n; 1195 1196 n = pushNewNode(RBBINode::setRef); 1197 n->fFirstPos = startPos; 1198 n->fLastPos = fNextIndex; 1199 fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText); 1200 // findSetFor() serves several purposes here: 1201 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1202 // - Mantains collection of all sets in use, needed later for establishing 1203 // character categories for run time engine. 1204 // - Eliminates mulitiple instances of the same set. 1205 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1206 findSetFor(n->fText, n, uset); 1207 } 1208 1209} 1210 1211U_NAMESPACE_END 1212 1213#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1214