1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ******************************************************************************* 6 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10package android.icu.text; 11 12import java.text.ParsePosition; 13import java.util.HashMap; 14 15import android.icu.impl.Assert; 16import android.icu.impl.Utility; 17import android.icu.lang.UCharacter; 18 19/** 20 * This class is part of the Rule Based Break Iterator rule compiler. 21 * It scans the rules and builds the parse tree. 22 * There is no public API here. 23 */ 24class RBBIRuleScanner { 25 26 private final static int kStackSize = 100; // The size of the state stack for 27 // rules parsing. Corresponds roughly 28 // to the depth of parentheses nesting 29 // that is allowed in the rules. 30 31 static class RBBIRuleChar { 32 int fChar; 33 boolean fEscaped; 34 } 35 36 37 38 RBBIRuleBuilder fRB; // The rule builder that we are part of. 39 40 int fScanIndex; // Index of current character being processed 41 // in the rule input string. 42 int fNextIndex; // Index of the next character, which 43 // is the first character not yet scanned. 44 boolean fQuoteMode; // Scan is in a 'quoted region' 45 int fLineNum; // Line number in input file. 46 int fCharNum; // Char position within the line. 47 int fLastChar; // Previous char, needed to count CR-LF 48 // as a single line, not two. 49 50 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine 51 // processing. 52 53 54 short fStack[] = new short[kStackSize]; // State stack, holds state pushes 55 int fStackPtr; // and pops as specified in the state 56 // transition rules. 57 58 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created 59 // during the parse of a rule 60 int fNodeStackPtr; 61 62 63 boolean fReverseRule; // True if the rule currently being scanned 64 // is a reverse direction rule (if it 65 // starts with a '!') 66 67 boolean fLookAheadRule; // True if the rule includes a '/' 68 // somewhere within it. 69 70 boolean fNoChainInRule; // True if the current rule starts with a '^'. 71 72 73 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of 74 // $variable symbols. 75 76 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to 77 // the sets created while parsing rules. 78 // The key is the string used for creating 79 // the set. 80 81 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during 82 // the scanning of RBBI rules. The 83 // indicies for these are assigned by the 84 // perl script that builds the state tables. 85 // See rbbirpt.h. 86 87 int fRuleNum; // Counts each rule as it is scanned. 88 89 int fOptionStart; // Input index of start of a !!option 90 // keyword, while being scanned. 91 92 93 94 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; 95 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; 96 static private String gRuleSet_digit_char_pattern = "[0-9]"; 97 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]"; 98 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; 99 static private String kAny = "any"; 100 101 102 103 104 //---------------------------------------------------------------------------------------- 105 // 106 // Constructor. 107 // 108 //---------------------------------------------------------------------------------------- 109 RBBIRuleScanner(RBBIRuleBuilder rb) { 110 fRB = rb; 111 fLineNum = 1; 112 113 // 114 // Set up the constant Unicode Sets. 115 // Note: These could be made static and shared among 116 // all instances of RBBIRuleScanners. 117 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern); 118 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern); 119 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern); 120 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern); 121 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern); 122 123 fSymbolTable = new RBBISymbolTable(this); 124 } 125 126 //---------------------------------------------------------------------------------------- 127 // 128 // doParseAction Do some action during rule parsing. 129 // Called by the parse state machine. 130 // Actions build the parse tree and Unicode Sets, 131 // and maintain the parse stack for nested expressions. 132 // 133 //---------------------------------------------------------------------------------------- 134 boolean doParseActions(int action) { 135 RBBINode n = null; 136 137 boolean returnVal = true; 138 139 switch (action) { 140 141 case RBBIRuleParseTable.doExprStart: 142 pushNewNode(RBBINode.opStart); 143 fRuleNum++; 144 break; 145 146 case RBBIRuleParseTable.doNoChain: 147 // Scanned a '^' while on the rule start state. 148 fNoChainInRule = true; 149 break; 150 151 152 case RBBIRuleParseTable.doExprOrOperator: { 153 fixOpStack(RBBINode.precOpCat); 154 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 155 RBBINode orNode = pushNewNode(RBBINode.opOr); 156 orNode.fLeftChild = operandNode; 157 operandNode.fParent = orNode; 158 } 159 break; 160 161 case RBBIRuleParseTable.doExprCatOperator: 162 // concatenation operator. 163 // For the implicit concatenation of adjacent terms in an expression 164 // that are 165 // not separated by any other operator. Action is invoked between the 166 // actions for the two terms. 167 { 168 fixOpStack(RBBINode.precOpCat); 169 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 170 RBBINode catNode = pushNewNode(RBBINode.opCat); 171 catNode.fLeftChild = operandNode; 172 operandNode.fParent = catNode; 173 } 174 break; 175 176 case RBBIRuleParseTable.doLParen: 177 // Open Paren. 178 // The openParen node is a dummy operation type with a low 179 // precedence, 180 // which has the affect of ensuring that any real binary op that 181 // follows within the parens binds more tightly to the operands than 182 // stuff outside of the parens. 183 pushNewNode(RBBINode.opLParen); 184 break; 185 186 case RBBIRuleParseTable.doExprRParen: 187 fixOpStack(RBBINode.precLParen); 188 break; 189 190 case RBBIRuleParseTable.doNOP: 191 break; 192 193 case RBBIRuleParseTable.doStartAssign: 194 // We've just scanned "$variable = " 195 // The top of the node stack has the $variable ref node. 196 197 // Save the start position of the RHS text in the StartExpression 198 // node 199 // that precedes the $variableReference node on the stack. 200 // This will eventually be used when saving the full $variable 201 // replacement 202 // text as a string. 203 n = fNodeStack[fNodeStackPtr - 1]; 204 n.fFirstPos = fNextIndex; // move past the '=' 205 206 // Push a new start-of-expression node; needed to keep parse of the 207 // RHS expression happy. 208 pushNewNode(RBBINode.opStart); 209 break; 210 211 case RBBIRuleParseTable.doEndAssign: { 212 // We have reached the end of an assignement statement. 213 // Current scan char is the ';' that terminates the assignment. 214 215 // Terminate expression, leaves expression parse tree rooted in TOS 216 // node. 217 fixOpStack(RBBINode.precStart); 218 219 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; 220 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; 221 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; 222 223 // Save original text of right side of assignment, excluding the 224 // terminating ';' 225 // in the root of the node for the right-hand-side expression. 226 RHSExprNode.fFirstPos = startExprNode.fFirstPos; 227 RHSExprNode.fLastPos = fScanIndex; 228 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, 229 // RHSExprNode.fLastPos, RHSExprNode.fText); 230 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, 231 RHSExprNode.fLastPos); 232 233 // Expression parse tree becomes l. child of the $variable reference 234 // node. 235 varRefNode.fLeftChild = RHSExprNode; 236 RHSExprNode.fParent = varRefNode; 237 238 // Make a symbol table entry for the $variableRef node. 239 fSymbolTable.addEntry(varRefNode.fText, varRefNode); 240 241 // Clean up the stack. 242 fNodeStackPtr -= 3; 243 break; 244 } 245 246 case RBBIRuleParseTable.doEndOfRule: { 247 fixOpStack(RBBINode.precStart); // Terminate expression, leaves 248 // expression 249 250 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) { 251 printNodeStack("end of rule"); 252 } 253 Assert.assrt(fNodeStackPtr == 1); 254 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 255 256 // If this rule includes a look-ahead '/', add a endMark node to the 257 // expression tree. 258 if (fLookAheadRule) { 259 RBBINode endNode = pushNewNode(RBBINode.endMark); 260 RBBINode catNode = pushNewNode(RBBINode.opCat); 261 fNodeStackPtr -= 2; 262 catNode.fLeftChild = thisRule; 263 catNode.fRightChild = endNode; 264 fNodeStack[fNodeStackPtr] = catNode; 265 endNode.fVal = fRuleNum; 266 endNode.fLookAheadEnd = true; 267 thisRule = catNode; 268 269 // TODO: Disable chaining out of look-ahead (hard break) rules. 270 // The break on rule match is forced, so there is no point in building up 271 // the state table to chain into another rule for a longer match. 272 } 273 274 // Mark this node as being the root of a rule. 275 thisRule.fRuleRoot = true; 276 277 // Flag if chaining into this rule is wanted. 278 // 279 if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain 280 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 281 thisRule.fChainIn = true; 282 } 283 284 285 // All rule expressions are ORed together. 286 // The ';' that terminates an expression really just functions as a 287 // '|' with 288 // a low operator prededence. 289 // 290 // Each of the four sets of rules are collected separately. 291 // (forward, reverse, safe_forward, safe_reverse) 292 // OR this rule into the appropriate group of them. 293 // 294 295 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); 296 297 if (fRB.fTreeRoots[destRules] != null) { 298 // This is not the first rule encountered. 299 // OR previous stuff (from *destRules) 300 // with the current rule expression (on the Node Stack) 301 // with the resulting OR expression going to *destRules 302 // 303 thisRule = fNodeStack[fNodeStackPtr]; 304 RBBINode prevRules = fRB.fTreeRoots[destRules]; 305 RBBINode orNode = pushNewNode(RBBINode.opOr); 306 orNode.fLeftChild = prevRules; 307 prevRules.fParent = orNode; 308 orNode.fRightChild = thisRule; 309 thisRule.fParent = orNode; 310 fRB.fTreeRoots[destRules] = orNode; 311 } else { 312 // This is the first rule encountered (for this direction). 313 // Just move its parse tree from the stack to *destRules. 314 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; 315 } 316 fReverseRule = false; // in preparation for the next rule. 317 fLookAheadRule = false; 318 fNoChainInRule = false; 319 fNodeStackPtr = 0; 320 } 321 break; 322 323 case RBBIRuleParseTable.doRuleError: 324 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 325 returnVal = false; 326 break; 327 328 case RBBIRuleParseTable.doVariableNameExpectedErr: 329 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 330 break; 331 332 // 333 // Unary operands + ? * 334 // These all appear after the operand to which they apply. 335 // When we hit one, the operand (may be a whole sub expression) 336 // will be on the top of the stack. 337 // Unary Operator becomes TOS, with the old TOS as its one child. 338 case RBBIRuleParseTable.doUnaryOpPlus: { 339 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 340 RBBINode plusNode = pushNewNode(RBBINode.opPlus); 341 plusNode.fLeftChild = operandNode; 342 operandNode.fParent = plusNode; 343 } 344 break; 345 346 case RBBIRuleParseTable.doUnaryOpQuestion: { 347 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 348 RBBINode qNode = pushNewNode(RBBINode.opQuestion); 349 qNode.fLeftChild = operandNode; 350 operandNode.fParent = qNode; 351 } 352 break; 353 354 case RBBIRuleParseTable.doUnaryOpStar: { 355 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 356 RBBINode starNode = pushNewNode(RBBINode.opStar); 357 starNode.fLeftChild = operandNode; 358 operandNode.fParent = starNode; 359 } 360 break; 361 362 case RBBIRuleParseTable.doRuleChar: 363 // A "Rule Character" is any single character that is a literal part 364 // of the regular expression. Like a, b and c in the expression "(abc*) 365 // | [:L:]" 366 // These are pretty uncommon in break rules; the terms are more commonly 367 // sets. To keep things uniform, treat these characters like as 368 // sets that just happen to contain only one character. 369 { 370 n = pushNewNode(RBBINode.setRef); 371 String s = String.valueOf((char)fC.fChar); 372 findSetFor(s, n, null); 373 n.fFirstPos = fScanIndex; 374 n.fLastPos = fNextIndex; 375 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 376 break; 377 } 378 379 case RBBIRuleParseTable.doDotAny: 380 // scanned a ".", meaning match any single character. 381 { 382 n = pushNewNode(RBBINode.setRef); 383 findSetFor(kAny, n, null); 384 n.fFirstPos = fScanIndex; 385 n.fLastPos = fNextIndex; 386 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 387 break; 388 } 389 390 case RBBIRuleParseTable.doSlash: 391 // Scanned a '/', which identifies a look-ahead break position in a 392 // rule. 393 n = pushNewNode(RBBINode.lookAhead); 394 n.fVal = fRuleNum; 395 n.fFirstPos = fScanIndex; 396 n.fLastPos = fNextIndex; 397 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 398 fLookAheadRule = true; 399 break; 400 401 case RBBIRuleParseTable.doStartTagValue: 402 // Scanned a '{', the opening delimiter for a tag value within a 403 // rule. 404 n = pushNewNode(RBBINode.tag); 405 n.fVal = 0; 406 n.fFirstPos = fScanIndex; 407 n.fLastPos = fNextIndex; 408 break; 409 410 case RBBIRuleParseTable.doTagDigit: 411 // Just scanned a decimal digit that's part of a tag value 412 { 413 n = fNodeStack[fNodeStackPtr]; 414 int v = UCharacter.digit((char) fC.fChar, 10); 415 n.fVal = n.fVal * 10 + v; 416 break; 417 } 418 419 case RBBIRuleParseTable.doTagValue: 420 n = fNodeStack[fNodeStackPtr]; 421 n.fLastPos = fNextIndex; 422 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 423 break; 424 425 case RBBIRuleParseTable.doTagExpectedError: 426 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); 427 returnVal = false; 428 break; 429 430 case RBBIRuleParseTable.doOptionStart: 431 // Scanning a !!option. At the start of string. 432 fOptionStart = fScanIndex; 433 break; 434 435 case RBBIRuleParseTable.doOptionEnd: { 436 String opt = fRB.fRules.substring(fOptionStart, fScanIndex); 437 if (opt.equals("chain")) { 438 fRB.fChainRules = true; 439 } else if (opt.equals("LBCMNoChain")) { 440 fRB.fLBCMNoChain = true; 441 } else if (opt.equals("forward")) { 442 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree; 443 } else if (opt.equals("reverse")) { 444 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree; 445 } else if (opt.equals("safe_forward")) { 446 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree; 447 } else if (opt.equals("safe_reverse")) { 448 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; 449 } else if (opt.equals("lookAheadHardBreak")) { 450 fRB.fLookAheadHardBreak = true; 451 } else { 452 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); 453 } 454 break; 455 } 456 457 case RBBIRuleParseTable.doReverseDir: 458 fReverseRule = true; 459 break; 460 461 case RBBIRuleParseTable.doStartVariableName: 462 n = pushNewNode(RBBINode.varRef); 463 n.fFirstPos = fScanIndex; 464 break; 465 466 case RBBIRuleParseTable.doEndVariableName: 467 n = fNodeStack[fNodeStackPtr]; 468 if (n == null || n.fType != RBBINode.varRef) { 469 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 470 break; 471 } 472 n.fLastPos = fScanIndex; 473 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos); 474 // Look the newly scanned name up in the symbol table 475 // If there's an entry, set the l. child of the var ref to the 476 // replacement expression. 477 // (We also pass through here when scanning assignments, but no harm 478 // is done, other 479 // than a slight wasted effort that seems hard to avoid. Lookup will 480 // be null) 481 n.fLeftChild = fSymbolTable.lookupNode(n.fText); 482 break; 483 484 case RBBIRuleParseTable.doCheckVarDef: 485 n = fNodeStack[fNodeStackPtr]; 486 if (n.fLeftChild == null) { 487 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); 488 returnVal = false; 489 } 490 break; 491 492 case RBBIRuleParseTable.doExprFinished: 493 break; 494 495 case RBBIRuleParseTable.doRuleErrorAssignExpr: 496 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); 497 returnVal = false; 498 break; 499 500 case RBBIRuleParseTable.doExit: 501 returnVal = false; 502 break; 503 504 case RBBIRuleParseTable.doScanUnicodeSet: 505 scanSet(); 506 break; 507 508 default: 509 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 510 returnVal = false; 511 break; 512 } 513 return returnVal; 514 } 515 516 //---------------------------------------------------------------------------------------- 517 // 518 // Error Throw and IllegalArgumentException in response to a rule parse 519 // error. 520 // 521 //---------------------------------------------------------------------------------------- 522 void error(int e) { 523 String s = "Error " + e + " at line " + fLineNum + " column " 524 + fCharNum; 525 IllegalArgumentException ex = new IllegalArgumentException(s); 526 throw ex; 527 528 } 529 530 //---------------------------------------------------------------------------------------- 531 // 532 // fixOpStack The parse stack holds partially assembled chunks of the parse 533 // tree. 534 // An entry on the stack may be as small as a single setRef node, 535 // or as large as the parse tree 536 // for an entire expression (this will be the one item left on the stack 537 // when the parsing of an RBBI rule completes. 538 // 539 // This function is called when a binary operator is encountered. 540 // It looks back up the stack for operators that are not yet associated 541 // with a right operand, and if the precedence of the stacked operator >= 542 // the precedence of the current operator, binds the operand left, 543 // to the previously encountered operator. 544 // 545 //---------------------------------------------------------------------------------------- 546 void fixOpStack(int p) { 547 RBBINode n; 548 // printNodeStack("entering fixOpStack()"); 549 for (;;) { 550 n = fNodeStack[fNodeStackPtr - 1]; // an operator node 551 if (n.fPrecedence == 0) { 552 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node"); 553 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 554 return; 555 } 556 557 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) { 558 // The most recent operand goes with the current operator, 559 // not with the previously stacked one. 560 break; 561 } 562 // Stack operator is a binary op ( '|' or concatenation) 563 // TOS operand becomes right child of this operator. 564 // Resulting subexpression becomes the TOS operand. 565 n.fRightChild = fNodeStack[fNodeStackPtr]; 566 fNodeStack[fNodeStackPtr].fParent = n; 567 fNodeStackPtr--; 568 // printNodeStack("looping in fixOpStack() "); 569 } 570 571 if (p <= RBBINode.precLParen) { 572 // Scan is at a right paren or end of expression. 573 // The scanned item must match the stack, or else there was an 574 // error. 575 // Discard the left paren (or start expr) node from the stack, 576 // leaving the completed (sub)expression as TOS. 577 if (n.fPrecedence != p) { 578 // Right paren encountered matched start of expression node, or 579 // end of expression matched with a left paren node. 580 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN); 581 } 582 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr]; 583 fNodeStackPtr--; 584 // Delete the now-discarded LParen or Start node. 585 // delete n; 586 } 587 // printNodeStack("leaving fixOpStack()"); 588 } 589 590 //---------------------------------------------------------------------------- 591 // 592 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 593 // been encountered. The val Node will be of nodetype uset 594 // and contain pointers to the actual UnicodeSets. 595 // The Key is the source string for initializing the set. 596 // 597 // The hash table is used to avoid creating duplicate 598 // unnamed (not $var references) UnicodeSets. 599 // 600 //---------------------------------------------------------------------------- 601 static class RBBISetTableEl { 602 String key; 603 604 RBBINode val; 605 } 606 607 608 //---------------------------------------------------------------------------------------- 609 // 610 // findSetFor given a String, 611 // - find the corresponding Unicode Set (uset node) 612 // (create one if necessary) 613 // - Set fLeftChild of the caller's node (should be a setRef node) 614 // to the uset node 615 // Maintain a hash table of uset nodes, so the same one is always used 616 // for the same string. 617 // If a "to adopt" set is provided and we haven't seen this key before, 618 // add the provided set to the hash table. 619 // If the string is one (32 bit) char in length, the set contains 620 // just one element which is the char in question. 621 // If the string is "any", return a set containing all chars. 622 // 623 //---------------------------------------------------------------------------------------- 624 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { 625 626 RBBISetTableEl el; 627 628 // First check whether we've already cached a set for this string. 629 // If so, just use the cached set in the new node. 630 // delete any set provided by the caller, since we own it. 631 el = fSetTable.get(s); 632 if (el != null) { 633 node.fLeftChild = el.val; 634 Assert.assrt(node.fLeftChild.fType == RBBINode.uset); 635 return; 636 } 637 638 // Haven't seen this set before. 639 // If the caller didn't provide us with a prebuilt set, 640 // create a new UnicodeSet now. 641 if (setToAdopt == null) { 642 if (s.equals(kAny)) { 643 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 644 } else { 645 int c; 646 c = UTF16.charAt(s, 0); 647 setToAdopt = new UnicodeSet(c, c); 648 } 649 } 650 651 // 652 // Make a new uset node to refer to this UnicodeSet 653 // This new uset node becomes the child of the caller's setReference 654 // node. 655 // 656 RBBINode usetNode = new RBBINode(RBBINode.uset); 657 usetNode.fInputSet = setToAdopt; 658 usetNode.fParent = node; 659 node.fLeftChild = usetNode; 660 usetNode.fText = s; 661 662 // 663 // Add the new uset node to the list of all uset nodes. 664 // 665 fRB.fUSetNodes.add(usetNode); 666 667 // 668 // Add the new set to the set hash table. 669 // 670 el = new RBBISetTableEl(); 671 el.key = s; 672 el.val = usetNode; 673 fSetTable.put(el.key, el); 674 675 return; 676 } 677 678 // 679 // Assorted Unicode character constants. 680 // Numeric because there is no portable way to enter them as literals. 681 // (Think EBCDIC). 682 // 683 static final int chNEL = 0x85; // NEL newline variant 684 685 static final int chLS = 0x2028; // Unicode Line Separator 686 687 //---------------------------------------------------------------------------------------- 688 // 689 // stripRules Return a rules string without unnecessary 690 // characters. 691 // 692 //---------------------------------------------------------------------------------------- 693 static String stripRules(String rules) { 694 StringBuilder strippedRules = new StringBuilder(); 695 int rulesLength = rules.length(); 696 for (int idx = 0; idx < rulesLength;) { 697 char ch = rules.charAt(idx++); 698 if (ch == '#') { 699 while (idx < rulesLength 700 && ch != '\r' && ch != '\n' && ch != chNEL) { 701 ch = rules.charAt(idx++); 702 } 703 } 704 if (!UCharacter.isISOControl(ch)) { 705 strippedRules.append(ch); 706 } 707 } 708 return strippedRules.toString(); 709 } 710 711 //---------------------------------------------------------------------------------------- 712 // 713 // nextCharLL Low Level Next Char from rule input source. 714 // Get a char from the input character iterator, 715 // keep track of input position for error reporting. 716 // 717 //---------------------------------------------------------------------------------------- 718 int nextCharLL() { 719 int ch; 720 721 if (fNextIndex >= fRB.fRules.length()) { 722 return -1; 723 } 724 ch = UTF16.charAt(fRB.fRules, fNextIndex); 725 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); 726 727 if (ch == '\r' || 728 ch == chNEL || 729 ch == chLS || 730 ch == '\n' && fLastChar != '\r') { 731 // Character is starting a new line. Bump up the line number, and 732 // reset the column to 0. 733 fLineNum++; 734 fCharNum = 0; 735 if (fQuoteMode) { 736 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING); 737 fQuoteMode = false; 738 } 739 } else { 740 // Character is not starting a new line. Except in the case of a 741 // LF following a CR, increment the column position. 742 if (ch != '\n') { 743 fCharNum++; 744 } 745 } 746 fLastChar = ch; 747 return ch; 748 } 749 750 //--------------------------------------------------------------------------------- 751 // 752 // nextChar for rules scanning. At this level, we handle stripping 753 // out comments and processing backslash character escapes. 754 // The rest of the rules grammar is handled at the next level up. 755 // 756 //--------------------------------------------------------------------------------- 757 void nextChar(RBBIRuleChar c) { 758 759 // Unicode Character constants needed for the processing done by nextChar(), 760 // in hex because literals wont work on EBCDIC machines. 761 762 fScanIndex = fNextIndex; 763 c.fChar = nextCharLL(); 764 c.fEscaped = false; 765 766 // 767 // check for '' sequence. 768 // These are recognized in all contexts, whether in quoted text or not. 769 // 770 if (c.fChar == '\'') { 771 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') { 772 c.fChar = nextCharLL(); // get nextChar officially so character counts 773 c.fEscaped = true; // stay correct. 774 } else { 775 // Single quote, by itself. 776 // Toggle quoting mode. 777 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 778 fQuoteMode = !fQuoteMode; 779 if (fQuoteMode == true) { 780 c.fChar = '('; 781 } else { 782 c.fChar = ')'; 783 } 784 c.fEscaped = false; // The paren that we return is not escaped. 785 return; 786 } 787 } 788 789 if (fQuoteMode) { 790 c.fEscaped = true; 791 } else { 792 // We are not in a 'quoted region' of the source. 793 // 794 if (c.fChar == '#') { 795 // Start of a comment. Consume the rest of it. 796 // The new-line char that terminates the comment is always returned. 797 // It will be treated as white-space, and serves to break up anything 798 // that might otherwise incorrectly clump together with a comment in 799 // the middle (a variable name, for example.) 800 for (;;) { 801 c.fChar = nextCharLL(); 802 if (c.fChar == -1 || // EOF 803 c.fChar == '\r' || 804 c.fChar == '\n' || 805 c.fChar == chNEL || 806 c.fChar == chLS) 807 { 808 break; 809 } 810 } 811 } 812 if (c.fChar == -1) { 813 return; 814 } 815 816 // 817 // check for backslash escaped characters. 818 // Use String.unescapeAt() to handle them. 819 // 820 if (c.fChar == '\\') { 821 c.fEscaped = true; 822 int[] unescapeIndex = new int[1]; 823 unescapeIndex[0] = fNextIndex; 824 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex); 825 if (unescapeIndex[0] == fNextIndex) { 826 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); 827 } 828 829 fCharNum += unescapeIndex[0] - fNextIndex; 830 fNextIndex = unescapeIndex[0]; 831 } 832 } 833 // putc(c.fChar, stdout); 834 } 835 836 //--------------------------------------------------------------------------------- 837 // 838 // Parse RBBI rules. The state machine for rules parsing is here. 839 // The state tables are hand-written in the file rbbirpt.txt, 840 // and converted to the form used here by a perl 841 // script rbbicst.pl 842 // 843 //--------------------------------------------------------------------------------- 844 void parse() { 845 int state; 846 RBBIRuleParseTable.RBBIRuleTableElement tableEl; 847 848 state = 1; 849 nextChar(fC); 850 // 851 // Main loop for the rule parsing state machine. 852 // Runs once per state transition. 853 // Each time through optionally performs, depending on the state table, 854 // - an advance to the the next input char 855 // - an action to be performed. 856 // - pushing or popping a state to/from the local state return stack. 857 // 858 for (;;) { 859 // Quit if state == 0. This is the normal way to exit the state machine. 860 // 861 if (state == 0) { 862 break; 863 } 864 865 // Find the state table element that matches the input char from the rule, or the 866 // class of the input character. Start with the first table row for this 867 // state, then linearly scan forward until we find a row that matches the 868 // character. The last row for each state always matches all characters, so 869 // the search will stop there, if not before. 870 // 871 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state]; 872 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 873 System.out.println("char, line, col = (\'" + (char) fC.fChar 874 + "\', " + fLineNum + ", " + fCharNum + " state = " 875 + tableEl.fStateName); 876 } 877 878 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state. 879 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow]; 880 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 881 System.out.print("."); 882 } 883 if (tableEl.fCharClass < 127 && fC.fEscaped == false 884 && tableEl.fCharClass == fC.fChar) { 885 // Table row specified an individual character, not a set, and 886 // the input character is not escaped, and 887 // the input character matched it. 888 break; 889 } 890 if (tableEl.fCharClass == 255) { 891 // Table row specified default, match anything character class. 892 break; 893 } 894 if (tableEl.fCharClass == 254 && fC.fEscaped) { 895 // Table row specified "escaped" and the char was escaped. 896 break; 897 } 898 if (tableEl.fCharClass == 253 && fC.fEscaped 899 && (fC.fChar == 0x50 || fC.fChar == 0x70)) { 900 // Table row specified "escaped P" and the char is either 'p' or 'P'. 901 break; 902 } 903 if (tableEl.fCharClass == 252 && fC.fChar == -1) { 904 // Table row specified eof and we hit eof on the input. 905 break; 906 } 907 908 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class && 909 fC.fEscaped == false && // char is not escaped && 910 fC.fChar != -1) { // char is not EOF 911 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; 912 if (uniset.contains(fC.fChar)) { 913 // Table row specified a character class, or set of characters, 914 // and the current char matches it. 915 break; 916 } 917 } 918 } 919 920 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 921 System.out.println(""); 922 } 923 // 924 // We've found the row of the state table that matches the current input 925 // character from the rules string. 926 // Perform any action specified by this row in the state table. 927 if (doParseActions(tableEl.fAction) == false) { 928 // Break out of the state machine loop if the 929 // the action signalled some kind of error, or 930 // the action was to exit, occurs on normal end-of-rules-input. 931 break; 932 } 933 934 if (tableEl.fPushState != 0) { 935 fStackPtr++; 936 if (fStackPtr >= kStackSize) { 937 System.out.println("RBBIRuleScanner.parse() - state stack overflow."); 938 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 939 } 940 fStack[fStackPtr] = tableEl.fPushState; 941 } 942 943 if (tableEl.fNextChar) { 944 nextChar(fC); 945 } 946 947 // Get the next state from the table entry, or from the 948 // state stack if the next state was specified as "pop". 949 if (tableEl.fNextState != 255) { 950 state = tableEl.fNextState; 951 } else { 952 state = fStack[fStackPtr]; 953 fStackPtr--; 954 if (fStackPtr < 0) { 955 System.out.println("RBBIRuleScanner.parse() - state stack underflow."); 956 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 957 } 958 } 959 960 } 961 962 // If there are no forward rules throw an error. 963 // 964 if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) { 965 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 966 } 967 968 // 969 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 970 // 971 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) { 972 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar); 973 RBBINode operand = pushNewNode(RBBINode.setRef); 974 findSetFor(kAny, operand, null); 975 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand; 976 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]; 977 fNodeStackPtr -= 2; 978 } 979 980 // 981 // Parsing of the input RBBI rules is complete. 982 // We now have a parse tree for the rule expressions 983 // and a list of all UnicodeSets that are referenced. 984 // 985 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) { 986 fSymbolTable.rbbiSymtablePrint(); 987 } 988 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) { 989 System.out.println("Completed Forward Rules Parse Tree..."); 990 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true); 991 System.out.println("\nCompleted Reverse Rules Parse Tree..."); 992 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true); 993 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree..."); 994 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) { 995 System.out.println(" -- null -- "); 996 } else { 997 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true); 998 } 999 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree..."); 1000 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) { 1001 System.out.println(" -- null -- "); 1002 } else { 1003 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true); 1004 } 1005 } 1006 } 1007 1008 //--------------------------------------------------------------------------------- 1009 // 1010 // printNodeStack for debugging... 1011 // 1012 //--------------------------------------------------------------------------------- 1013 ///CLOVER:OFF 1014 void printNodeStack(String title) { 1015 int i; 1016 System.out.println(title + ". Dumping node stack...\n"); 1017 for (i = fNodeStackPtr; i > 0; i--) { 1018 fNodeStack[i].printTree(true); 1019 } 1020 } 1021 ///CLOVER:ON 1022 1023 //--------------------------------------------------------------------------------- 1024 // 1025 // pushNewNode create a new RBBINode of the specified type and push it 1026 // onto the stack of nodes. 1027 // 1028 //--------------------------------------------------------------------------------- 1029 RBBINode pushNewNode(int nodeType) { 1030 fNodeStackPtr++; 1031 if (fNodeStackPtr >= kStackSize) { 1032 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow."); 1033 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 1034 } 1035 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); 1036 return fNodeStack[fNodeStackPtr]; 1037 } 1038 1039 //--------------------------------------------------------------------------------- 1040 // 1041 // scanSet Construct a UnicodeSet from the text at the current scan 1042 // position. Advance the scan position to the first character 1043 // after the set. 1044 // 1045 // A new RBBI setref node referring to the set is pushed onto the node 1046 // stack. 1047 // 1048 // The scan position is normally under the control of the state machine 1049 // that controls rule parsing. UnicodeSets, however, are parsed by 1050 // the UnicodeSet constructor, not by the RBBI rule parser. 1051 // 1052 //--------------------------------------------------------------------------------- 1053 void scanSet() { 1054 UnicodeSet uset = null; 1055 int startPos; 1056 ParsePosition pos = new ParsePosition(fScanIndex); 1057 int i; 1058 1059 startPos = fScanIndex; 1060 try { 1061 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE); 1062 } catch (Exception e) { // TODO: catch fewer exception types. 1063 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info. 1064 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET); 1065 } 1066 1067 // Verify that the set contains at least one code point. 1068 // 1069 if (uset.isEmpty()) { 1070 // This set is empty. 1071 // Make it an error, because it almost certainly is not what the user wanted. 1072 // Also, avoids having to think about corner cases in the tree manipulation code 1073 // that occurs later on. 1074 // TODO: this shouldn't be an error; it does happen. 1075 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); 1076 } 1077 1078 // Advance the RBBI parse postion over the UnicodeSet pattern. 1079 // Don't just set fScanIndex because the line/char positions maintained 1080 // for error reporting would be thrown off. 1081 i = pos.getIndex(); 1082 for (;;) { 1083 if (fNextIndex >= i) { 1084 break; 1085 } 1086 nextCharLL(); 1087 } 1088 1089 RBBINode n; 1090 1091 n = pushNewNode(RBBINode.setRef); 1092 n.fFirstPos = startPos; 1093 n.fLastPos = fNextIndex; 1094 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 1095 // findSetFor() serves several purposes here: 1096 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1097 // - Mantains collection of all sets in use, needed later for establishing 1098 // character categories for run time engine. 1099 // - Eliminates mulitiple instances of the same set. 1100 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1101 findSetFor(n.fText, n, uset); 1102 } 1103 1104} 1105 1106