1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved. 6 ******************************************************************************* 7 */ 8 9package com.ibm.icu.text; 10 11import java.text.ParsePosition; 12import java.util.HashMap; 13 14import com.ibm.icu.impl.Assert; 15import com.ibm.icu.impl.Utility; 16import com.ibm.icu.lang.UCharacter; 17 18/** 19 * This class is part of the Rule Based Break Iterator rule compiler. 20 * It scans the rules and builds the parse tree. 21 * There is no public API here. 22 */ 23class RBBIRuleScanner { 24 25 private final static int kStackSize = 100; // The size of the state stack for 26 // rules parsing. Corresponds roughly 27 // to the depth of parentheses nesting 28 // that is allowed in the rules. 29 30 static class RBBIRuleChar { 31 int fChar; 32 boolean fEscaped; 33 } 34 35 36 37 RBBIRuleBuilder fRB; // The rule builder that we are part of. 38 39 int fScanIndex; // Index of current character being processed 40 // in the rule input string. 41 int fNextIndex; // Index of the next character, which 42 // is the first character not yet scanned. 43 boolean fQuoteMode; // Scan is in a 'quoted region' 44 int fLineNum; // Line number in input file. 45 int fCharNum; // Char position within the line. 46 int fLastChar; // Previous char, needed to count CR-LF 47 // as a single line, not two. 48 49 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine 50 // processing. 51 52 53 short fStack[] = new short[kStackSize]; // State stack, holds state pushes 54 int fStackPtr; // and pops as specified in the state 55 // transition rules. 56 57 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created 58 // during the parse of a rule 59 int fNodeStackPtr; 60 61 62 boolean fReverseRule; // True if the rule currently being scanned 63 // is a reverse direction rule (if it 64 // starts with a '!') 65 66 boolean fLookAheadRule; // True if the rule includes a '/' 67 // somewhere within it. 68 69 boolean fNoChainInRule; // True if the current rule starts with a '^'. 70 71 72 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of 73 // $variable symbols. 74 75 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to 76 // the sets created while parsing rules. 77 // The key is the string used for creating 78 // the set. 79 80 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during 81 // the scanning of RBBI rules. The 82 // Indices for these are assigned by the 83 // perl script that builds the state tables. 84 // See rbbirpt.h. 85 86 int fRuleNum; // Counts each rule as it is scanned. 87 88 int fOptionStart; // Input index of start of a !!option 89 // keyword, while being scanned. 90 91 92 // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without escaping or quoting. 93 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; 94 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; 95 static private String gRuleSet_digit_char_pattern = "[0-9]"; 96 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]"; 97 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; 98 static private String kAny = "any"; 99 100 101 102 103 //---------------------------------------------------------------------------------------- 104 // 105 // Constructor. 106 // 107 //---------------------------------------------------------------------------------------- 108 RBBIRuleScanner(RBBIRuleBuilder rb) { 109 fRB = rb; 110 fLineNum = 1; 111 112 // 113 // Set up the constant Unicode Sets. 114 // Note: These could be made static and shared among 115 // all instances of RBBIRuleScanners. 116 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern); 117 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern); 118 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern); 119 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern); 120 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern); 121 122 fSymbolTable = new RBBISymbolTable(this); 123 } 124 125 //---------------------------------------------------------------------------------------- 126 // 127 // doParseAction Do some action during rule parsing. 128 // Called by the parse state machine. 129 // Actions build the parse tree and Unicode Sets, 130 // and maintain the parse stack for nested expressions. 131 // 132 //---------------------------------------------------------------------------------------- 133 boolean doParseActions(int action) { 134 RBBINode n = null; 135 136 boolean returnVal = true; 137 138 switch (action) { 139 140 case RBBIRuleParseTable.doExprStart: 141 pushNewNode(RBBINode.opStart); 142 fRuleNum++; 143 break; 144 145 case RBBIRuleParseTable.doNoChain: 146 // Scanned a '^' while on the rule start state. 147 fNoChainInRule = true; 148 break; 149 150 151 case RBBIRuleParseTable.doExprOrOperator: { 152 fixOpStack(RBBINode.precOpCat); 153 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 154 RBBINode orNode = pushNewNode(RBBINode.opOr); 155 orNode.fLeftChild = operandNode; 156 operandNode.fParent = orNode; 157 } 158 break; 159 160 case RBBIRuleParseTable.doExprCatOperator: 161 // concatenation operator. 162 // For the implicit concatenation of adjacent terms in an expression 163 // that are 164 // not separated by any other operator. Action is invoked between the 165 // actions for the two terms. 166 { 167 fixOpStack(RBBINode.precOpCat); 168 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 169 RBBINode catNode = pushNewNode(RBBINode.opCat); 170 catNode.fLeftChild = operandNode; 171 operandNode.fParent = catNode; 172 } 173 break; 174 175 case RBBIRuleParseTable.doLParen: 176 // Open Paren. 177 // The openParen node is a dummy operation type with a low 178 // precedence, 179 // which has the affect of ensuring that any real binary op that 180 // follows within the parens binds more tightly to the operands than 181 // stuff outside of the parens. 182 pushNewNode(RBBINode.opLParen); 183 break; 184 185 case RBBIRuleParseTable.doExprRParen: 186 fixOpStack(RBBINode.precLParen); 187 break; 188 189 case RBBIRuleParseTable.doNOP: 190 break; 191 192 case RBBIRuleParseTable.doStartAssign: 193 // We've just scanned "$variable = " 194 // The top of the node stack has the $variable ref node. 195 196 // Save the start position of the RHS text in the StartExpression 197 // node 198 // that precedes the $variableReference node on the stack. 199 // This will eventually be used when saving the full $variable 200 // replacement 201 // text as a string. 202 n = fNodeStack[fNodeStackPtr - 1]; 203 n.fFirstPos = fNextIndex; // move past the '=' 204 205 // Push a new start-of-expression node; needed to keep parse of the 206 // RHS expression happy. 207 pushNewNode(RBBINode.opStart); 208 break; 209 210 case RBBIRuleParseTable.doEndAssign: { 211 // We have reached the end of an assignement statement. 212 // Current scan char is the ';' that terminates the assignment. 213 214 // Terminate expression, leaves expression parse tree rooted in TOS 215 // node. 216 fixOpStack(RBBINode.precStart); 217 218 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; 219 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; 220 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; 221 222 // Save original text of right side of assignment, excluding the 223 // terminating ';' 224 // in the root of the node for the right-hand-side expression. 225 RHSExprNode.fFirstPos = startExprNode.fFirstPos; 226 RHSExprNode.fLastPos = fScanIndex; 227 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, 228 // RHSExprNode.fLastPos, RHSExprNode.fText); 229 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, 230 RHSExprNode.fLastPos); 231 232 // Expression parse tree becomes l. child of the $variable reference 233 // node. 234 varRefNode.fLeftChild = RHSExprNode; 235 RHSExprNode.fParent = varRefNode; 236 237 // Make a symbol table entry for the $variableRef node. 238 fSymbolTable.addEntry(varRefNode.fText, varRefNode); 239 240 // Clean up the stack. 241 fNodeStackPtr -= 3; 242 break; 243 } 244 245 case RBBIRuleParseTable.doEndOfRule: { 246 fixOpStack(RBBINode.precStart); // Terminate expression, leaves 247 // expression 248 249 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) { 250 printNodeStack("end of rule"); 251 } 252 Assert.assrt(fNodeStackPtr == 1); 253 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 254 255 // If this rule includes a look-ahead '/', add a endMark node to the 256 // expression tree. 257 if (fLookAheadRule) { 258 RBBINode endNode = pushNewNode(RBBINode.endMark); 259 RBBINode catNode = pushNewNode(RBBINode.opCat); 260 fNodeStackPtr -= 2; 261 catNode.fLeftChild = thisRule; 262 catNode.fRightChild = endNode; 263 fNodeStack[fNodeStackPtr] = catNode; 264 endNode.fVal = fRuleNum; 265 endNode.fLookAheadEnd = true; 266 thisRule = catNode; 267 268 // TODO: Disable chaining out of look-ahead (hard break) rules. 269 // The break on rule match is forced, so there is no point in building up 270 // the state table to chain into another rule for a longer match. 271 } 272 273 // Mark this node as being the root of a rule. 274 thisRule.fRuleRoot = true; 275 276 // Flag if chaining into this rule is wanted. 277 // 278 if (fRB.fChainRules && // If rule chaining is enabled globally via !!chain 279 !fNoChainInRule) { // and no '^' chain-in inhibit was on this rule 280 thisRule.fChainIn = true; 281 } 282 283 284 // All rule expressions are ORed together. 285 // The ';' that terminates an expression really just functions as a 286 // '|' with 287 // a low operator prededence. 288 // 289 // Each of the four sets of rules are collected separately. 290 // (forward, reverse, safe_forward, safe_reverse) 291 // OR this rule into the appropriate group of them. 292 // 293 294 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); 295 296 if (fRB.fTreeRoots[destRules] != null) { 297 // This is not the first rule encountered. 298 // OR previous stuff (from *destRules) 299 // with the current rule expression (on the Node Stack) 300 // with the resulting OR expression going to *destRules 301 // 302 thisRule = fNodeStack[fNodeStackPtr]; 303 RBBINode prevRules = fRB.fTreeRoots[destRules]; 304 RBBINode orNode = pushNewNode(RBBINode.opOr); 305 orNode.fLeftChild = prevRules; 306 prevRules.fParent = orNode; 307 orNode.fRightChild = thisRule; 308 thisRule.fParent = orNode; 309 fRB.fTreeRoots[destRules] = orNode; 310 } else { 311 // This is the first rule encountered (for this direction). 312 // Just move its parse tree from the stack to *destRules. 313 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; 314 } 315 fReverseRule = false; // in preparation for the next rule. 316 fLookAheadRule = false; 317 fNoChainInRule = false; 318 fNodeStackPtr = 0; 319 } 320 break; 321 322 case RBBIRuleParseTable.doRuleError: 323 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 324 returnVal = false; 325 break; 326 327 case RBBIRuleParseTable.doVariableNameExpectedErr: 328 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 329 break; 330 331 // 332 // Unary operands + ? * 333 // These all appear after the operand to which they apply. 334 // When we hit one, the operand (may be a whole sub expression) 335 // will be on the top of the stack. 336 // Unary Operator becomes TOS, with the old TOS as its one child. 337 case RBBIRuleParseTable.doUnaryOpPlus: { 338 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 339 RBBINode plusNode = pushNewNode(RBBINode.opPlus); 340 plusNode.fLeftChild = operandNode; 341 operandNode.fParent = plusNode; 342 } 343 break; 344 345 case RBBIRuleParseTable.doUnaryOpQuestion: { 346 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 347 RBBINode qNode = pushNewNode(RBBINode.opQuestion); 348 qNode.fLeftChild = operandNode; 349 operandNode.fParent = qNode; 350 } 351 break; 352 353 case RBBIRuleParseTable.doUnaryOpStar: { 354 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 355 RBBINode starNode = pushNewNode(RBBINode.opStar); 356 starNode.fLeftChild = operandNode; 357 operandNode.fParent = starNode; 358 } 359 break; 360 361 case RBBIRuleParseTable.doRuleChar: 362 // A "Rule Character" is any single character that is a literal part 363 // of the regular expression. Like a, b and c in the expression "(abc*) 364 // | [:L:]" 365 // These are pretty uncommon in break rules; the terms are more commonly 366 // sets. To keep things uniform, treat these characters like as 367 // sets that just happen to contain only one character. 368 { 369 n = pushNewNode(RBBINode.setRef); 370 String s = String.valueOf((char)fC.fChar); 371 findSetFor(s, n, null); 372 n.fFirstPos = fScanIndex; 373 n.fLastPos = fNextIndex; 374 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 375 break; 376 } 377 378 case RBBIRuleParseTable.doDotAny: 379 // scanned a ".", meaning match any single character. 380 { 381 n = pushNewNode(RBBINode.setRef); 382 findSetFor(kAny, n, null); 383 n.fFirstPos = fScanIndex; 384 n.fLastPos = fNextIndex; 385 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 386 break; 387 } 388 389 case RBBIRuleParseTable.doSlash: 390 // Scanned a '/', which identifies a look-ahead break position in a 391 // rule. 392 n = pushNewNode(RBBINode.lookAhead); 393 n.fVal = fRuleNum; 394 n.fFirstPos = fScanIndex; 395 n.fLastPos = fNextIndex; 396 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 397 fLookAheadRule = true; 398 break; 399 400 case RBBIRuleParseTable.doStartTagValue: 401 // Scanned a '{', the opening delimiter for a tag value within a 402 // rule. 403 n = pushNewNode(RBBINode.tag); 404 n.fVal = 0; 405 n.fFirstPos = fScanIndex; 406 n.fLastPos = fNextIndex; 407 break; 408 409 case RBBIRuleParseTable.doTagDigit: 410 // Just scanned a decimal digit that's part of a tag value 411 { 412 n = fNodeStack[fNodeStackPtr]; 413 int v = UCharacter.digit((char) fC.fChar, 10); 414 n.fVal = n.fVal * 10 + v; 415 break; 416 } 417 418 case RBBIRuleParseTable.doTagValue: 419 n = fNodeStack[fNodeStackPtr]; 420 n.fLastPos = fNextIndex; 421 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 422 break; 423 424 case RBBIRuleParseTable.doTagExpectedError: 425 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); 426 returnVal = false; 427 break; 428 429 case RBBIRuleParseTable.doOptionStart: 430 // Scanning a !!option. At the start of string. 431 fOptionStart = fScanIndex; 432 break; 433 434 case RBBIRuleParseTable.doOptionEnd: { 435 String opt = fRB.fRules.substring(fOptionStart, fScanIndex); 436 if (opt.equals("chain")) { 437 fRB.fChainRules = true; 438 } else if (opt.equals("LBCMNoChain")) { 439 fRB.fLBCMNoChain = true; 440 } else if (opt.equals("forward")) { 441 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree; 442 } else if (opt.equals("reverse")) { 443 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree; 444 } else if (opt.equals("safe_forward")) { 445 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree; 446 } else if (opt.equals("safe_reverse")) { 447 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; 448 } else if (opt.equals("lookAheadHardBreak")) { 449 fRB.fLookAheadHardBreak = true; 450 } else if (opt.equals("quoted_literals_only")) { 451 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear(); 452 } else if (opt.equals("unquoted_literals")) { 453 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(gRuleSet_rule_char_pattern); 454 } else { 455 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); 456 } 457 break; 458 } 459 460 case RBBIRuleParseTable.doReverseDir: 461 fReverseRule = true; 462 break; 463 464 case RBBIRuleParseTable.doStartVariableName: 465 n = pushNewNode(RBBINode.varRef); 466 n.fFirstPos = fScanIndex; 467 break; 468 469 case RBBIRuleParseTable.doEndVariableName: 470 n = fNodeStack[fNodeStackPtr]; 471 if (n == null || n.fType != RBBINode.varRef) { 472 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 473 break; 474 } 475 n.fLastPos = fScanIndex; 476 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos); 477 // Look the newly scanned name up in the symbol table 478 // If there's an entry, set the l. child of the var ref to the 479 // replacement expression. 480 // (We also pass through here when scanning assignments, but no harm 481 // is done, other 482 // than a slight wasted effort that seems hard to avoid. Lookup will 483 // be null) 484 n.fLeftChild = fSymbolTable.lookupNode(n.fText); 485 break; 486 487 case RBBIRuleParseTable.doCheckVarDef: 488 n = fNodeStack[fNodeStackPtr]; 489 if (n.fLeftChild == null) { 490 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); 491 returnVal = false; 492 } 493 break; 494 495 case RBBIRuleParseTable.doExprFinished: 496 break; 497 498 case RBBIRuleParseTable.doRuleErrorAssignExpr: 499 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); 500 returnVal = false; 501 break; 502 503 case RBBIRuleParseTable.doExit: 504 returnVal = false; 505 break; 506 507 case RBBIRuleParseTable.doScanUnicodeSet: 508 scanSet(); 509 break; 510 511 default: 512 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 513 returnVal = false; 514 break; 515 } 516 return returnVal; 517 } 518 519 //---------------------------------------------------------------------------------------- 520 // 521 // Error Throw and IllegalArgumentException in response to a rule parse 522 // error. 523 // 524 //---------------------------------------------------------------------------------------- 525 void error(int e) { 526 String s = "Error " + e + " at line " + fLineNum + " column " 527 + fCharNum; 528 IllegalArgumentException ex = new IllegalArgumentException(s); 529 throw ex; 530 531 } 532 533 //---------------------------------------------------------------------------------------- 534 // 535 // fixOpStack The parse stack holds partially assembled chunks of the parse 536 // tree. 537 // An entry on the stack may be as small as a single setRef node, 538 // or as large as the parse tree 539 // for an entire expression (this will be the one item left on the stack 540 // when the parsing of an RBBI rule completes. 541 // 542 // This function is called when a binary operator is encountered. 543 // It looks back up the stack for operators that are not yet associated 544 // with a right operand, and if the precedence of the stacked operator >= 545 // the precedence of the current operator, binds the operand left, 546 // to the previously encountered operator. 547 // 548 //---------------------------------------------------------------------------------------- 549 void fixOpStack(int p) { 550 RBBINode n; 551 // printNodeStack("entering fixOpStack()"); 552 for (;;) { 553 n = fNodeStack[fNodeStackPtr - 1]; // an operator node 554 if (n.fPrecedence == 0) { 555 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node"); 556 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 557 return; 558 } 559 560 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) { 561 // The most recent operand goes with the current operator, 562 // not with the previously stacked one. 563 break; 564 } 565 // Stack operator is a binary op ( '|' or concatenation) 566 // TOS operand becomes right child of this operator. 567 // Resulting subexpression becomes the TOS operand. 568 n.fRightChild = fNodeStack[fNodeStackPtr]; 569 fNodeStack[fNodeStackPtr].fParent = n; 570 fNodeStackPtr--; 571 // printNodeStack("looping in fixOpStack() "); 572 } 573 574 if (p <= RBBINode.precLParen) { 575 // Scan is at a right paren or end of expression. 576 // The scanned item must match the stack, or else there was an 577 // error. 578 // Discard the left paren (or start expr) node from the stack, 579 // leaving the completed (sub)expression as TOS. 580 if (n.fPrecedence != p) { 581 // Right paren encountered matched start of expression node, or 582 // end of expression matched with a left paren node. 583 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN); 584 } 585 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr]; 586 fNodeStackPtr--; 587 // Delete the now-discarded LParen or Start node. 588 // delete n; 589 } 590 // printNodeStack("leaving fixOpStack()"); 591 } 592 593 //---------------------------------------------------------------------------- 594 // 595 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 596 // been encountered. The val Node will be of nodetype uset 597 // and contain pointers to the actual UnicodeSets. 598 // The Key is the source string for initializing the set. 599 // 600 // The hash table is used to avoid creating duplicate 601 // unnamed (not $var references) UnicodeSets. 602 // 603 //---------------------------------------------------------------------------- 604 static class RBBISetTableEl { 605 String key; 606 607 RBBINode val; 608 } 609 610 611 //---------------------------------------------------------------------------------------- 612 // 613 // findSetFor given a String, 614 // - find the corresponding Unicode Set (uset node) 615 // (create one if necessary) 616 // - Set fLeftChild of the caller's node (should be a setRef node) 617 // to the uset node 618 // Maintain a hash table of uset nodes, so the same one is always used 619 // for the same string. 620 // If a "to adopt" set is provided and we haven't seen this key before, 621 // add the provided set to the hash table. 622 // If the string is one (32 bit) char in length, the set contains 623 // just one element which is the char in question. 624 // If the string is "any", return a set containing all chars. 625 // 626 //---------------------------------------------------------------------------------------- 627 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { 628 629 RBBISetTableEl el; 630 631 // First check whether we've already cached a set for this string. 632 // If so, just use the cached set in the new node. 633 // delete any set provided by the caller, since we own it. 634 el = fSetTable.get(s); 635 if (el != null) { 636 node.fLeftChild = el.val; 637 Assert.assrt(node.fLeftChild.fType == RBBINode.uset); 638 return; 639 } 640 641 // Haven't seen this set before. 642 // If the caller didn't provide us with a prebuilt set, 643 // create a new UnicodeSet now. 644 if (setToAdopt == null) { 645 if (s.equals(kAny)) { 646 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 647 } else { 648 int c; 649 c = UTF16.charAt(s, 0); 650 setToAdopt = new UnicodeSet(c, c); 651 } 652 } 653 654 // 655 // Make a new uset node to refer to this UnicodeSet 656 // This new uset node becomes the child of the caller's setReference 657 // node. 658 // 659 RBBINode usetNode = new RBBINode(RBBINode.uset); 660 usetNode.fInputSet = setToAdopt; 661 usetNode.fParent = node; 662 node.fLeftChild = usetNode; 663 usetNode.fText = s; 664 665 // 666 // Add the new uset node to the list of all uset nodes. 667 // 668 fRB.fUSetNodes.add(usetNode); 669 670 // 671 // Add the new set to the set hash table. 672 // 673 el = new RBBISetTableEl(); 674 el.key = s; 675 el.val = usetNode; 676 fSetTable.put(el.key, el); 677 678 return; 679 } 680 681 // 682 // Assorted Unicode character constants. 683 // Numeric because there is no portable way to enter them as literals. 684 // (Think EBCDIC). 685 // 686 static final int chNEL = 0x85; // NEL newline variant 687 688 static final int chLS = 0x2028; // Unicode Line Separator 689 690 //---------------------------------------------------------------------------------------- 691 // 692 // stripRules Return a rules string without unnecessary 693 // characters. 694 // 695 //---------------------------------------------------------------------------------------- 696 static String stripRules(String rules) { 697 StringBuilder strippedRules = new StringBuilder(); 698 int rulesLength = rules.length(); 699 for (int idx = 0; idx < rulesLength;) { 700 char ch = rules.charAt(idx++); 701 if (ch == '#') { 702 while (idx < rulesLength 703 && ch != '\r' && ch != '\n' && ch != chNEL) { 704 ch = rules.charAt(idx++); 705 } 706 } 707 if (!UCharacter.isISOControl(ch)) { 708 strippedRules.append(ch); 709 } 710 } 711 return strippedRules.toString(); 712 } 713 714 //---------------------------------------------------------------------------------------- 715 // 716 // nextCharLL Low Level Next Char from rule input source. 717 // Get a char from the input character iterator, 718 // keep track of input position for error reporting. 719 // 720 //---------------------------------------------------------------------------------------- 721 int nextCharLL() { 722 int ch; 723 724 if (fNextIndex >= fRB.fRules.length()) { 725 return -1; 726 } 727 ch = UTF16.charAt(fRB.fRules, fNextIndex); 728 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); 729 730 if (ch == '\r' || 731 ch == chNEL || 732 ch == chLS || 733 ch == '\n' && fLastChar != '\r') { 734 // Character is starting a new line. Bump up the line number, and 735 // reset the column to 0. 736 fLineNum++; 737 fCharNum = 0; 738 if (fQuoteMode) { 739 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING); 740 fQuoteMode = false; 741 } 742 } else { 743 // Character is not starting a new line. Except in the case of a 744 // LF following a CR, increment the column position. 745 if (ch != '\n') { 746 fCharNum++; 747 } 748 } 749 fLastChar = ch; 750 return ch; 751 } 752 753 //--------------------------------------------------------------------------------- 754 // 755 // nextChar for rules scanning. At this level, we handle stripping 756 // out comments and processing backslash character escapes. 757 // The rest of the rules grammar is handled at the next level up. 758 // 759 //--------------------------------------------------------------------------------- 760 void nextChar(RBBIRuleChar c) { 761 762 // Unicode Character constants needed for the processing done by nextChar(), 763 // in hex because literals wont work on EBCDIC machines. 764 765 fScanIndex = fNextIndex; 766 c.fChar = nextCharLL(); 767 c.fEscaped = false; 768 769 // 770 // check for '' sequence. 771 // These are recognized in all contexts, whether in quoted text or not. 772 // 773 if (c.fChar == '\'') { 774 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') { 775 c.fChar = nextCharLL(); // get nextChar officially so character counts 776 c.fEscaped = true; // stay correct. 777 } else { 778 // Single quote, by itself. 779 // Toggle quoting mode. 780 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 781 fQuoteMode = !fQuoteMode; 782 if (fQuoteMode == true) { 783 c.fChar = '('; 784 } else { 785 c.fChar = ')'; 786 } 787 c.fEscaped = false; // The paren that we return is not escaped. 788 return; 789 } 790 } 791 792 if (fQuoteMode) { 793 c.fEscaped = true; 794 } else { 795 // We are not in a 'quoted region' of the source. 796 // 797 if (c.fChar == '#') { 798 // Start of a comment. Consume the rest of it. 799 // The new-line char that terminates the comment is always returned. 800 // It will be treated as white-space, and serves to break up anything 801 // that might otherwise incorrectly clump together with a comment in 802 // the middle (a variable name, for example.) 803 for (;;) { 804 c.fChar = nextCharLL(); 805 if (c.fChar == -1 || // EOF 806 c.fChar == '\r' || 807 c.fChar == '\n' || 808 c.fChar == chNEL || 809 c.fChar == chLS) 810 { 811 break; 812 } 813 } 814 } 815 if (c.fChar == -1) { 816 return; 817 } 818 819 // 820 // check for backslash escaped characters. 821 // Use String.unescapeAt() to handle them. 822 // 823 if (c.fChar == '\\') { 824 c.fEscaped = true; 825 int[] unescapeIndex = new int[1]; 826 unescapeIndex[0] = fNextIndex; 827 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex); 828 if (unescapeIndex[0] == fNextIndex) { 829 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); 830 } 831 832 fCharNum += unescapeIndex[0] - fNextIndex; 833 fNextIndex = unescapeIndex[0]; 834 } 835 } 836 // putc(c.fChar, stdout); 837 } 838 839 //--------------------------------------------------------------------------------- 840 // 841 // Parse RBBI rules. The state machine for rules parsing is here. 842 // The state tables are hand-written in the file rbbirpt.txt, 843 // and converted to the form used here by a perl 844 // script rbbicst.pl 845 // 846 //--------------------------------------------------------------------------------- 847 void parse() { 848 int state; 849 RBBIRuleParseTable.RBBIRuleTableElement tableEl; 850 851 state = 1; 852 nextChar(fC); 853 // 854 // Main loop for the rule parsing state machine. 855 // Runs once per state transition. 856 // Each time through optionally performs, depending on the state table, 857 // - an advance to the the next input char 858 // - an action to be performed. 859 // - pushing or popping a state to/from the local state return stack. 860 // 861 for (;;) { 862 // Quit if state == 0. This is the normal way to exit the state machine. 863 // 864 if (state == 0) { 865 break; 866 } 867 868 // Find the state table element that matches the input char from the rule, or the 869 // class of the input character. Start with the first table row for this 870 // state, then linearly scan forward until we find a row that matches the 871 // character. The last row for each state always matches all characters, so 872 // the search will stop there, if not before. 873 // 874 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state]; 875 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 876 System.out.println("char, line, col = (\'" + (char) fC.fChar 877 + "\', " + fLineNum + ", " + fCharNum + " state = " 878 + tableEl.fStateName); 879 } 880 881 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state. 882 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow]; 883 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 884 System.out.print("."); 885 } 886 if (tableEl.fCharClass < 127 && fC.fEscaped == false 887 && tableEl.fCharClass == fC.fChar) { 888 // Table row specified an individual character, not a set, and 889 // the input character is not escaped, and 890 // the input character matched it. 891 break; 892 } 893 if (tableEl.fCharClass == 255) { 894 // Table row specified default, match anything character class. 895 break; 896 } 897 if (tableEl.fCharClass == 254 && fC.fEscaped) { 898 // Table row specified "escaped" and the char was escaped. 899 break; 900 } 901 if (tableEl.fCharClass == 253 && fC.fEscaped 902 && (fC.fChar == 0x50 || fC.fChar == 0x70)) { 903 // Table row specified "escaped P" and the char is either 'p' or 'P'. 904 break; 905 } 906 if (tableEl.fCharClass == 252 && fC.fChar == -1) { 907 // Table row specified eof and we hit eof on the input. 908 break; 909 } 910 911 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class && 912 fC.fEscaped == false && // char is not escaped && 913 fC.fChar != -1) { // char is not EOF 914 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; 915 if (uniset.contains(fC.fChar)) { 916 // Table row specified a character class, or set of characters, 917 // and the current char matches it. 918 break; 919 } 920 } 921 } 922 923 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 924 System.out.println(""); 925 } 926 // 927 // We've found the row of the state table that matches the current input 928 // character from the rules string. 929 // Perform any action specified by this row in the state table. 930 if (doParseActions(tableEl.fAction) == false) { 931 // Break out of the state machine loop if the 932 // the action signalled some kind of error, or 933 // the action was to exit, occurs on normal end-of-rules-input. 934 break; 935 } 936 937 if (tableEl.fPushState != 0) { 938 fStackPtr++; 939 if (fStackPtr >= kStackSize) { 940 System.out.println("RBBIRuleScanner.parse() - state stack overflow."); 941 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 942 } 943 fStack[fStackPtr] = tableEl.fPushState; 944 } 945 946 if (tableEl.fNextChar) { 947 nextChar(fC); 948 } 949 950 // Get the next state from the table entry, or from the 951 // state stack if the next state was specified as "pop". 952 if (tableEl.fNextState != 255) { 953 state = tableEl.fNextState; 954 } else { 955 state = fStack[fStackPtr]; 956 fStackPtr--; 957 if (fStackPtr < 0) { 958 System.out.println("RBBIRuleScanner.parse() - state stack underflow."); 959 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 960 } 961 } 962 963 } 964 965 // If there are no forward rules throw an error. 966 // 967 if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) { 968 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 969 } 970 971 // 972 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 973 // 974 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) { 975 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar); 976 RBBINode operand = pushNewNode(RBBINode.setRef); 977 findSetFor(kAny, operand, null); 978 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand; 979 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]; 980 fNodeStackPtr -= 2; 981 } 982 983 // 984 // Parsing of the input RBBI rules is complete. 985 // We now have a parse tree for the rule expressions 986 // and a list of all UnicodeSets that are referenced. 987 // 988 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) { 989 fSymbolTable.rbbiSymtablePrint(); 990 } 991 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) { 992 System.out.println("Completed Forward Rules Parse Tree..."); 993 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true); 994 System.out.println("\nCompleted Reverse Rules Parse Tree..."); 995 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true); 996 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree..."); 997 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) { 998 System.out.println(" -- null -- "); 999 } else { 1000 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true); 1001 } 1002 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree..."); 1003 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) { 1004 System.out.println(" -- null -- "); 1005 } else { 1006 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true); 1007 } 1008 } 1009 } 1010 1011 //--------------------------------------------------------------------------------- 1012 // 1013 // printNodeStack for debugging... 1014 // 1015 //--------------------------------------------------------------------------------- 1016 ///CLOVER:OFF 1017 void printNodeStack(String title) { 1018 int i; 1019 System.out.println(title + ". Dumping node stack...\n"); 1020 for (i = fNodeStackPtr; i > 0; i--) { 1021 fNodeStack[i].printTree(true); 1022 } 1023 } 1024 ///CLOVER:ON 1025 1026 //--------------------------------------------------------------------------------- 1027 // 1028 // pushNewNode create a new RBBINode of the specified type and push it 1029 // onto the stack of nodes. 1030 // 1031 //--------------------------------------------------------------------------------- 1032 RBBINode pushNewNode(int nodeType) { 1033 fNodeStackPtr++; 1034 if (fNodeStackPtr >= kStackSize) { 1035 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow."); 1036 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 1037 } 1038 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); 1039 return fNodeStack[fNodeStackPtr]; 1040 } 1041 1042 //--------------------------------------------------------------------------------- 1043 // 1044 // scanSet Construct a UnicodeSet from the text at the current scan 1045 // position. Advance the scan position to the first character 1046 // after the set. 1047 // 1048 // A new RBBI setref node referring to the set is pushed onto the node 1049 // stack. 1050 // 1051 // The scan position is normally under the control of the state machine 1052 // that controls rule parsing. UnicodeSets, however, are parsed by 1053 // the UnicodeSet constructor, not by the RBBI rule parser. 1054 // 1055 //--------------------------------------------------------------------------------- 1056 void scanSet() { 1057 UnicodeSet uset = null; 1058 int startPos; 1059 ParsePosition pos = new ParsePosition(fScanIndex); 1060 int i; 1061 1062 startPos = fScanIndex; 1063 try { 1064 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE); 1065 } catch (Exception e) { // TODO: catch fewer exception types. 1066 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info. 1067 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET); 1068 } 1069 1070 // Verify that the set contains at least one code point. 1071 // 1072 if (uset.isEmpty()) { 1073 // This set is empty. 1074 // Make it an error, because it almost certainly is not what the user wanted. 1075 // Also, avoids having to think about corner cases in the tree manipulation code 1076 // that occurs later on. 1077 // TODO: this shouldn't be an error; it does happen. 1078 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); 1079 } 1080 1081 // Advance the RBBI parse postion over the UnicodeSet pattern. 1082 // Don't just set fScanIndex because the line/char positions maintained 1083 // for error reporting would be thrown off. 1084 i = pos.getIndex(); 1085 for (;;) { 1086 if (fNextIndex >= i) { 1087 break; 1088 } 1089 nextCharLL(); 1090 } 1091 1092 RBBINode n; 1093 1094 n = pushNewNode(RBBINode.setRef); 1095 n.fFirstPos = startPos; 1096 n.fLastPos = fNextIndex; 1097 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 1098 // findSetFor() serves several purposes here: 1099 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1100 // - Mantains collection of all sets in use, needed later for establishing 1101 // character categories for run time engine. 1102 // - Eliminates mulitiple instances of the same set. 1103 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1104 findSetFor(n.fText, n, uset); 1105 } 1106 1107} 1108 1109