1/* 2 ******************************************************************************* 3 * Copyright (C) 2003-2011, International Business Machines Corporation and others. All Rights Reserved. 4 ******************************************************************************* 5 */ 6 7package com.ibm.icu.text; 8 9import java.text.ParsePosition; 10import java.util.HashMap; 11 12import com.ibm.icu.impl.Assert; 13import com.ibm.icu.impl.Utility; 14import com.ibm.icu.lang.UCharacter; 15 16/** 17 * This class is part of the Rule Based Break Iterator rule compiler. 18 * It scans the rules and builds the parse tree. 19 * There is no public API here. 20 */ 21class RBBIRuleScanner { 22 23 private final static int kStackSize = 100; // The size of the state stack for 24 // rules parsing. Corresponds roughly 25 // to the depth of parentheses nesting 26 // that is allowed in the rules. 27 28 static class RBBIRuleChar { 29 int fChar; 30 boolean fEscaped; 31 } 32 33 34 35 RBBIRuleBuilder fRB; // The rule builder that we are part of. 36 37 int fScanIndex; // Index of current character being processed 38 // in the rule input string. 39 int fNextIndex; // Index of the next character, which 40 // is the first character not yet scanned. 41 boolean fQuoteMode; // Scan is in a 'quoted region' 42 int fLineNum; // Line number in input file. 43 int fCharNum; // Char position within the line. 44 int fLastChar; // Previous char, needed to count CR-LF 45 // as a single line, not two. 46 47 RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine 48 // processing. 49 String fVarName; // $variableName, valid when we've just 50 // scanned one. 51 52 53 short fStack[] = new short[kStackSize]; // State stack, holds state pushes 54 int fStackPtr; // and pops as specified in the state 55 // transition rules. 56 57 RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created 58 // during the parse of a rule 59 int fNodeStackPtr; 60 61 62 boolean fReverseRule; // True if the rule currently being scanned 63 // is a reverse direction rule (if it 64 // starts with a '!') 65 66 boolean fLookAheadRule; // True if the rule includes a '/' 67 // somewhere within it. 68 69 RBBISymbolTable fSymbolTable; // symbol table, holds definitions of 70 // $variable symbols. 71 72 HashMap<String, RBBISetTableEl> fSetTable = new HashMap<String, RBBISetTableEl>(); // UnicocodeSet hash table, holds indexes to 73 // the sets created while parsing rules. 74 // The key is the string used for creating 75 // the set. 76 77 UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during 78 // the scanning of RBBI rules. The 79 // indicies for these are assigned by the 80 // perl script that builds the state tables. 81 // See rbbirpt.h. 82 83 int fRuleNum; // Counts each rule as it is scanned. 84 85 int fOptionStart; // Input index of start of a !!option 86 // keyword, while being scanned. 87 88 89 90 static private String gRuleSet_rule_char_pattern = "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]"; 91 static private String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]"; 92 static private String gRuleSet_digit_char_pattern = "[0-9]"; 93 static private String gRuleSet_name_start_char_pattern = "[_\\p{L}]"; 94 static private String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]"; 95 static private String kAny = "any"; 96 97 98 99 100 //---------------------------------------------------------------------------------------- 101 // 102 // Constructor. 103 // 104 //---------------------------------------------------------------------------------------- 105 RBBIRuleScanner(RBBIRuleBuilder rb) { 106 fRB = rb; 107 fLineNum = 1; 108 109 // 110 // Set up the constant Unicode Sets. 111 // Note: These could be made static and shared among 112 // all instances of RBBIRuleScanners. 113 fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet(gRuleSet_rule_char_pattern); 114 fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet(gRuleSet_white_space_pattern); 115 fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet(gRuleSet_name_char_pattern); 116 fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet(gRuleSet_name_start_char_pattern); 117 fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet(gRuleSet_digit_char_pattern); 118 119 fSymbolTable = new RBBISymbolTable(this, rb.fRules); 120 } 121 122 //---------------------------------------------------------------------------------------- 123 // 124 // doParseAction Do some action during rule parsing. 125 // Called by the parse state machine. 126 // Actions build the parse tree and Unicode Sets, 127 // and maintain the parse stack for nested expressions. 128 // 129 //---------------------------------------------------------------------------------------- 130 boolean doParseActions(int action) { 131 RBBINode n = null; 132 133 boolean returnVal = true; 134 135 switch (action) { 136 137 case RBBIRuleParseTable.doExprStart: 138 pushNewNode(RBBINode.opStart); 139 fRuleNum++; 140 break; 141 142 case RBBIRuleParseTable.doExprOrOperator: { 143 fixOpStack(RBBINode.precOpCat); 144 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 145 RBBINode orNode = pushNewNode(RBBINode.opOr); 146 orNode.fLeftChild = operandNode; 147 operandNode.fParent = orNode; 148 } 149 break; 150 151 case RBBIRuleParseTable.doExprCatOperator: 152 // concatenation operator. 153 // For the implicit concatenation of adjacent terms in an expression 154 // that are 155 // not separated by any other operator. Action is invoked between the 156 // actions for the two terms. 157 { 158 fixOpStack(RBBINode.precOpCat); 159 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 160 RBBINode catNode = pushNewNode(RBBINode.opCat); 161 catNode.fLeftChild = operandNode; 162 operandNode.fParent = catNode; 163 } 164 break; 165 166 case RBBIRuleParseTable.doLParen: 167 // Open Paren. 168 // The openParen node is a dummy operation type with a low 169 // precedence, 170 // which has the affect of ensuring that any real binary op that 171 // follows within the parens binds more tightly to the operands than 172 // stuff outside of the parens. 173 pushNewNode(RBBINode.opLParen); 174 break; 175 176 case RBBIRuleParseTable.doExprRParen: 177 fixOpStack(RBBINode.precLParen); 178 break; 179 180 case RBBIRuleParseTable.doNOP: 181 break; 182 183 case RBBIRuleParseTable.doStartAssign: 184 // We've just scanned "$variable = " 185 // The top of the node stack has the $variable ref node. 186 187 // Save the start position of the RHS text in the StartExpression 188 // node 189 // that precedes the $variableReference node on the stack. 190 // This will eventually be used when saving the full $variable 191 // replacement 192 // text as a string. 193 n = fNodeStack[fNodeStackPtr - 1]; 194 n.fFirstPos = fNextIndex; // move past the '=' 195 196 // Push a new start-of-expression node; needed to keep parse of the 197 // RHS expression happy. 198 pushNewNode(RBBINode.opStart); 199 break; 200 201 case RBBIRuleParseTable.doEndAssign: { 202 // We have reached the end of an assignement statement. 203 // Current scan char is the ';' that terminates the assignment. 204 205 // Terminate expression, leaves expression parse tree rooted in TOS 206 // node. 207 fixOpStack(RBBINode.precStart); 208 209 RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; 210 RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; 211 RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; 212 213 // Save original text of right side of assignment, excluding the 214 // terminating ';' 215 // in the root of the node for the right-hand-side expression. 216 RHSExprNode.fFirstPos = startExprNode.fFirstPos; 217 RHSExprNode.fLastPos = fScanIndex; 218 // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, 219 // RHSExprNode.fLastPos, RHSExprNode.fText); 220 RHSExprNode.fText = fRB.fRules.substring(RHSExprNode.fFirstPos, 221 RHSExprNode.fLastPos); 222 223 // Expression parse tree becomes l. child of the $variable reference 224 // node. 225 varRefNode.fLeftChild = RHSExprNode; 226 RHSExprNode.fParent = varRefNode; 227 228 // Make a symbol table entry for the $variableRef node. 229 fSymbolTable.addEntry(varRefNode.fText, varRefNode); 230 231 // Clean up the stack. 232 fNodeStackPtr -= 3; 233 break; 234 } 235 236 case RBBIRuleParseTable.doEndOfRule: { 237 fixOpStack(RBBINode.precStart); // Terminate expression, leaves 238 // expression 239 240 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) { 241 printNodeStack("end of rule"); 242 } 243 Assert.assrt(fNodeStackPtr == 1); 244 245 // If this rule includes a look-ahead '/', add a endMark node to the 246 // expression tree. 247 if (fLookAheadRule) { 248 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 249 RBBINode endNode = pushNewNode(RBBINode.endMark); 250 RBBINode catNode = pushNewNode(RBBINode.opCat); 251 fNodeStackPtr -= 2; 252 catNode.fLeftChild = thisRule; 253 catNode.fRightChild = endNode; 254 fNodeStack[fNodeStackPtr] = catNode; 255 endNode.fVal = fRuleNum; 256 endNode.fLookAheadEnd = true; 257 } 258 259 // All rule expressions are ORed together. 260 // The ';' that terminates an expression really just functions as a 261 // '|' with 262 // a low operator prededence. 263 // 264 // Each of the four sets of rules are collected separately. 265 // (forward, reverse, safe_forward, safe_reverse) 266 // OR this rule into the appropriate group of them. 267 // 268 269 int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); 270 271 if (fRB.fTreeRoots[destRules] != null) { 272 // This is not the first rule encounted. 273 // OR previous stuff (from *destRules) 274 // with the current rule expression (on the Node Stack) 275 // with the resulting OR expression going to *destRules 276 // 277 RBBINode thisRule = fNodeStack[fNodeStackPtr]; 278 RBBINode prevRules = fRB.fTreeRoots[destRules]; 279 RBBINode orNode = pushNewNode(RBBINode.opOr); 280 orNode.fLeftChild = prevRules; 281 prevRules.fParent = orNode; 282 orNode.fRightChild = thisRule; 283 thisRule.fParent = orNode; 284 fRB.fTreeRoots[destRules] = orNode; 285 } else { 286 // This is the first rule encountered (for this direction). 287 // Just move its parse tree from the stack to *destRules. 288 fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; 289 } 290 fReverseRule = false; // in preparation for the next rule. 291 fLookAheadRule = false; 292 fNodeStackPtr = 0; 293 } 294 break; 295 296 case RBBIRuleParseTable.doRuleError: 297 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 298 returnVal = false; 299 break; 300 301 case RBBIRuleParseTable.doVariableNameExpectedErr: 302 error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX); 303 break; 304 305 // 306 // Unary operands + ? * 307 // These all appear after the operand to which they apply. 308 // When we hit one, the operand (may be a whole sub expression) 309 // will be on the top of the stack. 310 // Unary Operator becomes TOS, with the old TOS as its one child. 311 case RBBIRuleParseTable.doUnaryOpPlus: { 312 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 313 RBBINode plusNode = pushNewNode(RBBINode.opPlus); 314 plusNode.fLeftChild = operandNode; 315 operandNode.fParent = plusNode; 316 } 317 break; 318 319 case RBBIRuleParseTable.doUnaryOpQuestion: { 320 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 321 RBBINode qNode = pushNewNode(RBBINode.opQuestion); 322 qNode.fLeftChild = operandNode; 323 operandNode.fParent = qNode; 324 } 325 break; 326 327 case RBBIRuleParseTable.doUnaryOpStar: { 328 RBBINode operandNode = fNodeStack[fNodeStackPtr--]; 329 RBBINode starNode = pushNewNode(RBBINode.opStar); 330 starNode.fLeftChild = operandNode; 331 operandNode.fParent = starNode; 332 } 333 break; 334 335 case RBBIRuleParseTable.doRuleChar: 336 // A "Rule Character" is any single character that is a literal part 337 // of the regular expression. Like a, b and c in the expression "(abc*) 338 // | [:L:]" 339 // These are pretty uncommon in break rules; the terms are more commonly 340 // sets. To keep things uniform, treat these characters like as 341 // sets that just happen to contain only one character. 342 { 343 n = pushNewNode(RBBINode.setRef); 344 String s = String.valueOf((char)fC.fChar); 345 findSetFor(s, n, null); 346 n.fFirstPos = fScanIndex; 347 n.fLastPos = fNextIndex; 348 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 349 break; 350 } 351 352 case RBBIRuleParseTable.doDotAny: 353 // scanned a ".", meaning match any single character. 354 { 355 n = pushNewNode(RBBINode.setRef); 356 findSetFor(kAny, n, null); 357 n.fFirstPos = fScanIndex; 358 n.fLastPos = fNextIndex; 359 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 360 break; 361 } 362 363 case RBBIRuleParseTable.doSlash: 364 // Scanned a '/', which identifies a look-ahead break position in a 365 // rule. 366 n = pushNewNode(RBBINode.lookAhead); 367 n.fVal = fRuleNum; 368 n.fFirstPos = fScanIndex; 369 n.fLastPos = fNextIndex; 370 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 371 fLookAheadRule = true; 372 break; 373 374 case RBBIRuleParseTable.doStartTagValue: 375 // Scanned a '{', the opening delimiter for a tag value within a 376 // rule. 377 n = pushNewNode(RBBINode.tag); 378 n.fVal = 0; 379 n.fFirstPos = fScanIndex; 380 n.fLastPos = fNextIndex; 381 break; 382 383 case RBBIRuleParseTable.doTagDigit: 384 // Just scanned a decimal digit that's part of a tag value 385 { 386 n = fNodeStack[fNodeStackPtr]; 387 int v = UCharacter.digit((char) fC.fChar, 10); 388 n.fVal = n.fVal * 10 + v; 389 break; 390 } 391 392 case RBBIRuleParseTable.doTagValue: 393 n = fNodeStack[fNodeStackPtr]; 394 n.fLastPos = fNextIndex; 395 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 396 break; 397 398 case RBBIRuleParseTable.doTagExpectedError: 399 error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); 400 returnVal = false; 401 break; 402 403 case RBBIRuleParseTable.doOptionStart: 404 // Scanning a !!option. At the start of string. 405 fOptionStart = fScanIndex; 406 break; 407 408 case RBBIRuleParseTable.doOptionEnd: { 409 String opt = fRB.fRules.substring(fOptionStart, fScanIndex); 410 if (opt.equals("chain")) { 411 fRB.fChainRules = true; 412 } else if (opt.equals("LBCMNoChain")) { 413 fRB.fLBCMNoChain = true; 414 } else if (opt.equals("forward")) { 415 fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree; 416 } else if (opt.equals("reverse")) { 417 fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree; 418 } else if (opt.equals("safe_forward")) { 419 fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree; 420 } else if (opt.equals("safe_reverse")) { 421 fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree; 422 } else if (opt.equals("lookAheadHardBreak")) { 423 fRB.fLookAheadHardBreak = true; 424 } else { 425 error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); 426 } 427 break; 428 } 429 430 case RBBIRuleParseTable.doReverseDir: 431 fReverseRule = true; 432 break; 433 434 case RBBIRuleParseTable.doStartVariableName: 435 n = pushNewNode(RBBINode.varRef); 436 n.fFirstPos = fScanIndex; 437 break; 438 439 case RBBIRuleParseTable.doEndVariableName: 440 n = fNodeStack[fNodeStackPtr]; 441 if (n == null || n.fType != RBBINode.varRef) { 442 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 443 break; 444 } 445 n.fLastPos = fScanIndex; 446 n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos); 447 // Look the newly scanned name up in the symbol table 448 // If there's an entry, set the l. child of the var ref to the 449 // replacement expression. 450 // (We also pass through here when scanning assignments, but no harm 451 // is done, other 452 // than a slight wasted effort that seems hard to avoid. Lookup will 453 // be null) 454 n.fLeftChild = fSymbolTable.lookupNode(n.fText); 455 break; 456 457 case RBBIRuleParseTable.doCheckVarDef: 458 n = fNodeStack[fNodeStackPtr]; 459 if (n.fLeftChild == null) { 460 error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); 461 returnVal = false; 462 } 463 break; 464 465 case RBBIRuleParseTable.doExprFinished: 466 break; 467 468 case RBBIRuleParseTable.doRuleErrorAssignExpr: 469 error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); 470 returnVal = false; 471 break; 472 473 case RBBIRuleParseTable.doExit: 474 returnVal = false; 475 break; 476 477 case RBBIRuleParseTable.doScanUnicodeSet: 478 scanSet(); 479 break; 480 481 default: 482 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 483 returnVal = false; 484 break; 485 } 486 return returnVal; 487 } 488 489 //---------------------------------------------------------------------------------------- 490 // 491 // Error Throw and IllegalArgumentException in response to a rule parse 492 // error. 493 // 494 //---------------------------------------------------------------------------------------- 495 void error(int e) { 496 String s = "Error " + e + " at line " + fLineNum + " column " 497 + fCharNum; 498 IllegalArgumentException ex = new IllegalArgumentException(s); 499 throw ex; 500 501 } 502 503 //---------------------------------------------------------------------------------------- 504 // 505 // fixOpStack The parse stack holds partially assembled chunks of the parse 506 // tree. 507 // An entry on the stack may be as small as a single setRef node, 508 // or as large as the parse tree 509 // for an entire expression (this will be the one item left on the stack 510 // when the parsing of an RBBI rule completes. 511 // 512 // This function is called when a binary operator is encountered. 513 // It looks back up the stack for operators that are not yet associated 514 // with a right operand, and if the precedence of the stacked operator >= 515 // the precedence of the current operator, binds the operand left, 516 // to the previously encountered operator. 517 // 518 //---------------------------------------------------------------------------------------- 519 void fixOpStack(int p) { 520 RBBINode n; 521 // printNodeStack("entering fixOpStack()"); 522 for (;;) { 523 n = fNodeStack[fNodeStackPtr - 1]; // an operator node 524 if (n.fPrecedence == 0) { 525 System.out.print("RBBIRuleScanner.fixOpStack, bad operator node"); 526 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 527 return; 528 } 529 530 if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) { 531 // The most recent operand goes with the current operator, 532 // not with the previously stacked one. 533 break; 534 } 535 // Stack operator is a binary op ( '|' or concatenation) 536 // TOS operand becomes right child of this operator. 537 // Resulting subexpression becomes the TOS operand. 538 n.fRightChild = fNodeStack[fNodeStackPtr]; 539 fNodeStack[fNodeStackPtr].fParent = n; 540 fNodeStackPtr--; 541 // printNodeStack("looping in fixOpStack() "); 542 } 543 544 if (p <= RBBINode.precLParen) { 545 // Scan is at a right paren or end of expression. 546 // The scanned item must match the stack, or else there was an 547 // error. 548 // Discard the left paren (or start expr) node from the stack, 549 // leaving the completed (sub)expression as TOS. 550 if (n.fPrecedence != p) { 551 // Right paren encountered matched start of expression node, or 552 // end of expression matched with a left paren node. 553 error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN); 554 } 555 fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr]; 556 fNodeStackPtr--; 557 // Delete the now-discarded LParen or Start node. 558 // delete n; 559 } 560 // printNodeStack("leaving fixOpStack()"); 561 } 562 563 //---------------------------------------------------------------------------- 564 // 565 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have 566 // been encountered. The val Node will be of nodetype uset 567 // and contain pointers to the actual UnicodeSets. 568 // The Key is the source string for initializing the set. 569 // 570 // The hash table is used to avoid creating duplicate 571 // unnamed (not $var references) UnicodeSets. 572 // 573 //---------------------------------------------------------------------------- 574 static class RBBISetTableEl { 575 String key; 576 577 RBBINode val; 578 } 579 580 581 //---------------------------------------------------------------------------------------- 582 // 583 // findSetFor given a String, 584 // - find the corresponding Unicode Set (uset node) 585 // (create one if necessary) 586 // - Set fLeftChild of the caller's node (should be a setRef node) 587 // to the uset node 588 // Maintain a hash table of uset nodes, so the same one is always used 589 // for the same string. 590 // If a "to adopt" set is provided and we haven't seen this key before, 591 // add the provided set to the hash table. 592 // If the string is one (32 bit) char in length, the set contains 593 // just one element which is the char in question. 594 // If the string is "any", return a set containing all chars. 595 // 596 //---------------------------------------------------------------------------------------- 597 void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { 598 599 RBBISetTableEl el; 600 601 // First check whether we've already cached a set for this string. 602 // If so, just use the cached set in the new node. 603 // delete any set provided by the caller, since we own it. 604 el = fSetTable.get(s); 605 if (el != null) { 606 node.fLeftChild = el.val; 607 Assert.assrt(node.fLeftChild.fType == RBBINode.uset); 608 return; 609 } 610 611 // Haven't seen this set before. 612 // If the caller didn't provide us with a prebuilt set, 613 // create a new UnicodeSet now. 614 if (setToAdopt == null) { 615 if (s.equals(kAny)) { 616 setToAdopt = new UnicodeSet(0x000000, 0x10ffff); 617 } else { 618 int c; 619 c = UTF16.charAt(s, 0); 620 setToAdopt = new UnicodeSet(c, c); 621 } 622 } 623 624 // 625 // Make a new uset node to refer to this UnicodeSet 626 // This new uset node becomes the child of the caller's setReference 627 // node. 628 // 629 RBBINode usetNode = new RBBINode(RBBINode.uset); 630 usetNode.fInputSet = setToAdopt; 631 usetNode.fParent = node; 632 node.fLeftChild = usetNode; 633 usetNode.fText = s; 634 635 // 636 // Add the new uset node to the list of all uset nodes. 637 // 638 fRB.fUSetNodes.add(usetNode); 639 640 // 641 // Add the new set to the set hash table. 642 // 643 el = new RBBISetTableEl(); 644 el.key = s; 645 el.val = usetNode; 646 fSetTable.put(el.key, el); 647 648 return; 649 } 650 651 // 652 // Assorted Unicode character constants. 653 // Numeric because there is no portable way to enter them as literals. 654 // (Think EBCDIC). 655 // 656 static final int chNEL = 0x85; // NEL newline variant 657 658 static final int chLS = 0x2028; // Unicode Line Separator 659 660 //---------------------------------------------------------------------------------------- 661 // 662 // stripRules Return a rules string without unnecessary 663 // characters. 664 // 665 //---------------------------------------------------------------------------------------- 666 static String stripRules(String rules) { 667 StringBuilder strippedRules = new StringBuilder(); 668 int rulesLength = rules.length(); 669 for (int idx = 0; idx < rulesLength;) { 670 char ch = rules.charAt(idx++); 671 if (ch == '#') { 672 while (idx < rulesLength 673 && ch != '\r' && ch != '\n' && ch != chNEL) { 674 ch = rules.charAt(idx++); 675 } 676 } 677 if (!UCharacter.isISOControl(ch)) { 678 strippedRules.append(ch); 679 } 680 } 681 return strippedRules.toString(); 682 } 683 684 //---------------------------------------------------------------------------------------- 685 // 686 // nextCharLL Low Level Next Char from rule input source. 687 // Get a char from the input character iterator, 688 // keep track of input position for error reporting. 689 // 690 //---------------------------------------------------------------------------------------- 691 int nextCharLL() { 692 int ch; 693 694 if (fNextIndex >= fRB.fRules.length()) { 695 return -1; 696 } 697 ch = UTF16.charAt(fRB.fRules, fNextIndex); 698 fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1); 699 700 if (ch == '\r' || 701 ch == chNEL || 702 ch == chLS || 703 ch == '\n' && fLastChar != '\r') { 704 // Character is starting a new line. Bump up the line number, and 705 // reset the column to 0. 706 fLineNum++; 707 fCharNum = 0; 708 if (fQuoteMode) { 709 error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING); 710 fQuoteMode = false; 711 } 712 } else { 713 // Character is not starting a new line. Except in the case of a 714 // LF following a CR, increment the column position. 715 if (ch != '\n') { 716 fCharNum++; 717 } 718 } 719 fLastChar = ch; 720 return ch; 721 } 722 723 //--------------------------------------------------------------------------------- 724 // 725 // nextChar for rules scanning. At this level, we handle stripping 726 // out comments and processing backslash character escapes. 727 // The rest of the rules grammar is handled at the next level up. 728 // 729 //--------------------------------------------------------------------------------- 730 void nextChar(RBBIRuleChar c) { 731 732 // Unicode Character constants needed for the processing done by nextChar(), 733 // in hex because literals wont work on EBCDIC machines. 734 735 fScanIndex = fNextIndex; 736 c.fChar = nextCharLL(); 737 c.fEscaped = false; 738 739 // 740 // check for '' sequence. 741 // These are recognized in all contexts, whether in quoted text or not. 742 // 743 if (c.fChar == '\'') { 744 if (UTF16.charAt(fRB.fRules, fNextIndex) == '\'') { 745 c.fChar = nextCharLL(); // get nextChar officially so character counts 746 c.fEscaped = true; // stay correct. 747 } else { 748 // Single quote, by itself. 749 // Toggle quoting mode. 750 // Return either '(' or ')', because quotes cause a grouping of the quoted text. 751 fQuoteMode = !fQuoteMode; 752 if (fQuoteMode == true) { 753 c.fChar = '('; 754 } else { 755 c.fChar = ')'; 756 } 757 c.fEscaped = false; // The paren that we return is not escaped. 758 return; 759 } 760 } 761 762 if (fQuoteMode) { 763 c.fEscaped = true; 764 } else { 765 // We are not in a 'quoted region' of the source. 766 // 767 if (c.fChar == '#') { 768 // Start of a comment. Consume the rest of it. 769 // The new-line char that terminates the comment is always returned. 770 // It will be treated as white-space, and serves to break up anything 771 // that might otherwise incorrectly clump together with a comment in 772 // the middle (a variable name, for example.) 773 for (;;) { 774 c.fChar = nextCharLL(); 775 if (c.fChar == -1 || // EOF 776 c.fChar == '\r' || 777 c.fChar == '\n' || 778 c.fChar == chNEL || 779 c.fChar == chLS) 780 { 781 break; 782 } 783 } 784 } 785 if (c.fChar == -1) { 786 return; 787 } 788 789 // 790 // check for backslash escaped characters. 791 // Use String.unescapeAt() to handle them. 792 // 793 if (c.fChar == '\\') { 794 c.fEscaped = true; 795 int[] unescapeIndex = new int[1]; 796 unescapeIndex[0] = fNextIndex; 797 c.fChar = Utility.unescapeAt(fRB.fRules, unescapeIndex); 798 if (unescapeIndex[0] == fNextIndex) { 799 error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); 800 } 801 802 fCharNum += unescapeIndex[0] - fNextIndex; 803 fNextIndex = unescapeIndex[0]; 804 } 805 } 806 // putc(c.fChar, stdout); 807 } 808 809 //--------------------------------------------------------------------------------- 810 // 811 // Parse RBBI rules. The state machine for rules parsing is here. 812 // The state tables are hand-written in the file rbbirpt.txt, 813 // and converted to the form used here by a perl 814 // script rbbicst.pl 815 // 816 //--------------------------------------------------------------------------------- 817 void parse() { 818 int state; 819 RBBIRuleParseTable.RBBIRuleTableElement tableEl; 820 821 state = 1; 822 nextChar(fC); 823 // 824 // Main loop for the rule parsing state machine. 825 // Runs once per state transition. 826 // Each time through optionally performs, depending on the state table, 827 // - an advance to the the next input char 828 // - an action to be performed. 829 // - pushing or popping a state to/from the local state return stack. 830 // 831 for (;;) { 832 // Quit if state == 0. This is the normal way to exit the state machine. 833 // 834 if (state == 0) { 835 break; 836 } 837 838 // Find the state table element that matches the input char from the rule, or the 839 // class of the input character. Start with the first table row for this 840 // state, then linearly scan forward until we find a row that matches the 841 // character. The last row for each state always matches all characters, so 842 // the search will stop there, if not before. 843 // 844 tableEl = RBBIRuleParseTable.gRuleParseStateTable[state]; 845 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 846 System.out.println("char, line, col = (\'" + (char) fC.fChar 847 + "\', " + fLineNum + ", " + fCharNum + " state = " 848 + tableEl.fStateName); 849 } 850 851 for (int tableRow = state;; tableRow++) { // loop over the state table rows associated with this state. 852 tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow]; 853 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 854 System.out.print("."); 855 } 856 if (tableEl.fCharClass < 127 && fC.fEscaped == false 857 && tableEl.fCharClass == fC.fChar) { 858 // Table row specified an individual character, not a set, and 859 // the input character is not escaped, and 860 // the input character matched it. 861 break; 862 } 863 if (tableEl.fCharClass == 255) { 864 // Table row specified default, match anything character class. 865 break; 866 } 867 if (tableEl.fCharClass == 254 && fC.fEscaped) { 868 // Table row specified "escaped" and the char was escaped. 869 break; 870 } 871 if (tableEl.fCharClass == 253 && fC.fEscaped 872 && (fC.fChar == 0x50 || fC.fChar == 0x70)) { 873 // Table row specified "escaped P" and the char is either 'p' or 'P'. 874 break; 875 } 876 if (tableEl.fCharClass == 252 && fC.fChar == -1) { 877 // Table row specified eof and we hit eof on the input. 878 break; 879 } 880 881 if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table specs a char class && 882 fC.fEscaped == false && // char is not escaped && 883 fC.fChar != -1) { // char is not EOF 884 UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; 885 if (uniset.contains(fC.fChar)) { 886 // Table row specified a character class, or set of characters, 887 // and the current char matches it. 888 break; 889 } 890 } 891 } 892 893 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) { 894 System.out.println(""); 895 } 896 // 897 // We've found the row of the state table that matches the current input 898 // character from the rules string. 899 // Perform any action specified by this row in the state table. 900 if (doParseActions(tableEl.fAction) == false) { 901 // Break out of the state machine loop if the 902 // the action signalled some kind of error, or 903 // the action was to exit, occurs on normal end-of-rules-input. 904 break; 905 } 906 907 if (tableEl.fPushState != 0) { 908 fStackPtr++; 909 if (fStackPtr >= kStackSize) { 910 System.out.println("RBBIRuleScanner.parse() - state stack overflow."); 911 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 912 } 913 fStack[fStackPtr] = tableEl.fPushState; 914 } 915 916 if (tableEl.fNextChar) { 917 nextChar(fC); 918 } 919 920 // Get the next state from the table entry, or from the 921 // state stack if the next state was specified as "pop". 922 if (tableEl.fNextState != 255) { 923 state = tableEl.fNextState; 924 } else { 925 state = fStack[fStackPtr]; 926 fStackPtr--; 927 if (fStackPtr < 0) { 928 System.out.println("RBBIRuleScanner.parse() - state stack underflow."); 929 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 930 } 931 } 932 933 } 934 935 // 936 // If there were NO user specified reverse rules, set up the equivalent of ".*;" 937 // 938 if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) { 939 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar); 940 RBBINode operand = pushNewNode(RBBINode.setRef); 941 findSetFor(kAny, operand, null); 942 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand; 943 operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree]; 944 fNodeStackPtr -= 2; 945 } 946 947 // 948 // Parsing of the input RBBI rules is complete. 949 // We now have a parse tree for the rule expressions 950 // and a list of all UnicodeSets that are referenced. 951 // 952 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) { 953 fSymbolTable.rbbiSymtablePrint(); 954 } 955 if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) { 956 System.out.println("Completed Forward Rules Parse Tree..."); 957 fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true); 958 System.out.println("\nCompleted Reverse Rules Parse Tree..."); 959 fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true); 960 System.out.println("\nCompleted Safe Point Forward Rules Parse Tree..."); 961 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) { 962 System.out.println(" -- null -- "); 963 } else { 964 fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true); 965 } 966 System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree..."); 967 if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) { 968 System.out.println(" -- null -- "); 969 } else { 970 fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true); 971 } 972 } 973 } 974 975 //--------------------------------------------------------------------------------- 976 // 977 // printNodeStack for debugging... 978 // 979 //--------------------------------------------------------------------------------- 980 ///CLOVER:OFF 981 void printNodeStack(String title) { 982 int i; 983 System.out.println(title + ". Dumping node stack...\n"); 984 for (i = fNodeStackPtr; i > 0; i--) { 985 fNodeStack[i].printTree(true); 986 } 987 } 988 ///CLOVER:ON 989 990 //--------------------------------------------------------------------------------- 991 // 992 // pushNewNode create a new RBBINode of the specified type and push it 993 // onto the stack of nodes. 994 // 995 //--------------------------------------------------------------------------------- 996 RBBINode pushNewNode(int nodeType) { 997 fNodeStackPtr++; 998 if (fNodeStackPtr >= kStackSize) { 999 System.out.println("RBBIRuleScanner.pushNewNode - stack overflow."); 1000 error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); 1001 } 1002 fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); 1003 return fNodeStack[fNodeStackPtr]; 1004 } 1005 1006 //--------------------------------------------------------------------------------- 1007 // 1008 // scanSet Construct a UnicodeSet from the text at the current scan 1009 // position. Advance the scan position to the first character 1010 // after the set. 1011 // 1012 // A new RBBI setref node referring to the set is pushed onto the node 1013 // stack. 1014 // 1015 // The scan position is normally under the control of the state machine 1016 // that controls rule parsing. UnicodeSets, however, are parsed by 1017 // the UnicodeSet constructor, not by the RBBI rule parser. 1018 // 1019 //--------------------------------------------------------------------------------- 1020 void scanSet() { 1021 UnicodeSet uset = null; 1022 int startPos; 1023 ParsePosition pos = new ParsePosition(fScanIndex); 1024 int i; 1025 1026 startPos = fScanIndex; 1027 try { 1028 uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE); 1029 } catch (Exception e) { // TODO: catch fewer exception types. 1030 // Repackage UnicodeSet errors as RBBI rule builder errors, with location info. 1031 error(RBBIRuleBuilder.U_BRK_MALFORMED_SET); 1032 } 1033 1034 // Verify that the set contains at least one code point. 1035 // 1036 if (uset.isEmpty()) { 1037 // This set is empty. 1038 // Make it an error, because it almost certainly is not what the user wanted. 1039 // Also, avoids having to think about corner cases in the tree manipulation code 1040 // that occurs later on. 1041 // TODO: this shouldn't be an error; it does happen. 1042 error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); 1043 } 1044 1045 // Advance the RBBI parse postion over the UnicodeSet pattern. 1046 // Don't just set fScanIndex because the line/char positions maintained 1047 // for error reporting would be thrown off. 1048 i = pos.getIndex(); 1049 for (;;) { 1050 if (fNextIndex >= i) { 1051 break; 1052 } 1053 nextCharLL(); 1054 } 1055 1056 RBBINode n; 1057 1058 n = pushNewNode(RBBINode.setRef); 1059 n.fFirstPos = startPos; 1060 n.fLastPos = fNextIndex; 1061 n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos); 1062 // findSetFor() serves several purposes here: 1063 // - Adopts storage for the UnicodeSet, will be responsible for deleting. 1064 // - Mantains collection of all sets in use, needed later for establishing 1065 // character categories for run time engine. 1066 // - Eliminates mulitiple instances of the same set. 1067 // - Creates a new uset node if necessary (if this isn't a duplicate.) 1068 findSetFor(n.fText, n, uset); 1069 } 1070 1071} 1072 1073