1/* 2********************************************************************** 3* Copyright (c) 2001-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6*/ 7package com.ibm.icu.text; 8 9import java.text.ParsePosition; 10import java.util.ArrayList; 11import java.util.HashMap; 12import java.util.List; 13import java.util.Map; 14 15import com.ibm.icu.impl.IllegalIcuArgumentException; 16import com.ibm.icu.impl.PatternProps; 17import com.ibm.icu.impl.Utility; 18import com.ibm.icu.lang.UCharacter; 19import com.ibm.icu.text.RuleBasedTransliterator.Data; 20 21class TransliteratorParser { 22 23 //---------------------------------------------------------------------- 24 // Data members 25 //---------------------------------------------------------------------- 26 27 /** 28 * PUBLIC data member. 29 * A Vector of RuleBasedTransliterator.Data objects, one for each discrete group 30 * of rules in the rule set 31 */ 32 public List<Data> dataVector; 33 34 /** 35 * PUBLIC data member. 36 * A Vector of Strings containing all of the ID blocks in the rule set 37 */ 38 public List<String> idBlockVector; 39 40 /** 41 * The current data object for which we are parsing rules 42 */ 43 private Data curData; 44 45 /** 46 * PUBLIC data member containing the parsed compound filter, if any. 47 */ 48 public UnicodeSet compoundFilter; 49 50 51 private int direction; 52 53 /** 54 * Temporary symbol table used during parsing. 55 */ 56 private ParseData parseData; 57 58 /** 59 * Temporary vector of set variables. When parsing is complete, this 60 * is copied into the array data.variables. As with data.variables, 61 * element 0 corresponds to character data.variablesBase. 62 */ 63 private List<Object> variablesVector; 64 65 /** 66 * Temporary table of variable names. When parsing is complete, this is 67 * copied into data.variableNames. 68 */ 69 private Map<String, char[]> variableNames; 70 71 /** 72 * String of standins for segments. Used during the parsing of a single 73 * rule. segmentStandins.charAt(0) is the standin for "$1" and corresponds 74 * to StringMatcher object segmentObjects.elementAt(0), etc. 75 */ 76 private StringBuffer segmentStandins; 77 78 /** 79 * Vector of StringMatcher objects for segments. Used during the 80 * parsing of a single rule. 81 * segmentStandins.charAt(0) is the standin for "$1" and corresponds 82 * to StringMatcher object segmentObjects.elementAt(0), etc. 83 */ 84 private List<StringMatcher> segmentObjects; 85 86 /** 87 * The next available stand-in for variables. This starts at some point in 88 * the private use area (discovered dynamically) and increments up toward 89 * <code>variableLimit</code>. At any point during parsing, available 90 * variables are <code>variableNext..variableLimit-1</code>. 91 */ 92 private char variableNext; 93 94 /** 95 * The last available stand-in for variables. This is discovered 96 * dynamically. At any point during parsing, available variables are 97 * <code>variableNext..variableLimit-1</code>. During variable definition 98 * we use the special value variableLimit-1 as a placeholder. 99 */ 100 private char variableLimit; 101 102 /** 103 * When we encounter an undefined variable, we do not immediately signal 104 * an error, in case we are defining this variable, e.g., "$a = [a-z];". 105 * Instead, we save the name of the undefined variable, and substitute 106 * in the placeholder char variableLimit - 1, and decrement 107 * variableLimit. 108 */ 109 private String undefinedVariableName; 110 111 /** 112 * The stand-in character for the 'dot' set, represented by '.' in 113 * patterns. This is allocated the first time it is needed, and 114 * reused thereafter. 115 */ 116 private int dotStandIn = -1; 117 118 //---------------------------------------------------------------------- 119 // Constants 120 //---------------------------------------------------------------------- 121 122 // Indicator for ID blocks 123 private static final String ID_TOKEN = "::"; 124 private static final int ID_TOKEN_LEN = 2; 125 126/* 127(reserved for future expansion) 128 // markers for beginning and end of rule groups 129 private static final String BEGIN_TOKEN = "BEGIN"; 130 private static final String END_TOKEN = "END"; 131*/ 132 133 // Operators 134 private static final char VARIABLE_DEF_OP = '='; 135 private static final char FORWARD_RULE_OP = '>'; 136 private static final char REVERSE_RULE_OP = '<'; 137 private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op 138 139 private static final String OPERATORS = "=><\u2190\u2192\u2194"; 140 private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;"; 141 142 // Other special characters 143 private static final char QUOTE = '\''; 144 private static final char ESCAPE = '\\'; 145 private static final char END_OF_RULE = ';'; 146 private static final char RULE_COMMENT_CHAR = '#'; 147 148 private static final char CONTEXT_ANTE = '{'; // ante{key 149 private static final char CONTEXT_POST = '}'; // key}post 150 private static final char CURSOR_POS = '|'; 151 private static final char CURSOR_OFFSET = '@'; 152 private static final char ANCHOR_START = '^'; 153 154 private static final char KLEENE_STAR = '*'; 155 private static final char ONE_OR_MORE = '+'; 156 private static final char ZERO_OR_ONE = '?'; 157 158 private static final char DOT = '.'; 159 private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]"; 160 161 // By definition, the ANCHOR_END special character is a 162 // trailing SymbolTable.SYMBOL_REF character. 163 // private static final char ANCHOR_END = '$'; 164 165 // Segments of the input string are delimited by "(" and ")". In the 166 // output string these segments are referenced as "$1", "$2", etc. 167 private static final char SEGMENT_OPEN = '('; 168 private static final char SEGMENT_CLOSE = ')'; 169 170 // A function is denoted &Source-Target/Variant(text) 171 private static final char FUNCTION = '&'; 172 173 // Aliases for some of the syntax characters. These are provided so 174 // transliteration rules can be expressed in XML without clashing with 175 // XML syntax characters '<', '>', and '&'. 176 private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow 177 private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow 178 private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow 179 private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta) 180 181 // Special characters disallowed at the top level 182 private static UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]"); 183 184 // Special characters disallowed within a segment 185 private static UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]"); 186 187 // Special characters disallowed within a function argument 188 private static UnicodeSet ILLEGAL_FUNC = new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]"); 189 190 //---------------------------------------------------------------------- 191 // class ParseData 192 //---------------------------------------------------------------------- 193 194 /** 195 * This class implements the SymbolTable interface. It is used 196 * during parsing to give UnicodeSet access to variables that 197 * have been defined so far. Note that it uses variablesVector, 198 * _not_ data.variables. 199 */ 200 private class ParseData implements SymbolTable { 201 202 /** 203 * Implement SymbolTable API. 204 */ 205 public char[] lookup(String name) { 206 return variableNames.get(name); 207 } 208 209 /** 210 * Implement SymbolTable API. 211 */ 212 public UnicodeMatcher lookupMatcher(int ch) { 213 // Note that we cannot use data.lookup() because the 214 // set array has not been constructed yet. 215 int i = ch - curData.variablesBase; 216 if (i >= 0 && i < variablesVector.size()) { 217 return (UnicodeMatcher) variablesVector.get(i); 218 } 219 return null; 220 } 221 222 /** 223 * Implement SymbolTable API. Parse out a symbol reference 224 * name. 225 */ 226 public String parseReference(String text, ParsePosition pos, int limit) { 227 int start = pos.getIndex(); 228 int i = start; 229 while (i < limit) { 230 char c = text.charAt(i); 231 if ((i==start && !UCharacter.isUnicodeIdentifierStart(c)) || 232 !UCharacter.isUnicodeIdentifierPart(c)) { 233 break; 234 } 235 ++i; 236 } 237 if (i == start) { // No valid name chars 238 return null; 239 } 240 pos.setIndex(i); 241 return text.substring(start, i); 242 } 243 244 /** 245 * Return true if the given character is a matcher standin or a plain 246 * character (non standin). 247 */ 248 public boolean isMatcher(int ch) { 249 // Note that we cannot use data.lookup() because the 250 // set array has not been constructed yet. 251 int i = ch - curData.variablesBase; 252 if (i >= 0 && i < variablesVector.size()) { 253 return variablesVector.get(i) instanceof UnicodeMatcher; 254 } 255 return true; 256 } 257 258 /** 259 * Return true if the given character is a replacer standin or a plain 260 * character (non standin). 261 */ 262 public boolean isReplacer(int ch) { 263 // Note that we cannot use data.lookup() because the 264 // set array has not been constructed yet. 265 int i = ch - curData.variablesBase; 266 if (i >= 0 && i < variablesVector.size()) { 267 return variablesVector.get(i) instanceof UnicodeReplacer; 268 } 269 return true; 270 } 271 } 272 273 //---------------------------------------------------------------------- 274 // classes RuleBody, RuleArray, and RuleReader 275 //---------------------------------------------------------------------- 276 277 /** 278 * A private abstract class representing the interface to rule 279 * source code that is broken up into lines. Handles the 280 * folding of lines terminated by a backslash. This folding 281 * is limited; it does not account for comments, quotes, or 282 * escapes, so its use to be limited. 283 */ 284 private static abstract class RuleBody { 285 286 /** 287 * Retrieve the next line of the source, or return null if 288 * none. Folds lines terminated by a backslash into the 289 * next line, without regard for comments, quotes, or 290 * escapes. 291 */ 292 String nextLine() { 293 String s = handleNextLine(); 294 if (s != null && 295 s.length() > 0 && 296 s.charAt(s.length() - 1) == '\\') { 297 StringBuilder b = new StringBuilder(s); 298 do { 299 b.deleteCharAt(b.length()-1); 300 s = handleNextLine(); 301 if (s == null) { 302 break; 303 } 304 b.append(s); 305 } while (s.length() > 0 && 306 s.charAt(s.length() - 1) == '\\'); 307 s = b.toString(); 308 } 309 return s; 310 } 311 312 /** 313 * Reset to the first line of the source. 314 */ 315 abstract void reset(); 316 317 /** 318 * Subclass method to return the next line of the source. 319 */ 320 abstract String handleNextLine(); 321 } 322 323 /** 324 * RuleBody subclass for a String[] array. 325 */ 326 private static class RuleArray extends RuleBody { 327 String[] array; 328 int i; 329 public RuleArray(String[] array) { this.array = array; i = 0; } 330 public String handleNextLine() { 331 return (i < array.length) ? array[i++] : null; 332 } 333 public void reset() { 334 i = 0; 335 } 336 } 337 338 /* 339 * RuleBody subclass for a ResourceReader. 340 */ 341/* private static class RuleReader extends RuleBody { 342 ResourceReader reader; 343 public RuleReader(ResourceReader reader) { this.reader = reader; } 344 public String handleNextLine() { 345 try { 346 return reader.readLine(); 347 } catch (java.io.IOException e) {} 348 return null; 349 } 350 public void reset() { 351 reader.reset(); 352 } 353 }*/ 354 355 //---------------------------------------------------------------------- 356 // class RuleHalf 357 //---------------------------------------------------------------------- 358 359 /** 360 * A class representing one side of a rule. This class knows how to 361 * parse half of a rule. It is tightly coupled to the method 362 * TransliteratorParser.parseRule(). 363 */ 364 private static class RuleHalf { 365 366 public String text; 367 368 public int cursor = -1; // position of cursor in text 369 public int ante = -1; // position of ante context marker '{' in text 370 public int post = -1; // position of post context marker '}' in text 371 372 // Record the offset to the cursor either to the left or to the 373 // right of the key. This is indicated by characters on the output 374 // side that allow the cursor to be positioned arbitrarily within 375 // the matching text. For example, abc{def} > | @@@ xyz; changes 376 // def to xyz and moves the cursor to before abc. Offset characters 377 // must be at the start or end, and they cannot move the cursor past 378 // the ante- or postcontext text. Placeholders are only valid in 379 // output text. The length of the ante and post context is 380 // determined at runtime, because of supplementals and quantifiers. 381 public int cursorOffset = 0; // only nonzero on output side 382 383 // Position of first CURSOR_OFFSET on _right_. This will be -1 384 // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc. 385 private int cursorOffsetPos = 0; 386 387 public boolean anchorStart = false; 388 public boolean anchorEnd = false; 389 390 /** 391 * The segment number from 1..n of the next '(' we see 392 * during parsing; 1-based. 393 */ 394 private int nextSegmentNumber = 1; 395 396 /** 397 * Parse one side of a rule, stopping at either the limit, 398 * the END_OF_RULE character, or an operator. 399 * @return the index after the terminating character, or 400 * if limit was reached, limit 401 */ 402 public int parse(String rule, int pos, int limit, 403 TransliteratorParser parser) { 404 int start = pos; 405 StringBuffer buf = new StringBuffer(); 406 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false); 407 text = buf.toString(); 408 409 if (cursorOffset > 0 && cursor != cursorOffsetPos) { 410 syntaxError("Misplaced " + CURSOR_POS, rule, start); 411 } 412 413 return pos; 414 } 415 416 /** 417 * Parse a section of one side of a rule, stopping at either 418 * the limit, the END_OF_RULE character, an operator, or a 419 * segment close character. This method parses both a 420 * top-level rule half and a segment within such a rule half. 421 * It calls itself recursively to parse segments and nested 422 * segments. 423 * @param buf buffer into which to accumulate the rule pattern 424 * characters, either literal characters from the rule or 425 * standins for UnicodeMatcher objects including segments. 426 * @param illegal the set of special characters that is illegal during 427 * this parse. 428 * @param isSegment if true, then we've already seen a '(' and 429 * pos on entry points right after it. Accumulate everything 430 * up to the closing ')', put it in a segment matcher object, 431 * generate a standin for it, and add the standin to buf. As 432 * a side effect, update the segments vector with a reference 433 * to the segment matcher. This works recursively for nested 434 * segments. If isSegment is false, just accumulate 435 * characters into buf. 436 * @return the index after the terminating character, or 437 * if limit was reached, limit 438 */ 439 private int parseSection(String rule, int pos, int limit, 440 TransliteratorParser parser, 441 StringBuffer buf, 442 UnicodeSet illegal, 443 boolean isSegment) { 444 int start = pos; 445 ParsePosition pp = null; 446 int quoteStart = -1; // Most recent 'single quoted string' 447 int quoteLimit = -1; 448 int varStart = -1; // Most recent $variableReference 449 int varLimit = -1; 450 int[] iref = new int[1]; 451 int bufStart = buf.length(); 452 453 main: 454 while (pos < limit) { 455 // Since all syntax characters are in the BMP, fetching 456 // 16-bit code units suffices here. 457 char c = rule.charAt(pos++); 458 if (PatternProps.isWhiteSpace(c)) { 459 continue; 460 } 461 // HALF_ENDERS is all chars that end a rule half: "<>=;" 462 if (HALF_ENDERS.indexOf(c) >= 0) { 463 ///CLOVER:OFF 464 // isSegment is always false 465 if (isSegment) { 466 syntaxError("Unclosed segment", rule, start); 467 } 468 ///CLOVER:ON 469 break main; 470 } 471 if (anchorEnd) { 472 // Text after a presumed end anchor is a syntax err 473 syntaxError("Malformed variable reference", rule, start); 474 } 475 if (UnicodeSet.resemblesPattern(rule, pos-1)) { 476 if (pp == null) { 477 pp = new ParsePosition(0); 478 } 479 pp.setIndex(pos-1); // Backup to opening '[' 480 buf.append(parser.parseSet(rule, pp)); 481 pos = pp.getIndex(); 482 continue; 483 } 484 // Handle escapes 485 if (c == ESCAPE) { 486 if (pos == limit) { 487 syntaxError("Trailing backslash", rule, start); 488 } 489 iref[0] = pos; 490 int escaped = Utility.unescapeAt(rule, iref); 491 pos = iref[0]; 492 if (escaped == -1) { 493 syntaxError("Malformed escape", rule, start); 494 } 495 parser.checkVariableRange(escaped, rule, start); 496 UTF16.append(buf, escaped); 497 continue; 498 } 499 // Handle quoted matter 500 if (c == QUOTE) { 501 int iq = rule.indexOf(QUOTE, pos); 502 if (iq == pos) { 503 buf.append(c); // Parse [''] outside quotes as ['] 504 ++pos; 505 } else { 506 /* This loop picks up a run of quoted text of the 507 * form 'aaaa' each time through. If this run 508 * hasn't really ended ('aaaa''bbbb') then it keeps 509 * looping, each time adding on a new run. When it 510 * reaches the final quote it breaks. 511 */ 512 quoteStart = buf.length(); 513 for (;;) { 514 if (iq < 0) { 515 syntaxError("Unterminated quote", rule, start); 516 } 517 buf.append(rule.substring(pos, iq)); 518 pos = iq+1; 519 if (pos < limit && rule.charAt(pos) == QUOTE) { 520 // Parse [''] inside quotes as ['] 521 iq = rule.indexOf(QUOTE, pos+1); 522 // Continue looping 523 } else { 524 break; 525 } 526 } 527 quoteLimit = buf.length(); 528 529 for (iq=quoteStart; iq<quoteLimit; ++iq) { 530 parser.checkVariableRange(buf.charAt(iq), rule, start); 531 } 532 } 533 continue; 534 } 535 536 parser.checkVariableRange(c, rule, start); 537 538 if (illegal.contains(c)) { 539 syntaxError("Illegal character '" + c + '\'', rule, start); 540 } 541 542 switch (c) { 543 544 //------------------------------------------------------ 545 // Elements allowed within and out of segments 546 //------------------------------------------------------ 547 case ANCHOR_START: 548 if (buf.length() == 0 && !anchorStart) { 549 anchorStart = true; 550 } else { 551 syntaxError("Misplaced anchor start", 552 rule, start); 553 } 554 break; 555 case SEGMENT_OPEN: 556 { 557 // bufSegStart is the offset in buf to the first 558 // character of the segment we are parsing. 559 int bufSegStart = buf.length(); 560 561 // Record segment number now, since nextSegmentNumber 562 // will be incremented during the call to parseSection 563 // if there are nested segments. 564 int segmentNumber = nextSegmentNumber++; // 1-based 565 566 // Parse the segment 567 pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true); 568 569 // After parsing a segment, the relevant characters are 570 // in buf, starting at offset bufSegStart. Extract them 571 // into a string matcher, and replace them with a 572 // standin for that matcher. 573 StringMatcher m = 574 new StringMatcher(buf.substring(bufSegStart), 575 segmentNumber, parser.curData); 576 577 // Record and associate object and segment number 578 parser.setSegmentObject(segmentNumber, m); 579 buf.setLength(bufSegStart); 580 buf.append(parser.getSegmentStandin(segmentNumber)); 581 } 582 break; 583 case FUNCTION: 584 case ALT_FUNCTION: 585 { 586 iref[0] = pos; 587 TransliteratorIDParser.SingleID single = TransliteratorIDParser.parseFilterID(rule, iref); 588 // The next character MUST be a segment open 589 if (single == null || 590 !Utility.parseChar(rule, iref, SEGMENT_OPEN)) { 591 syntaxError("Invalid function", rule, start); 592 } 593 594 Transliterator t = single.getInstance(); 595 if (t == null) { 596 syntaxError("Invalid function ID", rule, start); 597 } 598 599 // bufSegStart is the offset in buf to the first 600 // character of the segment we are parsing. 601 int bufSegStart = buf.length(); 602 603 // Parse the segment 604 pos = parseSection(rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true); 605 606 // After parsing a segment, the relevant characters are 607 // in buf, starting at offset bufSegStart. 608 FunctionReplacer r = 609 new FunctionReplacer(t, 610 new StringReplacer(buf.substring(bufSegStart), parser.curData)); 611 612 // Replace the buffer contents with a stand-in 613 buf.setLength(bufSegStart); 614 buf.append(parser.generateStandInFor(r)); 615 } 616 break; 617 case SymbolTable.SYMBOL_REF: 618 // Handle variable references and segment references "$1" .. "$9" 619 { 620 // A variable reference must be followed immediately 621 // by a Unicode identifier start and zero or more 622 // Unicode identifier part characters, or by a digit 623 // 1..9 if it is a segment reference. 624 if (pos == limit) { 625 // A variable ref character at the end acts as 626 // an anchor to the context limit, as in perl. 627 anchorEnd = true; 628 break; 629 } 630 // Parse "$1" "$2" .. "$9" .. (no upper limit) 631 c = rule.charAt(pos); 632 int r = UCharacter.digit(c, 10); 633 if (r >= 1 && r <= 9) { 634 iref[0] = pos; 635 r = Utility.parseNumber(rule, iref, 10); 636 if (r < 0) { 637 syntaxError("Undefined segment reference", 638 rule, start); 639 } 640 pos = iref[0]; 641 buf.append(parser.getSegmentStandin(r)); 642 } else { 643 if (pp == null) { // Lazy create 644 pp = new ParsePosition(0); 645 } 646 pp.setIndex(pos); 647 String name = parser.parseData. 648 parseReference(rule, pp, limit); 649 if (name == null) { 650 // This means the '$' was not followed by a 651 // valid name. Try to interpret it as an 652 // end anchor then. If this also doesn't work 653 // (if we see a following character) then signal 654 // an error. 655 anchorEnd = true; 656 break; 657 } 658 pos = pp.getIndex(); 659 // If this is a variable definition statement, 660 // then the LHS variable will be undefined. In 661 // that case appendVariableDef() will append the 662 // special placeholder char variableLimit-1. 663 varStart = buf.length(); 664 parser.appendVariableDef(name, buf); 665 varLimit = buf.length(); 666 } 667 } 668 break; 669 case DOT: 670 buf.append(parser.getDotStandIn()); 671 break; 672 case KLEENE_STAR: 673 case ONE_OR_MORE: 674 case ZERO_OR_ONE: 675 // Quantifiers. We handle single characters, quoted strings, 676 // variable references, and segments. 677 // a+ matches aaa 678 // 'foo'+ matches foofoofoo 679 // $v+ matches xyxyxy if $v == xy 680 // (seg)+ matches segsegseg 681 { 682 ///CLOVER:OFF 683 // isSegment is always false 684 if (isSegment && buf.length() == bufStart) { 685 // The */+ immediately follows '(' 686 syntaxError("Misplaced quantifier", rule, start); 687 break; 688 } 689 ///CLOVER:ON 690 691 int qstart, qlimit; 692 // The */+ follows an isolated character or quote 693 // or variable reference 694 if (buf.length() == quoteLimit) { 695 // The */+ follows a 'quoted string' 696 qstart = quoteStart; 697 qlimit = quoteLimit; 698 } else if (buf.length() == varLimit) { 699 // The */+ follows a $variableReference 700 qstart = varStart; 701 qlimit = varLimit; 702 } else { 703 // The */+ follows a single character, possibly 704 // a segment standin 705 qstart = buf.length() - 1; 706 qlimit = qstart + 1; 707 } 708 709 UnicodeMatcher m; 710 try { 711 m = new StringMatcher(buf.toString(), qstart, qlimit, 712 0, parser.curData); 713 } catch (RuntimeException e) { 714 final String precontext = pos < 50 ? rule.substring(0, pos) : "..." + rule.substring(pos - 50, pos); 715 final String postContext = limit-pos <= 50 ? rule.substring(pos, limit) : rule.substring(pos, pos+50) + "..."; 716 throw (RuntimeException) 717 new IllegalIcuArgumentException("Failure in rule: " + precontext + "$$$" 718 + postContext).initCause(e); 719 } 720 int min = 0; 721 int max = Quantifier.MAX; 722 switch (c) { 723 case ONE_OR_MORE: 724 min = 1; 725 break; 726 case ZERO_OR_ONE: 727 min = 0; 728 max = 1; 729 break; 730 // case KLEENE_STAR: 731 // do nothing -- min, max already set 732 } 733 m = new Quantifier(m, min, max); 734 buf.setLength(qstart); 735 buf.append(parser.generateStandInFor(m)); 736 } 737 break; 738 739 //------------------------------------------------------ 740 // Elements allowed ONLY WITHIN segments 741 //------------------------------------------------------ 742 case SEGMENT_CLOSE: 743 // assert(isSegment); 744 // We're done parsing a segment. 745 break main; 746 747 //------------------------------------------------------ 748 // Elements allowed ONLY OUTSIDE segments 749 //------------------------------------------------------ 750 case CONTEXT_ANTE: 751 if (ante >= 0) { 752 syntaxError("Multiple ante contexts", rule, start); 753 } 754 ante = buf.length(); 755 break; 756 case CONTEXT_POST: 757 if (post >= 0) { 758 syntaxError("Multiple post contexts", rule, start); 759 } 760 post = buf.length(); 761 break; 762 case CURSOR_POS: 763 if (cursor >= 0) { 764 syntaxError("Multiple cursors", rule, start); 765 } 766 cursor = buf.length(); 767 break; 768 case CURSOR_OFFSET: 769 if (cursorOffset < 0) { 770 if (buf.length() > 0) { 771 syntaxError("Misplaced " + c, rule, start); 772 } 773 --cursorOffset; 774 } else if (cursorOffset > 0) { 775 if (buf.length() != cursorOffsetPos || cursor >= 0) { 776 syntaxError("Misplaced " + c, rule, start); 777 } 778 ++cursorOffset; 779 } else { 780 if (cursor == 0 && buf.length() == 0) { 781 cursorOffset = -1; 782 } else if (cursor < 0) { 783 cursorOffsetPos = buf.length(); 784 cursorOffset = 1; 785 } else { 786 syntaxError("Misplaced " + c, rule, start); 787 } 788 } 789 break; 790 791 //------------------------------------------------------ 792 // Non-special characters 793 //------------------------------------------------------ 794 default: 795 // Disallow unquoted characters other than [0-9A-Za-z] 796 // in the printable ASCII range. These characters are 797 // reserved for possible future use. 798 if (c >= 0x0021 && c <= 0x007E && 799 !((c >= '0' && c <= '9') || 800 (c >= 'A' && c <= 'Z') || 801 (c >= 'a' && c <= 'z'))) { 802 syntaxError("Unquoted " + c, rule, start); 803 } 804 buf.append(c); 805 break; 806 } 807 } 808 return pos; 809 } 810 811 /** 812 * Remove context. 813 */ 814 void removeContext() { 815 text = text.substring(ante < 0 ? 0 : ante, 816 post < 0 ? text.length() : post); 817 ante = post = -1; 818 anchorStart = anchorEnd = false; 819 } 820 821 /** 822 * Return true if this half looks like valid output, that is, does not 823 * contain quantifiers or other special input-only elements. 824 */ 825 public boolean isValidOutput(TransliteratorParser parser) { 826 for (int i=0; i<text.length(); ) { 827 int c = UTF16.charAt(text, i); 828 i += UTF16.getCharCount(c); 829 if (!parser.parseData.isReplacer(c)) { 830 return false; 831 } 832 } 833 return true; 834 } 835 836 /** 837 * Return true if this half looks like valid input, that is, does not 838 * contain functions or other special output-only elements. 839 */ 840 public boolean isValidInput(TransliteratorParser parser) { 841 for (int i=0; i<text.length(); ) { 842 int c = UTF16.charAt(text, i); 843 i += UTF16.getCharCount(c); 844 if (!parser.parseData.isMatcher(c)) { 845 return false; 846 } 847 } 848 return true; 849 } 850 } 851 852 //---------------------------------------------------------------------- 853 // PUBLIC methods 854 //---------------------------------------------------------------------- 855 856 /** 857 * Constructor. 858 */ 859 public TransliteratorParser() { 860 } 861 862 /** 863 * Parse a set of rules. After the parse completes, examine the public 864 * data members for results. 865 */ 866 public void parse(String rules, int dir) { 867 parseRules(new RuleArray(new String[] { rules }), dir); 868 } 869 870 /* 871 * Parse a set of rules. After the parse completes, examine the public 872 * data members for results. 873 */ 874/* public void parse(ResourceReader rules, int direction) { 875 parseRules(new RuleReader(rules), direction); 876 }*/ 877 878 //---------------------------------------------------------------------- 879 // PRIVATE methods 880 //---------------------------------------------------------------------- 881 882 /** 883 * Parse an array of zero or more rules. The strings in the array are 884 * treated as if they were concatenated together, with rule terminators 885 * inserted between array elements if not present already. 886 * 887 * Any previous rules are discarded. Typically this method is called exactly 888 * once, during construction. 889 * 890 * The member this.data will be set to null if there are no rules. 891 * 892 * @exception IllegalIcuArgumentException if there is a syntax error in the 893 * rules 894 */ 895 void parseRules(RuleBody ruleArray, int dir) { 896 boolean parsingIDs = true; 897 int ruleCount = 0; 898 899 dataVector = new ArrayList<Data>(); 900 idBlockVector = new ArrayList<String>(); 901 curData = null; 902 direction = dir; 903 compoundFilter = null; 904 variablesVector = new ArrayList<Object>(); 905 variableNames = new HashMap<String, char[]>(); 906 parseData = new ParseData(); 907 908 List<RuntimeException> errors = new ArrayList<RuntimeException>(); 909 int errorCount = 0; 910 911 ruleArray.reset(); 912 913 StringBuilder idBlockResult = new StringBuilder(); 914 915 // The compound filter offset is an index into idBlockResult. 916 // If it is 0, then the compound filter occurred at the start, 917 // and it is the offset to the _start_ of the compound filter 918 // pattern. Otherwise it is the offset to the _limit_ of the 919 // compound filter pattern within idBlockResult. 920 this.compoundFilter = null; 921 int compoundFilterOffset = -1; 922 923 main: 924 for (;;) { 925 String rule = ruleArray.nextLine(); 926 if (rule == null) { 927 break; 928 } 929 int pos = 0; 930 int limit = rule.length(); 931 while (pos < limit) { 932 char c = rule.charAt(pos++); 933 if (PatternProps.isWhiteSpace(c)) { 934 continue; 935 } 936 // Skip lines starting with the comment character 937 if (c == RULE_COMMENT_CHAR) { 938 pos = rule.indexOf("\n", pos) + 1; 939 if (pos == 0) { 940 break; // No "\n" found; rest of rule is a commnet 941 } 942 continue; // Either fall out or restart with next line 943 } 944 945 // skip empty rules 946 if (c == END_OF_RULE) 947 continue; 948 949 // Often a rule file contains multiple errors. It's 950 // convenient to the rule author if these are all reported 951 // at once. We keep parsing rules even after a failure, up 952 // to a specified limit, and report all errors at once. 953 try { 954 ++ruleCount; 955 956 // We've found the start of a rule or ID. c is its first 957 // character, and pos points past c. 958 --pos; 959 // Look for an ID token. Must have at least ID_TOKEN_LEN + 1 960 // chars left. 961 if ((pos + ID_TOKEN_LEN + 1) <= limit && 962 rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) { 963 pos += ID_TOKEN_LEN; 964 c = rule.charAt(pos); 965 while (PatternProps.isWhiteSpace(c) && pos < limit) { 966 ++pos; 967 c = rule.charAt(pos); 968 } 969 int[] p = new int[] { pos }; 970 971 if (!parsingIDs) { 972 if (curData != null) { 973 if (direction == Transliterator.FORWARD) 974 dataVector.add(curData); 975 else 976 dataVector.add(0, curData); 977 curData = null; 978 } 979 parsingIDs = true; 980 } 981 982 TransliteratorIDParser.SingleID id = 983 TransliteratorIDParser.parseSingleID( 984 rule, p, direction); 985 if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) { 986 // Successful ::ID parse. 987 988 if (direction == Transliterator.FORWARD) { 989 idBlockResult.append(id.canonID).append(END_OF_RULE); 990 } else { 991 idBlockResult.insert(0, id.canonID + END_OF_RULE); 992 } 993 994 } else { 995 // Couldn't parse an ID. Try to parse a global filter 996 int[] withParens = new int[] { -1 }; 997 UnicodeSet f = TransliteratorIDParser.parseGlobalFilter(rule, p, direction, withParens, null); 998 if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) { 999 if ((direction == Transliterator.FORWARD) == 1000 (withParens[0] == 0)) { 1001 if (compoundFilter != null) { 1002 // Multiple compound filters 1003 syntaxError("Multiple global filters", rule, pos); 1004 } 1005 compoundFilter = f; 1006 compoundFilterOffset = ruleCount; 1007 } 1008 } else { 1009 // Invalid ::id 1010 // Can be parsed as neither an ID nor a global filter 1011 syntaxError("Invalid ::ID", rule, pos); 1012 } 1013 } 1014 1015 pos = p[0]; 1016 } else { 1017 if (parsingIDs) { 1018 if (direction == Transliterator.FORWARD) 1019 idBlockVector.add(idBlockResult.toString()); 1020 else 1021 idBlockVector.add(0, idBlockResult.toString()); 1022 idBlockResult.delete(0, idBlockResult.length()); 1023 parsingIDs = false; 1024 curData = new RuleBasedTransliterator.Data(); 1025 1026 // By default, rules use part of the private use area 1027 // E000..F8FF for variables and other stand-ins. Currently 1028 // the range F000..F8FF is typically sufficient. The 'use 1029 // variable range' pragma allows rule sets to modify this. 1030 setVariableRange(0xF000, 0xF8FF); 1031 } 1032 1033 if (resemblesPragma(rule, pos, limit)) { 1034 int ppp = parsePragma(rule, pos, limit); 1035 if (ppp < 0) { 1036 syntaxError("Unrecognized pragma", rule, pos); 1037 } 1038 pos = ppp; 1039 // Parse a rule 1040 } else { 1041 pos = parseRule(rule, pos, limit); 1042 } 1043 } 1044 } catch (IllegalArgumentException e) { 1045 if (errorCount == 30) { 1046 IllegalIcuArgumentException icuEx = new IllegalIcuArgumentException("\nMore than 30 errors; further messages squelched"); 1047 icuEx.initCause(e); 1048 errors.add(icuEx); 1049 break main; 1050 } 1051 e.fillInStackTrace(); 1052 errors.add(e); 1053 ++errorCount; 1054 pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';' 1055 } 1056 } 1057 } 1058 if (parsingIDs && idBlockResult.length() > 0) { 1059 if (direction == Transliterator.FORWARD) 1060 idBlockVector.add(idBlockResult.toString()); 1061 else 1062 idBlockVector.add(0, idBlockResult.toString()); 1063 } 1064 else if (!parsingIDs && curData != null) { 1065 if (direction == Transliterator.FORWARD) 1066 dataVector.add(curData); 1067 else 1068 dataVector.add(0, curData); 1069 } 1070 1071 // Convert the set vector to an array 1072 for (int i = 0; i < dataVector.size(); i++) { 1073 Data data = dataVector.get(i); 1074 data.variables = new Object[variablesVector.size()]; 1075 variablesVector.toArray(data.variables); 1076 data.variableNames = new HashMap<String, char[]>(); 1077 data.variableNames.putAll(variableNames); 1078 } 1079 variablesVector = null; 1080 1081 // Do more syntax checking and index the rules 1082 try { 1083 if (compoundFilter != null) { 1084 if ((direction == Transliterator.FORWARD && 1085 compoundFilterOffset != 1) || 1086 (direction == Transliterator.REVERSE && 1087 compoundFilterOffset != ruleCount)) { 1088 throw new IllegalIcuArgumentException("Compound filters misplaced"); 1089 } 1090 } 1091 1092 for (int i = 0; i < dataVector.size(); i++) { 1093 Data data = dataVector.get(i); 1094 data.ruleSet.freeze(); 1095 } 1096 1097 if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0) 1098 idBlockVector.remove(0); 1099 1100 } catch (IllegalArgumentException e) { 1101 e.fillInStackTrace(); 1102 errors.add(e); 1103 } 1104 1105 if (errors.size() != 0) { 1106 for (int i = errors.size()-1; i > 0; --i) { 1107 RuntimeException previous = errors.get(i-1); 1108 while (previous.getCause() != null) { 1109 previous = (RuntimeException) previous.getCause(); // chain specially 1110 } 1111 previous.initCause(errors.get(i)); 1112 } 1113 throw errors.get(0); 1114 // if initCause not supported: throw new IllegalArgumentException(errors.toString()); 1115 } 1116 } 1117 1118 /** 1119 * MAIN PARSER. Parse the next rule in the given rule string, starting 1120 * at pos. Return the index after the last character parsed. Do not 1121 * parse characters at or after limit. 1122 * 1123 * Important: The character at pos must be a non-whitespace character 1124 * that is not the comment character. 1125 * 1126 * This method handles quoting, escaping, and whitespace removal. It 1127 * parses the end-of-rule character. It recognizes context and cursor 1128 * indicators. Once it does a lexical breakdown of the rule at pos, it 1129 * creates a rule object and adds it to our rule list. 1130 * 1131 * This method is tightly coupled to the inner class RuleHalf. 1132 */ 1133 private int parseRule(String rule, int pos, int limit) { 1134 // Locate the left side, operator, and right side 1135 int start = pos; 1136 char operator = 0; 1137 1138 // Set up segments data 1139 segmentStandins = new StringBuffer(); 1140 segmentObjects = new ArrayList<StringMatcher>(); 1141 1142 RuleHalf left = new RuleHalf(); 1143 RuleHalf right = new RuleHalf(); 1144 1145 undefinedVariableName = null; 1146 pos = left.parse(rule, pos, limit, this); 1147 1148 if (pos == limit || 1149 OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) { 1150 syntaxError("No operator pos=" + pos, rule, start); 1151 } 1152 ++pos; 1153 1154 // Found an operator char. Check for forward-reverse operator. 1155 if (operator == REVERSE_RULE_OP && 1156 (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) { 1157 ++pos; 1158 operator = FWDREV_RULE_OP; 1159 } 1160 1161 // Translate alternate op characters. 1162 switch (operator) { 1163 case ALT_FORWARD_RULE_OP: 1164 operator = FORWARD_RULE_OP; 1165 break; 1166 case ALT_REVERSE_RULE_OP: 1167 operator = REVERSE_RULE_OP; 1168 break; 1169 case ALT_FWDREV_RULE_OP: 1170 operator = FWDREV_RULE_OP; 1171 break; 1172 } 1173 1174 pos = right.parse(rule, pos, limit, this); 1175 1176 if (pos < limit) { 1177 if (rule.charAt(--pos) == END_OF_RULE) { 1178 ++pos; 1179 } else { 1180 // RuleHalf parser must have terminated at an operator 1181 syntaxError("Unquoted operator", rule, start); 1182 } 1183 } 1184 1185 if (operator == VARIABLE_DEF_OP) { 1186 // LHS is the name. RHS is a single character, either a literal 1187 // or a set (already parsed). If RHS is longer than one 1188 // character, it is either a multi-character string, or multiple 1189 // sets, or a mixture of chars and sets -- syntax error. 1190 1191 // We expect to see a single undefined variable (the one being 1192 // defined). 1193 if (undefinedVariableName == null) { 1194 syntaxError("Missing '$' or duplicate definition", rule, start); 1195 } 1196 if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) { 1197 syntaxError("Malformed LHS", rule, start); 1198 } 1199 if (left.anchorStart || left.anchorEnd || 1200 right.anchorStart || right.anchorEnd) { 1201 syntaxError("Malformed variable def", rule, start); 1202 } 1203 // We allow anything on the right, including an empty string. 1204 int n = right.text.length(); 1205 char[] value = new char[n]; 1206 right.text.getChars(0, n, value, 0); 1207 variableNames.put(undefinedVariableName, value); 1208 1209 ++variableLimit; 1210 return pos; 1211 } 1212 1213 // If this is not a variable definition rule, we shouldn't have 1214 // any undefined variable names. 1215 if (undefinedVariableName != null) { 1216 syntaxError("Undefined variable $" + undefinedVariableName, 1217 rule, start); 1218 } 1219 1220 // Verify segments 1221 if (segmentStandins.length() > segmentObjects.size()) { 1222 syntaxError("Undefined segment reference", rule, start); 1223 } 1224 for (int i=0; i<segmentStandins.length(); ++i) { 1225 if (segmentStandins.charAt(i) == 0) { 1226 syntaxError("Internal error", rule, start); // will never happen 1227 } 1228 } 1229 for (int i=0; i<segmentObjects.size(); ++i) { 1230 if (segmentObjects.get(i) == null) { 1231 syntaxError("Internal error", rule, start); // will never happen 1232 } 1233 } 1234 1235 // If the direction we want doesn't match the rule 1236 // direction, do nothing. 1237 if (operator != FWDREV_RULE_OP && 1238 ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) { 1239 return pos; 1240 } 1241 1242 // Transform the rule into a forward rule by swapping the 1243 // sides if necessary. 1244 if (direction == Transliterator.REVERSE) { 1245 RuleHalf temp = left; 1246 left = right; 1247 right = temp; 1248 } 1249 1250 // Remove non-applicable elements in forward-reverse 1251 // rules. Bidirectional rules ignore elements that do not 1252 // apply. 1253 if (operator == FWDREV_RULE_OP) { 1254 right.removeContext(); 1255 left.cursor = -1; 1256 left.cursorOffset = 0; 1257 } 1258 1259 // Normalize context 1260 if (left.ante < 0) { 1261 left.ante = 0; 1262 } 1263 if (left.post < 0) { 1264 left.post = left.text.length(); 1265 } 1266 1267 // Context is only allowed on the input side. Cursors are only 1268 // allowed on the output side. Segment delimiters can only appear 1269 // on the left, and references on the right. Cursor offset 1270 // cannot appear without an explicit cursor. Cursor offset 1271 // cannot place the cursor outside the limits of the context. 1272 // Anchors are only allowed on the input side. 1273 if (right.ante >= 0 || right.post >= 0 || left.cursor >= 0 || 1274 (right.cursorOffset != 0 && right.cursor < 0) || 1275 // - The following two checks were used to ensure that the 1276 // - the cursor offset stayed within the ante- or postcontext. 1277 // - However, with the addition of quantifiers, we have to 1278 // - allow arbitrary cursor offsets and do runtime checking. 1279 //(right.cursorOffset > (left.text.length() - left.post)) || 1280 //(-right.cursorOffset > left.ante) || 1281 right.anchorStart || right.anchorEnd || 1282 !left.isValidInput(this) || !right.isValidOutput(this) || 1283 left.ante > left.post) { 1284 syntaxError("Malformed rule", rule, start); 1285 } 1286 1287 // Flatten segment objects vector to an array 1288 UnicodeMatcher[] segmentsArray = null; 1289 if (segmentObjects.size() > 0) { 1290 segmentsArray = new UnicodeMatcher[segmentObjects.size()]; 1291 segmentObjects.toArray(segmentsArray); 1292 } 1293 1294 curData.ruleSet.addRule(new TransliterationRule( 1295 left.text, left.ante, left.post, 1296 right.text, right.cursor, right.cursorOffset, 1297 segmentsArray, 1298 left.anchorStart, left.anchorEnd, 1299 curData)); 1300 1301 return pos; 1302 } 1303 1304 /** 1305 * Set the variable range to [start, end] (inclusive). 1306 */ 1307 private void setVariableRange(int start, int end) { 1308 if (start > end || start < 0 || end > 0xFFFF) { 1309 throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end); 1310 } 1311 1312 curData.variablesBase = (char) start; // first private use 1313 1314 if (dataVector.size() == 0) { 1315 variableNext = (char) start; 1316 variableLimit = (char) (end + 1); 1317 } 1318 } 1319 1320 /** 1321 * Assert that the given character is NOT within the variable range. 1322 * If it is, signal an error. This is neccesary to ensure that the 1323 * variable range does not overlap characters used in a rule. 1324 */ 1325 private void checkVariableRange(int ch, String rule, int start) { 1326 if (ch >= curData.variablesBase && ch < variableLimit) { 1327 syntaxError("Variable range character in rule", rule, start); 1328 } 1329 } 1330 1331 // (The following method is part of an unimplemented feature. 1332 // Remove this clover pragma after the feature is implemented. 1333 // 2003-06-11 ICU 2.6 Alan) 1334 ///CLOVER:OFF 1335 /** 1336 * Set the maximum backup to 'backup', in response to a pragma 1337 * statement. 1338 */ 1339 private void pragmaMaximumBackup(int backup) { 1340 //TODO Finish 1341 throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet"); 1342 } 1343 ///CLOVER:ON 1344 1345 // (The following method is part of an unimplemented feature. 1346 // Remove this clover pragma after the feature is implemented. 1347 // 2003-06-11 ICU 2.6 Alan) 1348 ///CLOVER:OFF 1349 /** 1350 * Begin normalizing all rules using the given mode, in response 1351 * to a pragma statement. 1352 */ 1353 private void pragmaNormalizeRules(Normalizer.Mode mode) { 1354 //TODO Finish 1355 throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet"); 1356 } 1357 ///CLOVER:ON 1358 1359 /** 1360 * Return true if the given rule looks like a pragma. 1361 * @param pos offset to the first non-whitespace character 1362 * of the rule. 1363 * @param limit pointer past the last character of the rule. 1364 */ 1365 static boolean resemblesPragma(String rule, int pos, int limit) { 1366 // Must start with /use\s/i 1367 return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0; 1368 } 1369 1370 /** 1371 * Parse a pragma. This method assumes resemblesPragma() has 1372 * already returned true. 1373 * @param pos offset to the first non-whitespace character 1374 * of the rule. 1375 * @param limit pointer past the last character of the rule. 1376 * @return the position index after the final ';' of the pragma, 1377 * or -1 on failure. 1378 */ 1379 private int parsePragma(String rule, int pos, int limit) { 1380 int[] array = new int[2]; 1381 1382 // resemblesPragma() has already returned true, so we 1383 // know that pos points to /use\s/i; we can skip 4 characters 1384 // immediately 1385 pos += 4; 1386 1387 // Here are the pragmas we recognize: 1388 // use variable range 0xE000 0xEFFF; 1389 // use maximum backup 16; 1390 // use nfd rules; 1391 int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array); 1392 if (p >= 0) { 1393 setVariableRange(array[0], array[1]); 1394 return p; 1395 } 1396 1397 p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array); 1398 if (p >= 0) { 1399 pragmaMaximumBackup(array[0]); 1400 return p; 1401 } 1402 1403 p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null); 1404 if (p >= 0) { 1405 pragmaNormalizeRules(Normalizer.NFD); 1406 return p; 1407 } 1408 1409 p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null); 1410 if (p >= 0) { 1411 pragmaNormalizeRules(Normalizer.NFC); 1412 return p; 1413 } 1414 1415 // Syntax error: unable to parse pragma 1416 return -1; 1417 } 1418 1419 /** 1420 * Throw an exception indicating a syntax error. Search the rule string 1421 * for the probable end of the rule. Of course, if the error is that 1422 * the end of rule marker is missing, then the rule end will not be found. 1423 * In any case the rule start will be correctly reported. 1424 * @param msg error description 1425 * @param rule pattern string 1426 * @param start position of first character of current rule 1427 */ 1428 static final void syntaxError(String msg, String rule, int start) { 1429 int end = ruleEnd(rule, start, rule.length()); 1430 throw new IllegalIcuArgumentException(msg + " in \"" + 1431 Utility.escape(rule.substring(start, end)) + '"'); 1432 } 1433 1434 static final int ruleEnd(String rule, int start, int limit) { 1435 int end = Utility.quotedIndexOf(rule, start, limit, ";"); 1436 if (end < 0) { 1437 end = limit; 1438 } 1439 return end; 1440 } 1441 1442 /** 1443 * Parse a UnicodeSet out, store it, and return the stand-in character 1444 * used to represent it. 1445 */ 1446 private final char parseSet(String rule, ParsePosition pos) { 1447 UnicodeSet set = new UnicodeSet(rule, pos, parseData); 1448 if (variableNext >= variableLimit) { 1449 throw new RuntimeException("Private use variables exhausted"); 1450 } 1451 set.compact(); 1452 return generateStandInFor(set); 1453 } 1454 1455 /** 1456 * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer. 1457 * Store the object. 1458 */ 1459 char generateStandInFor(Object obj) { 1460 // assert(obj != null); 1461 1462 // Look up previous stand-in, if any. This is a short list 1463 // (typical n is 0, 1, or 2); linear search is optimal. 1464 for (int i=0; i<variablesVector.size(); ++i) { 1465 if (variablesVector.get(i) == obj) { // [sic] pointer comparison 1466 return (char) (curData.variablesBase + i); 1467 } 1468 } 1469 1470 if (variableNext >= variableLimit) { 1471 throw new RuntimeException("Variable range exhausted"); 1472 } 1473 variablesVector.add(obj); 1474 return variableNext++; 1475 } 1476 1477 /** 1478 * Return the standin for segment seg (1-based). 1479 */ 1480 public char getSegmentStandin(int seg) { 1481 if (segmentStandins.length() < seg) { 1482 segmentStandins.setLength(seg); 1483 } 1484 char c = segmentStandins.charAt(seg-1); 1485 if (c == 0) { 1486 if (variableNext >= variableLimit) { 1487 throw new RuntimeException("Variable range exhausted"); 1488 } 1489 c = variableNext++; 1490 // Set a placeholder in the master variables vector that will be 1491 // filled in later by setSegmentObject(). We know that we will get 1492 // called first because setSegmentObject() will call us. 1493 variablesVector.add(null); 1494 segmentStandins.setCharAt(seg-1, c); 1495 } 1496 return c; 1497 } 1498 1499 /** 1500 * Set the object for segment seg (1-based). 1501 */ 1502 public void setSegmentObject(int seg, StringMatcher obj) { 1503 // Since we call parseSection() recursively, nested 1504 // segments will result in segment i+1 getting parsed 1505 // and stored before segment i; be careful with the 1506 // vector handling here. 1507 while (segmentObjects.size() < seg) { 1508 segmentObjects.add(null); 1509 } 1510 int index = getSegmentStandin(seg) - curData.variablesBase; 1511 if (segmentObjects.get(seg-1) != null || 1512 variablesVector.get(index) != null) { 1513 throw new RuntimeException(); // should never happen 1514 } 1515 segmentObjects.set(seg-1, obj); 1516 variablesVector.set(index, obj); 1517 } 1518 1519 /** 1520 * Return the stand-in for the dot set. It is allocated the first 1521 * time and reused thereafter. 1522 */ 1523 char getDotStandIn() { 1524 if (dotStandIn == -1) { 1525 dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET)); 1526 } 1527 return (char) dotStandIn; 1528 } 1529 1530 /** 1531 * Append the value of the given variable name to the given 1532 * StringBuffer. 1533 * @exception IllegalIcuArgumentException if the name is unknown. 1534 */ 1535 private void appendVariableDef(String name, StringBuffer buf) { 1536 char[] ch = variableNames.get(name); 1537 if (ch == null) { 1538 // We allow one undefined variable so that variable definition 1539 // statements work. For the first undefined variable we return 1540 // the special placeholder variableLimit-1, and save the variable 1541 // name. 1542 if (undefinedVariableName == null) { 1543 undefinedVariableName = name; 1544 if (variableNext >= variableLimit) { 1545 throw new RuntimeException("Private use variables exhausted"); 1546 } 1547 buf.append(--variableLimit); 1548 } else { 1549 throw new IllegalIcuArgumentException("Undefined variable $" 1550 + name); 1551 } 1552 } else { 1553 buf.append(ch); 1554 } 1555 } 1556} 1557 1558//eof 1559