1/* 2******************************************************************************* 3* Copyright (C) 2013-2015, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* CollationRuleParser.java, ported from collationruleparser.h/.cpp 7* 8* C++ version created on: 2013apr10 9* created by: Markus W. Scherer 10*/ 11 12package com.ibm.icu.impl.coll; 13 14import java.text.ParseException; 15import java.util.ArrayList; 16 17import com.ibm.icu.impl.IllegalIcuArgumentException; 18import com.ibm.icu.impl.PatternProps; 19import com.ibm.icu.lang.UCharacter; 20import com.ibm.icu.lang.UProperty; 21import com.ibm.icu.text.Collator; 22import com.ibm.icu.text.Normalizer2; 23import com.ibm.icu.text.UTF16; 24import com.ibm.icu.text.UnicodeSet; 25import com.ibm.icu.util.ULocale; 26 27public final class CollationRuleParser { 28 /** Special reset positions. */ 29 enum Position { 30 FIRST_TERTIARY_IGNORABLE, 31 LAST_TERTIARY_IGNORABLE, 32 FIRST_SECONDARY_IGNORABLE, 33 LAST_SECONDARY_IGNORABLE, 34 FIRST_PRIMARY_IGNORABLE, 35 LAST_PRIMARY_IGNORABLE, 36 FIRST_VARIABLE, 37 LAST_VARIABLE, 38 FIRST_REGULAR, 39 LAST_REGULAR, 40 FIRST_IMPLICIT, 41 LAST_IMPLICIT, 42 FIRST_TRAILING, 43 LAST_TRAILING 44 } 45 static final Position[] POSITION_VALUES = Position.values(); 46 47 /** 48 * First character of contractions that encode special reset positions. 49 * U+FFFE cannot be tailored via rule syntax. 50 * 51 * The second contraction character is POS_BASE + Position. 52 */ 53 static final char POS_LEAD = 0xfffe; 54 /** 55 * Base for the second character of contractions that encode special reset positions. 56 * Braille characters U+28xx are printable and normalization-inert. 57 * @see POS_LEAD 58 */ 59 static final char POS_BASE = 0x2800; 60 61 static abstract class Sink { 62 /** 63 * Adds a reset. 64 * strength=UCOL_IDENTICAL for &str. 65 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. 66 */ 67 abstract void addReset(int strength, CharSequence str); 68 /** 69 * Adds a relation with strength and prefix | str / extension. 70 */ 71 abstract void addRelation(int strength, CharSequence prefix, 72 CharSequence str, CharSequence extension); 73 74 void suppressContractions(UnicodeSet set) {} 75 76 void optimize(UnicodeSet set) {} 77 } 78 79 interface Importer { 80 String getRules(String localeID, String collationType); 81 } 82 83 /** 84 * Constructor. 85 * The Sink must be set before parsing. 86 * The Importer can be set, otherwise [import locale] syntax is not supported. 87 */ 88 CollationRuleParser(CollationData base) { 89 baseData = base; 90 } 91 92 /** 93 * Sets the pointer to a Sink object. 94 * The pointer is aliased: Pointer copy without cloning or taking ownership. 95 */ 96 void setSink(Sink sinkAlias) { 97 sink = sinkAlias; 98 } 99 100 /** 101 * Sets the pointer to an Importer object. 102 * The pointer is aliased: Pointer copy without cloning or taking ownership. 103 */ 104 void setImporter(Importer importerAlias) { 105 importer = importerAlias; 106 } 107 108 void parse(String ruleString, CollationSettings outSettings) throws ParseException { 109 settings = outSettings; 110 parse(ruleString); 111 } 112 113 private static final int UCOL_DEFAULT = -1; 114 private static final int UCOL_OFF = 0; 115 private static final int UCOL_ON = 1; 116 117 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ 118 private static final int STRENGTH_MASK = 0xf; 119 private static final int STARRED_FLAG = 0x10; 120 private static final int OFFSET_SHIFT = 8; 121 122 private static final String BEFORE = "[before"; 123 124 // In C++, we parse into temporary UnicodeString objects named "raw" or "str". 125 // In Java, we reuse this StringBuilder. 126 private final StringBuilder rawBuilder = new StringBuilder(); 127 128 private void parse(String ruleString) throws ParseException { 129 rules = ruleString; 130 ruleIndex = 0; 131 132 while(ruleIndex < rules.length()) { 133 char c = rules.charAt(ruleIndex); 134 if(PatternProps.isWhiteSpace(c)) { 135 ++ruleIndex; 136 continue; 137 } 138 switch(c) { 139 case 0x26: // '&' 140 parseRuleChain(); 141 break; 142 case 0x5b: // '[' 143 parseSetting(); 144 break; 145 case 0x23: // '#' starts a comment, until the end of the line 146 ruleIndex = skipComment(ruleIndex + 1); 147 break; 148 case 0x40: // '@' is equivalent to [backwards 2] 149 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true); 150 ++ruleIndex; 151 break; 152 case 0x21: // '!' used to turn on Thai/Lao character reversal 153 // Accept but ignore. The root collator has contractions 154 // that are equivalent to the character reversal, where appropriate. 155 ++ruleIndex; 156 break; 157 default: 158 setParseError("expected a reset or setting or comment"); 159 break; 160 } 161 } 162 } 163 164 private void parseRuleChain() throws ParseException { 165 int resetStrength = parseResetAndPosition(); 166 boolean isFirstRelation = true; 167 for(;;) { 168 int result = parseRelationOperator(); 169 if(result < 0) { 170 if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) { 171 // '#' starts a comment, until the end of the line 172 ruleIndex = skipComment(ruleIndex + 1); 173 continue; 174 } 175 if(isFirstRelation) { 176 setParseError("reset not followed by a relation"); 177 } 178 return; 179 } 180 int strength = result & STRENGTH_MASK; 181 if(resetStrength < Collator.IDENTICAL) { 182 // reset-before rule chain 183 if(isFirstRelation) { 184 if(strength != resetStrength) { 185 setParseError("reset-before strength differs from its first relation"); 186 return; 187 } 188 } else { 189 if(strength < resetStrength) { 190 setParseError("reset-before strength followed by a stronger relation"); 191 return; 192 } 193 } 194 } 195 int i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator 196 if((result & STARRED_FLAG) == 0) { 197 parseRelationStrings(strength, i); 198 } else { 199 parseStarredCharacters(strength, i); 200 } 201 isFirstRelation = false; 202 } 203 } 204 205 private int parseResetAndPosition() throws ParseException { 206 int i = skipWhiteSpace(ruleIndex + 1); 207 int j; 208 char c; 209 int resetStrength; 210 if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) && 211 (j = i + BEFORE.length()) < rules.length() && 212 PatternProps.isWhiteSpace(rules.charAt(j)) && 213 ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() && 214 0x31 <= (c = rules.charAt(j)) && c <= 0x33 && 215 rules.charAt(j + 1) == 0x5d) { 216 // &[before n] with n=1 or 2 or 3 217 resetStrength = Collator.PRIMARY + (c - 0x31); 218 i = skipWhiteSpace(j + 2); 219 } else { 220 resetStrength = Collator.IDENTICAL; 221 } 222 if(i >= rules.length()) { 223 setParseError("reset without position"); 224 return UCOL_DEFAULT; 225 } 226 if(rules.charAt(i) == 0x5b) { // '[' 227 i = parseSpecialPosition(i, rawBuilder); 228 } else { 229 i = parseTailoringString(i, rawBuilder); 230 } 231 try { 232 sink.addReset(resetStrength, rawBuilder); 233 } catch(Exception e) { 234 setParseError("adding reset failed", e); 235 return UCOL_DEFAULT; 236 } 237 ruleIndex = i; 238 return resetStrength; 239 } 240 241 private int parseRelationOperator() { 242 ruleIndex = skipWhiteSpace(ruleIndex); 243 if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; } 244 int strength; 245 int i = ruleIndex; 246 char c = rules.charAt(i++); 247 switch(c) { 248 case 0x3c: // '<' 249 if(i < rules.length() && rules.charAt(i) == 0x3c) { // << 250 ++i; 251 if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<< 252 ++i; 253 if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<<< 254 ++i; 255 strength = Collator.QUATERNARY; 256 } else { 257 strength = Collator.TERTIARY; 258 } 259 } else { 260 strength = Collator.SECONDARY; 261 } 262 } else { 263 strength = Collator.PRIMARY; 264 } 265 if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*' 266 ++i; 267 strength |= STARRED_FLAG; 268 } 269 break; 270 case 0x3b: // ';' same as << 271 strength = Collator.SECONDARY; 272 break; 273 case 0x2c: // ',' same as <<< 274 strength = Collator.TERTIARY; 275 break; 276 case 0x3d: // '=' 277 strength = Collator.IDENTICAL; 278 if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*' 279 ++i; 280 strength |= STARRED_FLAG; 281 } 282 break; 283 default: 284 return UCOL_DEFAULT; 285 } 286 return ((i - ruleIndex) << OFFSET_SHIFT) | strength; 287 } 288 289 private void parseRelationStrings(int strength, int i) throws ParseException { 290 // Parse 291 // prefix | str / extension 292 // where prefix and extension are optional. 293 String prefix = ""; 294 CharSequence extension = ""; 295 i = parseTailoringString(i, rawBuilder); 296 char next = (i < rules.length()) ? rules.charAt(i) : 0; 297 if(next == 0x7c) { // '|' separates the context prefix from the string. 298 prefix = rawBuilder.toString(); 299 i = parseTailoringString(i + 1, rawBuilder); 300 next = (i < rules.length()) ? rules.charAt(i) : 0; 301 } 302 // str = rawBuilder (do not modify rawBuilder any more in this function) 303 if(next == 0x2f) { // '/' separates the string from the extension. 304 StringBuilder extBuilder = new StringBuilder(); 305 i = parseTailoringString(i + 1, extBuilder); 306 extension = extBuilder; 307 } 308 if(prefix.length() != 0) { 309 int prefix0 = prefix.codePointAt(0); 310 int c = rawBuilder.codePointAt(0); 311 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) { 312 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary"); 313 return; 314 } 315 } 316 try { 317 sink.addRelation(strength, prefix, rawBuilder, extension); 318 } catch(Exception e) { 319 setParseError("adding relation failed", e); 320 return; 321 } 322 ruleIndex = i; 323 } 324 325 private void parseStarredCharacters(int strength, int i) throws ParseException { 326 String empty = ""; 327 i = parseString(skipWhiteSpace(i), rawBuilder); 328 if(rawBuilder.length() == 0) { 329 setParseError("missing starred-relation string"); 330 return; 331 } 332 int prev = -1; 333 int j = 0; 334 for(;;) { 335 while(j < rawBuilder.length()) { 336 int c = rawBuilder.codePointAt(j); 337 if(!nfd.isInert(c)) { 338 setParseError("starred-relation string is not all NFD-inert"); 339 return; 340 } 341 try { 342 sink.addRelation(strength, empty, UTF16.valueOf(c), empty); 343 } catch(Exception e) { 344 setParseError("adding relation failed", e); 345 return; 346 } 347 j += Character.charCount(c); 348 prev = c; 349 } 350 if(i >= rules.length() || rules.charAt(i) != 0x2d) { // '-' 351 break; 352 } 353 if(prev < 0) { 354 setParseError("range without start in starred-relation string"); 355 return; 356 } 357 i = parseString(i + 1, rawBuilder); 358 if(rawBuilder.length() == 0) { 359 setParseError("range without end in starred-relation string"); 360 return; 361 } 362 int c = rawBuilder.codePointAt(0); 363 if(c < prev) { 364 setParseError("range start greater than end in starred-relation string"); 365 return; 366 } 367 // range prev-c 368 while(++prev <= c) { 369 if(!nfd.isInert(prev)) { 370 setParseError("starred-relation string range is not all NFD-inert"); 371 return; 372 } 373 if(isSurrogate(prev)) { 374 setParseError("starred-relation string range contains a surrogate"); 375 return; 376 } 377 if(0xfffd <= prev && prev <= 0xffff) { 378 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF"); 379 return; 380 } 381 try { 382 sink.addRelation(strength, empty, UTF16.valueOf(prev), empty); 383 } catch(Exception e) { 384 setParseError("adding relation failed", e); 385 return; 386 } 387 } 388 prev = -1; 389 j = Character.charCount(c); 390 } 391 ruleIndex = skipWhiteSpace(i); 392 } 393 394 private int parseTailoringString(int i, StringBuilder raw) throws ParseException { 395 i = parseString(skipWhiteSpace(i), raw); 396 if(raw.length() == 0) { 397 setParseError("missing relation string"); 398 } 399 return skipWhiteSpace(i); 400 } 401 402 private int parseString(int i, StringBuilder raw) throws ParseException { 403 raw.setLength(0); 404 while(i < rules.length()) { 405 char c = rules.charAt(i++); 406 if(isSyntaxChar(c)) { 407 if(c == 0x27) { // apostrophe 408 if(i < rules.length() && rules.charAt(i) == 0x27) { 409 // Double apostrophe, encodes a single one. 410 raw.append((char)0x27); 411 ++i; 412 continue; 413 } 414 // Quote literal text until the next single apostrophe. 415 for(;;) { 416 if(i == rules.length()) { 417 setParseError("quoted literal text missing terminating apostrophe"); 418 return i; 419 } 420 c = rules.charAt(i++); 421 if(c == 0x27) { 422 if(i < rules.length() && rules.charAt(i) == 0x27) { 423 // Double apostrophe inside quoted literal text, 424 // still encodes a single apostrophe. 425 ++i; 426 } else { 427 break; 428 } 429 } 430 raw.append(c); 431 } 432 } else if(c == 0x5c) { // backslash 433 if(i == rules.length()) { 434 setParseError("backslash escape at the end of the rule string"); 435 return i; 436 } 437 int cp = rules.codePointAt(i); 438 raw.appendCodePoint(cp); 439 i += Character.charCount(cp); 440 } else { 441 // Any other syntax character terminates a string. 442 --i; 443 break; 444 } 445 } else if(PatternProps.isWhiteSpace(c)) { 446 // Unquoted white space terminates a string. 447 --i; 448 break; 449 } else { 450 raw.append(c); 451 } 452 } 453 for(int j = 0; j < raw.length();) { 454 int c = raw.codePointAt(j); 455 if(isSurrogate(c)) { 456 setParseError("string contains an unpaired surrogate"); 457 return i; 458 } 459 if(0xfffd <= c && c <= 0xffff) { 460 setParseError("string contains U+FFFD, U+FFFE or U+FFFF"); 461 return i; 462 } 463 j += Character.charCount(c); 464 } 465 return i; 466 } 467 468 // TODO: Widen UTF16.isSurrogate(char16) to take an int. 469 private static final boolean isSurrogate(int c) { 470 return (c & 0xfffff800) == 0xd800; 471 } 472 473 private static final String[] positions = { 474 "first tertiary ignorable", 475 "last tertiary ignorable", 476 "first secondary ignorable", 477 "last secondary ignorable", 478 "first primary ignorable", 479 "last primary ignorable", 480 "first variable", 481 "last variable", 482 "first regular", 483 "last regular", 484 "first implicit", 485 "last implicit", 486 "first trailing", 487 "last trailing" 488 }; 489 490 /** 491 * Sets str to a contraction of U+FFFE and (U+2800 + Position). 492 * @return rule index after the special reset position 493 * @throws ParseException 494 */ 495 private int parseSpecialPosition(int i, StringBuilder str) throws ParseException { 496 int j = readWords(i + 1, rawBuilder); 497 if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) { // words end with ] 498 ++j; 499 String raw = rawBuilder.toString(); 500 str.setLength(0); 501 for(int pos = 0; pos < positions.length; ++pos) { 502 if(raw.equals(positions[pos])) { 503 str.append(POS_LEAD).append((char)(POS_BASE + pos)); 504 return j; 505 } 506 } 507 if(raw.equals("top")) { 508 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal())); 509 return j; 510 } 511 if(raw.equals("variable top")) { 512 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal())); 513 return j; 514 } 515 } 516 setParseError("not a valid special reset position"); 517 return i; 518 } 519 520 private void parseSetting() throws ParseException { 521 int i = ruleIndex + 1; 522 int j = readWords(i, rawBuilder); 523 if(j <= i || rawBuilder.length() == 0) { 524 setParseError("expected a setting/option at '['"); 525 } 526 // startsWith() etc. are available for String but not CharSequence/StringBuilder. 527 String raw = rawBuilder.toString(); 528 if(rules.charAt(j) == 0x5d) { // words end with ] 529 ++j; 530 if(raw.startsWith("reorder") && 531 (raw.length() == 7 || raw.charAt(7) == 0x20)) { 532 parseReordering(raw); 533 ruleIndex = j; 534 return; 535 } 536 if(raw.equals("backwards 2")) { 537 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true); 538 ruleIndex = j; 539 return; 540 } 541 String v; 542 int valueIndex = raw.lastIndexOf(0x20); 543 if(valueIndex >= 0) { 544 v = raw.substring(valueIndex + 1); 545 raw = raw.substring(0, valueIndex); 546 } else { 547 v = ""; 548 } 549 if(raw.equals("strength") && v.length() == 1) { 550 int value = UCOL_DEFAULT; 551 char c = v.charAt(0); 552 if(0x31 <= c && c <= 0x34) { // 1..4 553 value = Collator.PRIMARY + (c - 0x31); 554 } else if(c == 0x49) { // 'I' 555 value = Collator.IDENTICAL; 556 } 557 if(value != UCOL_DEFAULT) { 558 settings.setStrength(value); 559 ruleIndex = j; 560 return; 561 } 562 } else if(raw.equals("alternate")) { 563 int value = UCOL_DEFAULT; 564 if(v.equals("non-ignorable")) { 565 value = 0; // UCOL_NON_IGNORABLE 566 } else if(v.equals("shifted")) { 567 value = 1; // UCOL_SHIFTED 568 } 569 if(value != UCOL_DEFAULT) { 570 settings.setAlternateHandlingShifted(value > 0); 571 ruleIndex = j; 572 return; 573 } 574 } else if(raw.equals("maxVariable")) { 575 int value = UCOL_DEFAULT; 576 if(v.equals("space")) { 577 value = CollationSettings.MAX_VAR_SPACE; 578 } else if(v.equals("punct")) { 579 value = CollationSettings.MAX_VAR_PUNCT; 580 } else if(v.equals("symbol")) { 581 value = CollationSettings.MAX_VAR_SYMBOL; 582 } else if(v.equals("currency")) { 583 value = CollationSettings.MAX_VAR_CURRENCY; 584 } 585 if(value != UCOL_DEFAULT) { 586 settings.setMaxVariable(value, 0); 587 settings.variableTop = baseData.getLastPrimaryForGroup( 588 Collator.ReorderCodes.FIRST + value); 589 assert(settings.variableTop != 0); 590 ruleIndex = j; 591 return; 592 } 593 } else if(raw.equals("caseFirst")) { 594 int value = UCOL_DEFAULT; 595 if(v.equals("off")) { 596 value = UCOL_OFF; 597 } else if(v.equals("lower")) { 598 value = CollationSettings.CASE_FIRST; // UCOL_LOWER_FIRST 599 } else if(v.equals("upper")) { 600 value = CollationSettings.CASE_FIRST_AND_UPPER_MASK; // UCOL_UPPER_FIRST 601 } 602 if(value != UCOL_DEFAULT) { 603 settings.setCaseFirst(value); 604 ruleIndex = j; 605 return; 606 } 607 } else if(raw.equals("caseLevel")) { 608 int value = getOnOffValue(v); 609 if(value != UCOL_DEFAULT) { 610 settings.setFlag(CollationSettings.CASE_LEVEL, value > 0); 611 ruleIndex = j; 612 return; 613 } 614 } else if(raw.equals("normalization")) { 615 int value = getOnOffValue(v); 616 if(value != UCOL_DEFAULT) { 617 settings.setFlag(CollationSettings.CHECK_FCD, value > 0); 618 ruleIndex = j; 619 return; 620 } 621 } else if(raw.equals("numericOrdering")) { 622 int value = getOnOffValue(v); 623 if(value != UCOL_DEFAULT) { 624 settings.setFlag(CollationSettings.NUMERIC, value > 0); 625 ruleIndex = j; 626 return; 627 } 628 } else if(raw.equals("hiraganaQ")) { 629 int value = getOnOffValue(v); 630 if(value != UCOL_DEFAULT) { 631 if(value == UCOL_ON) { 632 setParseError("[hiraganaQ on] is not supported"); 633 } 634 ruleIndex = j; 635 return; 636 } 637 } else if(raw.equals("import")) { 638 // BCP 47 language tag -> ICU locale ID 639 ULocale localeID; 640 try { 641 localeID = new ULocale.Builder().setLanguageTag(v).build(); 642 } catch(Exception e) { 643 setParseError("expected language tag in [import langTag]", e); 644 return; 645 } 646 // localeID minus all keywords 647 String baseID = localeID.getBaseName(); 648 // @collation=type, or length=0 if not specified 649 String collationType = localeID.getKeywordValue("collation"); 650 if(importer == null) { 651 setParseError("[import langTag] is not supported"); 652 } else { 653 String importedRules; 654 try { 655 importedRules = 656 importer.getRules(baseID, 657 collationType != null ? collationType : "standard"); 658 } catch(Exception e) { 659 setParseError("[import langTag] failed", e); 660 return; 661 } 662 String outerRules = rules; 663 int outerRuleIndex = ruleIndex; 664 try { 665 parse(importedRules); 666 } catch(Exception e) { 667 ruleIndex = outerRuleIndex; // Restore the original index for error reporting. 668 setParseError("parsing imported rules failed", e); 669 } 670 rules = outerRules; 671 ruleIndex = j; 672 } 673 return; 674 } 675 } else if(rules.charAt(j) == 0x5b) { // words end with [ 676 UnicodeSet set = new UnicodeSet(); 677 j = parseUnicodeSet(j, set); 678 if(raw.equals("optimize")) { 679 try { 680 sink.optimize(set); 681 } catch(Exception e) { 682 setParseError("[optimize set] failed", e); 683 } 684 ruleIndex = j; 685 return; 686 } else if(raw.equals("suppressContractions")) { 687 try { 688 sink.suppressContractions(set); 689 } catch(Exception e) { 690 setParseError("[suppressContractions set] failed", e); 691 } 692 ruleIndex = j; 693 return; 694 } 695 } 696 setParseError("not a valid setting/option"); 697 } 698 699 private void parseReordering(CharSequence raw) throws ParseException { 700 int i = 7; // after "reorder" 701 if(i == raw.length()) { 702 // empty [reorder] with no codes 703 settings.resetReordering(); 704 return; 705 } 706 // Parse the codes in [reorder aa bb cc]. 707 ArrayList<Integer> reorderCodes = new ArrayList<Integer>(); 708 while(i < raw.length()) { 709 ++i; // skip the word-separating space 710 int limit = i; 711 while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; } 712 String word = raw.subSequence(i, limit).toString(); 713 int code = getReorderCode(word); 714 if(code < 0) { 715 setParseError("unknown script or reorder code"); 716 return; 717 } 718 reorderCodes.add(code); 719 i = limit; 720 } 721 if(reorderCodes.isEmpty()) { 722 settings.resetReordering(); 723 } else { 724 int[] codes = new int[reorderCodes.size()]; 725 int j = 0; 726 for(Integer code : reorderCodes) { codes[j++] = code; } 727 settings.setReordering(baseData, codes); 728 } 729 } 730 731 private static final String[] gSpecialReorderCodes = { 732 "space", "punct", "symbol", "currency", "digit" 733 }; 734 735 /** 736 * Gets a script or reorder code from its string representation. 737 * @return the script/reorder code, or 738 * -1 if not recognized 739 */ 740 public static int getReorderCode(String word) { 741 for(int i = 0; i < gSpecialReorderCodes.length; ++i) { 742 if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) { 743 return Collator.ReorderCodes.FIRST + i; 744 } 745 } 746 try { 747 int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word); 748 if(script >= 0) { 749 return script; 750 } 751 } catch (IllegalIcuArgumentException e) { 752 // fall through 753 } 754 if(word.equalsIgnoreCase("others")) { 755 return Collator.ReorderCodes.OTHERS; // same as Zzzz = USCRIPT_UNKNOWN 756 } 757 return -1; 758 } 759 760 private static int getOnOffValue(String s) { 761 if(s.equals("on")) { 762 return UCOL_ON; 763 } else if(s.equals("off")) { 764 return UCOL_OFF; 765 } else { 766 return UCOL_DEFAULT; 767 } 768 } 769 770 private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException { 771 // Collect a UnicodeSet pattern between a balanced pair of [brackets]. 772 int level = 0; 773 int j = i; 774 for(;;) { 775 if(j == rules.length()) { 776 setParseError("unbalanced UnicodeSet pattern brackets"); 777 return j; 778 } 779 char c = rules.charAt(j++); 780 if(c == 0x5b) { // '[' 781 ++level; 782 } else if(c == 0x5d) { // ']' 783 if(--level == 0) { break; } 784 } 785 } 786 try { 787 set.applyPattern(rules.substring(i, j)); 788 } catch(Exception e) { 789 setParseError("not a valid UnicodeSet pattern: " + e.getMessage()); 790 } 791 j = skipWhiteSpace(j); 792 if(j == rules.length() || rules.charAt(j) != 0x5d) { 793 setParseError("missing option-terminating ']' after UnicodeSet pattern"); 794 return j; 795 } 796 return ++j; 797 } 798 799 private int readWords(int i, StringBuilder raw) { 800 raw.setLength(0); 801 i = skipWhiteSpace(i); 802 for(;;) { 803 if(i >= rules.length()) { return 0; } 804 char c = rules.charAt(i); 805 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_ 806 if(raw.length() == 0) { return i; } 807 int lastIndex = raw.length() - 1; 808 if(raw.charAt(lastIndex) == ' ') { // remove trailing space 809 raw.setLength(lastIndex); 810 } 811 return i; 812 } 813 if(PatternProps.isWhiteSpace(c)) { 814 raw.append(' '); 815 i = skipWhiteSpace(i + 1); 816 } else { 817 raw.append(c); 818 ++i; 819 } 820 } 821 } 822 823 private int skipComment(int i) { 824 // skip to past the newline 825 while(i < rules.length()) { 826 char c = rules.charAt(i++); 827 // LF or FF or CR or NEL or LS or PS 828 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) { 829 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS." 830 // NLF (new line function) = CR or LF or CR+LF or NEL. 831 // No need to collect all of CR+LF because a following LF will be ignored anyway. 832 break; 833 } 834 } 835 return i; 836 } 837 838 private void setParseError(String reason) throws ParseException { 839 throw makeParseException(reason); 840 } 841 842 private void setParseError(String reason, Exception e) throws ParseException { 843 ParseException newExc = makeParseException(reason + ": " + e.getMessage()); 844 newExc.initCause(e); 845 throw newExc; 846 } 847 848 private ParseException makeParseException(String reason) { 849 return new ParseException(appendErrorContext(reason), ruleIndex); 850 } 851 852 private static final int U_PARSE_CONTEXT_LEN = 16; 853 854 // C++ setErrorContext() 855 private String appendErrorContext(String reason) { 856 // Note: This relies on the calling code maintaining the ruleIndex 857 // at a position that is useful for debugging. 858 // For example, at the beginning of a reset or relation etc. 859 StringBuilder msg = new StringBuilder(reason); 860 msg.append(" at index ").append(ruleIndex); 861 // We are not counting line numbers. 862 863 msg.append(" near \""); 864 // before ruleIndex 865 int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1); 866 if(start < 0) { 867 start = 0; 868 } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) { 869 ++start; 870 } 871 msg.append(rules, start, ruleIndex); 872 873 msg.append('!'); 874 // starting from ruleIndex 875 int length = rules.length() - ruleIndex; 876 if(length >= U_PARSE_CONTEXT_LEN) { 877 length = U_PARSE_CONTEXT_LEN - 1; 878 if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) { 879 --length; 880 } 881 } 882 msg.append(rules, ruleIndex, ruleIndex + length); 883 return msg.append('\"').toString(); 884 } 885 886 /** 887 * ASCII [:P:] and [:S:]: 888 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] 889 */ 890 private static boolean isSyntaxChar(int c) { 891 return 0x21 <= c && c <= 0x7e && 892 (c <= 0x2f || (0x3a <= c && c <= 0x40) || 893 (0x5b <= c && c <= 0x60) || (0x7b <= c)); 894 } 895 896 private int skipWhiteSpace(int i) { 897 while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) { 898 ++i; 899 } 900 return i; 901 } 902 903 private Normalizer2 nfd = Normalizer2.getNFDInstance(); 904 private Normalizer2 nfc = Normalizer2.getNFCInstance(); 905 906 private String rules; 907 private final CollationData baseData; 908 private CollationSettings settings; 909 910 private Sink sink; 911 private Importer importer; 912 913 private int ruleIndex; 914} 915