1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4******************************************************************************* 5* Copyright (C) 2013-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* CollationRuleParser.java, ported from collationruleparser.h/.cpp 9* 10* C++ version created on: 2013apr10 11* created by: Markus W. Scherer 12*/ 13 14package com.ibm.icu.impl.coll; 15 16import java.text.ParseException; 17import java.util.ArrayList; 18 19import com.ibm.icu.impl.IllegalIcuArgumentException; 20import com.ibm.icu.impl.PatternProps; 21import com.ibm.icu.lang.UCharacter; 22import com.ibm.icu.lang.UProperty; 23import com.ibm.icu.text.Collator; 24import com.ibm.icu.text.Normalizer2; 25import com.ibm.icu.text.UTF16; 26import com.ibm.icu.text.UnicodeSet; 27import com.ibm.icu.util.ULocale; 28 29public final class CollationRuleParser { 30 /** Special reset positions. */ 31 enum Position { 32 FIRST_TERTIARY_IGNORABLE, 33 LAST_TERTIARY_IGNORABLE, 34 FIRST_SECONDARY_IGNORABLE, 35 LAST_SECONDARY_IGNORABLE, 36 FIRST_PRIMARY_IGNORABLE, 37 LAST_PRIMARY_IGNORABLE, 38 FIRST_VARIABLE, 39 LAST_VARIABLE, 40 FIRST_REGULAR, 41 LAST_REGULAR, 42 FIRST_IMPLICIT, 43 LAST_IMPLICIT, 44 FIRST_TRAILING, 45 LAST_TRAILING 46 } 47 static final Position[] POSITION_VALUES = Position.values(); 48 49 /** 50 * First character of contractions that encode special reset positions. 51 * U+FFFE cannot be tailored via rule syntax. 52 * 53 * The second contraction character is POS_BASE + Position. 54 */ 55 static final char POS_LEAD = 0xfffe; 56 /** 57 * Base for the second character of contractions that encode special reset positions. 58 * Braille characters U+28xx are printable and normalization-inert. 59 * @see POS_LEAD 60 */ 61 static final char POS_BASE = 0x2800; 62 63 static abstract class Sink { 64 /** 65 * Adds a reset. 66 * strength=UCOL_IDENTICAL for &str. 67 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. 68 */ 69 abstract void addReset(int strength, CharSequence str); 70 /** 71 * Adds a relation with strength and prefix | str / extension. 72 */ 73 abstract void addRelation(int strength, CharSequence prefix, 74 CharSequence str, CharSequence extension); 75 76 void suppressContractions(UnicodeSet set) {} 77 78 void optimize(UnicodeSet set) {} 79 } 80 81 interface Importer { 82 String getRules(String localeID, String collationType); 83 } 84 85 /** 86 * Constructor. 87 * The Sink must be set before parsing. 88 * The Importer can be set, otherwise [import locale] syntax is not supported. 89 */ 90 CollationRuleParser(CollationData base) { 91 baseData = base; 92 } 93 94 /** 95 * Sets the pointer to a Sink object. 96 * The pointer is aliased: Pointer copy without cloning or taking ownership. 97 */ 98 void setSink(Sink sinkAlias) { 99 sink = sinkAlias; 100 } 101 102 /** 103 * Sets the pointer to an Importer object. 104 * The pointer is aliased: Pointer copy without cloning or taking ownership. 105 */ 106 void setImporter(Importer importerAlias) { 107 importer = importerAlias; 108 } 109 110 void parse(String ruleString, CollationSettings outSettings) throws ParseException { 111 settings = outSettings; 112 parse(ruleString); 113 } 114 115 private static final int UCOL_DEFAULT = -1; 116 private static final int UCOL_OFF = 0; 117 private static final int UCOL_ON = 1; 118 119 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ 120 private static final int STRENGTH_MASK = 0xf; 121 private static final int STARRED_FLAG = 0x10; 122 private static final int OFFSET_SHIFT = 8; 123 124 private static final String BEFORE = "[before"; 125 126 // In C++, we parse into temporary UnicodeString objects named "raw" or "str". 127 // In Java, we reuse this StringBuilder. 128 private final StringBuilder rawBuilder = new StringBuilder(); 129 130 private void parse(String ruleString) throws ParseException { 131 rules = ruleString; 132 ruleIndex = 0; 133 134 while(ruleIndex < rules.length()) { 135 char c = rules.charAt(ruleIndex); 136 if(PatternProps.isWhiteSpace(c)) { 137 ++ruleIndex; 138 continue; 139 } 140 switch(c) { 141 case 0x26: // '&' 142 parseRuleChain(); 143 break; 144 case 0x5b: // '[' 145 parseSetting(); 146 break; 147 case 0x23: // '#' starts a comment, until the end of the line 148 ruleIndex = skipComment(ruleIndex + 1); 149 break; 150 case 0x40: // '@' is equivalent to [backwards 2] 151 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true); 152 ++ruleIndex; 153 break; 154 case 0x21: // '!' used to turn on Thai/Lao character reversal 155 // Accept but ignore. The root collator has contractions 156 // that are equivalent to the character reversal, where appropriate. 157 ++ruleIndex; 158 break; 159 default: 160 setParseError("expected a reset or setting or comment"); 161 break; 162 } 163 } 164 } 165 166 private void parseRuleChain() throws ParseException { 167 int resetStrength = parseResetAndPosition(); 168 boolean isFirstRelation = true; 169 for(;;) { 170 int result = parseRelationOperator(); 171 if(result < 0) { 172 if(ruleIndex < rules.length() && rules.charAt(ruleIndex) == 0x23) { 173 // '#' starts a comment, until the end of the line 174 ruleIndex = skipComment(ruleIndex + 1); 175 continue; 176 } 177 if(isFirstRelation) { 178 setParseError("reset not followed by a relation"); 179 } 180 return; 181 } 182 int strength = result & STRENGTH_MASK; 183 if(resetStrength < Collator.IDENTICAL) { 184 // reset-before rule chain 185 if(isFirstRelation) { 186 if(strength != resetStrength) { 187 setParseError("reset-before strength differs from its first relation"); 188 return; 189 } 190 } else { 191 if(strength < resetStrength) { 192 setParseError("reset-before strength followed by a stronger relation"); 193 return; 194 } 195 } 196 } 197 int i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator 198 if((result & STARRED_FLAG) == 0) { 199 parseRelationStrings(strength, i); 200 } else { 201 parseStarredCharacters(strength, i); 202 } 203 isFirstRelation = false; 204 } 205 } 206 207 private int parseResetAndPosition() throws ParseException { 208 int i = skipWhiteSpace(ruleIndex + 1); 209 int j; 210 char c; 211 int resetStrength; 212 if(rules.regionMatches(i, BEFORE, 0, BEFORE.length()) && 213 (j = i + BEFORE.length()) < rules.length() && 214 PatternProps.isWhiteSpace(rules.charAt(j)) && 215 ((j = skipWhiteSpace(j + 1)) + 1) < rules.length() && 216 0x31 <= (c = rules.charAt(j)) && c <= 0x33 && 217 rules.charAt(j + 1) == 0x5d) { 218 // &[before n] with n=1 or 2 or 3 219 resetStrength = Collator.PRIMARY + (c - 0x31); 220 i = skipWhiteSpace(j + 2); 221 } else { 222 resetStrength = Collator.IDENTICAL; 223 } 224 if(i >= rules.length()) { 225 setParseError("reset without position"); 226 return UCOL_DEFAULT; 227 } 228 if(rules.charAt(i) == 0x5b) { // '[' 229 i = parseSpecialPosition(i, rawBuilder); 230 } else { 231 i = parseTailoringString(i, rawBuilder); 232 } 233 try { 234 sink.addReset(resetStrength, rawBuilder); 235 } catch(Exception e) { 236 setParseError("adding reset failed", e); 237 return UCOL_DEFAULT; 238 } 239 ruleIndex = i; 240 return resetStrength; 241 } 242 243 private int parseRelationOperator() { 244 ruleIndex = skipWhiteSpace(ruleIndex); 245 if(ruleIndex >= rules.length()) { return UCOL_DEFAULT; } 246 int strength; 247 int i = ruleIndex; 248 char c = rules.charAt(i++); 249 switch(c) { 250 case 0x3c: // '<' 251 if(i < rules.length() && rules.charAt(i) == 0x3c) { // << 252 ++i; 253 if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<< 254 ++i; 255 if(i < rules.length() && rules.charAt(i) == 0x3c) { // <<<< 256 ++i; 257 strength = Collator.QUATERNARY; 258 } else { 259 strength = Collator.TERTIARY; 260 } 261 } else { 262 strength = Collator.SECONDARY; 263 } 264 } else { 265 strength = Collator.PRIMARY; 266 } 267 if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*' 268 ++i; 269 strength |= STARRED_FLAG; 270 } 271 break; 272 case 0x3b: // ';' same as << 273 strength = Collator.SECONDARY; 274 break; 275 case 0x2c: // ',' same as <<< 276 strength = Collator.TERTIARY; 277 break; 278 case 0x3d: // '=' 279 strength = Collator.IDENTICAL; 280 if(i < rules.length() && rules.charAt(i) == 0x2a) { // '*' 281 ++i; 282 strength |= STARRED_FLAG; 283 } 284 break; 285 default: 286 return UCOL_DEFAULT; 287 } 288 return ((i - ruleIndex) << OFFSET_SHIFT) | strength; 289 } 290 291 private void parseRelationStrings(int strength, int i) throws ParseException { 292 // Parse 293 // prefix | str / extension 294 // where prefix and extension are optional. 295 String prefix = ""; 296 CharSequence extension = ""; 297 i = parseTailoringString(i, rawBuilder); 298 char next = (i < rules.length()) ? rules.charAt(i) : 0; 299 if(next == 0x7c) { // '|' separates the context prefix from the string. 300 prefix = rawBuilder.toString(); 301 i = parseTailoringString(i + 1, rawBuilder); 302 next = (i < rules.length()) ? rules.charAt(i) : 0; 303 } 304 // str = rawBuilder (do not modify rawBuilder any more in this function) 305 if(next == 0x2f) { // '/' separates the string from the extension. 306 StringBuilder extBuilder = new StringBuilder(); 307 i = parseTailoringString(i + 1, extBuilder); 308 extension = extBuilder; 309 } 310 if(prefix.length() != 0) { 311 int prefix0 = prefix.codePointAt(0); 312 int c = rawBuilder.codePointAt(0); 313 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) { 314 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary"); 315 return; 316 } 317 } 318 try { 319 sink.addRelation(strength, prefix, rawBuilder, extension); 320 } catch(Exception e) { 321 setParseError("adding relation failed", e); 322 return; 323 } 324 ruleIndex = i; 325 } 326 327 private void parseStarredCharacters(int strength, int i) throws ParseException { 328 String empty = ""; 329 i = parseString(skipWhiteSpace(i), rawBuilder); 330 if(rawBuilder.length() == 0) { 331 setParseError("missing starred-relation string"); 332 return; 333 } 334 int prev = -1; 335 int j = 0; 336 for(;;) { 337 while(j < rawBuilder.length()) { 338 int c = rawBuilder.codePointAt(j); 339 if(!nfd.isInert(c)) { 340 setParseError("starred-relation string is not all NFD-inert"); 341 return; 342 } 343 try { 344 sink.addRelation(strength, empty, UTF16.valueOf(c), empty); 345 } catch(Exception e) { 346 setParseError("adding relation failed", e); 347 return; 348 } 349 j += Character.charCount(c); 350 prev = c; 351 } 352 if(i >= rules.length() || rules.charAt(i) != 0x2d) { // '-' 353 break; 354 } 355 if(prev < 0) { 356 setParseError("range without start in starred-relation string"); 357 return; 358 } 359 i = parseString(i + 1, rawBuilder); 360 if(rawBuilder.length() == 0) { 361 setParseError("range without end in starred-relation string"); 362 return; 363 } 364 int c = rawBuilder.codePointAt(0); 365 if(c < prev) { 366 setParseError("range start greater than end in starred-relation string"); 367 return; 368 } 369 // range prev-c 370 while(++prev <= c) { 371 if(!nfd.isInert(prev)) { 372 setParseError("starred-relation string range is not all NFD-inert"); 373 return; 374 } 375 if(isSurrogate(prev)) { 376 setParseError("starred-relation string range contains a surrogate"); 377 return; 378 } 379 if(0xfffd <= prev && prev <= 0xffff) { 380 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF"); 381 return; 382 } 383 try { 384 sink.addRelation(strength, empty, UTF16.valueOf(prev), empty); 385 } catch(Exception e) { 386 setParseError("adding relation failed", e); 387 return; 388 } 389 } 390 prev = -1; 391 j = Character.charCount(c); 392 } 393 ruleIndex = skipWhiteSpace(i); 394 } 395 396 private int parseTailoringString(int i, StringBuilder raw) throws ParseException { 397 i = parseString(skipWhiteSpace(i), raw); 398 if(raw.length() == 0) { 399 setParseError("missing relation string"); 400 } 401 return skipWhiteSpace(i); 402 } 403 404 private int parseString(int i, StringBuilder raw) throws ParseException { 405 raw.setLength(0); 406 while(i < rules.length()) { 407 char c = rules.charAt(i++); 408 if(isSyntaxChar(c)) { 409 if(c == 0x27) { // apostrophe 410 if(i < rules.length() && rules.charAt(i) == 0x27) { 411 // Double apostrophe, encodes a single one. 412 raw.append((char)0x27); 413 ++i; 414 continue; 415 } 416 // Quote literal text until the next single apostrophe. 417 for(;;) { 418 if(i == rules.length()) { 419 setParseError("quoted literal text missing terminating apostrophe"); 420 return i; 421 } 422 c = rules.charAt(i++); 423 if(c == 0x27) { 424 if(i < rules.length() && rules.charAt(i) == 0x27) { 425 // Double apostrophe inside quoted literal text, 426 // still encodes a single apostrophe. 427 ++i; 428 } else { 429 break; 430 } 431 } 432 raw.append(c); 433 } 434 } else if(c == 0x5c) { // backslash 435 if(i == rules.length()) { 436 setParseError("backslash escape at the end of the rule string"); 437 return i; 438 } 439 int cp = rules.codePointAt(i); 440 raw.appendCodePoint(cp); 441 i += Character.charCount(cp); 442 } else { 443 // Any other syntax character terminates a string. 444 --i; 445 break; 446 } 447 } else if(PatternProps.isWhiteSpace(c)) { 448 // Unquoted white space terminates a string. 449 --i; 450 break; 451 } else { 452 raw.append(c); 453 } 454 } 455 for(int j = 0; j < raw.length();) { 456 int c = raw.codePointAt(j); 457 if(isSurrogate(c)) { 458 setParseError("string contains an unpaired surrogate"); 459 return i; 460 } 461 if(0xfffd <= c && c <= 0xffff) { 462 setParseError("string contains U+FFFD, U+FFFE or U+FFFF"); 463 return i; 464 } 465 j += Character.charCount(c); 466 } 467 return i; 468 } 469 470 // TODO: Widen UTF16.isSurrogate(char16) to take an int. 471 private static final boolean isSurrogate(int c) { 472 return (c & 0xfffff800) == 0xd800; 473 } 474 475 private static final String[] positions = { 476 "first tertiary ignorable", 477 "last tertiary ignorable", 478 "first secondary ignorable", 479 "last secondary ignorable", 480 "first primary ignorable", 481 "last primary ignorable", 482 "first variable", 483 "last variable", 484 "first regular", 485 "last regular", 486 "first implicit", 487 "last implicit", 488 "first trailing", 489 "last trailing" 490 }; 491 492 /** 493 * Sets str to a contraction of U+FFFE and (U+2800 + Position). 494 * @return rule index after the special reset position 495 * @throws ParseException 496 */ 497 private int parseSpecialPosition(int i, StringBuilder str) throws ParseException { 498 int j = readWords(i + 1, rawBuilder); 499 if(j > i && rules.charAt(j) == 0x5d && rawBuilder.length() != 0) { // words end with ] 500 ++j; 501 String raw = rawBuilder.toString(); 502 str.setLength(0); 503 for(int pos = 0; pos < positions.length; ++pos) { 504 if(raw.equals(positions[pos])) { 505 str.append(POS_LEAD).append((char)(POS_BASE + pos)); 506 return j; 507 } 508 } 509 if(raw.equals("top")) { 510 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_REGULAR.ordinal())); 511 return j; 512 } 513 if(raw.equals("variable top")) { 514 str.append(POS_LEAD).append((char)(POS_BASE + Position.LAST_VARIABLE.ordinal())); 515 return j; 516 } 517 } 518 setParseError("not a valid special reset position"); 519 return i; 520 } 521 522 private void parseSetting() throws ParseException { 523 int i = ruleIndex + 1; 524 int j = readWords(i, rawBuilder); 525 if(j <= i || rawBuilder.length() == 0) { 526 setParseError("expected a setting/option at '['"); 527 } 528 // startsWith() etc. are available for String but not CharSequence/StringBuilder. 529 String raw = rawBuilder.toString(); 530 if(rules.charAt(j) == 0x5d) { // words end with ] 531 ++j; 532 if(raw.startsWith("reorder") && 533 (raw.length() == 7 || raw.charAt(7) == 0x20)) { 534 parseReordering(raw); 535 ruleIndex = j; 536 return; 537 } 538 if(raw.equals("backwards 2")) { 539 settings.setFlag(CollationSettings.BACKWARD_SECONDARY, true); 540 ruleIndex = j; 541 return; 542 } 543 String v; 544 int valueIndex = raw.lastIndexOf(0x20); 545 if(valueIndex >= 0) { 546 v = raw.substring(valueIndex + 1); 547 raw = raw.substring(0, valueIndex); 548 } else { 549 v = ""; 550 } 551 if(raw.equals("strength") && v.length() == 1) { 552 int value = UCOL_DEFAULT; 553 char c = v.charAt(0); 554 if(0x31 <= c && c <= 0x34) { // 1..4 555 value = Collator.PRIMARY + (c - 0x31); 556 } else if(c == 0x49) { // 'I' 557 value = Collator.IDENTICAL; 558 } 559 if(value != UCOL_DEFAULT) { 560 settings.setStrength(value); 561 ruleIndex = j; 562 return; 563 } 564 } else if(raw.equals("alternate")) { 565 int value = UCOL_DEFAULT; 566 if(v.equals("non-ignorable")) { 567 value = 0; // UCOL_NON_IGNORABLE 568 } else if(v.equals("shifted")) { 569 value = 1; // UCOL_SHIFTED 570 } 571 if(value != UCOL_DEFAULT) { 572 settings.setAlternateHandlingShifted(value > 0); 573 ruleIndex = j; 574 return; 575 } 576 } else if(raw.equals("maxVariable")) { 577 int value = UCOL_DEFAULT; 578 if(v.equals("space")) { 579 value = CollationSettings.MAX_VAR_SPACE; 580 } else if(v.equals("punct")) { 581 value = CollationSettings.MAX_VAR_PUNCT; 582 } else if(v.equals("symbol")) { 583 value = CollationSettings.MAX_VAR_SYMBOL; 584 } else if(v.equals("currency")) { 585 value = CollationSettings.MAX_VAR_CURRENCY; 586 } 587 if(value != UCOL_DEFAULT) { 588 settings.setMaxVariable(value, 0); 589 settings.variableTop = baseData.getLastPrimaryForGroup( 590 Collator.ReorderCodes.FIRST + value); 591 assert(settings.variableTop != 0); 592 ruleIndex = j; 593 return; 594 } 595 } else if(raw.equals("caseFirst")) { 596 int value = UCOL_DEFAULT; 597 if(v.equals("off")) { 598 value = UCOL_OFF; 599 } else if(v.equals("lower")) { 600 value = CollationSettings.CASE_FIRST; // UCOL_LOWER_FIRST 601 } else if(v.equals("upper")) { 602 value = CollationSettings.CASE_FIRST_AND_UPPER_MASK; // UCOL_UPPER_FIRST 603 } 604 if(value != UCOL_DEFAULT) { 605 settings.setCaseFirst(value); 606 ruleIndex = j; 607 return; 608 } 609 } else if(raw.equals("caseLevel")) { 610 int value = getOnOffValue(v); 611 if(value != UCOL_DEFAULT) { 612 settings.setFlag(CollationSettings.CASE_LEVEL, value > 0); 613 ruleIndex = j; 614 return; 615 } 616 } else if(raw.equals("normalization")) { 617 int value = getOnOffValue(v); 618 if(value != UCOL_DEFAULT) { 619 settings.setFlag(CollationSettings.CHECK_FCD, value > 0); 620 ruleIndex = j; 621 return; 622 } 623 } else if(raw.equals("numericOrdering")) { 624 int value = getOnOffValue(v); 625 if(value != UCOL_DEFAULT) { 626 settings.setFlag(CollationSettings.NUMERIC, value > 0); 627 ruleIndex = j; 628 return; 629 } 630 } else if(raw.equals("hiraganaQ")) { 631 int value = getOnOffValue(v); 632 if(value != UCOL_DEFAULT) { 633 if(value == UCOL_ON) { 634 setParseError("[hiraganaQ on] is not supported"); 635 } 636 ruleIndex = j; 637 return; 638 } 639 } else if(raw.equals("import")) { 640 // BCP 47 language tag -> ICU locale ID 641 ULocale localeID; 642 try { 643 localeID = new ULocale.Builder().setLanguageTag(v).build(); 644 } catch(Exception e) { 645 setParseError("expected language tag in [import langTag]", e); 646 return; 647 } 648 // localeID minus all keywords 649 String baseID = localeID.getBaseName(); 650 // @collation=type, or length=0 if not specified 651 String collationType = localeID.getKeywordValue("collation"); 652 if(importer == null) { 653 setParseError("[import langTag] is not supported"); 654 } else { 655 String importedRules; 656 try { 657 importedRules = 658 importer.getRules(baseID, 659 collationType != null ? collationType : "standard"); 660 } catch(Exception e) { 661 setParseError("[import langTag] failed", e); 662 return; 663 } 664 String outerRules = rules; 665 int outerRuleIndex = ruleIndex; 666 try { 667 parse(importedRules); 668 } catch(Exception e) { 669 ruleIndex = outerRuleIndex; // Restore the original index for error reporting. 670 setParseError("parsing imported rules failed", e); 671 } 672 rules = outerRules; 673 ruleIndex = j; 674 } 675 return; 676 } 677 } else if(rules.charAt(j) == 0x5b) { // words end with [ 678 UnicodeSet set = new UnicodeSet(); 679 j = parseUnicodeSet(j, set); 680 if(raw.equals("optimize")) { 681 try { 682 sink.optimize(set); 683 } catch(Exception e) { 684 setParseError("[optimize set] failed", e); 685 } 686 ruleIndex = j; 687 return; 688 } else if(raw.equals("suppressContractions")) { 689 try { 690 sink.suppressContractions(set); 691 } catch(Exception e) { 692 setParseError("[suppressContractions set] failed", e); 693 } 694 ruleIndex = j; 695 return; 696 } 697 } 698 setParseError("not a valid setting/option"); 699 } 700 701 private void parseReordering(CharSequence raw) throws ParseException { 702 int i = 7; // after "reorder" 703 if(i == raw.length()) { 704 // empty [reorder] with no codes 705 settings.resetReordering(); 706 return; 707 } 708 // Parse the codes in [reorder aa bb cc]. 709 ArrayList<Integer> reorderCodes = new ArrayList<Integer>(); 710 while(i < raw.length()) { 711 ++i; // skip the word-separating space 712 int limit = i; 713 while(limit < raw.length() && raw.charAt(limit) != ' ') { ++limit; } 714 String word = raw.subSequence(i, limit).toString(); 715 int code = getReorderCode(word); 716 if(code < 0) { 717 setParseError("unknown script or reorder code"); 718 return; 719 } 720 reorderCodes.add(code); 721 i = limit; 722 } 723 if(reorderCodes.isEmpty()) { 724 settings.resetReordering(); 725 } else { 726 int[] codes = new int[reorderCodes.size()]; 727 int j = 0; 728 for(Integer code : reorderCodes) { codes[j++] = code; } 729 settings.setReordering(baseData, codes); 730 } 731 } 732 733 private static final String[] gSpecialReorderCodes = { 734 "space", "punct", "symbol", "currency", "digit" 735 }; 736 737 /** 738 * Gets a script or reorder code from its string representation. 739 * @return the script/reorder code, or 740 * -1 if not recognized 741 */ 742 public static int getReorderCode(String word) { 743 for(int i = 0; i < gSpecialReorderCodes.length; ++i) { 744 if(word.equalsIgnoreCase(gSpecialReorderCodes[i])) { 745 return Collator.ReorderCodes.FIRST + i; 746 } 747 } 748 try { 749 int script = UCharacter.getPropertyValueEnum(UProperty.SCRIPT, word); 750 if(script >= 0) { 751 return script; 752 } 753 } catch (IllegalIcuArgumentException e) { 754 // fall through 755 } 756 if(word.equalsIgnoreCase("others")) { 757 return Collator.ReorderCodes.OTHERS; // same as Zzzz = USCRIPT_UNKNOWN 758 } 759 return -1; 760 } 761 762 private static int getOnOffValue(String s) { 763 if(s.equals("on")) { 764 return UCOL_ON; 765 } else if(s.equals("off")) { 766 return UCOL_OFF; 767 } else { 768 return UCOL_DEFAULT; 769 } 770 } 771 772 private int parseUnicodeSet(int i, UnicodeSet set) throws ParseException { 773 // Collect a UnicodeSet pattern between a balanced pair of [brackets]. 774 int level = 0; 775 int j = i; 776 for(;;) { 777 if(j == rules.length()) { 778 setParseError("unbalanced UnicodeSet pattern brackets"); 779 return j; 780 } 781 char c = rules.charAt(j++); 782 if(c == 0x5b) { // '[' 783 ++level; 784 } else if(c == 0x5d) { // ']' 785 if(--level == 0) { break; } 786 } 787 } 788 try { 789 set.applyPattern(rules.substring(i, j)); 790 } catch(Exception e) { 791 setParseError("not a valid UnicodeSet pattern: " + e.getMessage()); 792 } 793 j = skipWhiteSpace(j); 794 if(j == rules.length() || rules.charAt(j) != 0x5d) { 795 setParseError("missing option-terminating ']' after UnicodeSet pattern"); 796 return j; 797 } 798 return ++j; 799 } 800 801 private int readWords(int i, StringBuilder raw) { 802 raw.setLength(0); 803 i = skipWhiteSpace(i); 804 for(;;) { 805 if(i >= rules.length()) { return 0; } 806 char c = rules.charAt(i); 807 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_ 808 if(raw.length() == 0) { return i; } 809 int lastIndex = raw.length() - 1; 810 if(raw.charAt(lastIndex) == ' ') { // remove trailing space 811 raw.setLength(lastIndex); 812 } 813 return i; 814 } 815 if(PatternProps.isWhiteSpace(c)) { 816 raw.append(' '); 817 i = skipWhiteSpace(i + 1); 818 } else { 819 raw.append(c); 820 ++i; 821 } 822 } 823 } 824 825 private int skipComment(int i) { 826 // skip to past the newline 827 while(i < rules.length()) { 828 char c = rules.charAt(i++); 829 // LF or FF or CR or NEL or LS or PS 830 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) { 831 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS." 832 // NLF (new line function) = CR or LF or CR+LF or NEL. 833 // No need to collect all of CR+LF because a following LF will be ignored anyway. 834 break; 835 } 836 } 837 return i; 838 } 839 840 private void setParseError(String reason) throws ParseException { 841 throw makeParseException(reason); 842 } 843 844 private void setParseError(String reason, Exception e) throws ParseException { 845 ParseException newExc = makeParseException(reason + ": " + e.getMessage()); 846 newExc.initCause(e); 847 throw newExc; 848 } 849 850 private ParseException makeParseException(String reason) { 851 return new ParseException(appendErrorContext(reason), ruleIndex); 852 } 853 854 private static final int U_PARSE_CONTEXT_LEN = 16; 855 856 // C++ setErrorContext() 857 private String appendErrorContext(String reason) { 858 // Note: This relies on the calling code maintaining the ruleIndex 859 // at a position that is useful for debugging. 860 // For example, at the beginning of a reset or relation etc. 861 StringBuilder msg = new StringBuilder(reason); 862 msg.append(" at index ").append(ruleIndex); 863 // We are not counting line numbers. 864 865 msg.append(" near \""); 866 // before ruleIndex 867 int start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1); 868 if(start < 0) { 869 start = 0; 870 } else if(start > 0 && Character.isLowSurrogate(rules.charAt(start))) { 871 ++start; 872 } 873 msg.append(rules, start, ruleIndex); 874 875 msg.append('!'); 876 // starting from ruleIndex 877 int length = rules.length() - ruleIndex; 878 if(length >= U_PARSE_CONTEXT_LEN) { 879 length = U_PARSE_CONTEXT_LEN - 1; 880 if(Character.isHighSurrogate(rules.charAt(ruleIndex + length - 1))) { 881 --length; 882 } 883 } 884 msg.append(rules, ruleIndex, ruleIndex + length); 885 return msg.append('\"').toString(); 886 } 887 888 /** 889 * ASCII [:P:] and [:S:]: 890 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] 891 */ 892 private static boolean isSyntaxChar(int c) { 893 return 0x21 <= c && c <= 0x7e && 894 (c <= 0x2f || (0x3a <= c && c <= 0x40) || 895 (0x5b <= c && c <= 0x60) || (0x7b <= c)); 896 } 897 898 private int skipWhiteSpace(int i) { 899 while(i < rules.length() && PatternProps.isWhiteSpace(rules.charAt(i))) { 900 ++i; 901 } 902 return i; 903 } 904 905 private Normalizer2 nfd = Normalizer2.getNFDInstance(); 906 private Normalizer2 nfc = Normalizer2.getNFCInstance(); 907 908 private String rules; 909 private final CollationData baseData; 910 private CollationSettings settings; 911 912 private Sink sink; 913 private Importer importer; 914 915 private int ruleIndex; 916} 917