1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/** 4 ******************************************************************************* 5 * Copyright (C) 2001-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationTest.java, ported from collationtest.cpp 9 * C++ version created on: 2012apr27 10 * created by: Markus W. Scherer 11 */ 12package com.ibm.icu.dev.test.collator; 13 14import java.io.BufferedReader; 15import java.io.IOException; 16import java.text.ParseException; 17import java.util.HashSet; 18import java.util.Set; 19 20import org.junit.Test; 21 22import com.ibm.icu.dev.test.TestFmwk; 23import com.ibm.icu.dev.test.TestUtil; 24import com.ibm.icu.impl.Norm2AllModes; 25import com.ibm.icu.impl.Utility; 26import com.ibm.icu.impl.coll.Collation; 27import com.ibm.icu.impl.coll.CollationData; 28import com.ibm.icu.impl.coll.CollationFCD; 29import com.ibm.icu.impl.coll.CollationIterator; 30import com.ibm.icu.impl.coll.CollationRoot; 31import com.ibm.icu.impl.coll.CollationRootElements; 32import com.ibm.icu.impl.coll.CollationRuleParser; 33import com.ibm.icu.impl.coll.CollationWeights; 34import com.ibm.icu.impl.coll.FCDIterCollationIterator; 35import com.ibm.icu.impl.coll.FCDUTF16CollationIterator; 36import com.ibm.icu.impl.coll.UTF16CollationIterator; 37import com.ibm.icu.impl.coll.UVector32; 38import com.ibm.icu.text.CollationElementIterator; 39import com.ibm.icu.text.CollationKey; 40import com.ibm.icu.text.Collator; 41import com.ibm.icu.text.Collator.ReorderCodes; 42import com.ibm.icu.text.Normalizer2; 43import com.ibm.icu.text.RawCollationKey; 44import com.ibm.icu.text.RuleBasedCollator; 45import com.ibm.icu.text.UCharacterIterator; 46import com.ibm.icu.text.UTF16; 47import com.ibm.icu.text.UnicodeSet; 48import com.ibm.icu.text.UnicodeSetIterator; 49import com.ibm.icu.util.IllformedLocaleException; 50import com.ibm.icu.util.Output; 51import com.ibm.icu.util.ULocale; 52 53public class CollationTest extends TestFmwk { 54 public CollationTest() { 55 } 56 57 // Fields 58 Normalizer2 fcd, nfd; 59 Collator coll; 60 String fileLine; 61 int fileLineNumber; 62 String fileTestName; 63 64 // package private methods ---------------------------------------------- 65 66 static void doTest(TestFmwk test, RuleBasedCollator col, String source, 67 String target, int result) 68 { 69 doTestVariant(test, col, source, target, result); 70 if (result == -1) { 71 doTestVariant(test, col, target, source, 1); 72 } 73 else if (result == 1) { 74 doTestVariant(test, col, target, source, -1); 75 } 76 else { 77 doTestVariant(test, col, target, source, 0); 78 } 79 80 CollationElementIterator iter = col.getCollationElementIterator(source); 81 backAndForth(test, iter); 82 iter.setText(target); 83 backAndForth(test, iter); 84 } 85 86 /** 87 * Return an integer array containing all of the collation orders 88 * returned by calls to next on the specified iterator 89 */ 90 static int[] getOrders(CollationElementIterator iter) 91 { 92 int maxSize = 100; 93 int size = 0; 94 int[] orders = new int[maxSize]; 95 96 int order; 97 while ((order = iter.next()) != CollationElementIterator.NULLORDER) { 98 if (size == maxSize) { 99 maxSize *= 2; 100 int[] temp = new int[maxSize]; 101 System.arraycopy(orders, 0, temp, 0, size); 102 orders = temp; 103 } 104 orders[size++] = order; 105 } 106 107 if (maxSize > size) { 108 int[] temp = new int[size]; 109 System.arraycopy(orders, 0, temp, 0, size); 110 orders = temp; 111 } 112 return orders; 113 } 114 115 static void backAndForth(TestFmwk test, CollationElementIterator iter) 116 { 117 // Run through the iterator forwards and stick it into an array 118 iter.reset(); 119 int[] orders = getOrders(iter); 120 121 // Now go through it backwards and make sure we get the same values 122 int index = orders.length; 123 int o; 124 125 // reset the iterator 126 iter.reset(); 127 128 while ((o = iter.previous()) != CollationElementIterator.NULLORDER) { 129 if (o != orders[--index]) { 130 if (o == 0) { 131 index ++; 132 } else { 133 while (index > 0 && orders[index] == 0) { 134 index --; 135 } 136 if (o != orders[index]) { 137 TestFmwk.errln("Mismatch at index " + index + ": 0x" 138 + Utility.hex(orders[index]) + " vs 0x" + Utility.hex(o)); 139 break; 140 } 141 } 142 } 143 } 144 145 while (index != 0 && orders[index - 1] == 0) { 146 index --; 147 } 148 149 if (index != 0) { 150 String msg = "Didn't get back to beginning - index is "; 151 TestFmwk.errln(msg + index); 152 153 iter.reset(); 154 TestFmwk.err("next: "); 155 while ((o = iter.next()) != CollationElementIterator.NULLORDER) { 156 String hexString = "0x" + Utility.hex(o) + " "; 157 TestFmwk.err(hexString); 158 } 159 TestFmwk.errln(""); 160 TestFmwk.err("prev: "); 161 while ((o = iter.previous()) != CollationElementIterator.NULLORDER) { 162 String hexString = "0x" + Utility.hex(o) + " "; 163 TestFmwk.err(hexString); 164 } 165 TestFmwk.errln(""); 166 } 167 } 168 169 static final String appendCompareResult(int result, String target){ 170 if (result == -1) { 171 target += "LESS"; 172 } else if (result == 0) { 173 target += "EQUAL"; 174 } else if (result == 1) { 175 target += "GREATER"; 176 } else { 177 String huh = "?"; 178 target += huh + result; 179 } 180 return target; 181 } 182 183 static final String prettify(CollationKey key) { 184 byte[] bytes = key.toByteArray(); 185 return prettify(bytes, bytes.length); 186 } 187 188 static final String prettify(RawCollationKey key) { 189 return prettify(key.bytes, key.size); 190 } 191 192 static final String prettify(byte[] skBytes, int length) { 193 StringBuilder target = new StringBuilder(length * 3 + 2).append('['); 194 195 for (int i = 0; i < length; i++) { 196 String numStr = Integer.toHexString(skBytes[i] & 0xff); 197 if (numStr.length() < 2) { 198 target.append('0'); 199 } 200 target.append(numStr).append(' '); 201 } 202 target.append(']'); 203 return target.toString(); 204 } 205 206 private static void doTestVariant(TestFmwk test, 207 RuleBasedCollator myCollation, 208 String source, String target, int result) 209 { 210 int compareResult = myCollation.compare(source, target); 211 if (compareResult != result) { 212 213 // !!! if not mod build, error, else nothing. 214 // warnln if not build, error, else always print warning. 215 // do we need a 'quiet warning?' (err or log). Hmmm, 216 // would it work to have the 'verbose' flag let you 217 // suppress warnings? Are there ever some warnings you 218 // want to suppress, and others you don't? 219 TestFmwk.errln("Comparing \"" + Utility.hex(source) + "\" with \"" 220 + Utility.hex(target) + "\" expected " + result 221 + " but got " + compareResult); 222 } 223 CollationKey ssk = myCollation.getCollationKey(source); 224 CollationKey tsk = myCollation.getCollationKey(target); 225 compareResult = ssk.compareTo(tsk); 226 if (compareResult != result) { 227 TestFmwk.errln("Comparing CollationKeys of \"" + Utility.hex(source) 228 + "\" with \"" + Utility.hex(target) 229 + "\" expected " + result + " but got " 230 + compareResult); 231 } 232 RawCollationKey srsk = new RawCollationKey(); 233 myCollation.getRawCollationKey(source, srsk); 234 RawCollationKey trsk = new RawCollationKey(); 235 myCollation.getRawCollationKey(target, trsk); 236 compareResult = ssk.compareTo(tsk); 237 if (compareResult != result) { 238 TestFmwk.errln("Comparing RawCollationKeys of \"" 239 + Utility.hex(source) 240 + "\" with \"" + Utility.hex(target) 241 + "\" expected " + result + " but got " 242 + compareResult); 243 } 244 } 245 246 @Test 247 public void TestMinMax() { 248 setRootCollator(); 249 RuleBasedCollator rbc = (RuleBasedCollator)coll; 250 251 final String s = "\uFFFE\uFFFF"; 252 long[] ces; 253 254 ces = rbc.internalGetCEs(s); 255 if (ces.length != 2) { 256 errln("expected 2 CEs for <FFFE, FFFF>, got " + ces.length); 257 return; 258 } 259 260 long ce = ces[0]; 261 long expected = Collation.makeCE(Collation.MERGE_SEPARATOR_PRIMARY); 262 if (ce != expected) { 263 errln("CE(U+fffe)=0x" + Utility.hex(ce) + " != 02.."); 264 } 265 266 ce = ces[1]; 267 expected = Collation.makeCE(Collation.MAX_PRIMARY); 268 if (ce != expected) { 269 errln("CE(U+ffff)=0x" + Utility.hex(ce) + " != max.."); 270 } 271 } 272 273 @Test 274 public void TestImplicits() { 275 CollationData cd = CollationRoot.getData(); 276 277 // Implicit primary weights should be assigned for the following sets, 278 // and sort in ascending order by set and then code point. 279 // See http://www.unicode.org/reports/tr10/#Implicit_Weights 280 // core Han Unified Ideographs 281 UnicodeSet coreHan = new UnicodeSet("[\\p{unified_ideograph}&" 282 + "[\\p{Block=CJK_Unified_Ideographs}" 283 + "\\p{Block=CJK_Compatibility_Ideographs}]]"); 284 // all other Unified Han ideographs 285 UnicodeSet otherHan = new UnicodeSet("[\\p{unified ideograph}-" 286 + "[\\p{Block=CJK_Unified_Ideographs}" 287 + "\\p{Block=CJK_Compatibility_Ideographs}]]"); 288 289 UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]"); 290 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings. 291 292 // Starting with CLDR 26/ICU 54, the root Han order may instead be 293 // the Unihan radical-stroke order. 294 // The tests should pass either way, so we only test the order of a small set of Han characters 295 // whose radical-stroke order is the same as their code point order. 296 UnicodeSet someHanInCPOrder = new UnicodeSet( 297 "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" + 298 "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]"); 299 UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder); 300 inOrder.addAll(unassigned).freeze(); 301 302 UnicodeSet[] sets = { coreHan, otherHan, unassigned }; 303 int prev = 0; 304 long prevPrimary = 0; 305 UTF16CollationIterator ci = new UTF16CollationIterator(cd, false, "", 0); 306 for (int i = 0; i < sets.length; ++i) { 307 UnicodeSetIterator iter = new UnicodeSetIterator(sets[i]); 308 while (iter.next()) { 309 String s = iter.getString(); 310 int c = s.codePointAt(0); 311 ci.setText(false, s, 0); 312 long ce = ci.nextCE(); 313 long ce2 = ci.nextCE(); 314 if (ce == Collation.NO_CE || ce2 != Collation.NO_CE) { 315 errln("CollationIterator.nextCE(0x" + Utility.hex(c) 316 + ") did not yield exactly one CE"); 317 continue; 318 319 } 320 if ((ce & 0xffffffffL) != Collation.COMMON_SEC_AND_TER_CE) { 321 errln("CollationIterator.nextCE(U+" + Utility.hex(c, 4) 322 + ") has non-common sec/ter weights: 0x" + Utility.hex(ce & 0xffffffffL, 8)); 323 continue; 324 } 325 long primary = ce >>> 32; 326 if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) { 327 errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary) 328 + ".. not greater than CE(U+" + Utility.hex(prev) 329 + ")=0x" + Utility.hex(prevPrimary) + ".."); 330 331 } 332 prev = c; 333 prevPrimary = primary; 334 } 335 } 336 } 337 338 // ICU4C: TestNulTerminated / renamed for ICU4J 339 @Test 340 public void TestSubSequence() { 341 CollationData data = CollationRoot.getData(); 342 final String s = "abab"; // { 0x61, 0x62, 0x61, 0x62 } 343 344 UTF16CollationIterator ci1 = new UTF16CollationIterator(data, false, s, 0); 345 UTF16CollationIterator ci2 = new UTF16CollationIterator(data, false, s, 2); 346 347 for (int i = 0; i < 2; ++i) { 348 long ce1 = ci1.nextCE(); 349 long ce2 = ci2.nextCE(); 350 351 if (ce1 != ce2) { 352 errln("CollationIterator.nextCE(with start position at 0) != " 353 + "nextCE(with start position at 2) at CE " + i); 354 } 355 } 356 } 357 358 359 // ICU4C: TestIllegalUTF8 / not applicable to ICU4J 360 361 362 private static void addLeadSurrogatesForSupplementary(UnicodeSet src, UnicodeSet dest) { 363 for(int c = 0x10000; c < 0x110000;) { 364 int next = c + 0x400; 365 if(src.containsSome(c, next - 1)) { 366 dest.add(UTF16.getLeadSurrogate(c)); 367 } 368 c = next; 369 } 370 } 371 372 @Test 373 public void TestShortFCDData() { 374 UnicodeSet expectedLccc = new UnicodeSet("[:^lccc=0:]"); 375 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates 376 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc); 377 378 UnicodeSet lccc = new UnicodeSet(); // actual 379 for (int c = 0; c <= 0xffff; ++c) { 380 if (CollationFCD.hasLccc(c)) { 381 lccc.add(c); 382 } 383 } 384 385 UnicodeSet diff = new UnicodeSet(expectedLccc); 386 diff.removeAll(lccc); 387 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP 388 389 String empty = "[]"; 390 String diffString; 391 392 diffString = diff.toPattern(true); 393 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString); 394 395 diff = lccc; 396 diff.removeAll(expectedLccc); 397 diffString = diff.toPattern(true); 398 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString); 399 400 UnicodeSet expectedTccc = new UnicodeSet("[:^tccc=0:]"); 401 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc); 402 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc); 403 404 UnicodeSet tccc = new UnicodeSet(); // actual 405 for(int c = 0; c <= 0xffff; ++c) { 406 if (CollationFCD.hasTccc(c)) { 407 tccc.add(c); 408 } 409 } 410 411 diff = new UnicodeSet(expectedTccc); 412 diff.removeAll(tccc); 413 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP 414 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString); 415 416 diff = tccc; 417 diff.removeAll(expectedTccc); 418 diffString = diff.toPattern(true); 419 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString); 420 } 421 422 private static class CodePointIterator { 423 int[] cp; 424 int length; 425 int pos; 426 427 CodePointIterator(int[] cp) { 428 this.cp = cp; 429 this.length = cp.length; 430 this.pos = 0; 431 } 432 433 void resetToStart() { 434 pos = 0; 435 } 436 437 int next() { 438 return (pos < length) ? cp[pos++] : Collation.SENTINEL_CP; 439 } 440 441 int previous() { 442 return (pos > 0) ? cp[--pos] : Collation.SENTINEL_CP; 443 } 444 445 int getLength() { 446 return length; 447 } 448 449 int getIndex() { 450 return pos; 451 } 452 } 453 454 private void checkFCD(String name, CollationIterator ci, CodePointIterator cpi) { 455 // Iterate forward to the limit. 456 for (;;) { 457 int c1 = ci.nextCodePoint(); 458 int c2 = cpi.next(); 459 if (c1 != c2) { 460 errln(name + ".nextCodePoint(to limit, 1st pass) = U+" + Utility.hex(c1) 461 + " != U+" + Utility.hex(c1) + " at " + cpi.getIndex()); 462 return; 463 } 464 if (c1 < 0) { 465 break; 466 } 467 } 468 469 // Iterate backward most of the way. 470 for (int n = (cpi.getLength() * 2) / 3; n > 0; --n) { 471 int c1 = ci.previousCodePoint(); 472 int c2 = cpi.previous(); 473 if (c1 != c2) { 474 errln(name + ".previousCodePoint() = U+" + Utility.hex(c1) + 475 " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); 476 return; 477 } 478 } 479 480 // Forward again. 481 for (;;) { 482 int c1 = ci.nextCodePoint(); 483 int c2 = cpi.next(); 484 if (c1 != c2) { 485 errln(name + ".nextCodePoint(to limit again) = U+" + Utility.hex(c1) 486 + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); 487 return; 488 } 489 if (c1 < 0) { 490 break; 491 } 492 } 493 494 // Iterate backward to the start. 495 for (;;) { 496 int c1 = ci.previousCodePoint(); 497 int c2 = cpi.previous(); 498 if (c1 != c2) { 499 errln(name + ".nextCodePoint(to start) = U+" + Utility.hex(c1) 500 + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex()); 501 return; 502 } 503 if (c1 < 0) { 504 break; 505 } 506 } 507 } 508 509 @Test 510 public void TestFCD() { 511 CollationData data = CollationRoot.getData(); 512 513 // Input string, not FCD. 514 StringBuilder buf = new StringBuilder(); 515 buf.append("\u0308\u00e1\u0062\u0301\u0327\u0430\u0062") 516 .appendCodePoint(0x1D15F) // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216 517 .append("\u0327\u0308") // ccc=202, 230 518 .appendCodePoint(0x1D16D) // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226 519 .appendCodePoint(0x1D15F) 520 .appendCodePoint(0x1D16D) 521 .append("\uac01") 522 .append("\u00e7") // Character with tccc!=0 decomposed together with mis-ordered sequence. 523 .appendCodePoint(0x1D16D).appendCodePoint(0x1D165) 524 .append("\u00e1") // Character with tccc!=0 decomposed together with decomposed sequence. 525 .append("\u0f73\u0f75") // Tibetan composite vowels must be decomposed. 526 .append("\u4e00\u0f81"); 527 String s = buf.toString(); 528 529 // Expected code points. 530 int[] cp = { 531 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62, 532 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308, 533 0x1D15F, 0x1D16D, 534 0xac01, 535 0x63, 0x327, 0x1D165, 0x1D16D, 536 0x61, 537 0xf71, 0xf71, 0xf72, 0xf74, 0x301, 538 0x4e00, 0xf71, 0xf80 539 }; 540 541 FCDUTF16CollationIterator u16ci = new FCDUTF16CollationIterator(data, false, s, 0); 542 CodePointIterator cpi = new CodePointIterator(cp); 543 checkFCD("FCDUTF16CollationIterator", u16ci, cpi); 544 545 cpi.resetToStart(); 546 UCharacterIterator iter = UCharacterIterator.getInstance(s); 547 FCDIterCollationIterator uici = new FCDIterCollationIterator(data, false, iter, 0); 548 checkFCD("FCDIterCollationIterator", uici, cpi); 549 } 550 551 private void checkAllocWeights(CollationWeights cw, long lowerLimit, long upperLimit, 552 int n, int someLength, int minCount) { 553 554 if (!cw.allocWeights(lowerLimit, upperLimit, n)) { 555 errln("CollationWeights::allocWeights(0x" 556 + Utility.hex(lowerLimit) + ",0x" 557 + Utility.hex(upperLimit) + "," 558 + n + ") = false"); 559 return; 560 } 561 long previous = lowerLimit; 562 int count = 0; // number of weights that have someLength 563 for (int i = 0; i < n; ++i) { 564 long w = cw.nextWeight(); 565 if (w == 0xffffffffL) { 566 errln("CollationWeights::allocWeights(0x" 567 + Utility.hex(lowerLimit) + ",0x" 568 + Utility.hex(upperLimit) + ",0x" 569 + n + ").nextWeight() returns only " 570 + i + " weights"); 571 return; 572 } 573 if (!(previous < w && w < upperLimit)) { 574 errln("CollationWeights::allocWeights(0x" 575 + Utility.hex(lowerLimit) + ",0x" 576 + Utility.hex(upperLimit) + "," 577 + n + ").nextWeight() number " 578 + (i + 1) + " -> 0x" + Utility.hex(w) 579 + " not between " 580 + Utility.hex(previous) + " and " 581 + Utility.hex(upperLimit)); 582 return; 583 } 584 if (CollationWeights.lengthOfWeight(w) == someLength) { 585 ++count; 586 } 587 } 588 if (count < minCount) { 589 errln("CollationWeights::allocWeights(0x" 590 + Utility.hex(lowerLimit) + ",0x" 591 + Utility.hex(upperLimit) + "," 592 + n + ").nextWeight() returns only " 593 + count + " < " + minCount + " weights of length " 594 + someLength); 595 596 } 597 } 598 599 @Test 600 public void TestCollationWeights() { 601 CollationWeights cw = new CollationWeights(); 602 603 // Non-compressible primaries use 254 second bytes 02..FF. 604 logln("CollationWeights.initForPrimary(non-compressible)"); 605 cw.initForPrimary(false); 606 // Expect 1 weight 11 and 254 weights 12xx. 607 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 1, 1); 608 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 2, 254); 609 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202. 610 checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 260, 2, 255); 611 // Expect 254 two-byte weights from the ranges 10ff and 11xx. 612 checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 600, 2, 254); 613 // Expect 254^2=64516 three-byte weights. 614 // During computation, there should be 3 three-byte ranges 615 // 10ffff, 11xxxx, 120202. 616 // The middle one should be split 64515:1, 617 // and the newly-split-off range and the last ranged lengthened. 618 checkAllocWeights(cw, 0x10fffe00L, 0x12020300L, 1 + 64516 + 254 + 1, 3, 64516); 619 // Expect weights 1102 & 1103. 620 checkAllocWeights(cw, 0x10ff0000L, 0x11040000L, 2, 2, 2); 621 // Expect weights 102102 & 102103. 622 checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2); 623 624 // Compressible primaries use 251 second bytes 04..FE. 625 logln("CollationWeights.initForPrimary(compressible)"); 626 cw.initForPrimary(true); 627 // Expect 1 weight 11 and 251 weights 12xx. 628 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 1, 1); 629 checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 2, 251); 630 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204. 631 checkAllocWeights(cw, 0x10fdfe40L, 0x12050300L, 260, 2, 252); 632 // Expect weights 1104 & 1105. 633 checkAllocWeights(cw, 0x10fe0000L, 0x11060000L, 2, 2, 2); 634 // Expect weights 102102 & 102103. 635 checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2); 636 637 // Secondary and tertiary weights use only bytes 3 & 4. 638 logln("CollationWeights.initForSecondary()"); 639 cw.initForSecondary(); 640 // Expect weights fbxx and all four fc..ff. 641 checkAllocWeights(cw, 0xfb20L, 0x10000L, 20, 3, 4); 642 643 logln("CollationWeights.initForTertiary()"); 644 cw.initForTertiary(); 645 // Expect weights 3dxx and both 3e & 3f. 646 checkAllocWeights(cw, 0x3d02L, 0x4000L, 10, 3, 2); 647 } 648 649 private static boolean isValidCE(CollationRootElements re, CollationData data, long p, long s, long ctq) { 650 long p1 = p >>> 24; 651 long p2 = (p >>> 16) & 0xff; 652 long p3 = (p >>> 8) & 0xff; 653 long p4 = p & 0xff; 654 long s1 = s >>> 8; 655 long s2 = s & 0xff; 656 // ctq = Case, Tertiary, Quaternary 657 long c = (ctq & Collation.CASE_MASK) >>> 14; 658 long t = ctq & Collation.ONLY_TERTIARY_MASK; 659 long t1 = t >>> 8; 660 long t2 = t & 0xff; 661 long q = ctq & Collation.QUATERNARY_MASK; 662 // No leading zero bytes. 663 if ((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) { 664 return false; 665 } 666 // No intermediate zero bytes. 667 if (p1 != 0 && p2 == 0 && (p & 0xffff) != 0) { 668 return false; 669 } 670 if (p2 != 0 && p3 == 0 && p4 != 0) { 671 return false; 672 } 673 // Minimum & maximum lead bytes. 674 if ((p1 != 0 && p1 <= Collation.MERGE_SEPARATOR_BYTE) 675 || s1 == Collation.LEVEL_SEPARATOR_BYTE 676 || t1 == Collation.LEVEL_SEPARATOR_BYTE || t1 > 0x3f) { 677 return false; 678 } 679 if (c > 2) { 680 return false; 681 } 682 // The valid byte range for the second primary byte depends on compressibility. 683 if (p2 != 0) { 684 if (data.isCompressibleLeadByte((int)p1)) { 685 if (p2 <= Collation.PRIMARY_COMPRESSION_LOW_BYTE 686 || Collation.PRIMARY_COMPRESSION_HIGH_BYTE <= p2) { 687 return false; 688 } 689 } else { 690 if (p2 <= Collation.LEVEL_SEPARATOR_BYTE) { 691 return false; 692 } 693 } 694 } 695 // Other bytes just need to avoid the level separator. 696 // Trailing zeros are ok. 697 // assert (Collation.LEVEL_SEPARATOR_BYTE == 1); 698 if (p3 == Collation.LEVEL_SEPARATOR_BYTE || p4 == Collation.LEVEL_SEPARATOR_BYTE 699 || s2 == Collation.LEVEL_SEPARATOR_BYTE || t2 == Collation.LEVEL_SEPARATOR_BYTE) { 700 return false; 701 } 702 // Well-formed CEs. 703 if (p == 0) { 704 if (s == 0) { 705 if (t == 0) { 706 // Completely ignorable CE. 707 // Quaternary CEs are not supported. 708 if (c != 0 || q != 0) { 709 return false; 710 } 711 } else { 712 // Tertiary CE. 713 if (t < re.getTertiaryBoundary() || c != 2) { 714 return false; 715 } 716 } 717 } else { 718 // Secondary CE. 719 if (s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) { 720 return false; 721 } 722 } 723 } else { 724 // Primary CE. 725 if (s == 0 || (Collation.COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) 726 || s >= re.getSecondaryBoundary()) { 727 return false; 728 } 729 if (t == 0 || t >= re.getTertiaryBoundary()) { 730 return false; 731 } 732 } 733 return true; 734 } 735 736 private static boolean isValidCE(CollationRootElements re, CollationData data, long ce) { 737 long p = ce >>> 32; 738 long secTer = ce & 0xffffffffL; 739 return isValidCE(re, data, p, secTer >>> 16, secTer & 0xffff); 740 } 741 742 private static class RootElementsIterator { 743 CollationData data; 744 long[] elements; 745 int length; 746 747 long pri; 748 long secTer; 749 int index; 750 751 RootElementsIterator(CollationData root) { 752 data = root; 753 elements = root.rootElements; 754 length = elements.length; 755 pri = 0; 756 secTer = 0; 757 index = (int)elements[CollationRootElements.IX_FIRST_TERTIARY_INDEX]; 758 } 759 760 boolean next() { 761 if (index >= length) { 762 return false; 763 } 764 long p = elements[index]; 765 if (p == CollationRootElements.PRIMARY_SENTINEL) { 766 return false; 767 } 768 if ((p & CollationRootElements.SEC_TER_DELTA_FLAG) != 0) { 769 ++index; 770 secTer = p & ~CollationRootElements.SEC_TER_DELTA_FLAG; 771 return true; 772 } 773 if ((p & CollationRootElements.PRIMARY_STEP_MASK) != 0) { 774 // End of a range, enumerate the primaries in the range. 775 int step = (int)p & CollationRootElements.PRIMARY_STEP_MASK; 776 p &= 0xffffff00; 777 if (pri == p) { 778 // Finished the range, return the next CE after it. 779 ++index; 780 return next(); 781 } 782 assert (pri < p); 783 // Return the next primary in this range. 784 boolean isCompressible = data.isCompressiblePrimary(pri); 785 if ((pri & 0xffff) == 0) { 786 pri = Collation.incTwoBytePrimaryByOffset(pri, isCompressible, step); 787 } else { 788 pri = Collation.incThreeBytePrimaryByOffset(pri, isCompressible, step); 789 } 790 return true; 791 } 792 // Simple primary CE. 793 ++index; 794 pri = p; 795 // Does this have an explicit below-common sec/ter unit, 796 // or does it imply a common one? 797 if(index == length) { 798 secTer = Collation.COMMON_SEC_AND_TER_CE; 799 } else { 800 secTer = elements[index]; 801 if((secTer & CollationRootElements.SEC_TER_DELTA_FLAG) == 0) { 802 // No sec/ter delta. 803 secTer = Collation.COMMON_SEC_AND_TER_CE; 804 } else { 805 secTer &= ~CollationRootElements.SEC_TER_DELTA_FLAG; 806 if(secTer > Collation.COMMON_SEC_AND_TER_CE) { 807 // Implied sec/ter. 808 secTer = Collation.COMMON_SEC_AND_TER_CE; 809 } else { 810 // Explicit sec/ter below common/common. 811 ++index; 812 } 813 } 814 } 815 return true; 816 } 817 818 long getPrimary() { 819 return pri; 820 } 821 822 long getSecTer() { 823 return secTer; 824 } 825 } 826 827 @Test 828 public void TestRootElements() { 829 CollationData root = CollationRoot.getData(); 830 831 CollationRootElements rootElements = new CollationRootElements(root.rootElements); 832 RootElementsIterator iter = new RootElementsIterator(root); 833 834 // We check each root CE for validity, 835 // and we also verify that there is a tailoring gap between each two CEs. 836 CollationWeights cw1c = new CollationWeights(); // compressible primary weights 837 CollationWeights cw1u = new CollationWeights(); // uncompressible primary weights 838 CollationWeights cw2 = new CollationWeights(); 839 CollationWeights cw3 = new CollationWeights(); 840 841 cw1c.initForPrimary(true); 842 cw1u.initForPrimary(false); 843 cw2.initForSecondary(); 844 cw3.initForTertiary(); 845 846 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs, 847 // nor the special merge-separator CE for U+FFFE. 848 long prevPri = 0; 849 long prevSec = 0; 850 long prevTer = 0; 851 852 while (iter.next()) { 853 long pri = iter.getPrimary(); 854 long secTer = iter.getSecTer(); 855 // CollationRootElements CEs must have 0 case and quaternary bits. 856 if ((secTer & Collation.CASE_AND_QUATERNARY_MASK) != 0) { 857 errln("CollationRootElements CE has non-zero case and/or quaternary bits: " 858 + "0x" + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); 859 } 860 long sec = secTer >>> 16; 861 long ter = secTer & Collation.ONLY_TERTIARY_MASK; 862 long ctq = ter; 863 if (pri == 0 && sec == 0 && ter != 0) { 864 // Tertiary CEs must have uppercase bits, 865 // but they are not stored in the CollationRootElements. 866 ctq |= 0x8000; 867 } 868 if (!isValidCE(rootElements, root, pri, sec, ctq)) { 869 errln("invalid root CE 0x" 870 + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); 871 } else { 872 if (pri != prevPri) { 873 long newWeight = 0; 874 if (prevPri == 0 || prevPri >= Collation.FFFD_PRIMARY) { 875 // There is currently no tailoring gap after primary ignorables, 876 // and we forbid tailoring after U+FFFD and U+FFFF. 877 } else if (root.isCompressiblePrimary(prevPri)) { 878 if (!cw1c.allocWeights(prevPri, pri, 1)) { 879 errln("no primary/compressible tailoring gap between " 880 + "0x" + Utility.hex(prevPri, 8) 881 + " and 0x" + Utility.hex(pri, 8)); 882 } else { 883 newWeight = cw1c.nextWeight(); 884 } 885 } else { 886 if (!cw1u.allocWeights(prevPri, pri, 1)) { 887 errln("no primary/uncompressible tailoring gap between " 888 + "0x" + Utility.hex(prevPri, 8) 889 + " and 0x" + Utility.hex(pri, 8)); 890 } else { 891 newWeight = cw1u.nextWeight(); 892 } 893 } 894 if (newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) { 895 errln("mis-allocated primary weight, should get " 896 + "0x" + Utility.hex(prevPri, 8) 897 + " < 0x" + Utility.hex(newWeight, 8) 898 + " < 0x" + Utility.hex(pri, 8)); 899 } 900 } else if (sec != prevSec) { 901 long lowerLimit = prevSec == 0 ? 902 rootElements.getSecondaryBoundary() - 0x100 : prevSec; 903 if (!cw2.allocWeights(lowerLimit, sec, 1)) { 904 errln("no secondary tailoring gap between " 905 + "0x" + Utility.hex(lowerLimit) 906 + " and 0x" + Utility.hex(sec)); 907 } else { 908 long newWeight = cw2.nextWeight(); 909 if (!(prevSec < newWeight && newWeight < sec)) { 910 errln("mis-allocated secondary weight, should get " 911 + "0x" + Utility.hex(lowerLimit) 912 + " < 0x" + Utility.hex(newWeight) 913 + " < 0x" + Utility.hex(sec)); 914 } 915 } 916 } else if (ter != prevTer) { 917 long lowerLimit = prevTer == 0 ? 918 rootElements.getTertiaryBoundary() - 0x100 : prevTer; 919 if (!cw3.allocWeights(lowerLimit, ter, 1)) { 920 errln("no tertiary tailoring gap between " 921 + "0x" + Utility.hex(lowerLimit) 922 + " and 0x" + Utility.hex(ter)); 923 } else { 924 long newWeight = cw3.nextWeight(); 925 if (!(prevTer < newWeight && newWeight < ter)) { 926 errln("mis-allocated tertiary weight, should get " 927 + "0x" + Utility.hex(lowerLimit) 928 + " < 0x" + Utility.hex(newWeight) 929 + " < 0x" + Utility.hex(ter)); 930 } 931 } 932 } else { 933 errln("duplicate root CE 0x" 934 + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8)); 935 } 936 } 937 prevPri = pri; 938 prevSec = sec; 939 prevTer = ter; 940 } 941 } 942 943 @Test 944 public void TestTailoredElements() { 945 CollationData root = CollationRoot.getData(); 946 CollationRootElements rootElements = new CollationRootElements(root.rootElements); 947 948 Set<String> prevLocales = new HashSet<String>(); 949 prevLocales.add(""); 950 prevLocales.add("root"); 951 prevLocales.add("root@collation=standard"); 952 953 long[] ces; 954 ULocale[] locales = Collator.getAvailableULocales(); 955 String localeID = "root"; 956 int locIdx = 0; 957 958 for (; locIdx < locales.length; localeID = locales[locIdx++].getName()) { 959 ULocale locale = new ULocale(localeID); 960 String[] types = Collator.getKeywordValuesForLocale("collation", locale, false); 961 for (int typeIdx = 0; typeIdx < types.length; ++typeIdx) { 962 String type = types[typeIdx]; // first: default type 963 if (type.startsWith("private-")) { 964 errln("Collator.getKeywordValuesForLocale(" + localeID + 965 ") returns private collation keyword: " + type); 966 } 967 ULocale localeWithType = locale.setKeywordValue("collation", type); 968 Collator coll = Collator.getInstance(localeWithType); 969 ULocale actual = coll.getLocale(ULocale.ACTUAL_LOCALE); 970 if (prevLocales.contains(actual.getName())) { 971 continue; 972 } 973 prevLocales.add(actual.getName()); 974 logln("TestTailoredElements(): requested " + localeWithType.getName() 975 + " -> actual " + actual.getName()); 976 if (!(coll instanceof RuleBasedCollator)) { 977 continue; 978 } 979 RuleBasedCollator rbc = (RuleBasedCollator) coll; 980 981 // Note: It would be better to get tailored strings such that we can 982 // identify the prefix, and only get the CEs for the prefix+string, 983 // not also for the prefix. 984 // There is currently no API for that. 985 // It would help in an unusual case where a contraction starting in the prefix 986 // extends past its end, and we do not see the intended mapping. 987 // For example, for a mapping p|st, if there is also a contraction ps, 988 // then we get CEs(ps)+CEs(t), rather than CEs(p|st). 989 UnicodeSet tailored = coll.getTailoredSet(); 990 UnicodeSetIterator iter = new UnicodeSetIterator(tailored); 991 while (iter.next()) { 992 String s = iter.getString(); 993 ces = rbc.internalGetCEs(s); 994 for (int i = 0; i < ces.length; ++i) { 995 long ce = ces[i]; 996 if (!isValidCE(rootElements, root, ce)) { 997 logln(prettify(s)); 998 errln("invalid tailored CE 0x" + Utility.hex(ce, 16) 999 + " at CE index " + i + " from string:"); 1000 } 1001 } 1002 } 1003 } 1004 } 1005 } 1006 1007 private static boolean isSpace(char c) { 1008 return (c == 0x09 || c == 0x20 || c == 0x3000); 1009 } 1010 1011 private static boolean isSectionStarter(char c) { 1012 return (c == '%' || c == '*' || c == '@'); 1013 } 1014 1015 private int skipSpaces(int i) { 1016 while (isSpace(fileLine.charAt(i))) { 1017 ++i; 1018 } 1019 return i; 1020 } 1021 1022 private String printSortKey(byte[] p) { 1023 StringBuilder s = new StringBuilder(); 1024 for (int i = 0; i < p.length; ++i) { 1025 if (i > 0) { 1026 s.append(' '); 1027 } 1028 byte b = p[i]; 1029 if (b == 0) { 1030 s.append('.'); 1031 } else if (b == 1) { 1032 s.append('|'); 1033 } else { 1034 s.append(String.format("%02x", b & 0xff)); 1035 } 1036 } 1037 return s.toString(); 1038 } 1039 1040 private String printCollationKey(CollationKey key) { 1041 byte[] p = key.toByteArray(); 1042 return printSortKey(p); 1043 } 1044 1045 private boolean readNonEmptyLine(BufferedReader in) throws IOException { 1046 for (;;) { 1047 String line = in.readLine(); 1048 if (line == null) { 1049 fileLine = null; 1050 return false; 1051 } 1052 if (fileLineNumber == 0 && line.length() != 0 && line.charAt(0) == '\uFEFF') { 1053 line = line.substring(1); // Remove the BOM. 1054 } 1055 ++fileLineNumber; 1056 // Strip trailing comments and spaces 1057 int idx = line.indexOf('#'); 1058 if (idx < 0) { 1059 idx = line.length(); 1060 } 1061 while (idx > 0 && isSpace(line.charAt(idx - 1))) { 1062 --idx; 1063 } 1064 if (idx != 0) { 1065 fileLine = idx < line.length() ? line.substring(0, idx) : line; 1066 return true; 1067 } 1068 // Empty line, continue. 1069 } 1070 } 1071 1072 private int parseString(int start, Output<String> prefix, Output<String> s) throws ParseException { 1073 int length = fileLine.length(); 1074 int i; 1075 for (i = start; i < length && !isSpace(fileLine.charAt(i)); ++i) { 1076 } 1077 int pipeIndex = fileLine.indexOf('|', start); 1078 if (pipeIndex >= 0 && pipeIndex < i) { 1079 String tmpPrefix = Utility.unescape(fileLine.substring(start, pipeIndex)); 1080 if (tmpPrefix.length() == 0) { 1081 prefix.value = null; 1082 logln(fileLine); 1083 throw new ParseException("empty prefix on line " + fileLineNumber, fileLineNumber); 1084 } 1085 prefix.value = tmpPrefix; 1086 start = pipeIndex + 1; 1087 } else { 1088 prefix.value = null; 1089 } 1090 1091 String tmp = Utility.unescape(fileLine.substring(start, i)); 1092 if (tmp.length() == 0) { 1093 s.value = null; 1094 logln(fileLine); 1095 throw new ParseException("empty string on line " + fileLineNumber, fileLineNumber); 1096 } 1097 s.value = tmp; 1098 return i; 1099 } 1100 1101 private int parseRelationAndString(Output<String> s) throws ParseException { 1102 int relation = Collation.NO_LEVEL; 1103 int start; 1104 if (fileLine.charAt(0) == '<') { 1105 char second = fileLine.charAt(1); 1106 start = 2; 1107 switch(second) { 1108 case 0x31: // <1 1109 relation = Collation.PRIMARY_LEVEL; 1110 break; 1111 case 0x32: // <2 1112 relation = Collation.SECONDARY_LEVEL; 1113 break; 1114 case 0x33: // <3 1115 relation = Collation.TERTIARY_LEVEL; 1116 break; 1117 case 0x34: // <4 1118 relation = Collation.QUATERNARY_LEVEL; 1119 break; 1120 case 0x63: // <c 1121 relation = Collation.CASE_LEVEL; 1122 break; 1123 case 0x69: // <i 1124 relation = Collation.IDENTICAL_LEVEL; 1125 break; 1126 default: // just < 1127 relation = Collation.NO_LEVEL; 1128 start = 1; 1129 break; 1130 } 1131 } else if (fileLine.charAt(0) == '=') { 1132 relation = Collation.ZERO_LEVEL; 1133 start = 1; 1134 } else { 1135 start = 0; 1136 } 1137 1138 if (start == 0 || !isSpace(fileLine.charAt(start))) { 1139 logln(fileLine); 1140 throw new ParseException("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line " 1141 + fileLineNumber, fileLineNumber); 1142 } 1143 1144 start = skipSpaces(start); 1145 Output<String> prefixOut = new Output<String>(); 1146 start = parseString(start, prefixOut, s); 1147 if (prefixOut.value != null) { 1148 logln(fileLine); 1149 throw new ParseException("prefix string not allowed for test string: on line " 1150 + fileLineNumber, fileLineNumber); 1151 } 1152 if (start < fileLine.length()) { 1153 logln(fileLine); 1154 throw new ParseException("unexpected line contents after test string on line " 1155 + fileLineNumber, fileLineNumber); 1156 } 1157 1158 return relation; 1159 } 1160 1161 private void parseAndSetAttribute() throws ParseException { 1162 // Parse attributes even if the Collator could not be created, 1163 // in order to report syntax errors. 1164 int start = skipSpaces(1); 1165 int equalPos = fileLine.indexOf('='); 1166 if (equalPos < 0) { 1167 if (fileLine.regionMatches(start, "reorder", 0, 7)) { 1168 parseAndSetReorderCodes(start + 7); 1169 return; 1170 } 1171 logln(fileLine); 1172 throw new ParseException("missing '=' on line " + fileLineNumber, fileLineNumber); 1173 } 1174 1175 String attrString = fileLine.substring(start, equalPos); 1176 String valueString = fileLine.substring(equalPos + 1); 1177 if (attrString.equals("maxVariable")) { 1178 int max; 1179 if (valueString.equals("space")) { 1180 max = ReorderCodes.SPACE; 1181 } else if(valueString.equals("punct")) { 1182 max = ReorderCodes.PUNCTUATION; 1183 } else if(valueString.equals("symbol")) { 1184 max = ReorderCodes.SYMBOL; 1185 } else if(valueString.equals("currency")) { 1186 max = ReorderCodes.CURRENCY; 1187 } else { 1188 logln(fileLine); 1189 throw new ParseException("invalid attribute value name on line " 1190 + fileLineNumber, fileLineNumber); 1191 } 1192 if (coll != null) { 1193 coll.setMaxVariable(max); 1194 } 1195 fileLine = null; 1196 return; 1197 } 1198 1199 boolean parsed = true; 1200 RuleBasedCollator rbc = (RuleBasedCollator)coll; 1201 if (attrString.equals("backwards")) { 1202 if (valueString.equals("on")) { 1203 if (rbc != null) rbc.setFrenchCollation(true); 1204 } else if (valueString.equals("off")) { 1205 if (rbc != null) rbc.setFrenchCollation(false); 1206 } else if (valueString.equals("default")) { 1207 if (rbc != null) rbc.setFrenchCollationDefault(); 1208 } else { 1209 parsed = false; 1210 } 1211 } else if (attrString.equals("alternate")) { 1212 if (valueString.equals("non-ignorable")) { 1213 if (rbc != null) rbc.setAlternateHandlingShifted(false); 1214 } else if (valueString.equals("shifted")) { 1215 if (rbc != null) rbc.setAlternateHandlingShifted(true); 1216 } else if (valueString.equals("default")) { 1217 if (rbc != null) rbc.setAlternateHandlingDefault(); 1218 } else { 1219 parsed = false; 1220 } 1221 } else if (attrString.equals("caseFirst")) { 1222 if (valueString.equals("upper")) { 1223 if (rbc != null) rbc.setUpperCaseFirst(true); 1224 } else if (valueString.equals("lower")) { 1225 if (rbc != null) rbc.setLowerCaseFirst(true); 1226 } else if (valueString.equals("default")) { 1227 if (rbc != null) rbc.setCaseFirstDefault(); 1228 } else { 1229 parsed = false; 1230 } 1231 } else if (attrString.equals("caseLevel")) { 1232 if (valueString.equals("on")) { 1233 if (rbc != null) rbc.setCaseLevel(true); 1234 } else if (valueString.equals("off")) { 1235 if (rbc != null) rbc.setCaseLevel(false); 1236 } else if (valueString.equals("default")) { 1237 if (rbc != null) rbc.setCaseLevelDefault(); 1238 } else { 1239 parsed = false; 1240 } 1241 } else if (attrString.equals("strength")) { 1242 if (valueString.equals("primary")) { 1243 if (rbc != null) rbc.setStrength(Collator.PRIMARY); 1244 } else if (valueString.equals("secondary")) { 1245 if (rbc != null) rbc.setStrength(Collator.SECONDARY); 1246 } else if (valueString.equals("tertiary")) { 1247 if (rbc != null) rbc.setStrength(Collator.TERTIARY); 1248 } else if (valueString.equals("quaternary")) { 1249 if (rbc != null) rbc.setStrength(Collator.QUATERNARY); 1250 } else if (valueString.equals("identical")) { 1251 if (rbc != null) rbc.setStrength(Collator.IDENTICAL); 1252 } else if (valueString.equals("default")) { 1253 if (rbc != null) rbc.setStrengthDefault(); 1254 } else { 1255 parsed = false; 1256 } 1257 } else if (attrString.equals("numeric")) { 1258 if (valueString.equals("on")) { 1259 if (rbc != null) rbc.setNumericCollation(true); 1260 } else if (valueString.equals("off")) { 1261 if (rbc != null) rbc.setNumericCollation(false); 1262 } else if (valueString.equals("default")) { 1263 if (rbc != null) rbc.setNumericCollationDefault(); 1264 } else { 1265 parsed = false; 1266 } 1267 } else { 1268 logln(fileLine); 1269 throw new ParseException("invalid attribute name on line " 1270 + fileLineNumber, fileLineNumber); 1271 } 1272 if (!parsed) { 1273 logln(fileLine); 1274 throw new ParseException( 1275 "invalid attribute value name or attribute=value combination on line " 1276 + fileLineNumber, fileLineNumber); 1277 } 1278 1279 fileLine = null; 1280 } 1281 1282 private void parseAndSetReorderCodes(int start) throws ParseException { 1283 UVector32 reorderCodes = new UVector32(); 1284 while (start < fileLine.length()) { 1285 start = skipSpaces(start); 1286 int limit = start; 1287 while (limit < fileLine.length() && !isSpace(fileLine.charAt(limit))) { 1288 ++limit; 1289 } 1290 String name = fileLine.substring(start, limit); 1291 int code = CollationRuleParser.getReorderCode(name); 1292 if (code < -1) { 1293 if (name.equalsIgnoreCase("default")) { 1294 code = ReorderCodes.DEFAULT; // -1 1295 } else { 1296 logln(fileLine); 1297 throw new ParseException("invalid reorder code '" + name + "' on line " 1298 + fileLineNumber, fileLineNumber); 1299 } 1300 } 1301 reorderCodes.addElement(code); 1302 start = limit; 1303 } 1304 if (coll != null) { 1305 int[] reorderCodesArray = new int[reorderCodes.size()]; 1306 System.arraycopy(reorderCodes.getBuffer(), 0, 1307 reorderCodesArray, 0, reorderCodes.size()); 1308 coll.setReorderCodes(reorderCodesArray); 1309 } 1310 1311 fileLine = null; 1312 } 1313 1314 private void buildTailoring(BufferedReader in) throws IOException { 1315 StringBuilder rules = new StringBuilder(); 1316 while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) { 1317 rules.append(Utility.unescape(fileLine)); 1318 } 1319 1320 try { 1321 coll = new RuleBasedCollator(rules.toString()); 1322 } catch (Exception e) { 1323 logln(rules.toString()); 1324 // Android patch: Add --omitCollationRules to genrb. 1325 logln("RuleBasedCollator(rules) failed - " + e.getMessage()); 1326 // Android patch end. 1327 coll = null; 1328 } 1329 } 1330 1331 private void setRootCollator() { 1332 coll = Collator.getInstance(ULocale.ROOT); 1333 } 1334 1335 private void setLocaleCollator() { 1336 coll = null; 1337 ULocale locale = null; 1338 if (fileLine.length() > 9) { 1339 String localeID = fileLine.substring(9); // "@ locale <langTag>" 1340 try { 1341 locale = new ULocale(localeID); // either locale ID or language tag 1342 } catch (IllformedLocaleException e) { 1343 locale = null; 1344 } 1345 } 1346 if (locale == null) { 1347 logln(fileLine); 1348 errln("invalid language tag on line " + fileLineNumber); 1349 return; 1350 } 1351 1352 logln("creating a collator for locale ID " + locale.getName()); 1353 try { 1354 coll = Collator.getInstance(locale); 1355 } catch (Exception e) { 1356 errln("unable to create a collator for locale " + locale + 1357 " on line " + fileLineNumber + " - " + e); 1358 } 1359 } 1360 1361 private boolean needsNormalization(String s) { 1362 if (!fcd.isNormalized(s)) { 1363 return true; 1364 } 1365 // In some sequences with Tibetan composite vowel signs, 1366 // even if the string passes the FCD check, 1367 // those composites must be decomposed. 1368 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81. 1369 int index = 0; 1370 while((index = s.indexOf(0xf71, index)) >= 0) { 1371 if (++index < s.length()) { 1372 char c = s.charAt(index); 1373 if (c == 0xf73 || c == 0xf75 || c == 0xf81) { 1374 return true; 1375 } 1376 } 1377 } 1378 return false; 1379 } 1380 1381 private boolean getCollationKey(String norm, String line, String s, Output<CollationKey> keyOut) { 1382 CollationKey key = coll.getCollationKey(s); 1383 keyOut.value = key; 1384 1385 byte[] keyBytes = key.toByteArray(); 1386 if (keyBytes.length == 0 || keyBytes[keyBytes.length - 1] != 0) { 1387 logln(fileTestName); 1388 logln(line); 1389 logln(printCollationKey(key)); 1390 errln("Collator(" + norm + ").getCollationKey() wrote an empty or unterminated key"); 1391 return false; 1392 } 1393 1394 int numLevels = coll.getStrength(); 1395 if (numLevels < Collator.IDENTICAL) { 1396 ++numLevels; 1397 } else { 1398 numLevels = 5; 1399 } 1400 if (((RuleBasedCollator)coll).isCaseLevel()) { 1401 ++numLevels; 1402 } 1403 int numLevelSeparators = 0; 1404 for (int i = 0; i < (keyBytes.length - 1); ++i) { 1405 byte b = keyBytes[i]; 1406 if (b == 0) { 1407 logln(fileTestName); 1408 logln(line); 1409 logln(printCollationKey(key)); 1410 errln("Collator(" + norm + ").getCollationKey() contains a 00 byte"); 1411 return false; 1412 } 1413 if (b == 1) { 1414 ++numLevelSeparators; 1415 } 1416 } 1417 if (numLevelSeparators != (numLevels - 1)) { 1418 logln(fileTestName); 1419 logln(line); 1420 logln(printCollationKey(key)); 1421 errln("Collator(" + norm + ").getCollationKey() has " 1422 + numLevelSeparators + " level separators for " 1423 + numLevels + " levels"); 1424 return false; 1425 } 1426 1427 // No nextSortKeyPart support in ICU4J 1428 1429 return true; 1430 } 1431 1432 /** 1433 * Changes the key to the merged segments of the U+FFFE-separated substrings of s. 1434 * Leaves key unchanged if s does not contain U+FFFE. 1435 * @return true if the key was successfully changed 1436 */ 1437 private boolean getMergedCollationKey(String s, Output<CollationKey> key) { 1438 CollationKey mergedKey = null; 1439 int sLength = s.length(); 1440 int segmentStart = 0; 1441 for (int i = 0;;) { 1442 if (i == sLength) { 1443 if (segmentStart == 0) { 1444 // s does not contain any U+FFFE. 1445 return false; 1446 } 1447 } else if (s.charAt(i) != '\uFFFE') { 1448 ++i; 1449 continue; 1450 } 1451 // Get the sort key for another segment and merge it into mergedKey. 1452 CollationKey tmpKey = coll.getCollationKey(s.substring(segmentStart, i)); 1453 if (mergedKey == null) { 1454 mergedKey = tmpKey; 1455 } else { 1456 mergedKey = mergedKey.merge(tmpKey); 1457 } 1458 if (i == sLength) { 1459 break; 1460 } 1461 segmentStart = ++i; 1462 } 1463 key.value = mergedKey; 1464 return true; 1465 } 1466 1467 private static int getDifferenceLevel(CollationKey prevKey, CollationKey key, 1468 int order, boolean collHasCaseLevel) { 1469 if (order == Collation.EQUAL) { 1470 return Collation.NO_LEVEL; 1471 } 1472 byte[] prevBytes = prevKey.toByteArray(); 1473 byte[] bytes = key.toByteArray(); 1474 int level = Collation.PRIMARY_LEVEL; 1475 for (int i = 0;; ++i) { 1476 byte b = prevBytes[i]; 1477 if (b != bytes[i]) { 1478 break; 1479 } 1480 if ((int)b == Collation.LEVEL_SEPARATOR_BYTE) { 1481 ++level; 1482 if (level == Collation.CASE_LEVEL && !collHasCaseLevel) { 1483 ++level; 1484 } 1485 } 1486 } 1487 return level; 1488 } 1489 1490 private boolean checkCompareTwo(String norm, String prevFileLine, String prevString, String s, 1491 int expectedOrder, int expectedLevel) { 1492 // Get the sort keys first, for error debug output. 1493 Output<CollationKey> prevKeyOut = new Output<CollationKey>(); 1494 CollationKey prevKey; 1495 if (!getCollationKey(norm, fileLine, prevString, prevKeyOut)) { 1496 return false; 1497 } 1498 prevKey = prevKeyOut.value; 1499 1500 Output<CollationKey> keyOut = new Output<CollationKey>(); 1501 CollationKey key; 1502 if (!getCollationKey(norm, fileLine, s, keyOut)) { 1503 return false; 1504 } 1505 key = keyOut.value; 1506 1507 int order = coll.compare(prevString, s); 1508 if (order != expectedOrder) { 1509 logln(fileTestName); 1510 logln(prevFileLine); 1511 logln(fileLine); 1512 logln(printCollationKey(prevKey)); 1513 logln(printCollationKey(key)); 1514 errln("line " + fileLineNumber 1515 + " Collator(" + norm + ").compare(previous, current) wrong order: " 1516 + order + " != " + expectedOrder); 1517 return false; 1518 } 1519 order = coll.compare(s, prevString); 1520 if (order != -expectedOrder) { 1521 logln(fileTestName); 1522 logln(prevFileLine); 1523 logln(fileLine); 1524 logln(printCollationKey(prevKey)); 1525 logln(printCollationKey(key)); 1526 errln("line " + fileLineNumber 1527 + " Collator(" + norm + ").compare(current, previous) wrong order: " 1528 + order + " != " + -expectedOrder); 1529 return false; 1530 } 1531 1532 order = prevKey.compareTo(key); 1533 if (order != expectedOrder) { 1534 logln(fileTestName); 1535 logln(prevFileLine); 1536 logln(fileLine); 1537 logln(printCollationKey(prevKey)); 1538 logln(printCollationKey(key)); 1539 errln("line " + fileLineNumber 1540 + " Collator(" + norm + ").getCollationKey(previous, current).compareTo() wrong order: " 1541 + order + " != " + expectedOrder); 1542 return false; 1543 } 1544 boolean collHasCaseLevel = ((RuleBasedCollator)coll).isCaseLevel(); 1545 int level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); 1546 if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) { 1547 if (level != expectedLevel) { 1548 logln(fileTestName); 1549 logln(prevFileLine); 1550 logln(fileLine); 1551 logln(printCollationKey(prevKey)); 1552 logln(printCollationKey(key)); 1553 errln("line " + fileLineNumber 1554 + " Collator(" + norm + ").getCollationKey(previous, current).compareTo()=" 1555 + order + " wrong level: " + level + " != " + expectedLevel); 1556 return false; 1557 } 1558 } 1559 1560 // If either string contains U+FFFE, then their sort keys must compare the same as 1561 // the merged sort keys of each string's between-FFFE segments. 1562 // 1563 // It is not required that 1564 // sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2)) 1565 // only that those two methods yield the same order. 1566 // 1567 // Use bit-wise OR so that getMergedCollationKey() is always called for both strings. 1568 Output<CollationKey> outPrevKey = new Output<CollationKey>(prevKey); 1569 Output<CollationKey> outKey = new Output<CollationKey>(key); 1570 if (getMergedCollationKey(prevString, outPrevKey) | getMergedCollationKey(s, outKey)) { 1571 prevKey = outPrevKey.value; 1572 key = outKey.value; 1573 order = prevKey.compareTo(key); 1574 if (order != expectedOrder) { 1575 logln(fileTestName); 1576 errln("line " + fileLineNumber 1577 + " Collator(" + norm + ").getCollationKey" 1578 + "(previous, current segments between U+FFFE)).merge().compareTo() wrong order: " 1579 + order + " != " + expectedOrder); 1580 logln(prevFileLine); 1581 logln(fileLine); 1582 logln(printCollationKey(prevKey)); 1583 logln(printCollationKey(key)); 1584 return false; 1585 } 1586 int mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel); 1587 if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) { 1588 if(mergedLevel != level) { 1589 logln(fileTestName); 1590 errln("line " + fileLineNumber 1591 + " Collator(" + norm + ").getCollationKey" 1592 + "(previous, current segments between U+FFFE)).merge().compareTo()=" 1593 + order + " wrong level: " + mergedLevel + " != " + level); 1594 logln(prevFileLine); 1595 logln(fileLine); 1596 logln(printCollationKey(prevKey)); 1597 logln(printCollationKey(key)); 1598 return false; 1599 } 1600 } 1601 } 1602 return true; 1603 } 1604 1605 private void checkCompareStrings(BufferedReader in) throws IOException { 1606 String prevFileLine = "(none)"; 1607 String prevString = ""; 1608 Output<String> sOut = new Output<String>(); 1609 while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) { 1610 // Parse the line even if it will be ignored (when we do not have a Collator) 1611 // in order to report syntax issues. 1612 int relation; 1613 try { 1614 relation = parseRelationAndString(sOut); 1615 } catch (ParseException pe) { 1616 errln(pe.toString()); 1617 break; 1618 } 1619 if(coll == null) { 1620 // We were unable to create the Collator but continue with tests. 1621 // Ignore test data for this Collator. 1622 // The next Collator creation might work. 1623 continue; 1624 } 1625 String s = sOut.value; 1626 int expectedOrder = (relation == Collation.ZERO_LEVEL) ? Collation.EQUAL : Collation.LESS; 1627 int expectedLevel = relation; 1628 boolean isOk = true; 1629 if (!needsNormalization(prevString) && !needsNormalization(s)) { 1630 coll.setDecomposition(Collator.NO_DECOMPOSITION); 1631 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s, 1632 expectedOrder, expectedLevel); 1633 } 1634 if (isOk) { 1635 coll.setDecomposition(Collator.CANONICAL_DECOMPOSITION); 1636 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s, 1637 expectedOrder, expectedLevel); 1638 } 1639 if (isOk && (!nfd.isNormalized(prevString) || !nfd.isNormalized(s))) { 1640 String pn = nfd.normalize(prevString); 1641 String n = nfd.normalize(s); 1642 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n, 1643 expectedOrder, expectedLevel); 1644 } 1645 prevFileLine = fileLine; 1646 prevString = s; 1647 } 1648 } 1649 1650 @Test 1651 public void TestDataDriven() { 1652 nfd = Normalizer2.getNFDInstance(); 1653 fcd = Norm2AllModes.getFCDNormalizer2(); 1654 1655 BufferedReader in = null; 1656 1657 try { 1658 in = TestUtil.getDataReader("collationtest.txt", "UTF-8"); 1659 1660 // Read a new line if necessary. 1661 // Sub-parsers leave the first line set that they do not handle. 1662 while (fileLine != null || readNonEmptyLine(in)) { 1663 if (!isSectionStarter(fileLine.charAt(0))) { 1664 logln(fileLine); 1665 errln("syntax error on line " + fileLineNumber); 1666 return; 1667 } 1668 if (fileLine.startsWith("** test: ")) { 1669 fileTestName = fileLine; 1670 logln(fileLine); 1671 fileLine = null; 1672 } else if (fileLine.equals("@ root")) { 1673 setRootCollator(); 1674 fileLine = null; 1675 } else if (fileLine.startsWith("@ locale ")) { 1676 setLocaleCollator(); 1677 fileLine = null; 1678 } else if (fileLine.equals("@ rules")) { 1679 buildTailoring(in); 1680 } else if (fileLine.charAt(0) == '%' 1681 && fileLine.length() > 1 && isSpace(fileLine.charAt(1))) { 1682 parseAndSetAttribute(); 1683 } else if (fileLine.equals("* compare")) { 1684 checkCompareStrings(in); 1685 } else { 1686 logln(fileLine); 1687 errln("syntax error on line " + fileLineNumber); 1688 return; 1689 } 1690 } 1691 } catch (ParseException pe) { 1692 errln(pe.toString()); 1693 } catch (IOException e) { 1694 errln(e.getMessage()); 1695 } finally { 1696 try { 1697 if (in != null) { 1698 in.close(); 1699 } 1700 } catch (IOException e) { 1701 e.printStackTrace(); 1702 } 1703 } 1704 } 1705} 1706