1/* 2******************************************************************************* 3* Copyright (C) 2012-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* collationtest.cpp 7* 8* created on: 2012apr27 9* created by: Markus W. Scherer 10*/ 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_COLLATION 15 16#include "unicode/coll.h" 17#include "unicode/errorcode.h" 18#include "unicode/localpointer.h" 19#include "unicode/normalizer2.h" 20#include "unicode/sortkey.h" 21#include "unicode/std_string.h" 22#include "unicode/strenum.h" 23#include "unicode/tblcoll.h" 24#include "unicode/uiter.h" 25#include "unicode/uniset.h" 26#include "unicode/unistr.h" 27#include "unicode/usetiter.h" 28#include "unicode/ustring.h" 29#include "charstr.h" 30#include "cmemory.h" 31#include "collation.h" 32#include "collationdata.h" 33#include "collationfcd.h" 34#include "collationiterator.h" 35#include "collationroot.h" 36#include "collationrootelements.h" 37#include "collationruleparser.h" 38#include "collationweights.h" 39#include "cstring.h" 40#include "intltest.h" 41#include "normalizer2impl.h" 42#include "ucbuf.h" 43#include "uhash.h" 44#include "uitercollationiterator.h" 45#include "utf16collationiterator.h" 46#include "utf8collationiterator.h" 47#include "uvectr32.h" 48#include "uvectr64.h" 49#include "writesrc.h" 50 51#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 52 53// TODO: Move to ucbuf.h 54U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close); 55 56class CodePointIterator; 57 58// TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey) 59 60class CollationTest : public IntlTest { 61public: 62 CollationTest() 63 : fcd(NULL), nfd(NULL), 64 fileLineNumber(0), 65 coll(NULL) {} 66 67 ~CollationTest() { 68 delete coll; 69 } 70 71 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL); 72 73 void TestMinMax(); 74 void TestImplicits(); 75 void TestNulTerminated(); 76 void TestIllegalUTF8(); 77 void TestShortFCDData(); 78 void TestFCD(); 79 void TestCollationWeights(); 80 void TestRootElements(); 81 void TestTailoredElements(); 82 void TestDataDriven(); 83 84private: 85 void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi); 86 void checkAllocWeights(CollationWeights &cw, 87 uint32_t lowerLimit, uint32_t upperLimit, int32_t n, 88 int32_t someLength, int32_t minCount); 89 90 static UnicodeString printSortKey(const uint8_t *p, int32_t length); 91 static UnicodeString printCollationKey(const CollationKey &key); 92 93 // Helpers & fields for data-driven test. 94 static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; } 95 static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; } 96 static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; } // %*@ 97 int32_t skipSpaces(int32_t i) { 98 while(isSpace(fileLine[i])) { ++i; } 99 return i; 100 } 101 102 UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode); 103 void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode); 104 Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode); 105 void parseAndSetAttribute(IcuTestErrorCode &errorCode); 106 void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode); 107 void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode); 108 void setRootCollator(IcuTestErrorCode &errorCode); 109 void setLocaleCollator(IcuTestErrorCode &errorCode); 110 111 UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const; 112 113 UBool getSortKeyParts(const UChar *s, int32_t length, 114 CharString &dest, int32_t partSize, 115 IcuTestErrorCode &errorCode); 116 UBool getCollationKey(const char *norm, const UnicodeString &line, 117 const UChar *s, int32_t length, 118 CollationKey &key, IcuTestErrorCode &errorCode); 119 UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, 120 const UnicodeString &prevString, const UnicodeString &s, 121 UCollationResult expectedOrder, Collation::Level expectedLevel, 122 IcuTestErrorCode &errorCode); 123 void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode); 124 125 const Normalizer2 *fcd, *nfd; 126 UnicodeString fileLine; 127 int32_t fileLineNumber; 128 UnicodeString fileTestName; 129 Collator *coll; 130}; 131 132extern IntlTest *createCollationTest() { 133 return new CollationTest(); 134} 135 136void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) { 137 if(exec) { 138 logln("TestSuite CollationTest: "); 139 } 140 TESTCASE_AUTO_BEGIN; 141 TESTCASE_AUTO(TestMinMax); 142 TESTCASE_AUTO(TestImplicits); 143 TESTCASE_AUTO(TestNulTerminated); 144 TESTCASE_AUTO(TestIllegalUTF8); 145 TESTCASE_AUTO(TestShortFCDData); 146 TESTCASE_AUTO(TestFCD); 147 TESTCASE_AUTO(TestCollationWeights); 148 TESTCASE_AUTO(TestRootElements); 149 TESTCASE_AUTO(TestTailoredElements); 150 TESTCASE_AUTO(TestDataDriven); 151 TESTCASE_AUTO_END; 152} 153 154void CollationTest::TestMinMax() { 155 IcuTestErrorCode errorCode(*this, "TestMinMax"); 156 157 setRootCollator(errorCode); 158 if(errorCode.isFailure()) { 159 errorCode.reset(); 160 return; 161 } 162 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll); 163 if(rbc == NULL) { 164 errln("the root collator is not a RuleBasedCollator"); 165 return; 166 } 167 168 static const UChar s[2] = { 0xfffe, 0xffff }; 169 UVector64 ces(errorCode); 170 rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode); 171 errorCode.assertSuccess(); 172 if(ces.size() != 2) { 173 errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size()); 174 return; 175 } 176 int64_t ce = ces.elementAti(0); 177 int64_t expected = 178 ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) | 179 Collation::MERGE_SEPARATOR_LOWER32; 180 if(ce != expected) { 181 errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce); 182 } 183 184 ce = ces.elementAti(1); 185 expected = Collation::makeCE(Collation::MAX_PRIMARY); 186 if(ce != expected) { 187 errln("CE(U+ffff)=%04lx != max..", (long)ce); 188 } 189} 190 191void CollationTest::TestImplicits() { 192 IcuTestErrorCode errorCode(*this, "TestImplicits"); 193 194 const CollationData *cd = CollationRoot::getData(errorCode); 195 if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) { 196 return; 197 } 198 199 // Implicit primary weights should be assigned for the following sets, 200 // and sort in ascending order by set and then code point. 201 // See http://www.unicode.org/reports/tr10/#Implicit_Weights 202 // core Han Unified Ideographs 203 UnicodeSet coreHan("[\\p{unified_ideograph}&" 204 "[\\p{Block=CJK_Unified_Ideographs}" 205 "\\p{Block=CJK_Compatibility_Ideographs}]]", 206 errorCode); 207 // all other Unified Han ideographs 208 UnicodeSet otherHan("[\\p{unified ideograph}-" 209 "[\\p{Block=CJK_Unified_Ideographs}" 210 "\\p{Block=CJK_Compatibility_Ideographs}]]", 211 errorCode); 212 UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode); 213 unassigned.remove(0xfffe, 0xffff); // These have special CLDR root mappings. 214 if(errorCode.logIfFailureAndReset("UnicodeSet")) { 215 return; 216 } 217 const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned }; 218 UChar32 prev = 0; 219 uint32_t prevPrimary = 0; 220 UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL); 221 for(int32_t i = 0; i < LENGTHOF(sets); ++i) { 222 LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i])); 223 while(iter->next()) { 224 UChar32 c = iter->getCodepoint(); 225 UnicodeString s(c); 226 ci.setText(s.getBuffer(), s.getBuffer() + s.length()); 227 int64_t ce = ci.nextCE(errorCode); 228 int64_t ce2 = ci.nextCE(errorCode); 229 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { 230 return; 231 } 232 if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) { 233 errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c); 234 continue; 235 } 236 if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) { 237 errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx", 238 (long)c, (long)(ce & 0xffffffff)); 239 continue; 240 } 241 uint32_t primary = (uint32_t)(ce >> 32); 242 if(!(primary > prevPrimary)) { 243 errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..", 244 (long)c, (long)primary, (long)prev, (long)prevPrimary); 245 } 246 prev = c; 247 prevPrimary = primary; 248 } 249 } 250} 251 252void CollationTest::TestNulTerminated() { 253 IcuTestErrorCode errorCode(*this, "TestNulTerminated"); 254 const CollationData *data = CollationRoot::getData(errorCode); 255 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { 256 return; 257 } 258 259 static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 }; 260 261 UTF16CollationIterator ci1(data, FALSE, s, s, s + 2); 262 UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL); 263 for(int32_t i = 0;; ++i) { 264 int64_t ce1 = ci1.nextCE(errorCode); 265 int64_t ce2 = ci2.nextCE(errorCode); 266 if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) { 267 return; 268 } 269 if(ce1 != ce2) { 270 errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i); 271 break; 272 } 273 if(ce1 == Collation::NO_CE) { break; } 274 } 275} 276 277void CollationTest::TestIllegalUTF8() { 278 IcuTestErrorCode errorCode(*this, "TestIllegalUTF8"); 279 280 setRootCollator(errorCode); 281 if(errorCode.isFailure()) { 282 errorCode.reset(); 283 return; 284 } 285 coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode); 286 287 static const char *strings[] = { 288 // U+FFFD 289 "a\xef\xbf\xbdz", 290 // illegal byte sequences 291 "a\x80z", // trail byte 292 "a\xc1\x81z", // non-shortest form 293 "a\xe0\x82\x83z", // non-shortest form 294 "a\xed\xa0\x80z", // lead surrogate: would be U+D800 295 "a\xed\xbf\xbfz", // trail surrogate: would be U+DFFF 296 "a\xf0\x8f\xbf\xbfz", // non-shortest form 297 "a\xf4\x90\x80\x80z" // out of range: would be U+110000 298 }; 299 300 StringPiece fffd(strings[0]); 301 for(int32_t i = 1; i < LENGTHOF(strings); ++i) { 302 StringPiece illegal(strings[i]); 303 UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode); 304 if(order != UCOL_EQUAL) { 305 errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL", 306 (int)i, order); 307 } 308 } 309} 310 311namespace { 312 313void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) { 314 for(UChar32 c = 0x10000; c < 0x110000;) { 315 UChar32 next = c + 0x400; 316 if(src.containsSome(c, next - 1)) { 317 dest.add(U16_LEAD(c)); 318 } 319 c = next; 320 } 321} 322 323} // namespace 324 325void CollationTest::TestShortFCDData() { 326 // See CollationFCD class comments. 327 IcuTestErrorCode errorCode(*this, "TestShortFCDData"); 328 UnicodeSet expectedLccc("[:^lccc=0:]", errorCode); 329 errorCode.assertSuccess(); 330 expectedLccc.add(0xdc00, 0xdfff); // add all trail surrogates 331 addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc); 332 UnicodeSet lccc; // actual 333 for(UChar32 c = 0; c <= 0xffff; ++c) { 334 if(CollationFCD::hasLccc(c)) { lccc.add(c); } 335 } 336 UnicodeSet diff(expectedLccc); 337 diff.removeAll(lccc); 338 diff.remove(0x10000, 0x10ffff); // hasLccc() only works for the BMP 339 UnicodeString empty("[]"); 340 UnicodeString diffString; 341 diff.toPattern(diffString, TRUE); 342 assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString); 343 diff = lccc; 344 diff.removeAll(expectedLccc); 345 diff.toPattern(diffString, TRUE); 346 assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE); 347 348 UnicodeSet expectedTccc("[:^tccc=0:]", errorCode); 349 if (errorCode.isSuccess()) { 350 addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc); 351 addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc); 352 UnicodeSet tccc; // actual 353 for(UChar32 c = 0; c <= 0xffff; ++c) { 354 if(CollationFCD::hasTccc(c)) { tccc.add(c); } 355 } 356 diff = expectedTccc; 357 diff.removeAll(tccc); 358 diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP 359 assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString); 360 diff = tccc; 361 diff.removeAll(expectedTccc); 362 diff.toPattern(diffString, TRUE); 363 assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString); 364 } 365} 366 367class CodePointIterator { 368public: 369 CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {} 370 void resetToStart() { pos = 0; } 371 UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; } 372 UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; } 373 int32_t getLength() const { return length; } 374 int getIndex() const { return (int)pos; } 375private: 376 const UChar32 *cp; 377 int32_t length; 378 int32_t pos; 379}; 380 381void CollationTest::checkFCD(const char *name, 382 CollationIterator &ci, CodePointIterator &cpi) { 383 IcuTestErrorCode errorCode(*this, "checkFCD"); 384 385 // Iterate forward to the limit. 386 for(;;) { 387 UChar32 c1 = ci.nextCodePoint(errorCode); 388 UChar32 c2 = cpi.next(); 389 if(c1 != c2) { 390 errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d", 391 name, (long)c1, (long)c2, cpi.getIndex()); 392 return; 393 } 394 if(c1 < 0) { break; } 395 } 396 397 // Iterate backward most of the way. 398 for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) { 399 UChar32 c1 = ci.previousCodePoint(errorCode); 400 UChar32 c2 = cpi.previous(); 401 if(c1 != c2) { 402 errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d", 403 name, (long)c1, (long)c2, cpi.getIndex()); 404 return; 405 } 406 } 407 408 // Forward again. 409 for(;;) { 410 UChar32 c1 = ci.nextCodePoint(errorCode); 411 UChar32 c2 = cpi.next(); 412 if(c1 != c2) { 413 errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d", 414 name, (long)c1, (long)c2, cpi.getIndex()); 415 return; 416 } 417 if(c1 < 0) { break; } 418 } 419 420 // Iterate backward to the start. 421 for(;;) { 422 UChar32 c1 = ci.previousCodePoint(errorCode); 423 UChar32 c2 = cpi.previous(); 424 if(c1 != c2) { 425 errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d", 426 name, (long)c1, (long)c2, cpi.getIndex()); 427 return; 428 } 429 if(c1 < 0) { break; } 430 } 431} 432 433void CollationTest::TestFCD() { 434 IcuTestErrorCode errorCode(*this, "TestFCD"); 435 const CollationData *data = CollationRoot::getData(errorCode); 436 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { 437 return; 438 } 439 440 // Input string, not FCD, NUL-terminated. 441 static const UChar s[] = { 442 0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62, 443 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216 444 0x327, 0x308, // ccc=202, 230 445 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226 446 U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F), 447 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), 448 0xac01, 449 0xe7, // Character with tccc!=0 decomposed together with mis-ordered sequence. 450 U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165), 451 0xe1, // Character with tccc!=0 decomposed together with decomposed sequence. 452 0xf73, 0xf75, // Tibetan composite vowels must be decomposed. 453 0x4e00, 0xf81, 454 0 455 }; 456 // Expected code points. 457 static const UChar32 cp[] = { 458 0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62, 459 0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308, 460 0x1D15F, 0x1D16D, 461 0xac01, 462 0x63, 0x327, 0x1D165, 0x1D16D, 463 0x61, 464 0xf71, 0xf71, 0xf72, 0xf74, 0x301, 465 0x4e00, 0xf71, 0xf80 466 }; 467 468 FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL); 469 if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) { 470 return; 471 } 472 CodePointIterator cpi(cp, LENGTHOF(cp)); 473 checkFCD("FCDUTF16CollationIterator", u16ci, cpi); 474 475#if U_HAVE_STD_STRING 476 cpi.resetToStart(); 477 std::string utf8; 478 UnicodeString(s).toUTF8String(utf8); 479 FCDUTF8CollationIterator u8ci(data, FALSE, 480 reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1); 481 if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) { 482 return; 483 } 484 checkFCD("FCDUTF8CollationIterator", u8ci, cpi); 485#endif 486 487 cpi.resetToStart(); 488 UCharIterator iter; 489 uiter_setString(&iter, s, LENGTHOF(s) - 1); // -1: without the terminating NUL 490 FCDUIterCollationIterator uici(data, FALSE, iter, 0); 491 if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) { 492 return; 493 } 494 checkFCD("FCDUIterCollationIterator", uici, cpi); 495} 496 497void CollationTest::checkAllocWeights(CollationWeights &cw, 498 uint32_t lowerLimit, uint32_t upperLimit, int32_t n, 499 int32_t someLength, int32_t minCount) { 500 if(!cw.allocWeights(lowerLimit, upperLimit, n)) { 501 errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE", 502 (long)lowerLimit, (long)upperLimit, (long)n); 503 return; 504 } 505 uint32_t previous = lowerLimit; 506 int32_t count = 0; // number of weights that have someLength 507 for(int32_t i = 0; i < n; ++i) { 508 uint32_t w = cw.nextWeight(); 509 if(w == 0xffffffff) { 510 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " 511 "returns only %ld weights", 512 (long)lowerLimit, (long)upperLimit, (long)n, (long)i); 513 return; 514 } 515 if(!(previous < w && w < upperLimit)) { 516 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " 517 "number %ld -> %lx not between %lx and %lx", 518 (long)lowerLimit, (long)upperLimit, (long)n, 519 (long)(i + 1), (long)w, (long)previous, (long)upperLimit); 520 return; 521 } 522 if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; } 523 } 524 if(count < minCount) { 525 errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() " 526 "returns only %ld < %ld weights of length %d", 527 (long)lowerLimit, (long)upperLimit, (long)n, 528 (long)count, (long)minCount, (int)someLength); 529 } 530} 531 532void CollationTest::TestCollationWeights() { 533 CollationWeights cw; 534 535 // Non-compressible primaries use 254 second bytes 02..FF. 536 logln("CollationWeights.initForPrimary(non-compressible)"); 537 cw.initForPrimary(FALSE); 538 // Expect 1 weight 11 and 254 weights 12xx. 539 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1); 540 checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254); 541 // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202. 542 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255); 543 // Expect 254 two-byte weights from the ranges 10ff and 11xx. 544 checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254); 545 // Expect 254^2=64516 three-byte weights. 546 // During computation, there should be 3 three-byte ranges 547 // 10ffff, 11xxxx, 120202. 548 // The middle one should be split 64515:1, 549 // and the newly-split-off range and the last ranged lengthened. 550 checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516); 551 // Expect weights 1102 & 1103. 552 checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2); 553 // Expect weights 102102 & 102103. 554 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); 555 556 // Compressible primaries use 251 second bytes 04..FE. 557 logln("CollationWeights.initForPrimary(compressible)"); 558 cw.initForPrimary(TRUE); 559 // Expect 1 weight 11 and 251 weights 12xx. 560 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1); 561 checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251); 562 // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204. 563 checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252); 564 // Expect weights 1104 & 1105. 565 checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2); 566 // Expect weights 102102 & 102103. 567 checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2); 568 569 // Secondary and tertiary weights use only bytes 3 & 4. 570 logln("CollationWeights.initForSecondary()"); 571 cw.initForSecondary(); 572 // Expect weights fbxx and all four fc..ff. 573 checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4); 574 575 logln("CollationWeights.initForTertiary()"); 576 cw.initForTertiary(); 577 // Expect weights 3dxx and both 3e & 3f. 578 checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2); 579} 580 581namespace { 582 583UBool isValidCE(const CollationRootElements &re, const CollationData &data, 584 uint32_t p, uint32_t s, uint32_t ctq) { 585 uint32_t p1 = p >> 24; 586 uint32_t p2 = (p >> 16) & 0xff; 587 uint32_t p3 = (p >> 8) & 0xff; 588 uint32_t p4 = p & 0xff; 589 uint32_t s1 = s >> 8; 590 uint32_t s2 = s & 0xff; 591 // ctq = Case, Tertiary, Quaternary 592 uint32_t c = (ctq & Collation::CASE_MASK) >> 14; 593 uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK; 594 uint32_t t1 = t >> 8; 595 uint32_t t2 = t & 0xff; 596 uint32_t q = ctq & Collation::QUATERNARY_MASK; 597 // No leading zero bytes. 598 if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) { 599 return FALSE; 600 } 601 // No intermediate zero bytes. 602 if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) { 603 return FALSE; 604 } 605 if(p2 != 0 && p3 == 0 && p4 != 0) { 606 return FALSE; 607 } 608 // Minimum & maximum lead bytes. 609 if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) || 610 (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) || 611 (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) { 612 return FALSE; 613 } 614 if(t1 != 0 && t1 > 0x3f) { 615 return FALSE; 616 } 617 if(c > 2) { 618 return FALSE; 619 } 620 // The valid byte range for the second primary byte depends on compressibility. 621 if(p2 != 0) { 622 if(data.isCompressibleLeadByte(p1)) { 623 if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE || 624 Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) { 625 return FALSE; 626 } 627 } else { 628 if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) { 629 return FALSE; 630 } 631 } 632 } 633 // Other bytes just need to avoid the level separator. 634 // Trailing zeros are ok. 635 U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1); 636 if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE || 637 s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) { 638 return FALSE; 639 } 640 // Well-formed CEs. 641 if(p == 0) { 642 if(s == 0) { 643 if(t == 0) { 644 // Completely ignorable CE. 645 // Quaternary CEs are not supported. 646 if(c != 0 || q != 0) { 647 return FALSE; 648 } 649 } else { 650 // Tertiary CE. 651 if(t < re.getTertiaryBoundary() || c != 2) { 652 return FALSE; 653 } 654 } 655 } else { 656 // Secondary CE. 657 if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) { 658 return FALSE; 659 } 660 } 661 } else { 662 // Primary CE. 663 if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) || 664 s >= re.getSecondaryBoundary()) { 665 return FALSE; 666 } 667 if(t == 0 || t >= re.getTertiaryBoundary()) { 668 return FALSE; 669 } 670 } 671 return TRUE; 672} 673 674UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) { 675 uint32_t p = (uint32_t)(ce >> 32); 676 uint32_t secTer = (uint32_t)ce; 677 return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff); 678} 679 680class RootElementsIterator { 681public: 682 RootElementsIterator(const CollationData &root) 683 : data(root), 684 elements(root.rootElements), length(root.rootElementsLength), 685 pri(0), secTer(0), 686 index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {} 687 688 UBool next() { 689 if(index >= length) { return FALSE; } 690 uint32_t p = elements[index]; 691 if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; } 692 if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) { 693 ++index; 694 secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG; 695 return TRUE; 696 } 697 if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) { 698 // End of a range, enumerate the primaries in the range. 699 int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK; 700 p &= 0xffffff00; 701 if(pri == p) { 702 // Finished the range, return the next CE after it. 703 ++index; 704 return next(); 705 } 706 U_ASSERT(pri < p); 707 // Return the next primary in this range. 708 UBool isCompressible = data.isCompressiblePrimary(pri); 709 if((pri & 0xffff) == 0) { 710 pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step); 711 } else { 712 pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step); 713 } 714 return TRUE; 715 } 716 // Simple primary CE. 717 ++index; 718 pri = p; 719 secTer = Collation::COMMON_SEC_AND_TER_CE; 720 return TRUE; 721 } 722 723 uint32_t getPrimary() const { return pri; } 724 uint32_t getSecTer() const { return secTer; } 725 726private: 727 const CollationData &data; 728 const uint32_t *elements; 729 int32_t length; 730 731 uint32_t pri; 732 uint32_t secTer; 733 int32_t index; 734}; 735 736} // namespace 737 738void CollationTest::TestRootElements() { 739 IcuTestErrorCode errorCode(*this, "TestRootElements"); 740 const CollationData *root = CollationRoot::getData(errorCode); 741 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { 742 return; 743 } 744 CollationRootElements rootElements(root->rootElements, root->rootElementsLength); 745 RootElementsIterator iter(*root); 746 747 // We check each root CE for validity, 748 // and we also verify that there is a tailoring gap between each two CEs. 749 CollationWeights cw1c; // compressible primary weights 750 CollationWeights cw1u; // uncompressible primary weights 751 CollationWeights cw2; 752 CollationWeights cw3; 753 754 cw1c.initForPrimary(TRUE); 755 cw1u.initForPrimary(FALSE); 756 cw2.initForSecondary(); 757 cw3.initForTertiary(); 758 759 // Note: The root elements do not include Han-implicit or unassigned-implicit CEs, 760 // nor the special merge-separator CE for U+FFFE. 761 uint32_t prevPri = 0; 762 uint32_t prevSec = 0; 763 uint32_t prevTer = 0; 764 while(iter.next()) { 765 uint32_t pri = iter.getPrimary(); 766 uint32_t secTer = iter.getSecTer(); 767 // CollationRootElements CEs must have 0 case and quaternary bits. 768 if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) { 769 errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx", 770 (long)pri, (long)secTer); 771 } 772 uint32_t sec = secTer >> 16; 773 uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK; 774 uint32_t ctq = ter; 775 if(pri == 0 && sec == 0 && ter != 0) { 776 // Tertiary CEs must have uppercase bits, 777 // but they are not stored in the CollationRootElements. 778 ctq |= 0x8000; 779 } 780 if(!isValidCE(rootElements, *root, pri, sec, ctq)) { 781 errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer); 782 } else { 783 if(pri != prevPri) { 784 uint32_t newWeight = 0; 785 if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) { 786 // There is currently no tailoring gap after primary ignorables, 787 // and we forbid tailoring after U+FFFD and U+FFFF. 788 } else if(root->isCompressiblePrimary(prevPri)) { 789 if(!cw1c.allocWeights(prevPri, pri, 1)) { 790 errln("no primary/compressible tailoring gap between %08lx and %08lx", 791 (long)prevPri, (long)pri); 792 } else { 793 newWeight = cw1c.nextWeight(); 794 } 795 } else { 796 if(!cw1u.allocWeights(prevPri, pri, 1)) { 797 errln("no primary/uncompressible tailoring gap between %08lx and %08lx", 798 (long)prevPri, (long)pri); 799 } else { 800 newWeight = cw1u.nextWeight(); 801 } 802 } 803 if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) { 804 errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx", 805 (long)prevPri, (long)newWeight, (long)pri); 806 } 807 } else if(sec != prevSec) { 808 uint32_t lowerLimit = 809 prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec; 810 if(!cw2.allocWeights(lowerLimit, sec, 1)) { 811 errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec); 812 } else { 813 uint32_t newWeight = cw2.nextWeight(); 814 if(!(prevSec < newWeight && newWeight < sec)) { 815 errln("mis-allocated secondary weight, should get %04x < %04x < %04x", 816 (long)lowerLimit, (long)newWeight, (long)sec); 817 } 818 } 819 } else if(ter != prevTer) { 820 uint32_t lowerLimit = 821 prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer; 822 if(!cw3.allocWeights(lowerLimit, ter, 1)) { 823 errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter); 824 } else { 825 uint32_t newWeight = cw3.nextWeight(); 826 if(!(prevTer < newWeight && newWeight < ter)) { 827 errln("mis-allocated secondary weight, should get %04x < %04x < %04x", 828 (long)lowerLimit, (long)newWeight, (long)ter); 829 } 830 } 831 } else { 832 errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer); 833 } 834 } 835 prevPri = pri; 836 prevSec = sec; 837 prevTer = ter; 838 } 839} 840 841void CollationTest::TestTailoredElements() { 842 IcuTestErrorCode errorCode(*this, "TestTailoredElements"); 843 const CollationData *root = CollationRoot::getData(errorCode); 844 if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) { 845 return; 846 } 847 CollationRootElements rootElements(root->rootElements, root->rootElementsLength); 848 849 UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode); 850 if(errorCode.logIfFailureAndReset("failed to create a hash table")) { 851 return; 852 } 853 uhash_setKeyDeleter(prevLocales, uprv_free); 854 // TestRootElements() tests the root collator which does not have tailorings. 855 uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode); 856 uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode); 857 uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode); 858 859 UVector64 ces(errorCode); 860 LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales()); 861 U_ASSERT(locales.isValid()); 862 const char *localeID = "root"; 863 do { 864 Locale locale(localeID); 865 LocalPointer<StringEnumeration> types( 866 Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode)); 867 errorCode.assertSuccess(); 868 const char *type = NULL; // default type 869 do { 870 Locale localeWithType(locale); 871 if(type != NULL) { 872 localeWithType.setKeywordValue("collation", type, errorCode); 873 } 874 errorCode.assertSuccess(); 875 LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode)); 876 if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)", 877 localeWithType.getName())) { 878 continue; 879 } 880 Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode); 881 if(uhash_geti(prevLocales, actual.getName()) != 0) { 882 continue; 883 } 884 uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode); 885 errorCode.assertSuccess(); 886 logln("TestTailoredElements(): requested %s -> actual %s", 887 localeWithType.getName(), actual.getName()); 888 RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias()); 889 if(rbc == NULL) { 890 continue; 891 } 892 // Note: It would be better to get tailored strings such that we can 893 // identify the prefix, and only get the CEs for the prefix+string, 894 // not also for the prefix. 895 // There is currently no API for that. 896 // It would help in an unusual case where a contraction starting in the prefix 897 // extends past its end, and we do not see the intended mapping. 898 // For example, for a mapping p|st, if there is also a contraction ps, 899 // then we get CEs(ps)+CEs(t), rather than CEs(p|st). 900 LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode)); 901 errorCode.assertSuccess(); 902 UnicodeSetIterator iter(*tailored); 903 while(iter.next()) { 904 const UnicodeString &s = iter.getString(); 905 ces.removeAllElements(); 906 rbc->internalGetCEs(s, ces, errorCode); 907 errorCode.assertSuccess(); 908 for(int32_t i = 0; i < ces.size(); ++i) { 909 int64_t ce = ces.elementAti(i); 910 if(!isValidCE(rootElements, *root, ce)) { 911 errln("invalid tailored CE %016llx at CE index %d from string:", 912 (long long)ce, (int)i); 913 infoln(prettify(s)); 914 } 915 } 916 } 917 } while((type = types->next(NULL, errorCode)) != NULL); 918 } while((localeID = locales->next(NULL, errorCode)) != NULL); 919 uhash_close(prevLocales); 920} 921 922UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) { 923 UnicodeString s; 924 for(int32_t i = 0; i < length; ++i) { 925 if(i > 0) { s.append((UChar)0x20); } 926 uint8_t b = p[i]; 927 if(b == 0) { 928 s.append((UChar)0x2e); // period 929 } else if(b == 1) { 930 s.append((UChar)0x7c); // vertical bar 931 } else { 932 appendHex(b, 2, s); 933 } 934 } 935 return s; 936} 937 938UnicodeString CollationTest::printCollationKey(const CollationKey &key) { 939 int32_t length; 940 const uint8_t *p = key.getByteArray(length); 941 return printSortKey(p, length); 942} 943 944UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) { 945 int32_t lineLength; 946 const UChar *line = ucbuf_readline(f, &lineLength, errorCode); 947 if(line == NULL || errorCode.isFailure()) { 948 fileLine.remove(); 949 return FALSE; 950 } 951 ++fileLineNumber; 952 // Strip trailing CR/LF, comments, and spaces. 953 const UChar *comment = u_memchr(line, 0x23, lineLength); // '#' 954 if(comment != NULL) { 955 lineLength = (int32_t)(comment - line); 956 } else { 957 while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; } 958 } 959 while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; } 960 fileLine.setTo(FALSE, line, lineLength); 961 return TRUE; 962} 963 964void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, 965 UErrorCode &errorCode) { 966 int32_t length = fileLine.length(); 967 int32_t i; 968 for(i = start; i < length && !isSpace(fileLine[i]); ++i) {} 969 int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start); // '|' 970 if(pipeIndex >= 0) { 971 prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape(); 972 if(prefix.isEmpty()) { 973 errln("empty prefix on line %d", (int)fileLineNumber); 974 infoln(fileLine); 975 errorCode = U_PARSE_ERROR; 976 return; 977 } 978 start = pipeIndex + 1; 979 } else { 980 prefix.remove(); 981 } 982 s = fileLine.tempSubStringBetween(start, i).unescape(); 983 if(s.isEmpty()) { 984 errln("empty string on line %d", (int)fileLineNumber); 985 infoln(fileLine); 986 errorCode = U_PARSE_ERROR; 987 return; 988 } 989 start = i; 990} 991 992Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) { 993 Collation::Level relation; 994 int32_t start; 995 if(fileLine[0] == 0x3c) { // < 996 UChar second = fileLine[1]; 997 start = 2; 998 switch(second) { 999 case 0x31: // <1 1000 relation = Collation::PRIMARY_LEVEL; 1001 break; 1002 case 0x32: // <2 1003 relation = Collation::SECONDARY_LEVEL; 1004 break; 1005 case 0x33: // <3 1006 relation = Collation::TERTIARY_LEVEL; 1007 break; 1008 case 0x34: // <4 1009 relation = Collation::QUATERNARY_LEVEL; 1010 break; 1011 case 0x63: // <c 1012 relation = Collation::CASE_LEVEL; 1013 break; 1014 case 0x69: // <i 1015 relation = Collation::IDENTICAL_LEVEL; 1016 break; 1017 default: // just < 1018 relation = Collation::NO_LEVEL; 1019 start = 1; 1020 break; 1021 } 1022 } else if(fileLine[0] == 0x3d) { // = 1023 relation = Collation::ZERO_LEVEL; 1024 start = 1; 1025 } else { 1026 start = 0; 1027 } 1028 if(start == 0 || !isSpace(fileLine[start])) { 1029 errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber); 1030 infoln(fileLine); 1031 errorCode.set(U_PARSE_ERROR); 1032 return Collation::NO_LEVEL; 1033 } 1034 start = skipSpaces(start); 1035 UnicodeString prefix; 1036 parseString(start, prefix, s, errorCode); 1037 if(errorCode.isSuccess() && !prefix.isEmpty()) { 1038 errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber); 1039 infoln(fileLine); 1040 errorCode.set(U_PARSE_ERROR); 1041 return Collation::NO_LEVEL; 1042 } 1043 if(start < fileLine.length()) { 1044 errln("unexpected line contents after test string on line %d", (int)fileLineNumber); 1045 infoln(fileLine); 1046 errorCode.set(U_PARSE_ERROR); 1047 return Collation::NO_LEVEL; 1048 } 1049 return relation; 1050} 1051 1052static const struct { 1053 const char *name; 1054 UColAttribute attr; 1055} attributes[] = { 1056 { "backwards", UCOL_FRENCH_COLLATION }, 1057 { "alternate", UCOL_ALTERNATE_HANDLING }, 1058 { "caseFirst", UCOL_CASE_FIRST }, 1059 { "caseLevel", UCOL_CASE_LEVEL }, 1060 // UCOL_NORMALIZATION_MODE is turned on and off automatically. 1061 { "strength", UCOL_STRENGTH }, 1062 // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated. 1063 { "numeric", UCOL_NUMERIC_COLLATION } 1064}; 1065 1066static const struct { 1067 const char *name; 1068 UColAttributeValue value; 1069} attributeValues[] = { 1070 { "default", UCOL_DEFAULT }, 1071 { "primary", UCOL_PRIMARY }, 1072 { "secondary", UCOL_SECONDARY }, 1073 { "tertiary", UCOL_TERTIARY }, 1074 { "quaternary", UCOL_QUATERNARY }, 1075 { "identical", UCOL_IDENTICAL }, 1076 { "off", UCOL_OFF }, 1077 { "on", UCOL_ON }, 1078 { "shifted", UCOL_SHIFTED }, 1079 { "non-ignorable", UCOL_NON_IGNORABLE }, 1080 { "lower", UCOL_LOWER_FIRST }, 1081 { "upper", UCOL_UPPER_FIRST } 1082}; 1083 1084void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) { 1085 int32_t start = skipSpaces(1); 1086 int32_t equalPos = fileLine.indexOf(0x3d); 1087 if(equalPos < 0) { 1088 if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) { 1089 parseAndSetReorderCodes(start + 7, errorCode); 1090 return; 1091 } 1092 errln("missing '=' on line %d", (int)fileLineNumber); 1093 infoln(fileLine); 1094 errorCode.set(U_PARSE_ERROR); 1095 return; 1096 } 1097 1098 UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos); 1099 UnicodeString valueString = fileLine.tempSubString(equalPos+1); 1100 if(attrString == UNICODE_STRING("maxVariable", 11)) { 1101 UColReorderCode max; 1102 if(valueString == UNICODE_STRING("space", 5)) { 1103 max = UCOL_REORDER_CODE_SPACE; 1104 } else if(valueString == UNICODE_STRING("punct", 5)) { 1105 max = UCOL_REORDER_CODE_PUNCTUATION; 1106 } else if(valueString == UNICODE_STRING("symbol", 6)) { 1107 max = UCOL_REORDER_CODE_SYMBOL; 1108 } else if(valueString == UNICODE_STRING("currency", 8)) { 1109 max = UCOL_REORDER_CODE_CURRENCY; 1110 } else { 1111 errln("invalid attribute value name on line %d", (int)fileLineNumber); 1112 infoln(fileLine); 1113 errorCode.set(U_PARSE_ERROR); 1114 return; 1115 } 1116 coll->setMaxVariable(max, errorCode); 1117 if(errorCode.isFailure()) { 1118 errln("setMaxVariable() failed on line %d: %s", 1119 (int)fileLineNumber, errorCode.errorName()); 1120 infoln(fileLine); 1121 return; 1122 } 1123 fileLine.remove(); 1124 return; 1125 } 1126 1127 UColAttribute attr; 1128 for(int32_t i = 0;; ++i) { 1129 if(i == LENGTHOF(attributes)) { 1130 errln("invalid attribute name on line %d", (int)fileLineNumber); 1131 infoln(fileLine); 1132 errorCode.set(U_PARSE_ERROR); 1133 return; 1134 } 1135 if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) { 1136 attr = attributes[i].attr; 1137 break; 1138 } 1139 } 1140 1141 UColAttributeValue value; 1142 for(int32_t i = 0;; ++i) { 1143 if(i == LENGTHOF(attributeValues)) { 1144 errln("invalid attribute value name on line %d", (int)fileLineNumber); 1145 infoln(fileLine); 1146 errorCode.set(U_PARSE_ERROR); 1147 return; 1148 } 1149 if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) { 1150 value = attributeValues[i].value; 1151 break; 1152 } 1153 } 1154 1155 coll->setAttribute(attr, value, errorCode); 1156 if(errorCode.isFailure()) { 1157 errln("illegal attribute=value combination on line %d: %s", 1158 (int)fileLineNumber, errorCode.errorName()); 1159 infoln(fileLine); 1160 return; 1161 } 1162 fileLine.remove(); 1163} 1164 1165void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) { 1166 UVector32 reorderCodes(errorCode); 1167 while(start < fileLine.length()) { 1168 start = skipSpaces(start); 1169 int32_t limit = start; 1170 while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; } 1171 CharString name; 1172 name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode); 1173 int32_t code = CollationRuleParser::getReorderCode(name.data()); 1174 if(code < -1) { 1175 errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber); 1176 infoln(fileLine); 1177 errorCode.set(U_PARSE_ERROR); 1178 return; 1179 } 1180 reorderCodes.addElement(code, errorCode); 1181 start = limit; 1182 } 1183 coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode); 1184 if(errorCode.isFailure()) { 1185 errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, errorCode.errorName()); 1186 infoln(fileLine); 1187 return; 1188 } 1189 fileLine.remove(); 1190} 1191 1192void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) { 1193 UnicodeString rules; 1194 while(readLine(f, errorCode)) { 1195 if(fileLine.isEmpty()) { continue; } 1196 if(isSectionStarter(fileLine[0])) { break; } 1197 rules.append(fileLine.unescape()); 1198 } 1199 if(errorCode.isFailure()) { return; } 1200 logln(rules); 1201 1202 UParseError parseError; 1203 UnicodeString reason; 1204 delete coll; 1205 coll = new RuleBasedCollator(rules, parseError, reason, errorCode); 1206 if(coll == NULL) { 1207 errln("unable to allocate a new collator"); 1208 errorCode.set(U_MEMORY_ALLOCATION_ERROR); 1209 return; 1210 } 1211 if(errorCode.isFailure()) { 1212 errln("RuleBasedCollator(rules) failed - %s", errorCode.errorName()); 1213 infoln(UnicodeString(" reason: ") + reason); 1214 if(parseError.offset >= 0) { infoln(" rules offset: %d", (int)parseError.offset); } 1215 if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) { 1216 infoln(UnicodeString(" snippet: ...") + 1217 parseError.preContext + "(!)" + parseError.postContext + "..."); 1218 } 1219 } else { 1220 assertEquals("no error reason when RuleBasedCollator(rules) succeeds", 1221 UnicodeString(), reason); 1222 } 1223} 1224 1225void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) { 1226 if(errorCode.isFailure()) { return; } 1227 delete coll; 1228 coll = Collator::createInstance(Locale::getRoot(), errorCode); 1229 if(errorCode.isFailure()) { 1230 dataerrln("unable to create a root collator"); 1231 return; 1232 } 1233} 1234 1235void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) { 1236 if(errorCode.isFailure()) { return; } 1237 int32_t at = fileLine.indexOf((UChar)0x40, 9); // @ is not invariant 1238 if(at >= 0) { 1239 fileLine.setCharAt(at, (UChar)0x2a); // * 1240 } 1241 CharString localeID; 1242 localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode); 1243 if(at >= 0) { 1244 localeID.data()[at - 9] = '@'; 1245 } 1246 Locale locale(localeID.data()); 1247 if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) { 1248 errln("invalid language tag on line %d", (int)fileLineNumber); 1249 infoln(fileLine); 1250 if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); } 1251 return; 1252 } 1253 1254 logln("creating a collator for locale ID %s", locale.getName()); 1255 Collator *newColl = Collator::createInstance(locale, errorCode); 1256 if(errorCode.isFailure()) { 1257 dataerrln("unable to create a collator for locale %s on line %d", 1258 locale.getName(), (int)fileLineNumber); 1259 infoln(fileLine); 1260 return; 1261 } 1262 delete coll; 1263 coll = newColl; 1264} 1265 1266UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const { 1267 if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; } 1268 // In some sequences with Tibetan composite vowel signs, 1269 // even if the string passes the FCD check, 1270 // those composites must be decomposed. 1271 // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81. 1272 int32_t index = 0; 1273 while((index = s.indexOf((UChar)0xf71, index)) >= 0) { 1274 if(++index < s.length()) { 1275 UChar c = s[index]; 1276 if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; } 1277 } 1278 } 1279 return FALSE; 1280} 1281 1282UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length, 1283 CharString &dest, int32_t partSize, 1284 IcuTestErrorCode &errorCode) { 1285 if(errorCode.isFailure()) { return FALSE; } 1286 uint8_t part[32]; 1287 U_ASSERT(partSize <= LENGTHOF(part)); 1288 UCharIterator iter; 1289 uiter_setString(&iter, s, length); 1290 uint32_t state[2] = { 0, 0 }; 1291 for(;;) { 1292 int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode); 1293 UBool done = partLength < partSize; 1294 if(done) { 1295 // At the end, append the next byte as well which should be 00. 1296 ++partLength; 1297 } 1298 dest.append(reinterpret_cast<char *>(part), partLength, errorCode); 1299 if(done) { 1300 return errorCode.isSuccess(); 1301 } 1302 } 1303} 1304 1305UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line, 1306 const UChar *s, int32_t length, 1307 CollationKey &key, IcuTestErrorCode &errorCode) { 1308 if(errorCode.isFailure()) { return FALSE; } 1309 coll->getCollationKey(s, length, key, errorCode); 1310 if(errorCode.isFailure()) { 1311 infoln(fileTestName); 1312 errln("Collator(%s).getCollationKey() failed: %s", 1313 norm, errorCode.errorName()); 1314 infoln(line); 1315 return FALSE; 1316 } 1317 int32_t keyLength; 1318 const uint8_t *keyBytes = key.getByteArray(keyLength); 1319 if(keyLength == 0 || keyBytes[keyLength - 1] != 0) { 1320 infoln(fileTestName); 1321 errln("Collator(%s).getCollationKey() wrote an empty or unterminated key", 1322 norm); 1323 infoln(line); 1324 infoln(printCollationKey(key)); 1325 return FALSE; 1326 } 1327 1328 int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode); 1329 if(numLevels < UCOL_IDENTICAL) { 1330 ++numLevels; 1331 } else { 1332 numLevels = 5; 1333 } 1334 if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) { 1335 ++numLevels; 1336 } 1337 errorCode.assertSuccess(); 1338 int32_t numLevelSeparators = 0; 1339 for(int32_t i = 0; i < (keyLength - 1); ++i) { 1340 uint8_t b = keyBytes[i]; 1341 if(b == 0) { 1342 infoln(fileTestName); 1343 errln("Collator(%s).getCollationKey() contains a 00 byte", norm); 1344 infoln(line); 1345 infoln(printCollationKey(key)); 1346 return FALSE; 1347 } 1348 if(b == 1) { ++numLevelSeparators; } 1349 } 1350 if(numLevelSeparators != (numLevels - 1)) { 1351 infoln(fileTestName); 1352 errln("Collator(%s).getCollationKey() has %d level separators for %d levels", 1353 norm, (int)numLevelSeparators, (int)numLevels); 1354 infoln(line); 1355 infoln(printCollationKey(key)); 1356 return FALSE; 1357 } 1358 1359 // If s contains U+FFFE, check that merged segments make the same key. 1360 LocalMemory<uint8_t> mergedKey; 1361 int32_t mergedKeyLength = 0; 1362 int32_t mergedKeyCapacity = 0; 1363 int32_t sLength = (length >= 0) ? length : u_strlen(s); 1364 int32_t segmentStart = 0; 1365 for(int32_t i = 0;;) { 1366 if(i == sLength) { 1367 if(segmentStart == 0) { 1368 // s does not contain any U+FFFE. 1369 break; 1370 } 1371 } else if(s[i] != 0xfffe) { 1372 ++i; 1373 continue; 1374 } 1375 // Get the sort key for another segment and merge it into mergedKey. 1376 CollationKey key1(mergedKey.getAlias(), mergedKeyLength); // copies the bytes 1377 CollationKey key2; 1378 coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode); 1379 int32_t key1Length, key2Length; 1380 const uint8_t *key1Bytes = key1.getByteArray(key1Length); 1381 const uint8_t *key2Bytes = key2.getByteArray(key2Length); 1382 uint8_t *dest; 1383 int32_t minCapacity = key1Length + key2Length; 1384 if(key1Length > 0) { --minCapacity; } 1385 if(minCapacity <= mergedKeyCapacity) { 1386 dest = mergedKey.getAlias(); 1387 } else { 1388 if(minCapacity <= 200) { 1389 mergedKeyCapacity = 200; 1390 } else if(minCapacity <= 2 * mergedKeyCapacity) { 1391 mergedKeyCapacity *= 2; 1392 } else { 1393 mergedKeyCapacity = minCapacity; 1394 } 1395 dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity); 1396 } 1397 U_ASSERT(dest != NULL || mergedKeyCapacity == 0); 1398 if(key1Length == 0) { 1399 // key2 is the sort key for the first segment. 1400 uprv_memcpy(dest, key2Bytes, key2Length); 1401 mergedKeyLength = key2Length; 1402 } else { 1403 mergedKeyLength = 1404 ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length, 1405 dest, mergedKeyCapacity); 1406 } 1407 if(i == sLength) { break; } 1408 segmentStart = ++i; 1409 } 1410 if(segmentStart != 0 && 1411 (mergedKeyLength != keyLength || 1412 uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) { 1413 infoln(fileTestName); 1414 errln("Collator(%s).getCollationKey(with U+FFFE) != " 1415 "ucol_mergeSortkeys(segments)", 1416 norm); 1417 infoln(line); 1418 infoln(printCollationKey(key)); 1419 infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength)); 1420 return FALSE; 1421 } 1422 1423 // Check that internalNextSortKeyPart() makes the same key, with several part sizes. 1424 static const int32_t partSizes[] = { 32, 3, 1 }; 1425 for(int32_t psi = 0; psi < LENGTHOF(partSizes); ++psi) { 1426 int32_t partSize = partSizes[psi]; 1427 CharString parts; 1428 if(!getSortKeyParts(s, length, parts, 32, errorCode)) { 1429 infoln(fileTestName); 1430 errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s", 1431 norm, (int)partSize, errorCode.errorName()); 1432 infoln(line); 1433 return FALSE; 1434 } 1435 if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) { 1436 infoln(fileTestName); 1437 errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)", 1438 norm, (int)partSize); 1439 infoln(line); 1440 infoln(printCollationKey(key)); 1441 infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length())); 1442 return FALSE; 1443 } 1444 } 1445 return TRUE; 1446} 1447 1448namespace { 1449 1450/** 1451 * Replaces unpaired surrogates with U+FFFD. 1452 * Returns s if no replacement was made, otherwise buffer. 1453 */ 1454const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) { 1455 int32_t i = 0; 1456 while(i < s.length()) { 1457 UChar32 c = s.char32At(i); 1458 if(U_IS_SURROGATE(c)) { 1459 if(buffer.length() < i) { 1460 buffer.append(s, buffer.length(), i - buffer.length()); 1461 } 1462 buffer.append((UChar)0xfffd); 1463 } 1464 i += U16_LENGTH(c); 1465 } 1466 if(buffer.isEmpty()) { 1467 return s; 1468 } 1469 if(buffer.length() < i) { 1470 buffer.append(s, buffer.length(), i - buffer.length()); 1471 } 1472 return buffer; 1473} 1474 1475} 1476 1477UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine, 1478 const UnicodeString &prevString, const UnicodeString &s, 1479 UCollationResult expectedOrder, Collation::Level expectedLevel, 1480 IcuTestErrorCode &errorCode) { 1481 if(errorCode.isFailure()) { return FALSE; } 1482 1483 // Get the sort keys first, for error debug output. 1484 CollationKey prevKey; 1485 if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(), 1486 prevKey, errorCode)) { 1487 return FALSE; 1488 } 1489 CollationKey key; 1490 if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; } 1491 1492 UCollationResult order = coll->compare(prevString, s, errorCode); 1493 if(order != expectedOrder || errorCode.isFailure()) { 1494 infoln(fileTestName); 1495 errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)", 1496 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); 1497 infoln(prevFileLine); 1498 infoln(fileLine); 1499 infoln(printCollationKey(prevKey)); 1500 infoln(printCollationKey(key)); 1501 return FALSE; 1502 } 1503 order = coll->compare(s, prevString, errorCode); 1504 if(order != -expectedOrder || errorCode.isFailure()) { 1505 infoln(fileTestName); 1506 errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)", 1507 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName()); 1508 infoln(prevFileLine); 1509 infoln(fileLine); 1510 infoln(printCollationKey(prevKey)); 1511 infoln(printCollationKey(key)); 1512 return FALSE; 1513 } 1514 // Test NUL-termination if the strings do not contain NUL characters. 1515 UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0; 1516 if(!containNUL) { 1517 order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode); 1518 if(order != expectedOrder || errorCode.isFailure()) { 1519 infoln(fileTestName); 1520 errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)", 1521 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); 1522 infoln(prevFileLine); 1523 infoln(fileLine); 1524 infoln(printCollationKey(prevKey)); 1525 infoln(printCollationKey(key)); 1526 return FALSE; 1527 } 1528 order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode); 1529 if(order != -expectedOrder || errorCode.isFailure()) { 1530 infoln(fileTestName); 1531 errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)", 1532 (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName()); 1533 infoln(prevFileLine); 1534 infoln(fileLine); 1535 infoln(printCollationKey(prevKey)); 1536 infoln(printCollationKey(key)); 1537 return FALSE; 1538 } 1539 } 1540 1541#if U_HAVE_STD_STRING 1542 // compare(UTF-16) treats unpaired surrogates like unassigned code points. 1543 // Unpaired surrogates cannot be converted to UTF-8. 1544 // Create valid UTF-16 strings if necessary, and use those for 1545 // both the expected compare() result and for the input to compare(UTF-8). 1546 UnicodeString prevBuffer, sBuffer; 1547 const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer); 1548 const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer); 1549 std::string prevUTF8, sUTF8; 1550 UnicodeString(prevValid).toUTF8String(prevUTF8); 1551 UnicodeString(sValid).toUTF8String(sUTF8); 1552 UCollationResult expectedUTF8Order; 1553 if(&prevValid == &prevString && &sValid == &s) { 1554 expectedUTF8Order = expectedOrder; 1555 } else { 1556 expectedUTF8Order = coll->compare(prevValid, sValid, errorCode); 1557 } 1558 1559 order = coll->compareUTF8(prevUTF8, sUTF8, errorCode); 1560 if(order != expectedUTF8Order || errorCode.isFailure()) { 1561 infoln(fileTestName); 1562 errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)", 1563 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName()); 1564 infoln(prevFileLine); 1565 infoln(fileLine); 1566 infoln(printCollationKey(prevKey)); 1567 infoln(printCollationKey(key)); 1568 return FALSE; 1569 } 1570 order = coll->compareUTF8(sUTF8, prevUTF8, errorCode); 1571 if(order != -expectedUTF8Order || errorCode.isFailure()) { 1572 infoln(fileTestName); 1573 errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)", 1574 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName()); 1575 infoln(prevFileLine); 1576 infoln(fileLine); 1577 infoln(printCollationKey(prevKey)); 1578 infoln(printCollationKey(key)); 1579 return FALSE; 1580 } 1581 // Test NUL-termination if the strings do not contain NUL characters. 1582 if(!containNUL) { 1583 order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode); 1584 if(order != expectedUTF8Order || errorCode.isFailure()) { 1585 infoln(fileTestName); 1586 errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)", 1587 (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName()); 1588 infoln(prevFileLine); 1589 infoln(fileLine); 1590 infoln(printCollationKey(prevKey)); 1591 infoln(printCollationKey(key)); 1592 return FALSE; 1593 } 1594 order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode); 1595 if(order != -expectedUTF8Order || errorCode.isFailure()) { 1596 infoln(fileTestName); 1597 errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)", 1598 (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName()); 1599 infoln(prevFileLine); 1600 infoln(fileLine); 1601 infoln(printCollationKey(prevKey)); 1602 infoln(printCollationKey(key)); 1603 return FALSE; 1604 } 1605 } 1606#endif 1607 1608 UCharIterator leftIter; 1609 UCharIterator rightIter; 1610 uiter_setString(&leftIter, prevString.getBuffer(), prevString.length()); 1611 uiter_setString(&rightIter, s.getBuffer(), s.length()); 1612 order = coll->compare(leftIter, rightIter, errorCode); 1613 if(order != expectedOrder || errorCode.isFailure()) { 1614 infoln(fileTestName); 1615 errln("line %d Collator(%s).compare(UCharIterator: previous, current) " 1616 "wrong order: %d != %d (%s)", 1617 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); 1618 infoln(prevFileLine); 1619 infoln(fileLine); 1620 infoln(printCollationKey(prevKey)); 1621 infoln(printCollationKey(key)); 1622 return FALSE; 1623 } 1624 1625 order = prevKey.compareTo(key, errorCode); 1626 if(order != expectedOrder || errorCode.isFailure()) { 1627 infoln(fileTestName); 1628 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)", 1629 (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName()); 1630 infoln(prevFileLine); 1631 infoln(fileLine); 1632 infoln(printCollationKey(prevKey)); 1633 infoln(printCollationKey(key)); 1634 return FALSE; 1635 } 1636 if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) { 1637 int32_t prevKeyLength; 1638 const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength); 1639 int32_t keyLength; 1640 const uint8_t *bytes = key.getByteArray(keyLength); 1641 int32_t level = Collation::PRIMARY_LEVEL; 1642 for(int32_t i = 0;; ++i) { 1643 uint8_t b = prevBytes[i]; 1644 if(b != bytes[i]) { break; } 1645 if(b == Collation::LEVEL_SEPARATOR_BYTE) { 1646 ++level; 1647 if(level == Collation::CASE_LEVEL && 1648 coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) { 1649 ++level; 1650 } 1651 } 1652 } 1653 if(level != expectedLevel) { 1654 infoln(fileTestName); 1655 errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d", 1656 (int)fileLineNumber, norm, order, level, expectedLevel); 1657 infoln(prevFileLine); 1658 infoln(fileLine); 1659 infoln(printCollationKey(prevKey)); 1660 infoln(printCollationKey(key)); 1661 return FALSE; 1662 } 1663 } 1664 return TRUE; 1665} 1666 1667void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) { 1668 if(errorCode.isFailure()) { return; } 1669 UnicodeString prevFileLine = UNICODE_STRING("(none)", 6); 1670 UnicodeString prevString, s; 1671 prevString.getTerminatedBuffer(); // Ensure NUL-termination. 1672 while(readLine(f, errorCode)) { 1673 if(fileLine.isEmpty()) { continue; } 1674 if(isSectionStarter(fileLine[0])) { break; } 1675 Collation::Level relation = parseRelationAndString(s, errorCode); 1676 if(errorCode.isFailure()) { 1677 errorCode.reset(); 1678 break; 1679 } 1680 UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS; 1681 Collation::Level expectedLevel = relation; 1682 s.getTerminatedBuffer(); // Ensure NUL-termination. 1683 UBool isOk = TRUE; 1684 if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) { 1685 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode); 1686 isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s, 1687 expectedOrder, expectedLevel, errorCode); 1688 } 1689 if(isOk) { 1690 coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode); 1691 isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s, 1692 expectedOrder, expectedLevel, errorCode); 1693 } 1694 if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) { 1695 UnicodeString pn = nfd->normalize(prevString, errorCode); 1696 UnicodeString n = nfd->normalize(s, errorCode); 1697 pn.getTerminatedBuffer(); 1698 n.getTerminatedBuffer(); 1699 errorCode.assertSuccess(); 1700 isOk = checkCompareTwo("NFD input", prevFileLine, pn, n, 1701 expectedOrder, expectedLevel, errorCode); 1702 } 1703 if(!isOk) { 1704 errorCode.reset(); // already reported 1705 } 1706 prevFileLine = fileLine; 1707 prevString = s; 1708 prevString.getTerminatedBuffer(); // Ensure NUL-termination. 1709 } 1710} 1711 1712void CollationTest::TestDataDriven() { 1713 IcuTestErrorCode errorCode(*this, "TestDataDriven"); 1714 1715 fcd = Normalizer2Factory::getFCDInstance(errorCode); 1716 nfd = Normalizer2Factory::getNFDInstance(errorCode); 1717 if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) { 1718 return; 1719 } 1720 1721 CharString path(getSourceTestData(errorCode), errorCode); 1722 path.appendPathPart("collationtest.txt", errorCode); 1723 const char *codePage = "UTF-8"; 1724 LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode)); 1725 if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) { 1726 return; 1727 } 1728 while(errorCode.isSuccess()) { 1729 // Read a new line if necessary. 1730 // Sub-parsers leave the first line set that they do not handle. 1731 if(fileLine.isEmpty()) { 1732 if(!readLine(f.getAlias(), errorCode)) { break; } 1733 continue; 1734 } 1735 if(!isSectionStarter(fileLine[0])) { 1736 errln("syntax error on line %d", (int)fileLineNumber); 1737 infoln(fileLine); 1738 return; 1739 } 1740 if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) { 1741 fileTestName = fileLine; 1742 logln(fileLine); 1743 fileLine.remove(); 1744 } else if(fileLine == UNICODE_STRING("@ root", 6)) { 1745 setRootCollator(errorCode); 1746 fileLine.remove(); 1747 } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) { 1748 setLocaleCollator(errorCode); 1749 fileLine.remove(); 1750 } else if(fileLine == UNICODE_STRING("@ rules", 7)) { 1751 buildTailoring(f.getAlias(), errorCode); 1752 } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) { // % 1753 parseAndSetAttribute(errorCode); 1754 } else if(fileLine == UNICODE_STRING("* compare", 9)) { 1755 checkCompareStrings(f.getAlias(), errorCode); 1756 } else { 1757 errln("syntax error on line %d", (int)fileLineNumber); 1758 infoln(fileLine); 1759 return; 1760 } 1761 } 1762} 1763 1764#endif // !UCONFIG_NO_COLLATION 1765