1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationtest.cpp
7*
8* created on: 2012apr27
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/coll.h"
17#include "unicode/errorcode.h"
18#include "unicode/localpointer.h"
19#include "unicode/normalizer2.h"
20#include "unicode/sortkey.h"
21#include "unicode/std_string.h"
22#include "unicode/strenum.h"
23#include "unicode/tblcoll.h"
24#include "unicode/uiter.h"
25#include "unicode/uniset.h"
26#include "unicode/unistr.h"
27#include "unicode/usetiter.h"
28#include "unicode/ustring.h"
29#include "charstr.h"
30#include "cmemory.h"
31#include "collation.h"
32#include "collationdata.h"
33#include "collationfcd.h"
34#include "collationiterator.h"
35#include "collationroot.h"
36#include "collationrootelements.h"
37#include "collationruleparser.h"
38#include "collationweights.h"
39#include "cstring.h"
40#include "intltest.h"
41#include "normalizer2impl.h"
42#include "ucbuf.h"
43#include "uhash.h"
44#include "uitercollationiterator.h"
45#include "utf16collationiterator.h"
46#include "utf8collationiterator.h"
47#include "uvectr32.h"
48#include "uvectr64.h"
49#include "writesrc.h"
50
51#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
52
53// TODO: Move to ucbuf.h
54U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
55
56class CodePointIterator;
57
58// TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
59
60class CollationTest : public IntlTest {
61public:
62    CollationTest()
63            : fcd(NULL), nfd(NULL),
64              fileLineNumber(0),
65              coll(NULL) {}
66
67    ~CollationTest() {
68        delete coll;
69    }
70
71    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
72
73    void TestMinMax();
74    void TestImplicits();
75    void TestNulTerminated();
76    void TestIllegalUTF8();
77    void TestShortFCDData();
78    void TestFCD();
79    void TestCollationWeights();
80    void TestRootElements();
81    void TestTailoredElements();
82    void TestDataDriven();
83
84private:
85    void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
86    void checkAllocWeights(CollationWeights &cw,
87                           uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
88                           int32_t someLength, int32_t minCount);
89
90    static UnicodeString printSortKey(const uint8_t *p, int32_t length);
91    static UnicodeString printCollationKey(const CollationKey &key);
92
93    // Helpers & fields for data-driven test.
94    static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
95    static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
96    static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
97    int32_t skipSpaces(int32_t i) {
98        while(isSpace(fileLine[i])) { ++i; }
99        return i;
100    }
101
102    UBool readLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
103    void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
104    Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
105    void parseAndSetAttribute(IcuTestErrorCode &errorCode);
106    void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
107    void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
108    void setRootCollator(IcuTestErrorCode &errorCode);
109    void setLocaleCollator(IcuTestErrorCode &errorCode);
110
111    UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
112
113    UBool getSortKeyParts(const UChar *s, int32_t length,
114                          CharString &dest, int32_t partSize,
115                          IcuTestErrorCode &errorCode);
116    UBool getCollationKey(const char *norm, const UnicodeString &line,
117                          const UChar *s, int32_t length,
118                          CollationKey &key, IcuTestErrorCode &errorCode);
119    UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120                          const UnicodeString &prevString, const UnicodeString &s,
121                          UCollationResult expectedOrder, Collation::Level expectedLevel,
122                          IcuTestErrorCode &errorCode);
123    void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124
125    const Normalizer2 *fcd, *nfd;
126    UnicodeString fileLine;
127    int32_t fileLineNumber;
128    UnicodeString fileTestName;
129    Collator *coll;
130};
131
132extern IntlTest *createCollationTest() {
133    return new CollationTest();
134}
135
136void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137    if(exec) {
138        logln("TestSuite CollationTest: ");
139    }
140    TESTCASE_AUTO_BEGIN;
141    TESTCASE_AUTO(TestMinMax);
142    TESTCASE_AUTO(TestImplicits);
143    TESTCASE_AUTO(TestNulTerminated);
144    TESTCASE_AUTO(TestIllegalUTF8);
145    TESTCASE_AUTO(TestShortFCDData);
146    TESTCASE_AUTO(TestFCD);
147    TESTCASE_AUTO(TestCollationWeights);
148    TESTCASE_AUTO(TestRootElements);
149    TESTCASE_AUTO(TestTailoredElements);
150    TESTCASE_AUTO(TestDataDriven);
151    TESTCASE_AUTO_END;
152}
153
154void CollationTest::TestMinMax() {
155    IcuTestErrorCode errorCode(*this, "TestMinMax");
156
157    setRootCollator(errorCode);
158    if(errorCode.isFailure()) {
159        errorCode.reset();
160        return;
161    }
162    RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163    if(rbc == NULL) {
164        errln("the root collator is not a RuleBasedCollator");
165        return;
166    }
167
168    static const UChar s[2] = { 0xfffe, 0xffff };
169    UVector64 ces(errorCode);
170    rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171    errorCode.assertSuccess();
172    if(ces.size() != 2) {
173        errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174        return;
175    }
176    int64_t ce = ces.elementAti(0);
177    int64_t expected =
178        ((int64_t)Collation::MERGE_SEPARATOR_PRIMARY << 32) |
179        Collation::MERGE_SEPARATOR_LOWER32;
180    if(ce != expected) {
181        errln("CE(U+fffe)=%04lx != 02.02.02", (long)ce);
182    }
183
184    ce = ces.elementAti(1);
185    expected = Collation::makeCE(Collation::MAX_PRIMARY);
186    if(ce != expected) {
187        errln("CE(U+ffff)=%04lx != max..", (long)ce);
188    }
189}
190
191void CollationTest::TestImplicits() {
192    IcuTestErrorCode errorCode(*this, "TestImplicits");
193
194    const CollationData *cd = CollationRoot::getData(errorCode);
195    if(errorCode.logDataIfFailureAndReset("CollationRoot::getBaseData()")) {
196        return;
197    }
198
199    // Implicit primary weights should be assigned for the following sets,
200    // and sort in ascending order by set and then code point.
201    // See http://www.unicode.org/reports/tr10/#Implicit_Weights
202    // core Han Unified Ideographs
203    UnicodeSet coreHan("[\\p{unified_ideograph}&"
204                            "[\\p{Block=CJK_Unified_Ideographs}"
205                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
206                       errorCode);
207    // all other Unified Han ideographs
208    UnicodeSet otherHan("[\\p{unified ideograph}-"
209                            "[\\p{Block=CJK_Unified_Ideographs}"
210                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
211                        errorCode);
212    UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
213    unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
214    if(errorCode.logIfFailureAndReset("UnicodeSet")) {
215        return;
216    }
217    const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
218    UChar32 prev = 0;
219    uint32_t prevPrimary = 0;
220    UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
221    for(int32_t i = 0; i < LENGTHOF(sets); ++i) {
222        LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
223        while(iter->next()) {
224            UChar32 c = iter->getCodepoint();
225            UnicodeString s(c);
226            ci.setText(s.getBuffer(), s.getBuffer() + s.length());
227            int64_t ce = ci.nextCE(errorCode);
228            int64_t ce2 = ci.nextCE(errorCode);
229            if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
230                return;
231            }
232            if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
233                errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
234                continue;
235            }
236            if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
237                errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
238                      (long)c, (long)(ce & 0xffffffff));
239                continue;
240            }
241            uint32_t primary = (uint32_t)(ce >> 32);
242            if(!(primary > prevPrimary)) {
243                errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
244                      (long)c, (long)primary, (long)prev, (long)prevPrimary);
245            }
246            prev = c;
247            prevPrimary = primary;
248        }
249    }
250}
251
252void CollationTest::TestNulTerminated() {
253    IcuTestErrorCode errorCode(*this, "TestNulTerminated");
254    const CollationData *data = CollationRoot::getData(errorCode);
255    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
256        return;
257    }
258
259    static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
260
261    UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
262    UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
263    for(int32_t i = 0;; ++i) {
264        int64_t ce1 = ci1.nextCE(errorCode);
265        int64_t ce2 = ci2.nextCE(errorCode);
266        if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
267            return;
268        }
269        if(ce1 != ce2) {
270            errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
271            break;
272        }
273        if(ce1 == Collation::NO_CE) { break; }
274    }
275}
276
277void CollationTest::TestIllegalUTF8() {
278    IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
279
280    setRootCollator(errorCode);
281    if(errorCode.isFailure()) {
282        errorCode.reset();
283        return;
284    }
285    coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
286
287    static const char *strings[] = {
288        // U+FFFD
289        "a\xef\xbf\xbdz",
290        // illegal byte sequences
291        "a\x80z",  // trail byte
292        "a\xc1\x81z",  // non-shortest form
293        "a\xe0\x82\x83z",  // non-shortest form
294        "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
295        "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
296        "a\xf0\x8f\xbf\xbfz",  // non-shortest form
297        "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
298    };
299
300    StringPiece fffd(strings[0]);
301    for(int32_t i = 1; i < LENGTHOF(strings); ++i) {
302        StringPiece illegal(strings[i]);
303        UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
304        if(order != UCOL_EQUAL) {
305            errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
306                  (int)i, order);
307        }
308    }
309}
310
311namespace {
312
313void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
314    for(UChar32 c = 0x10000; c < 0x110000;) {
315        UChar32 next = c + 0x400;
316        if(src.containsSome(c, next - 1)) {
317            dest.add(U16_LEAD(c));
318        }
319        c = next;
320    }
321}
322
323}  // namespace
324
325void CollationTest::TestShortFCDData() {
326    // See CollationFCD class comments.
327    IcuTestErrorCode errorCode(*this, "TestShortFCDData");
328    UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
329    errorCode.assertSuccess();
330    expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
331    addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
332    UnicodeSet lccc;  // actual
333    for(UChar32 c = 0; c <= 0xffff; ++c) {
334        if(CollationFCD::hasLccc(c)) { lccc.add(c); }
335    }
336    UnicodeSet diff(expectedLccc);
337    diff.removeAll(lccc);
338    diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
339    UnicodeString empty("[]");
340    UnicodeString diffString;
341    diff.toPattern(diffString, TRUE);
342    assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
343    diff = lccc;
344    diff.removeAll(expectedLccc);
345    diff.toPattern(diffString, TRUE);
346    assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
347
348    UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
349    if (errorCode.isSuccess()) {
350        addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
351        addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
352        UnicodeSet tccc;  // actual
353        for(UChar32 c = 0; c <= 0xffff; ++c) {
354            if(CollationFCD::hasTccc(c)) { tccc.add(c); }
355        }
356        diff = expectedTccc;
357        diff.removeAll(tccc);
358        diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
359        assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
360        diff = tccc;
361        diff.removeAll(expectedTccc);
362        diff.toPattern(diffString, TRUE);
363        assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
364    }
365}
366
367class CodePointIterator {
368public:
369    CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
370    void resetToStart() { pos = 0; }
371    UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
372    UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
373    int32_t getLength() const { return length; }
374    int getIndex() const { return (int)pos; }
375private:
376    const UChar32 *cp;
377    int32_t length;
378    int32_t pos;
379};
380
381void CollationTest::checkFCD(const char *name,
382                             CollationIterator &ci, CodePointIterator &cpi) {
383    IcuTestErrorCode errorCode(*this, "checkFCD");
384
385    // Iterate forward to the limit.
386    for(;;) {
387        UChar32 c1 = ci.nextCodePoint(errorCode);
388        UChar32 c2 = cpi.next();
389        if(c1 != c2) {
390            errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
391                  name, (long)c1, (long)c2, cpi.getIndex());
392            return;
393        }
394        if(c1 < 0) { break; }
395    }
396
397    // Iterate backward most of the way.
398    for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
399        UChar32 c1 = ci.previousCodePoint(errorCode);
400        UChar32 c2 = cpi.previous();
401        if(c1 != c2) {
402            errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
403                  name, (long)c1, (long)c2, cpi.getIndex());
404            return;
405        }
406    }
407
408    // Forward again.
409    for(;;) {
410        UChar32 c1 = ci.nextCodePoint(errorCode);
411        UChar32 c2 = cpi.next();
412        if(c1 != c2) {
413            errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
414                  name, (long)c1, (long)c2, cpi.getIndex());
415            return;
416        }
417        if(c1 < 0) { break; }
418    }
419
420    // Iterate backward to the start.
421    for(;;) {
422        UChar32 c1 = ci.previousCodePoint(errorCode);
423        UChar32 c2 = cpi.previous();
424        if(c1 != c2) {
425            errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
426                  name, (long)c1, (long)c2, cpi.getIndex());
427            return;
428        }
429        if(c1 < 0) { break; }
430    }
431}
432
433void CollationTest::TestFCD() {
434    IcuTestErrorCode errorCode(*this, "TestFCD");
435    const CollationData *data = CollationRoot::getData(errorCode);
436    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
437        return;
438    }
439
440    // Input string, not FCD, NUL-terminated.
441    static const UChar s[] = {
442        0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
443        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
444        0x327, 0x308,  // ccc=202, 230
445        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
446        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
447        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
448        0xac01,
449        0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
450        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
451        0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
452        0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
453        0x4e00, 0xf81,
454        0
455    };
456    // Expected code points.
457    static const UChar32 cp[] = {
458        0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
459        0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
460        0x1D15F, 0x1D16D,
461        0xac01,
462        0x63, 0x327, 0x1D165, 0x1D16D,
463        0x61,
464        0xf71, 0xf71, 0xf72, 0xf74, 0x301,
465        0x4e00, 0xf71, 0xf80
466    };
467
468    FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
469    if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
470        return;
471    }
472    CodePointIterator cpi(cp, LENGTHOF(cp));
473    checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
474
475#if U_HAVE_STD_STRING
476    cpi.resetToStart();
477    std::string utf8;
478    UnicodeString(s).toUTF8String(utf8);
479    FCDUTF8CollationIterator u8ci(data, FALSE,
480                                  reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
481    if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
482        return;
483    }
484    checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
485#endif
486
487    cpi.resetToStart();
488    UCharIterator iter;
489    uiter_setString(&iter, s, LENGTHOF(s) - 1);  // -1: without the terminating NUL
490    FCDUIterCollationIterator uici(data, FALSE, iter, 0);
491    if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
492        return;
493    }
494    checkFCD("FCDUIterCollationIterator", uici, cpi);
495}
496
497void CollationTest::checkAllocWeights(CollationWeights &cw,
498                                      uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
499                                      int32_t someLength, int32_t minCount) {
500    if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
501        errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
502              (long)lowerLimit, (long)upperLimit, (long)n);
503        return;
504    }
505    uint32_t previous = lowerLimit;
506    int32_t count = 0;  // number of weights that have someLength
507    for(int32_t i = 0; i < n; ++i) {
508        uint32_t w = cw.nextWeight();
509        if(w == 0xffffffff) {
510            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
511                  "returns only %ld weights",
512                  (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
513            return;
514        }
515        if(!(previous < w && w < upperLimit)) {
516            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
517                  "number %ld -> %lx not between %lx and %lx",
518                  (long)lowerLimit, (long)upperLimit, (long)n,
519                  (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
520            return;
521        }
522        if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
523    }
524    if(count < minCount) {
525        errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
526              "returns only %ld < %ld weights of length %d",
527              (long)lowerLimit, (long)upperLimit, (long)n,
528              (long)count, (long)minCount, (int)someLength);
529    }
530}
531
532void CollationTest::TestCollationWeights() {
533    CollationWeights cw;
534
535    // Non-compressible primaries use 254 second bytes 02..FF.
536    logln("CollationWeights.initForPrimary(non-compressible)");
537    cw.initForPrimary(FALSE);
538    // Expect 1 weight 11 and 254 weights 12xx.
539    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
540    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
541    // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
542    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
543    // Expect 254 two-byte weights from the ranges 10ff and 11xx.
544    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
545    // Expect 254^2=64516 three-byte weights.
546    // During computation, there should be 3 three-byte ranges
547    // 10ffff, 11xxxx, 120202.
548    // The middle one should be split 64515:1,
549    // and the newly-split-off range and the last ranged lengthened.
550    checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
551    // Expect weights 1102 & 1103.
552    checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
553    // Expect weights 102102 & 102103.
554    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
555
556    // Compressible primaries use 251 second bytes 04..FE.
557    logln("CollationWeights.initForPrimary(compressible)");
558    cw.initForPrimary(TRUE);
559    // Expect 1 weight 11 and 251 weights 12xx.
560    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
561    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
562    // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
563    checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
564    // Expect weights 1104 & 1105.
565    checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
566    // Expect weights 102102 & 102103.
567    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
568
569    // Secondary and tertiary weights use only bytes 3 & 4.
570    logln("CollationWeights.initForSecondary()");
571    cw.initForSecondary();
572    // Expect weights fbxx and all four fc..ff.
573    checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
574
575    logln("CollationWeights.initForTertiary()");
576    cw.initForTertiary();
577    // Expect weights 3dxx and both 3e & 3f.
578    checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
579}
580
581namespace {
582
583UBool isValidCE(const CollationRootElements &re, const CollationData &data,
584                uint32_t p, uint32_t s, uint32_t ctq) {
585    uint32_t p1 = p >> 24;
586    uint32_t p2 = (p >> 16) & 0xff;
587    uint32_t p3 = (p >> 8) & 0xff;
588    uint32_t p4 = p & 0xff;
589    uint32_t s1 = s >> 8;
590    uint32_t s2 = s & 0xff;
591    // ctq = Case, Tertiary, Quaternary
592    uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
593    uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
594    uint32_t t1 = t >> 8;
595    uint32_t t2 = t & 0xff;
596    uint32_t q = ctq & Collation::QUATERNARY_MASK;
597    // No leading zero bytes.
598    if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
599        return FALSE;
600    }
601    // No intermediate zero bytes.
602    if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
603        return FALSE;
604    }
605    if(p2 != 0 && p3 == 0 && p4 != 0) {
606        return FALSE;
607    }
608    // Minimum & maximum lead bytes.
609    if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
610            (s1 != 0 && s1 <= Collation::MERGE_SEPARATOR_BYTE) ||
611            (t1 != 0 && t1 <= Collation::MERGE_SEPARATOR_BYTE)) {
612        return FALSE;
613    }
614    if(t1 != 0 && t1 > 0x3f) {
615        return FALSE;
616    }
617    if(c > 2) {
618        return FALSE;
619    }
620    // The valid byte range for the second primary byte depends on compressibility.
621    if(p2 != 0) {
622        if(data.isCompressibleLeadByte(p1)) {
623            if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
624                    Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
625                return FALSE;
626            }
627        } else {
628            if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
629                return FALSE;
630            }
631        }
632    }
633    // Other bytes just need to avoid the level separator.
634    // Trailing zeros are ok.
635    U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
636    if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
637            s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
638        return FALSE;
639    }
640    // Well-formed CEs.
641    if(p == 0) {
642        if(s == 0) {
643            if(t == 0) {
644                // Completely ignorable CE.
645                // Quaternary CEs are not supported.
646                if(c != 0 || q != 0) {
647                    return FALSE;
648                }
649            } else {
650                // Tertiary CE.
651                if(t < re.getTertiaryBoundary() || c != 2) {
652                    return FALSE;
653                }
654            }
655        } else {
656            // Secondary CE.
657            if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
658                return FALSE;
659            }
660        }
661    } else {
662        // Primary CE.
663        if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
664                s >= re.getSecondaryBoundary()) {
665            return FALSE;
666        }
667        if(t == 0 || t >= re.getTertiaryBoundary()) {
668            return FALSE;
669        }
670    }
671    return TRUE;
672}
673
674UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
675    uint32_t p = (uint32_t)(ce >> 32);
676    uint32_t secTer = (uint32_t)ce;
677    return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
678}
679
680class RootElementsIterator {
681public:
682    RootElementsIterator(const CollationData &root)
683            : data(root),
684              elements(root.rootElements), length(root.rootElementsLength),
685              pri(0), secTer(0),
686              index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
687
688    UBool next() {
689        if(index >= length) { return FALSE; }
690        uint32_t p = elements[index];
691        if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
692        if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
693            ++index;
694            secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
695            return TRUE;
696        }
697        if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
698            // End of a range, enumerate the primaries in the range.
699            int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
700            p &= 0xffffff00;
701            if(pri == p) {
702                // Finished the range, return the next CE after it.
703                ++index;
704                return next();
705            }
706            U_ASSERT(pri < p);
707            // Return the next primary in this range.
708            UBool isCompressible = data.isCompressiblePrimary(pri);
709            if((pri & 0xffff) == 0) {
710                pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
711            } else {
712                pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
713            }
714            return TRUE;
715        }
716        // Simple primary CE.
717        ++index;
718        pri = p;
719        secTer = Collation::COMMON_SEC_AND_TER_CE;
720        return TRUE;
721    }
722
723    uint32_t getPrimary() const { return pri; }
724    uint32_t getSecTer() const { return secTer; }
725
726private:
727    const CollationData &data;
728    const uint32_t *elements;
729    int32_t length;
730
731    uint32_t pri;
732    uint32_t secTer;
733    int32_t index;
734};
735
736}  // namespace
737
738void CollationTest::TestRootElements() {
739    IcuTestErrorCode errorCode(*this, "TestRootElements");
740    const CollationData *root = CollationRoot::getData(errorCode);
741    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
742        return;
743    }
744    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
745    RootElementsIterator iter(*root);
746
747    // We check each root CE for validity,
748    // and we also verify that there is a tailoring gap between each two CEs.
749    CollationWeights cw1c;  // compressible primary weights
750    CollationWeights cw1u;  // uncompressible primary weights
751    CollationWeights cw2;
752    CollationWeights cw3;
753
754    cw1c.initForPrimary(TRUE);
755    cw1u.initForPrimary(FALSE);
756    cw2.initForSecondary();
757    cw3.initForTertiary();
758
759    // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
760    // nor the special merge-separator CE for U+FFFE.
761    uint32_t prevPri = 0;
762    uint32_t prevSec = 0;
763    uint32_t prevTer = 0;
764    while(iter.next()) {
765        uint32_t pri = iter.getPrimary();
766        uint32_t secTer = iter.getSecTer();
767        // CollationRootElements CEs must have 0 case and quaternary bits.
768        if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
769            errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
770                  (long)pri, (long)secTer);
771        }
772        uint32_t sec = secTer >> 16;
773        uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
774        uint32_t ctq = ter;
775        if(pri == 0 && sec == 0 && ter != 0) {
776            // Tertiary CEs must have uppercase bits,
777            // but they are not stored in the CollationRootElements.
778            ctq |= 0x8000;
779        }
780        if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
781            errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
782        } else {
783            if(pri != prevPri) {
784                uint32_t newWeight = 0;
785                if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
786                    // There is currently no tailoring gap after primary ignorables,
787                    // and we forbid tailoring after U+FFFD and U+FFFF.
788                } else if(root->isCompressiblePrimary(prevPri)) {
789                    if(!cw1c.allocWeights(prevPri, pri, 1)) {
790                        errln("no primary/compressible tailoring gap between %08lx and %08lx",
791                              (long)prevPri, (long)pri);
792                    } else {
793                        newWeight = cw1c.nextWeight();
794                    }
795                } else {
796                    if(!cw1u.allocWeights(prevPri, pri, 1)) {
797                        errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
798                              (long)prevPri, (long)pri);
799                    } else {
800                        newWeight = cw1u.nextWeight();
801                    }
802                }
803                if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
804                    errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
805                          (long)prevPri, (long)newWeight, (long)pri);
806                }
807            } else if(sec != prevSec) {
808                uint32_t lowerLimit =
809                    prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
810                if(!cw2.allocWeights(lowerLimit, sec, 1)) {
811                    errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
812                } else {
813                    uint32_t newWeight = cw2.nextWeight();
814                    if(!(prevSec < newWeight && newWeight < sec)) {
815                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
816                              (long)lowerLimit, (long)newWeight, (long)sec);
817                    }
818                }
819            } else if(ter != prevTer) {
820                uint32_t lowerLimit =
821                    prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
822                if(!cw3.allocWeights(lowerLimit, ter, 1)) {
823                    errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
824                } else {
825                    uint32_t newWeight = cw3.nextWeight();
826                    if(!(prevTer < newWeight && newWeight < ter)) {
827                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
828                              (long)lowerLimit, (long)newWeight, (long)ter);
829                    }
830                }
831            } else {
832                errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
833            }
834        }
835        prevPri = pri;
836        prevSec = sec;
837        prevTer = ter;
838    }
839}
840
841void CollationTest::TestTailoredElements() {
842    IcuTestErrorCode errorCode(*this, "TestTailoredElements");
843    const CollationData *root = CollationRoot::getData(errorCode);
844    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
845        return;
846    }
847    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
848
849    UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
850    if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
851        return;
852    }
853    uhash_setKeyDeleter(prevLocales, uprv_free);
854    // TestRootElements() tests the root collator which does not have tailorings.
855    uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
856    uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
857    uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
858
859    UVector64 ces(errorCode);
860    LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
861    U_ASSERT(locales.isValid());
862    const char *localeID = "root";
863    do {
864        Locale locale(localeID);
865        LocalPointer<StringEnumeration> types(
866                Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
867        errorCode.assertSuccess();
868        const char *type = NULL;  // default type
869        do {
870            Locale localeWithType(locale);
871            if(type != NULL) {
872                localeWithType.setKeywordValue("collation", type, errorCode);
873            }
874            errorCode.assertSuccess();
875            LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
876            if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
877                                              localeWithType.getName())) {
878                continue;
879            }
880            Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
881            if(uhash_geti(prevLocales, actual.getName()) != 0) {
882                continue;
883            }
884            uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
885            errorCode.assertSuccess();
886            logln("TestTailoredElements(): requested %s -> actual %s",
887                  localeWithType.getName(), actual.getName());
888            RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
889            if(rbc == NULL) {
890                continue;
891            }
892            // Note: It would be better to get tailored strings such that we can
893            // identify the prefix, and only get the CEs for the prefix+string,
894            // not also for the prefix.
895            // There is currently no API for that.
896            // It would help in an unusual case where a contraction starting in the prefix
897            // extends past its end, and we do not see the intended mapping.
898            // For example, for a mapping p|st, if there is also a contraction ps,
899            // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
900            LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
901            errorCode.assertSuccess();
902            UnicodeSetIterator iter(*tailored);
903            while(iter.next()) {
904                const UnicodeString &s = iter.getString();
905                ces.removeAllElements();
906                rbc->internalGetCEs(s, ces, errorCode);
907                errorCode.assertSuccess();
908                for(int32_t i = 0; i < ces.size(); ++i) {
909                    int64_t ce = ces.elementAti(i);
910                    if(!isValidCE(rootElements, *root, ce)) {
911                        errln("invalid tailored CE %016llx at CE index %d from string:",
912                              (long long)ce, (int)i);
913                        infoln(prettify(s));
914                    }
915                }
916            }
917        } while((type = types->next(NULL, errorCode)) != NULL);
918    } while((localeID = locales->next(NULL, errorCode)) != NULL);
919    uhash_close(prevLocales);
920}
921
922UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
923    UnicodeString s;
924    for(int32_t i = 0; i < length; ++i) {
925        if(i > 0) { s.append((UChar)0x20); }
926        uint8_t b = p[i];
927        if(b == 0) {
928            s.append((UChar)0x2e);  // period
929        } else if(b == 1) {
930            s.append((UChar)0x7c);  // vertical bar
931        } else {
932            appendHex(b, 2, s);
933        }
934    }
935    return s;
936}
937
938UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
939    int32_t length;
940    const uint8_t *p = key.getByteArray(length);
941    return printSortKey(p, length);
942}
943
944UBool CollationTest::readLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
945    int32_t lineLength;
946    const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
947    if(line == NULL || errorCode.isFailure()) {
948        fileLine.remove();
949        return FALSE;
950    }
951    ++fileLineNumber;
952    // Strip trailing CR/LF, comments, and spaces.
953    const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
954    if(comment != NULL) {
955        lineLength = (int32_t)(comment - line);
956    } else {
957        while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
958    }
959    while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
960    fileLine.setTo(FALSE, line, lineLength);
961    return TRUE;
962}
963
964void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
965                                UErrorCode &errorCode) {
966    int32_t length = fileLine.length();
967    int32_t i;
968    for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
969    int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
970    if(pipeIndex >= 0) {
971        prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
972        if(prefix.isEmpty()) {
973            errln("empty prefix on line %d", (int)fileLineNumber);
974            infoln(fileLine);
975            errorCode = U_PARSE_ERROR;
976            return;
977        }
978        start = pipeIndex + 1;
979    } else {
980        prefix.remove();
981    }
982    s = fileLine.tempSubStringBetween(start, i).unescape();
983    if(s.isEmpty()) {
984        errln("empty string on line %d", (int)fileLineNumber);
985        infoln(fileLine);
986        errorCode = U_PARSE_ERROR;
987        return;
988    }
989    start = i;
990}
991
992Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
993    Collation::Level relation;
994    int32_t start;
995    if(fileLine[0] == 0x3c) {  // <
996        UChar second = fileLine[1];
997        start = 2;
998        switch(second) {
999        case 0x31:  // <1
1000            relation = Collation::PRIMARY_LEVEL;
1001            break;
1002        case 0x32:  // <2
1003            relation = Collation::SECONDARY_LEVEL;
1004            break;
1005        case 0x33:  // <3
1006            relation = Collation::TERTIARY_LEVEL;
1007            break;
1008        case 0x34:  // <4
1009            relation = Collation::QUATERNARY_LEVEL;
1010            break;
1011        case 0x63:  // <c
1012            relation = Collation::CASE_LEVEL;
1013            break;
1014        case 0x69:  // <i
1015            relation = Collation::IDENTICAL_LEVEL;
1016            break;
1017        default:  // just <
1018            relation = Collation::NO_LEVEL;
1019            start = 1;
1020            break;
1021        }
1022    } else if(fileLine[0] == 0x3d) {  // =
1023        relation = Collation::ZERO_LEVEL;
1024        start = 1;
1025    } else {
1026        start = 0;
1027    }
1028    if(start == 0 || !isSpace(fileLine[start])) {
1029        errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1030        infoln(fileLine);
1031        errorCode.set(U_PARSE_ERROR);
1032        return Collation::NO_LEVEL;
1033    }
1034    start = skipSpaces(start);
1035    UnicodeString prefix;
1036    parseString(start, prefix, s, errorCode);
1037    if(errorCode.isSuccess() && !prefix.isEmpty()) {
1038        errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1039        infoln(fileLine);
1040        errorCode.set(U_PARSE_ERROR);
1041        return Collation::NO_LEVEL;
1042    }
1043    if(start < fileLine.length()) {
1044        errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1045        infoln(fileLine);
1046        errorCode.set(U_PARSE_ERROR);
1047        return Collation::NO_LEVEL;
1048    }
1049    return relation;
1050}
1051
1052static const struct {
1053    const char *name;
1054    UColAttribute attr;
1055} attributes[] = {
1056    { "backwards", UCOL_FRENCH_COLLATION },
1057    { "alternate", UCOL_ALTERNATE_HANDLING },
1058    { "caseFirst", UCOL_CASE_FIRST },
1059    { "caseLevel", UCOL_CASE_LEVEL },
1060    // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1061    { "strength", UCOL_STRENGTH },
1062    // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1063    { "numeric", UCOL_NUMERIC_COLLATION }
1064};
1065
1066static const struct {
1067    const char *name;
1068    UColAttributeValue value;
1069} attributeValues[] = {
1070    { "default", UCOL_DEFAULT },
1071    { "primary", UCOL_PRIMARY },
1072    { "secondary", UCOL_SECONDARY },
1073    { "tertiary", UCOL_TERTIARY },
1074    { "quaternary", UCOL_QUATERNARY },
1075    { "identical", UCOL_IDENTICAL },
1076    { "off", UCOL_OFF },
1077    { "on", UCOL_ON },
1078    { "shifted", UCOL_SHIFTED },
1079    { "non-ignorable", UCOL_NON_IGNORABLE },
1080    { "lower", UCOL_LOWER_FIRST },
1081    { "upper", UCOL_UPPER_FIRST }
1082};
1083
1084void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1085    int32_t start = skipSpaces(1);
1086    int32_t equalPos = fileLine.indexOf(0x3d);
1087    if(equalPos < 0) {
1088        if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1089            parseAndSetReorderCodes(start + 7, errorCode);
1090            return;
1091        }
1092        errln("missing '=' on line %d", (int)fileLineNumber);
1093        infoln(fileLine);
1094        errorCode.set(U_PARSE_ERROR);
1095        return;
1096    }
1097
1098    UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1099    UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1100    if(attrString == UNICODE_STRING("maxVariable", 11)) {
1101        UColReorderCode max;
1102        if(valueString == UNICODE_STRING("space", 5)) {
1103            max = UCOL_REORDER_CODE_SPACE;
1104        } else if(valueString == UNICODE_STRING("punct", 5)) {
1105            max = UCOL_REORDER_CODE_PUNCTUATION;
1106        } else if(valueString == UNICODE_STRING("symbol", 6)) {
1107            max = UCOL_REORDER_CODE_SYMBOL;
1108        } else if(valueString == UNICODE_STRING("currency", 8)) {
1109            max = UCOL_REORDER_CODE_CURRENCY;
1110        } else {
1111            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1112            infoln(fileLine);
1113            errorCode.set(U_PARSE_ERROR);
1114            return;
1115        }
1116        coll->setMaxVariable(max, errorCode);
1117        if(errorCode.isFailure()) {
1118            errln("setMaxVariable() failed on line %d: %s",
1119                  (int)fileLineNumber, errorCode.errorName());
1120            infoln(fileLine);
1121            return;
1122        }
1123        fileLine.remove();
1124        return;
1125    }
1126
1127    UColAttribute attr;
1128    for(int32_t i = 0;; ++i) {
1129        if(i == LENGTHOF(attributes)) {
1130            errln("invalid attribute name on line %d", (int)fileLineNumber);
1131            infoln(fileLine);
1132            errorCode.set(U_PARSE_ERROR);
1133            return;
1134        }
1135        if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1136            attr = attributes[i].attr;
1137            break;
1138        }
1139    }
1140
1141    UColAttributeValue value;
1142    for(int32_t i = 0;; ++i) {
1143        if(i == LENGTHOF(attributeValues)) {
1144            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1145            infoln(fileLine);
1146            errorCode.set(U_PARSE_ERROR);
1147            return;
1148        }
1149        if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1150            value = attributeValues[i].value;
1151            break;
1152        }
1153    }
1154
1155    coll->setAttribute(attr, value, errorCode);
1156    if(errorCode.isFailure()) {
1157        errln("illegal attribute=value combination on line %d: %s",
1158              (int)fileLineNumber, errorCode.errorName());
1159        infoln(fileLine);
1160        return;
1161    }
1162    fileLine.remove();
1163}
1164
1165void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1166    UVector32 reorderCodes(errorCode);
1167    while(start < fileLine.length()) {
1168        start = skipSpaces(start);
1169        int32_t limit = start;
1170        while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1171        CharString name;
1172        name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1173        int32_t code = CollationRuleParser::getReorderCode(name.data());
1174        if(code < -1) {
1175            errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1176            infoln(fileLine);
1177            errorCode.set(U_PARSE_ERROR);
1178            return;
1179        }
1180        reorderCodes.addElement(code, errorCode);
1181        start = limit;
1182    }
1183    coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1184    if(errorCode.isFailure()) {
1185        errln("setReorderCodes() failed on line %d: %s", (int)fileLineNumber, errorCode.errorName());
1186        infoln(fileLine);
1187        return;
1188    }
1189    fileLine.remove();
1190}
1191
1192void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1193    UnicodeString rules;
1194    while(readLine(f, errorCode)) {
1195        if(fileLine.isEmpty()) { continue; }
1196        if(isSectionStarter(fileLine[0])) { break; }
1197        rules.append(fileLine.unescape());
1198    }
1199    if(errorCode.isFailure()) { return; }
1200    logln(rules);
1201
1202    UParseError parseError;
1203    UnicodeString reason;
1204    delete coll;
1205    coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1206    if(coll == NULL) {
1207        errln("unable to allocate a new collator");
1208        errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1209        return;
1210    }
1211    if(errorCode.isFailure()) {
1212        errln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1213        infoln(UnicodeString("  reason: ") + reason);
1214        if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1215        if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1216            infoln(UnicodeString("  snippet: ...") +
1217                parseError.preContext + "(!)" + parseError.postContext + "...");
1218        }
1219    } else {
1220        assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1221                     UnicodeString(), reason);
1222    }
1223}
1224
1225void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1226    if(errorCode.isFailure()) { return; }
1227    delete coll;
1228    coll = Collator::createInstance(Locale::getRoot(), errorCode);
1229    if(errorCode.isFailure()) {
1230        dataerrln("unable to create a root collator");
1231        return;
1232    }
1233}
1234
1235void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1236    if(errorCode.isFailure()) { return; }
1237    int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1238    if(at >= 0) {
1239        fileLine.setCharAt(at, (UChar)0x2a);  // *
1240    }
1241    CharString localeID;
1242    localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1243    if(at >= 0) {
1244        localeID.data()[at - 9] = '@';
1245    }
1246    Locale locale(localeID.data());
1247    if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1248        errln("invalid language tag on line %d", (int)fileLineNumber);
1249        infoln(fileLine);
1250        if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1251        return;
1252    }
1253
1254    logln("creating a collator for locale ID %s", locale.getName());
1255    Collator *newColl = Collator::createInstance(locale, errorCode);
1256    if(errorCode.isFailure()) {
1257        dataerrln("unable to create a collator for locale %s on line %d",
1258                  locale.getName(), (int)fileLineNumber);
1259        infoln(fileLine);
1260        return;
1261    }
1262    delete coll;
1263    coll = newColl;
1264}
1265
1266UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1267    if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1268    // In some sequences with Tibetan composite vowel signs,
1269    // even if the string passes the FCD check,
1270    // those composites must be decomposed.
1271    // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1272    int32_t index = 0;
1273    while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1274        if(++index < s.length()) {
1275            UChar c = s[index];
1276            if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1277        }
1278    }
1279    return FALSE;
1280}
1281
1282UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1283                                     CharString &dest, int32_t partSize,
1284                                     IcuTestErrorCode &errorCode) {
1285    if(errorCode.isFailure()) { return FALSE; }
1286    uint8_t part[32];
1287    U_ASSERT(partSize <= LENGTHOF(part));
1288    UCharIterator iter;
1289    uiter_setString(&iter, s, length);
1290    uint32_t state[2] = { 0, 0 };
1291    for(;;) {
1292        int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1293        UBool done = partLength < partSize;
1294        if(done) {
1295            // At the end, append the next byte as well which should be 00.
1296            ++partLength;
1297        }
1298        dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1299        if(done) {
1300            return errorCode.isSuccess();
1301        }
1302    }
1303}
1304
1305UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1306                                     const UChar *s, int32_t length,
1307                                     CollationKey &key, IcuTestErrorCode &errorCode) {
1308    if(errorCode.isFailure()) { return FALSE; }
1309    coll->getCollationKey(s, length, key, errorCode);
1310    if(errorCode.isFailure()) {
1311        infoln(fileTestName);
1312        errln("Collator(%s).getCollationKey() failed: %s",
1313              norm, errorCode.errorName());
1314        infoln(line);
1315        return FALSE;
1316    }
1317    int32_t keyLength;
1318    const uint8_t *keyBytes = key.getByteArray(keyLength);
1319    if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1320        infoln(fileTestName);
1321        errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1322              norm);
1323        infoln(line);
1324        infoln(printCollationKey(key));
1325        return FALSE;
1326    }
1327
1328    int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1329    if(numLevels < UCOL_IDENTICAL) {
1330        ++numLevels;
1331    } else {
1332        numLevels = 5;
1333    }
1334    if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1335        ++numLevels;
1336    }
1337    errorCode.assertSuccess();
1338    int32_t numLevelSeparators = 0;
1339    for(int32_t i = 0; i < (keyLength - 1); ++i) {
1340        uint8_t b = keyBytes[i];
1341        if(b == 0) {
1342            infoln(fileTestName);
1343            errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1344            infoln(line);
1345            infoln(printCollationKey(key));
1346            return FALSE;
1347        }
1348        if(b == 1) { ++numLevelSeparators; }
1349    }
1350    if(numLevelSeparators != (numLevels - 1)) {
1351        infoln(fileTestName);
1352        errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1353              norm, (int)numLevelSeparators, (int)numLevels);
1354        infoln(line);
1355        infoln(printCollationKey(key));
1356        return FALSE;
1357    }
1358
1359    // If s contains U+FFFE, check that merged segments make the same key.
1360    LocalMemory<uint8_t> mergedKey;
1361    int32_t mergedKeyLength = 0;
1362    int32_t mergedKeyCapacity = 0;
1363    int32_t sLength = (length >= 0) ? length : u_strlen(s);
1364    int32_t segmentStart = 0;
1365    for(int32_t i = 0;;) {
1366        if(i == sLength) {
1367            if(segmentStart == 0) {
1368                // s does not contain any U+FFFE.
1369                break;
1370            }
1371        } else if(s[i] != 0xfffe) {
1372            ++i;
1373            continue;
1374        }
1375        // Get the sort key for another segment and merge it into mergedKey.
1376        CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1377        CollationKey key2;
1378        coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1379        int32_t key1Length, key2Length;
1380        const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1381        const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1382        uint8_t *dest;
1383        int32_t minCapacity = key1Length + key2Length;
1384        if(key1Length > 0) { --minCapacity; }
1385        if(minCapacity <= mergedKeyCapacity) {
1386            dest = mergedKey.getAlias();
1387        } else {
1388            if(minCapacity <= 200) {
1389                mergedKeyCapacity = 200;
1390            } else if(minCapacity <= 2 * mergedKeyCapacity) {
1391                mergedKeyCapacity *= 2;
1392            } else {
1393                mergedKeyCapacity = minCapacity;
1394            }
1395            dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1396        }
1397        U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1398        if(key1Length == 0) {
1399            // key2 is the sort key for the first segment.
1400            uprv_memcpy(dest, key2Bytes, key2Length);
1401            mergedKeyLength = key2Length;
1402        } else {
1403            mergedKeyLength =
1404                ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1405                                   dest, mergedKeyCapacity);
1406        }
1407        if(i == sLength) { break; }
1408        segmentStart = ++i;
1409    }
1410    if(segmentStart != 0 &&
1411            (mergedKeyLength != keyLength ||
1412            uprv_memcmp(mergedKey.getAlias(), keyBytes, keyLength) != 0)) {
1413        infoln(fileTestName);
1414        errln("Collator(%s).getCollationKey(with U+FFFE) != "
1415              "ucol_mergeSortkeys(segments)",
1416              norm);
1417        infoln(line);
1418        infoln(printCollationKey(key));
1419        infoln(printSortKey(mergedKey.getAlias(), mergedKeyLength));
1420        return FALSE;
1421    }
1422
1423    // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1424    static const int32_t partSizes[] = { 32, 3, 1 };
1425    for(int32_t psi = 0; psi < LENGTHOF(partSizes); ++psi) {
1426        int32_t partSize = partSizes[psi];
1427        CharString parts;
1428        if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1429            infoln(fileTestName);
1430            errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1431                  norm, (int)partSize, errorCode.errorName());
1432            infoln(line);
1433            return FALSE;
1434        }
1435        if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1436            infoln(fileTestName);
1437            errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1438                  norm, (int)partSize);
1439            infoln(line);
1440            infoln(printCollationKey(key));
1441            infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1442            return FALSE;
1443        }
1444    }
1445    return TRUE;
1446}
1447
1448namespace {
1449
1450/**
1451 * Replaces unpaired surrogates with U+FFFD.
1452 * Returns s if no replacement was made, otherwise buffer.
1453 */
1454const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1455    int32_t i = 0;
1456    while(i < s.length()) {
1457        UChar32 c = s.char32At(i);
1458        if(U_IS_SURROGATE(c)) {
1459            if(buffer.length() < i) {
1460                buffer.append(s, buffer.length(), i - buffer.length());
1461            }
1462            buffer.append((UChar)0xfffd);
1463        }
1464        i += U16_LENGTH(c);
1465    }
1466    if(buffer.isEmpty()) {
1467        return s;
1468    }
1469    if(buffer.length() < i) {
1470        buffer.append(s, buffer.length(), i - buffer.length());
1471    }
1472    return buffer;
1473}
1474
1475}
1476
1477UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1478                                     const UnicodeString &prevString, const UnicodeString &s,
1479                                     UCollationResult expectedOrder, Collation::Level expectedLevel,
1480                                     IcuTestErrorCode &errorCode) {
1481    if(errorCode.isFailure()) { return FALSE; }
1482
1483    // Get the sort keys first, for error debug output.
1484    CollationKey prevKey;
1485    if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1486                        prevKey, errorCode)) {
1487        return FALSE;
1488    }
1489    CollationKey key;
1490    if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1491
1492    UCollationResult order = coll->compare(prevString, s, errorCode);
1493    if(order != expectedOrder || errorCode.isFailure()) {
1494        infoln(fileTestName);
1495        errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1496              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1497        infoln(prevFileLine);
1498        infoln(fileLine);
1499        infoln(printCollationKey(prevKey));
1500        infoln(printCollationKey(key));
1501        return FALSE;
1502    }
1503    order = coll->compare(s, prevString, errorCode);
1504    if(order != -expectedOrder || errorCode.isFailure()) {
1505        infoln(fileTestName);
1506        errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1507              (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1508        infoln(prevFileLine);
1509        infoln(fileLine);
1510        infoln(printCollationKey(prevKey));
1511        infoln(printCollationKey(key));
1512        return FALSE;
1513    }
1514    // Test NUL-termination if the strings do not contain NUL characters.
1515    UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1516    if(!containNUL) {
1517        order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1518        if(order != expectedOrder || errorCode.isFailure()) {
1519            infoln(fileTestName);
1520            errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1521                  (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1522            infoln(prevFileLine);
1523            infoln(fileLine);
1524            infoln(printCollationKey(prevKey));
1525            infoln(printCollationKey(key));
1526            return FALSE;
1527        }
1528        order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1529        if(order != -expectedOrder || errorCode.isFailure()) {
1530            infoln(fileTestName);
1531            errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1532                  (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1533            infoln(prevFileLine);
1534            infoln(fileLine);
1535            infoln(printCollationKey(prevKey));
1536            infoln(printCollationKey(key));
1537            return FALSE;
1538        }
1539    }
1540
1541#if U_HAVE_STD_STRING
1542    // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1543    // Unpaired surrogates cannot be converted to UTF-8.
1544    // Create valid UTF-16 strings if necessary, and use those for
1545    // both the expected compare() result and for the input to compare(UTF-8).
1546    UnicodeString prevBuffer, sBuffer;
1547    const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1548    const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1549    std::string prevUTF8, sUTF8;
1550    UnicodeString(prevValid).toUTF8String(prevUTF8);
1551    UnicodeString(sValid).toUTF8String(sUTF8);
1552    UCollationResult expectedUTF8Order;
1553    if(&prevValid == &prevString && &sValid == &s) {
1554        expectedUTF8Order = expectedOrder;
1555    } else {
1556        expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1557    }
1558
1559    order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1560    if(order != expectedUTF8Order || errorCode.isFailure()) {
1561        infoln(fileTestName);
1562        errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1563              (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1564        infoln(prevFileLine);
1565        infoln(fileLine);
1566        infoln(printCollationKey(prevKey));
1567        infoln(printCollationKey(key));
1568        return FALSE;
1569    }
1570    order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1571    if(order != -expectedUTF8Order || errorCode.isFailure()) {
1572        infoln(fileTestName);
1573        errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1574              (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1575        infoln(prevFileLine);
1576        infoln(fileLine);
1577        infoln(printCollationKey(prevKey));
1578        infoln(printCollationKey(key));
1579        return FALSE;
1580    }
1581    // Test NUL-termination if the strings do not contain NUL characters.
1582    if(!containNUL) {
1583        order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1584        if(order != expectedUTF8Order || errorCode.isFailure()) {
1585            infoln(fileTestName);
1586            errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1587                  (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1588            infoln(prevFileLine);
1589            infoln(fileLine);
1590            infoln(printCollationKey(prevKey));
1591            infoln(printCollationKey(key));
1592            return FALSE;
1593        }
1594        order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1595        if(order != -expectedUTF8Order || errorCode.isFailure()) {
1596            infoln(fileTestName);
1597            errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1598                  (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1599            infoln(prevFileLine);
1600            infoln(fileLine);
1601            infoln(printCollationKey(prevKey));
1602            infoln(printCollationKey(key));
1603            return FALSE;
1604        }
1605    }
1606#endif
1607
1608    UCharIterator leftIter;
1609    UCharIterator rightIter;
1610    uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1611    uiter_setString(&rightIter, s.getBuffer(), s.length());
1612    order = coll->compare(leftIter, rightIter, errorCode);
1613    if(order != expectedOrder || errorCode.isFailure()) {
1614        infoln(fileTestName);
1615        errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1616              "wrong order: %d != %d (%s)",
1617              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1618        infoln(prevFileLine);
1619        infoln(fileLine);
1620        infoln(printCollationKey(prevKey));
1621        infoln(printCollationKey(key));
1622        return FALSE;
1623    }
1624
1625    order = prevKey.compareTo(key, errorCode);
1626    if(order != expectedOrder || errorCode.isFailure()) {
1627        infoln(fileTestName);
1628        errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1629              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1630        infoln(prevFileLine);
1631        infoln(fileLine);
1632        infoln(printCollationKey(prevKey));
1633        infoln(printCollationKey(key));
1634        return FALSE;
1635    }
1636    if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1637        int32_t prevKeyLength;
1638        const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1639        int32_t keyLength;
1640        const uint8_t *bytes = key.getByteArray(keyLength);
1641        int32_t level = Collation::PRIMARY_LEVEL;
1642        for(int32_t i = 0;; ++i) {
1643            uint8_t b = prevBytes[i];
1644            if(b != bytes[i]) { break; }
1645            if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1646                ++level;
1647                if(level == Collation::CASE_LEVEL &&
1648                        coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_OFF) {
1649                    ++level;
1650                }
1651            }
1652        }
1653        if(level != expectedLevel) {
1654            infoln(fileTestName);
1655            errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1656                  (int)fileLineNumber, norm, order, level, expectedLevel);
1657            infoln(prevFileLine);
1658            infoln(fileLine);
1659            infoln(printCollationKey(prevKey));
1660            infoln(printCollationKey(key));
1661            return FALSE;
1662        }
1663    }
1664    return TRUE;
1665}
1666
1667void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1668    if(errorCode.isFailure()) { return; }
1669    UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1670    UnicodeString prevString, s;
1671    prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1672    while(readLine(f, errorCode)) {
1673        if(fileLine.isEmpty()) { continue; }
1674        if(isSectionStarter(fileLine[0])) { break; }
1675        Collation::Level relation = parseRelationAndString(s, errorCode);
1676        if(errorCode.isFailure()) {
1677            errorCode.reset();
1678            break;
1679        }
1680        UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1681        Collation::Level expectedLevel = relation;
1682        s.getTerminatedBuffer();  // Ensure NUL-termination.
1683        UBool isOk = TRUE;
1684        if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1685            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1686            isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1687                                   expectedOrder, expectedLevel, errorCode);
1688        }
1689        if(isOk) {
1690            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1691            isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1692                                   expectedOrder, expectedLevel, errorCode);
1693        }
1694        if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1695            UnicodeString pn = nfd->normalize(prevString, errorCode);
1696            UnicodeString n = nfd->normalize(s, errorCode);
1697            pn.getTerminatedBuffer();
1698            n.getTerminatedBuffer();
1699            errorCode.assertSuccess();
1700            isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1701                                   expectedOrder, expectedLevel, errorCode);
1702        }
1703        if(!isOk) {
1704            errorCode.reset();  // already reported
1705        }
1706        prevFileLine = fileLine;
1707        prevString = s;
1708        prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1709    }
1710}
1711
1712void CollationTest::TestDataDriven() {
1713    IcuTestErrorCode errorCode(*this, "TestDataDriven");
1714
1715    fcd = Normalizer2Factory::getFCDInstance(errorCode);
1716    nfd = Normalizer2Factory::getNFDInstance(errorCode);
1717    if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1718        return;
1719    }
1720
1721    CharString path(getSourceTestData(errorCode), errorCode);
1722    path.appendPathPart("collationtest.txt", errorCode);
1723    const char *codePage = "UTF-8";
1724    LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1725    if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1726        return;
1727    }
1728    while(errorCode.isSuccess()) {
1729        // Read a new line if necessary.
1730        // Sub-parsers leave the first line set that they do not handle.
1731        if(fileLine.isEmpty()) {
1732            if(!readLine(f.getAlias(), errorCode)) { break; }
1733            continue;
1734        }
1735        if(!isSectionStarter(fileLine[0])) {
1736            errln("syntax error on line %d", (int)fileLineNumber);
1737            infoln(fileLine);
1738            return;
1739        }
1740        if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1741            fileTestName = fileLine;
1742            logln(fileLine);
1743            fileLine.remove();
1744        } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1745            setRootCollator(errorCode);
1746            fileLine.remove();
1747        } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1748            setLocaleCollator(errorCode);
1749            fileLine.remove();
1750        } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1751            buildTailoring(f.getAlias(), errorCode);
1752        } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1753            parseAndSetAttribute(errorCode);
1754        } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1755            checkCompareStrings(f.getAlias(), errorCode);
1756        } else {
1757            errln("syntax error on line %d", (int)fileLineNumber);
1758            infoln(fileLine);
1759            return;
1760        }
1761    }
1762}
1763
1764#endif  // !UCONFIG_NO_COLLATION
1765