1/*
2*******************************************************************************
3* Copyright (C) 2012-2015, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationtest.cpp
7*
8* created on: 2012apr27
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/coll.h"
17#include "unicode/errorcode.h"
18#include "unicode/localpointer.h"
19#include "unicode/normalizer2.h"
20#include "unicode/sortkey.h"
21#include "unicode/std_string.h"
22#include "unicode/strenum.h"
23#include "unicode/tblcoll.h"
24#include "unicode/uiter.h"
25#include "unicode/uniset.h"
26#include "unicode/unistr.h"
27#include "unicode/usetiter.h"
28#include "unicode/ustring.h"
29#include "charstr.h"
30#include "cmemory.h"
31#include "collation.h"
32#include "collationdata.h"
33#include "collationfcd.h"
34#include "collationiterator.h"
35#include "collationroot.h"
36#include "collationrootelements.h"
37#include "collationruleparser.h"
38#include "collationweights.h"
39#include "cstring.h"
40#include "intltest.h"
41#include "normalizer2impl.h"
42#include "ucbuf.h"
43#include "uhash.h"
44#include "uitercollationiterator.h"
45#include "utf16collationiterator.h"
46#include "utf8collationiterator.h"
47#include "uvectr32.h"
48#include "uvectr64.h"
49#include "writesrc.h"
50
51// TODO: Move to ucbuf.h
52U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
53
54class CodePointIterator;
55
56// TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
57
58class CollationTest : public IntlTest {
59public:
60    CollationTest()
61            : fcd(NULL), nfd(NULL),
62              fileLineNumber(0),
63              coll(NULL) {}
64
65    ~CollationTest() {
66        delete coll;
67    }
68
69    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
70
71    void TestMinMax();
72    void TestImplicits();
73    void TestNulTerminated();
74    void TestIllegalUTF8();
75    void TestShortFCDData();
76    void TestFCD();
77    void TestCollationWeights();
78    void TestRootElements();
79    void TestTailoredElements();
80    void TestDataDriven();
81
82private:
83    void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
84    void checkAllocWeights(CollationWeights &cw,
85                           uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
86                           int32_t someLength, int32_t minCount);
87
88    static UnicodeString printSortKey(const uint8_t *p, int32_t length);
89    static UnicodeString printCollationKey(const CollationKey &key);
90
91    // Helpers & fields for data-driven test.
92    static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
93    static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
94    static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
95    int32_t skipSpaces(int32_t i) {
96        while(isSpace(fileLine[i])) { ++i; }
97        return i;
98    }
99
100    UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
101    void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
102    Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
103    void parseAndSetAttribute(IcuTestErrorCode &errorCode);
104    void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
105    void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
106    void setRootCollator(IcuTestErrorCode &errorCode);
107    void setLocaleCollator(IcuTestErrorCode &errorCode);
108
109    UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
110
111    UBool getSortKeyParts(const UChar *s, int32_t length,
112                          CharString &dest, int32_t partSize,
113                          IcuTestErrorCode &errorCode);
114    UBool getCollationKey(const char *norm, const UnicodeString &line,
115                          const UChar *s, int32_t length,
116                          CollationKey &key, IcuTestErrorCode &errorCode);
117    UBool getMergedCollationKey(const UChar *s, int32_t length,
118                                CollationKey &key, IcuTestErrorCode &errorCode);
119    UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
120                          const UnicodeString &prevString, const UnicodeString &s,
121                          UCollationResult expectedOrder, Collation::Level expectedLevel,
122                          IcuTestErrorCode &errorCode);
123    void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
124
125    const Normalizer2 *fcd, *nfd;
126    UnicodeString fileLine;
127    int32_t fileLineNumber;
128    UnicodeString fileTestName;
129    Collator *coll;
130};
131
132extern IntlTest *createCollationTest() {
133    return new CollationTest();
134}
135
136void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
137    if(exec) {
138        logln("TestSuite CollationTest: ");
139    }
140    TESTCASE_AUTO_BEGIN;
141    TESTCASE_AUTO(TestMinMax);
142    TESTCASE_AUTO(TestImplicits);
143    TESTCASE_AUTO(TestNulTerminated);
144    TESTCASE_AUTO(TestIllegalUTF8);
145    TESTCASE_AUTO(TestShortFCDData);
146    TESTCASE_AUTO(TestFCD);
147    TESTCASE_AUTO(TestCollationWeights);
148    TESTCASE_AUTO(TestRootElements);
149    TESTCASE_AUTO(TestTailoredElements);
150    TESTCASE_AUTO(TestDataDriven);
151    TESTCASE_AUTO_END;
152}
153
154void CollationTest::TestMinMax() {
155    IcuTestErrorCode errorCode(*this, "TestMinMax");
156
157    setRootCollator(errorCode);
158    if(errorCode.isFailure()) {
159        errorCode.reset();
160        return;
161    }
162    RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
163    if(rbc == NULL) {
164        errln("the root collator is not a RuleBasedCollator");
165        return;
166    }
167
168    static const UChar s[2] = { 0xfffe, 0xffff };
169    UVector64 ces(errorCode);
170    rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
171    errorCode.assertSuccess();
172    if(ces.size() != 2) {
173        errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
174        return;
175    }
176    int64_t ce = ces.elementAti(0);
177    int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
178    if(ce != expected) {
179        errln("CE(U+fffe)=%04lx != 02..", (long)ce);
180    }
181
182    ce = ces.elementAti(1);
183    expected = Collation::makeCE(Collation::MAX_PRIMARY);
184    if(ce != expected) {
185        errln("CE(U+ffff)=%04lx != max..", (long)ce);
186    }
187}
188
189void CollationTest::TestImplicits() {
190    IcuTestErrorCode errorCode(*this, "TestImplicits");
191
192    const CollationData *cd = CollationRoot::getData(errorCode);
193    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
194        return;
195    }
196
197    // Implicit primary weights should be assigned for the following sets,
198    // and sort in ascending order by set and then code point.
199    // See http://www.unicode.org/reports/tr10/#Implicit_Weights
200
201    // core Han Unified Ideographs
202    UnicodeSet coreHan("[\\p{unified_ideograph}&"
203                            "[\\p{Block=CJK_Unified_Ideographs}"
204                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
205                       errorCode);
206    // all other Unified Han ideographs
207    UnicodeSet otherHan("[\\p{unified ideograph}-"
208                            "[\\p{Block=CJK_Unified_Ideographs}"
209                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
210                        errorCode);
211    UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
212    unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
213
214    // Starting with CLDR 26/ICU 54, the root Han order may instead be
215    // the Unihan radical-stroke order.
216    // The tests should pass either way, so we only test the order of a small set of Han characters
217    // whose radical-stroke order is the same as their code point order.
218    UnicodeSet someHanInCPOrder(
219            "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
220            "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
221            errorCode);
222    UnicodeSet inOrder(someHanInCPOrder);
223    inOrder.addAll(unassigned).freeze();
224    if(errorCode.logIfFailureAndReset("UnicodeSet")) {
225        return;
226    }
227    const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
228    UChar32 prev = 0;
229    uint32_t prevPrimary = 0;
230    UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
231    for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
232        LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
233        while(iter->next()) {
234            UChar32 c = iter->getCodepoint();
235            UnicodeString s(c);
236            ci.setText(s.getBuffer(), s.getBuffer() + s.length());
237            int64_t ce = ci.nextCE(errorCode);
238            int64_t ce2 = ci.nextCE(errorCode);
239            if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
240                return;
241            }
242            if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
243                errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
244                continue;
245            }
246            if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
247                errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
248                      (long)c, (long)(ce & 0xffffffff));
249                continue;
250            }
251            uint32_t primary = (uint32_t)(ce >> 32);
252            if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
253                errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
254                      (long)c, (long)primary, (long)prev, (long)prevPrimary);
255            }
256            prev = c;
257            prevPrimary = primary;
258        }
259    }
260}
261
262void CollationTest::TestNulTerminated() {
263    IcuTestErrorCode errorCode(*this, "TestNulTerminated");
264    const CollationData *data = CollationRoot::getData(errorCode);
265    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
266        return;
267    }
268
269    static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
270
271    UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
272    UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
273    for(int32_t i = 0;; ++i) {
274        int64_t ce1 = ci1.nextCE(errorCode);
275        int64_t ce2 = ci2.nextCE(errorCode);
276        if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
277            return;
278        }
279        if(ce1 != ce2) {
280            errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
281            break;
282        }
283        if(ce1 == Collation::NO_CE) { break; }
284    }
285}
286
287void CollationTest::TestIllegalUTF8() {
288    IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
289
290    setRootCollator(errorCode);
291    if(errorCode.isFailure()) {
292        errorCode.reset();
293        return;
294    }
295    coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
296
297    static const char *strings[] = {
298        // U+FFFD
299        "a\xef\xbf\xbdz",
300        // illegal byte sequences
301        "a\x80z",  // trail byte
302        "a\xc1\x81z",  // non-shortest form
303        "a\xe0\x82\x83z",  // non-shortest form
304        "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
305        "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
306        "a\xf0\x8f\xbf\xbfz",  // non-shortest form
307        "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
308    };
309
310    StringPiece fffd(strings[0]);
311    for(int32_t i = 1; i < UPRV_LENGTHOF(strings); ++i) {
312        StringPiece illegal(strings[i]);
313        UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
314        if(order != UCOL_EQUAL) {
315            errln("compareUTF8(U+FFFD, string %d with illegal UTF-8)=%d != UCOL_EQUAL",
316                  (int)i, order);
317        }
318    }
319}
320
321namespace {
322
323void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
324    for(UChar32 c = 0x10000; c < 0x110000;) {
325        UChar32 next = c + 0x400;
326        if(src.containsSome(c, next - 1)) {
327            dest.add(U16_LEAD(c));
328        }
329        c = next;
330    }
331}
332
333}  // namespace
334
335void CollationTest::TestShortFCDData() {
336    // See CollationFCD class comments.
337    IcuTestErrorCode errorCode(*this, "TestShortFCDData");
338    UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
339    errorCode.assertSuccess();
340    expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
341    addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
342    UnicodeSet lccc;  // actual
343    for(UChar32 c = 0; c <= 0xffff; ++c) {
344        if(CollationFCD::hasLccc(c)) { lccc.add(c); }
345    }
346    UnicodeSet diff(expectedLccc);
347    diff.removeAll(lccc);
348    diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
349    UnicodeString empty("[]");
350    UnicodeString diffString;
351    diff.toPattern(diffString, TRUE);
352    assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
353    diff = lccc;
354    diff.removeAll(expectedLccc);
355    diff.toPattern(diffString, TRUE);
356    assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
357
358    UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
359    if (errorCode.isSuccess()) {
360        addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
361        addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
362        UnicodeSet tccc;  // actual
363        for(UChar32 c = 0; c <= 0xffff; ++c) {
364            if(CollationFCD::hasTccc(c)) { tccc.add(c); }
365        }
366        diff = expectedTccc;
367        diff.removeAll(tccc);
368        diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
369        assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
370        diff = tccc;
371        diff.removeAll(expectedTccc);
372        diff.toPattern(diffString, TRUE);
373        assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
374    }
375}
376
377class CodePointIterator {
378public:
379    CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
380    void resetToStart() { pos = 0; }
381    UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
382    UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
383    int32_t getLength() const { return length; }
384    int getIndex() const { return (int)pos; }
385private:
386    const UChar32 *cp;
387    int32_t length;
388    int32_t pos;
389};
390
391void CollationTest::checkFCD(const char *name,
392                             CollationIterator &ci, CodePointIterator &cpi) {
393    IcuTestErrorCode errorCode(*this, "checkFCD");
394
395    // Iterate forward to the limit.
396    for(;;) {
397        UChar32 c1 = ci.nextCodePoint(errorCode);
398        UChar32 c2 = cpi.next();
399        if(c1 != c2) {
400            errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
401                  name, (long)c1, (long)c2, cpi.getIndex());
402            return;
403        }
404        if(c1 < 0) { break; }
405    }
406
407    // Iterate backward most of the way.
408    for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
409        UChar32 c1 = ci.previousCodePoint(errorCode);
410        UChar32 c2 = cpi.previous();
411        if(c1 != c2) {
412            errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
413                  name, (long)c1, (long)c2, cpi.getIndex());
414            return;
415        }
416    }
417
418    // Forward again.
419    for(;;) {
420        UChar32 c1 = ci.nextCodePoint(errorCode);
421        UChar32 c2 = cpi.next();
422        if(c1 != c2) {
423            errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
424                  name, (long)c1, (long)c2, cpi.getIndex());
425            return;
426        }
427        if(c1 < 0) { break; }
428    }
429
430    // Iterate backward to the start.
431    for(;;) {
432        UChar32 c1 = ci.previousCodePoint(errorCode);
433        UChar32 c2 = cpi.previous();
434        if(c1 != c2) {
435            errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
436                  name, (long)c1, (long)c2, cpi.getIndex());
437            return;
438        }
439        if(c1 < 0) { break; }
440    }
441}
442
443void CollationTest::TestFCD() {
444    IcuTestErrorCode errorCode(*this, "TestFCD");
445    const CollationData *data = CollationRoot::getData(errorCode);
446    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
447        return;
448    }
449
450    // Input string, not FCD, NUL-terminated.
451    static const UChar s[] = {
452        0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
453        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
454        0x327, 0x308,  // ccc=202, 230
455        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
456        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
457        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
458        0xac01,
459        0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
460        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
461        0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
462        0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
463        0x4e00, 0xf81,
464        0
465    };
466    // Expected code points.
467    static const UChar32 cp[] = {
468        0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
469        0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
470        0x1D15F, 0x1D16D,
471        0xac01,
472        0x63, 0x327, 0x1D165, 0x1D16D,
473        0x61,
474        0xf71, 0xf71, 0xf72, 0xf74, 0x301,
475        0x4e00, 0xf71, 0xf80
476    };
477
478    FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
479    if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
480        return;
481    }
482    CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
483    checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
484
485#if U_HAVE_STD_STRING
486    cpi.resetToStart();
487    std::string utf8;
488    UnicodeString(s).toUTF8String(utf8);
489    FCDUTF8CollationIterator u8ci(data, FALSE,
490                                  reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
491    if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
492        return;
493    }
494    checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
495#endif
496
497    cpi.resetToStart();
498    UCharIterator iter;
499    uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
500    FCDUIterCollationIterator uici(data, FALSE, iter, 0);
501    if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
502        return;
503    }
504    checkFCD("FCDUIterCollationIterator", uici, cpi);
505}
506
507void CollationTest::checkAllocWeights(CollationWeights &cw,
508                                      uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
509                                      int32_t someLength, int32_t minCount) {
510    if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
511        errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
512              (long)lowerLimit, (long)upperLimit, (long)n);
513        return;
514    }
515    uint32_t previous = lowerLimit;
516    int32_t count = 0;  // number of weights that have someLength
517    for(int32_t i = 0; i < n; ++i) {
518        uint32_t w = cw.nextWeight();
519        if(w == 0xffffffff) {
520            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
521                  "returns only %ld weights",
522                  (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
523            return;
524        }
525        if(!(previous < w && w < upperLimit)) {
526            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
527                  "number %ld -> %lx not between %lx and %lx",
528                  (long)lowerLimit, (long)upperLimit, (long)n,
529                  (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
530            return;
531        }
532        if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
533    }
534    if(count < minCount) {
535        errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
536              "returns only %ld < %ld weights of length %d",
537              (long)lowerLimit, (long)upperLimit, (long)n,
538              (long)count, (long)minCount, (int)someLength);
539    }
540}
541
542void CollationTest::TestCollationWeights() {
543    CollationWeights cw;
544
545    // Non-compressible primaries use 254 second bytes 02..FF.
546    logln("CollationWeights.initForPrimary(non-compressible)");
547    cw.initForPrimary(FALSE);
548    // Expect 1 weight 11 and 254 weights 12xx.
549    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
550    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
551    // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
552    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
553    // Expect 254 two-byte weights from the ranges 10ff and 11xx.
554    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
555    // Expect 254^2=64516 three-byte weights.
556    // During computation, there should be 3 three-byte ranges
557    // 10ffff, 11xxxx, 120202.
558    // The middle one should be split 64515:1,
559    // and the newly-split-off range and the last ranged lengthened.
560    checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
561    // Expect weights 1102 & 1103.
562    checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
563    // Expect weights 102102 & 102103.
564    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
565
566    // Compressible primaries use 251 second bytes 04..FE.
567    logln("CollationWeights.initForPrimary(compressible)");
568    cw.initForPrimary(TRUE);
569    // Expect 1 weight 11 and 251 weights 12xx.
570    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
571    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
572    // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
573    checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
574    // Expect weights 1104 & 1105.
575    checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
576    // Expect weights 102102 & 102103.
577    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
578
579    // Secondary and tertiary weights use only bytes 3 & 4.
580    logln("CollationWeights.initForSecondary()");
581    cw.initForSecondary();
582    // Expect weights fbxx and all four fc..ff.
583    checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
584
585    logln("CollationWeights.initForTertiary()");
586    cw.initForTertiary();
587    // Expect weights 3dxx and both 3e & 3f.
588    checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
589}
590
591namespace {
592
593UBool isValidCE(const CollationRootElements &re, const CollationData &data,
594                uint32_t p, uint32_t s, uint32_t ctq) {
595    uint32_t p1 = p >> 24;
596    uint32_t p2 = (p >> 16) & 0xff;
597    uint32_t p3 = (p >> 8) & 0xff;
598    uint32_t p4 = p & 0xff;
599    uint32_t s1 = s >> 8;
600    uint32_t s2 = s & 0xff;
601    // ctq = Case, Tertiary, Quaternary
602    uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
603    uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
604    uint32_t t1 = t >> 8;
605    uint32_t t2 = t & 0xff;
606    uint32_t q = ctq & Collation::QUATERNARY_MASK;
607    // No leading zero bytes.
608    if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
609        return FALSE;
610    }
611    // No intermediate zero bytes.
612    if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
613        return FALSE;
614    }
615    if(p2 != 0 && p3 == 0 && p4 != 0) {
616        return FALSE;
617    }
618    // Minimum & maximum lead bytes.
619    if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
620            s1 == Collation::LEVEL_SEPARATOR_BYTE ||
621            t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
622        return FALSE;
623    }
624    if(c > 2) {
625        return FALSE;
626    }
627    // The valid byte range for the second primary byte depends on compressibility.
628    if(p2 != 0) {
629        if(data.isCompressibleLeadByte(p1)) {
630            if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
631                    Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
632                return FALSE;
633            }
634        } else {
635            if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
636                return FALSE;
637            }
638        }
639    }
640    // Other bytes just need to avoid the level separator.
641    // Trailing zeros are ok.
642    U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
643    if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
644            s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
645        return FALSE;
646    }
647    // Well-formed CEs.
648    if(p == 0) {
649        if(s == 0) {
650            if(t == 0) {
651                // Completely ignorable CE.
652                // Quaternary CEs are not supported.
653                if(c != 0 || q != 0) {
654                    return FALSE;
655                }
656            } else {
657                // Tertiary CE.
658                if(t < re.getTertiaryBoundary() || c != 2) {
659                    return FALSE;
660                }
661            }
662        } else {
663            // Secondary CE.
664            if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
665                return FALSE;
666            }
667        }
668    } else {
669        // Primary CE.
670        if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
671                s >= re.getSecondaryBoundary()) {
672            return FALSE;
673        }
674        if(t == 0 || t >= re.getTertiaryBoundary()) {
675            return FALSE;
676        }
677    }
678    return TRUE;
679}
680
681UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
682    uint32_t p = (uint32_t)(ce >> 32);
683    uint32_t secTer = (uint32_t)ce;
684    return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
685}
686
687class RootElementsIterator {
688public:
689    RootElementsIterator(const CollationData &root)
690            : data(root),
691              elements(root.rootElements), length(root.rootElementsLength),
692              pri(0), secTer(0),
693              index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
694
695    UBool next() {
696        if(index >= length) { return FALSE; }
697        uint32_t p = elements[index];
698        if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
699        if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
700            ++index;
701            secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
702            return TRUE;
703        }
704        if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
705            // End of a range, enumerate the primaries in the range.
706            int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
707            p &= 0xffffff00;
708            if(pri == p) {
709                // Finished the range, return the next CE after it.
710                ++index;
711                return next();
712            }
713            U_ASSERT(pri < p);
714            // Return the next primary in this range.
715            UBool isCompressible = data.isCompressiblePrimary(pri);
716            if((pri & 0xffff) == 0) {
717                pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
718            } else {
719                pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
720            }
721            return TRUE;
722        }
723        // Simple primary CE.
724        ++index;
725        pri = p;
726        // Does this have an explicit below-common sec/ter unit,
727        // or does it imply a common one?
728        if(index == length) {
729            secTer = Collation::COMMON_SEC_AND_TER_CE;
730        } else {
731            secTer = elements[index];
732            if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
733                // No sec/ter delta.
734                secTer = Collation::COMMON_SEC_AND_TER_CE;
735            } else {
736                secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
737                if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
738                    // Implied sec/ter.
739                    secTer = Collation::COMMON_SEC_AND_TER_CE;
740                } else {
741                    // Explicit sec/ter below common/common.
742                    ++index;
743                }
744            }
745        }
746        return TRUE;
747    }
748
749    uint32_t getPrimary() const { return pri; }
750    uint32_t getSecTer() const { return secTer; }
751
752private:
753    const CollationData &data;
754    const uint32_t *elements;
755    int32_t length;
756
757    uint32_t pri;
758    uint32_t secTer;
759    int32_t index;
760};
761
762}  // namespace
763
764void CollationTest::TestRootElements() {
765    IcuTestErrorCode errorCode(*this, "TestRootElements");
766    const CollationData *root = CollationRoot::getData(errorCode);
767    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
768        return;
769    }
770    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
771    RootElementsIterator iter(*root);
772
773    // We check each root CE for validity,
774    // and we also verify that there is a tailoring gap between each two CEs.
775    CollationWeights cw1c;  // compressible primary weights
776    CollationWeights cw1u;  // uncompressible primary weights
777    CollationWeights cw2;
778    CollationWeights cw3;
779
780    cw1c.initForPrimary(TRUE);
781    cw1u.initForPrimary(FALSE);
782    cw2.initForSecondary();
783    cw3.initForTertiary();
784
785    // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
786    // nor the special merge-separator CE for U+FFFE.
787    uint32_t prevPri = 0;
788    uint32_t prevSec = 0;
789    uint32_t prevTer = 0;
790    while(iter.next()) {
791        uint32_t pri = iter.getPrimary();
792        uint32_t secTer = iter.getSecTer();
793        // CollationRootElements CEs must have 0 case and quaternary bits.
794        if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
795            errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
796                  (long)pri, (long)secTer);
797        }
798        uint32_t sec = secTer >> 16;
799        uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
800        uint32_t ctq = ter;
801        if(pri == 0 && sec == 0 && ter != 0) {
802            // Tertiary CEs must have uppercase bits,
803            // but they are not stored in the CollationRootElements.
804            ctq |= 0x8000;
805        }
806        if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
807            errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
808        } else {
809            if(pri != prevPri) {
810                uint32_t newWeight = 0;
811                if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
812                    // There is currently no tailoring gap after primary ignorables,
813                    // and we forbid tailoring after U+FFFD and U+FFFF.
814                } else if(root->isCompressiblePrimary(prevPri)) {
815                    if(!cw1c.allocWeights(prevPri, pri, 1)) {
816                        errln("no primary/compressible tailoring gap between %08lx and %08lx",
817                              (long)prevPri, (long)pri);
818                    } else {
819                        newWeight = cw1c.nextWeight();
820                    }
821                } else {
822                    if(!cw1u.allocWeights(prevPri, pri, 1)) {
823                        errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
824                              (long)prevPri, (long)pri);
825                    } else {
826                        newWeight = cw1u.nextWeight();
827                    }
828                }
829                if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
830                    errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
831                          (long)prevPri, (long)newWeight, (long)pri);
832                }
833            } else if(sec != prevSec) {
834                uint32_t lowerLimit =
835                    prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
836                if(!cw2.allocWeights(lowerLimit, sec, 1)) {
837                    errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
838                } else {
839                    uint32_t newWeight = cw2.nextWeight();
840                    if(!(prevSec < newWeight && newWeight < sec)) {
841                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
842                              (long)lowerLimit, (long)newWeight, (long)sec);
843                    }
844                }
845            } else if(ter != prevTer) {
846                uint32_t lowerLimit =
847                    prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
848                if(!cw3.allocWeights(lowerLimit, ter, 1)) {
849                    errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
850                } else {
851                    uint32_t newWeight = cw3.nextWeight();
852                    if(!(prevTer < newWeight && newWeight < ter)) {
853                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
854                              (long)lowerLimit, (long)newWeight, (long)ter);
855                    }
856                }
857            } else {
858                errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
859            }
860        }
861        prevPri = pri;
862        prevSec = sec;
863        prevTer = ter;
864    }
865}
866
867void CollationTest::TestTailoredElements() {
868    IcuTestErrorCode errorCode(*this, "TestTailoredElements");
869    const CollationData *root = CollationRoot::getData(errorCode);
870    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
871        return;
872    }
873    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
874
875    UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
876    if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
877        return;
878    }
879    uhash_setKeyDeleter(prevLocales, uprv_free);
880    // TestRootElements() tests the root collator which does not have tailorings.
881    uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
882    uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
883    uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
884
885    UVector64 ces(errorCode);
886    LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
887    U_ASSERT(locales.isValid());
888    const char *localeID = "root";
889    do {
890        Locale locale(localeID);
891        LocalPointer<StringEnumeration> types(
892                Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
893        errorCode.assertSuccess();
894        const char *type;  // first: default type
895        while((type = types->next(NULL, errorCode)) != NULL) {
896            if(strncmp(type, "private-", 8) == 0) {
897                errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
898                        localeID, type);
899            }
900            Locale localeWithType(locale);
901            localeWithType.setKeywordValue("collation", type, errorCode);
902            errorCode.assertSuccess();
903            LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
904            if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
905                                              localeWithType.getName())) {
906                continue;
907            }
908            Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
909            if(uhash_geti(prevLocales, actual.getName()) != 0) {
910                continue;
911            }
912            uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
913            errorCode.assertSuccess();
914            logln("TestTailoredElements(): requested %s -> actual %s",
915                  localeWithType.getName(), actual.getName());
916            RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
917            if(rbc == NULL) {
918                continue;
919            }
920            // Note: It would be better to get tailored strings such that we can
921            // identify the prefix, and only get the CEs for the prefix+string,
922            // not also for the prefix.
923            // There is currently no API for that.
924            // It would help in an unusual case where a contraction starting in the prefix
925            // extends past its end, and we do not see the intended mapping.
926            // For example, for a mapping p|st, if there is also a contraction ps,
927            // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
928            LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
929            errorCode.assertSuccess();
930            UnicodeSetIterator iter(*tailored);
931            while(iter.next()) {
932                const UnicodeString &s = iter.getString();
933                ces.removeAllElements();
934                rbc->internalGetCEs(s, ces, errorCode);
935                errorCode.assertSuccess();
936                for(int32_t i = 0; i < ces.size(); ++i) {
937                    int64_t ce = ces.elementAti(i);
938                    if(!isValidCE(rootElements, *root, ce)) {
939                        errln("invalid tailored CE %016llx at CE index %d from string:",
940                              (long long)ce, (int)i);
941                        infoln(prettify(s));
942                    }
943                }
944            }
945        }
946    } while((localeID = locales->next(NULL, errorCode)) != NULL);
947    uhash_close(prevLocales);
948}
949
950UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
951    UnicodeString s;
952    for(int32_t i = 0; i < length; ++i) {
953        if(i > 0) { s.append((UChar)0x20); }
954        uint8_t b = p[i];
955        if(b == 0) {
956            s.append((UChar)0x2e);  // period
957        } else if(b == 1) {
958            s.append((UChar)0x7c);  // vertical bar
959        } else {
960            appendHex(b, 2, s);
961        }
962    }
963    return s;
964}
965
966UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
967    int32_t length;
968    const uint8_t *p = key.getByteArray(length);
969    return printSortKey(p, length);
970}
971
972UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
973    for(;;) {
974        int32_t lineLength;
975        const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
976        if(line == NULL || errorCode.isFailure()) {
977            fileLine.remove();
978            return FALSE;
979        }
980        ++fileLineNumber;
981        // Strip trailing CR/LF, comments, and spaces.
982        const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
983        if(comment != NULL) {
984            lineLength = (int32_t)(comment - line);
985        } else {
986            while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
987        }
988        while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
989        if(lineLength != 0) {
990            fileLine.setTo(FALSE, line, lineLength);
991            return TRUE;
992        }
993        // Empty line, continue.
994    }
995}
996
997void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
998                                UErrorCode &errorCode) {
999    int32_t length = fileLine.length();
1000    int32_t i;
1001    for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
1002    int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
1003    if(pipeIndex >= 0) {
1004        prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1005        if(prefix.isEmpty()) {
1006            errln("empty prefix on line %d", (int)fileLineNumber);
1007            infoln(fileLine);
1008            errorCode = U_PARSE_ERROR;
1009            return;
1010        }
1011        start = pipeIndex + 1;
1012    } else {
1013        prefix.remove();
1014    }
1015    s = fileLine.tempSubStringBetween(start, i).unescape();
1016    if(s.isEmpty()) {
1017        errln("empty string on line %d", (int)fileLineNumber);
1018        infoln(fileLine);
1019        errorCode = U_PARSE_ERROR;
1020        return;
1021    }
1022    start = i;
1023}
1024
1025Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1026    Collation::Level relation;
1027    int32_t start;
1028    if(fileLine[0] == 0x3c) {  // <
1029        UChar second = fileLine[1];
1030        start = 2;
1031        switch(second) {
1032        case 0x31:  // <1
1033            relation = Collation::PRIMARY_LEVEL;
1034            break;
1035        case 0x32:  // <2
1036            relation = Collation::SECONDARY_LEVEL;
1037            break;
1038        case 0x33:  // <3
1039            relation = Collation::TERTIARY_LEVEL;
1040            break;
1041        case 0x34:  // <4
1042            relation = Collation::QUATERNARY_LEVEL;
1043            break;
1044        case 0x63:  // <c
1045            relation = Collation::CASE_LEVEL;
1046            break;
1047        case 0x69:  // <i
1048            relation = Collation::IDENTICAL_LEVEL;
1049            break;
1050        default:  // just <
1051            relation = Collation::NO_LEVEL;
1052            start = 1;
1053            break;
1054        }
1055    } else if(fileLine[0] == 0x3d) {  // =
1056        relation = Collation::ZERO_LEVEL;
1057        start = 1;
1058    } else {
1059        start = 0;
1060    }
1061    if(start == 0 || !isSpace(fileLine[start])) {
1062        errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1063        infoln(fileLine);
1064        errorCode.set(U_PARSE_ERROR);
1065        return Collation::NO_LEVEL;
1066    }
1067    start = skipSpaces(start);
1068    UnicodeString prefix;
1069    parseString(start, prefix, s, errorCode);
1070    if(errorCode.isSuccess() && !prefix.isEmpty()) {
1071        errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1072        infoln(fileLine);
1073        errorCode.set(U_PARSE_ERROR);
1074        return Collation::NO_LEVEL;
1075    }
1076    if(start < fileLine.length()) {
1077        errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1078        infoln(fileLine);
1079        errorCode.set(U_PARSE_ERROR);
1080        return Collation::NO_LEVEL;
1081    }
1082    return relation;
1083}
1084
1085static const struct {
1086    const char *name;
1087    UColAttribute attr;
1088} attributes[] = {
1089    { "backwards", UCOL_FRENCH_COLLATION },
1090    { "alternate", UCOL_ALTERNATE_HANDLING },
1091    { "caseFirst", UCOL_CASE_FIRST },
1092    { "caseLevel", UCOL_CASE_LEVEL },
1093    // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1094    { "strength", UCOL_STRENGTH },
1095    // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1096    { "numeric", UCOL_NUMERIC_COLLATION }
1097};
1098
1099static const struct {
1100    const char *name;
1101    UColAttributeValue value;
1102} attributeValues[] = {
1103    { "default", UCOL_DEFAULT },
1104    { "primary", UCOL_PRIMARY },
1105    { "secondary", UCOL_SECONDARY },
1106    { "tertiary", UCOL_TERTIARY },
1107    { "quaternary", UCOL_QUATERNARY },
1108    { "identical", UCOL_IDENTICAL },
1109    { "off", UCOL_OFF },
1110    { "on", UCOL_ON },
1111    { "shifted", UCOL_SHIFTED },
1112    { "non-ignorable", UCOL_NON_IGNORABLE },
1113    { "lower", UCOL_LOWER_FIRST },
1114    { "upper", UCOL_UPPER_FIRST }
1115};
1116
1117void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1118    // Parse attributes even if the Collator could not be created,
1119    // in order to report syntax errors.
1120    int32_t start = skipSpaces(1);
1121    int32_t equalPos = fileLine.indexOf(0x3d);
1122    if(equalPos < 0) {
1123        if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1124            parseAndSetReorderCodes(start + 7, errorCode);
1125            return;
1126        }
1127        errln("missing '=' on line %d", (int)fileLineNumber);
1128        infoln(fileLine);
1129        errorCode.set(U_PARSE_ERROR);
1130        return;
1131    }
1132
1133    UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1134    UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1135    if(attrString == UNICODE_STRING("maxVariable", 11)) {
1136        UColReorderCode max;
1137        if(valueString == UNICODE_STRING("space", 5)) {
1138            max = UCOL_REORDER_CODE_SPACE;
1139        } else if(valueString == UNICODE_STRING("punct", 5)) {
1140            max = UCOL_REORDER_CODE_PUNCTUATION;
1141        } else if(valueString == UNICODE_STRING("symbol", 6)) {
1142            max = UCOL_REORDER_CODE_SYMBOL;
1143        } else if(valueString == UNICODE_STRING("currency", 8)) {
1144            max = UCOL_REORDER_CODE_CURRENCY;
1145        } else {
1146            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1147            infoln(fileLine);
1148            errorCode.set(U_PARSE_ERROR);
1149            return;
1150        }
1151        if(coll != NULL) {
1152            coll->setMaxVariable(max, errorCode);
1153            if(errorCode.isFailure()) {
1154                errln("setMaxVariable() failed on line %d: %s",
1155                      (int)fileLineNumber, errorCode.errorName());
1156                infoln(fileLine);
1157                return;
1158            }
1159        }
1160        fileLine.remove();
1161        return;
1162    }
1163
1164    UColAttribute attr;
1165    for(int32_t i = 0;; ++i) {
1166        if(i == UPRV_LENGTHOF(attributes)) {
1167            errln("invalid attribute name on line %d", (int)fileLineNumber);
1168            infoln(fileLine);
1169            errorCode.set(U_PARSE_ERROR);
1170            return;
1171        }
1172        if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1173            attr = attributes[i].attr;
1174            break;
1175        }
1176    }
1177
1178    UColAttributeValue value;
1179    for(int32_t i = 0;; ++i) {
1180        if(i == UPRV_LENGTHOF(attributeValues)) {
1181            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1182            infoln(fileLine);
1183            errorCode.set(U_PARSE_ERROR);
1184            return;
1185        }
1186        if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1187            value = attributeValues[i].value;
1188            break;
1189        }
1190    }
1191
1192    if(coll != NULL) {
1193        coll->setAttribute(attr, value, errorCode);
1194        if(errorCode.isFailure()) {
1195            errln("illegal attribute=value combination on line %d: %s",
1196                  (int)fileLineNumber, errorCode.errorName());
1197            infoln(fileLine);
1198            return;
1199        }
1200    }
1201    fileLine.remove();
1202}
1203
1204void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1205    UVector32 reorderCodes(errorCode);
1206    while(start < fileLine.length()) {
1207        start = skipSpaces(start);
1208        int32_t limit = start;
1209        while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1210        CharString name;
1211        name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1212        int32_t code = CollationRuleParser::getReorderCode(name.data());
1213        if(code < 0) {
1214            if(uprv_stricmp(name.data(), "default") == 0) {
1215                code = UCOL_REORDER_CODE_DEFAULT;  // -1
1216            } else {
1217                errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1218                infoln(fileLine);
1219                errorCode.set(U_PARSE_ERROR);
1220                return;
1221            }
1222        }
1223        reorderCodes.addElement(code, errorCode);
1224        start = limit;
1225    }
1226    if(coll != NULL) {
1227        coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1228        if(errorCode.isFailure()) {
1229            errln("setReorderCodes() failed on line %d: %s",
1230                  (int)fileLineNumber, errorCode.errorName());
1231            infoln(fileLine);
1232            return;
1233        }
1234    }
1235    fileLine.remove();
1236}
1237
1238void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1239    UnicodeString rules;
1240    while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1241        rules.append(fileLine.unescape());
1242    }
1243    if(errorCode.isFailure()) { return; }
1244    logln(rules);
1245
1246    UParseError parseError;
1247    UnicodeString reason;
1248    delete coll;
1249    coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1250    if(coll == NULL) {
1251        errln("unable to allocate a new collator");
1252        errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1253        return;
1254    }
1255    if(errorCode.isFailure()) {
1256        dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1257        infoln(UnicodeString("  reason: ") + reason);
1258        if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1259        if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1260            infoln(UnicodeString("  snippet: ...") +
1261                parseError.preContext + "(!)" + parseError.postContext + "...");
1262        }
1263        delete coll;
1264        coll = NULL;
1265        errorCode.reset();
1266    } else {
1267        assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1268                     UnicodeString(), reason);
1269    }
1270}
1271
1272void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1273    if(errorCode.isFailure()) { return; }
1274    delete coll;
1275    coll = Collator::createInstance(Locale::getRoot(), errorCode);
1276    if(errorCode.isFailure()) {
1277        dataerrln("unable to create a root collator");
1278        return;
1279    }
1280}
1281
1282void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1283    if(errorCode.isFailure()) { return; }
1284    delete coll;
1285    coll = NULL;
1286    int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1287    if(at >= 0) {
1288        fileLine.setCharAt(at, (UChar)0x2a);  // *
1289    }
1290    CharString localeID;
1291    localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1292    if(at >= 0) {
1293        localeID.data()[at - 9] = '@';
1294    }
1295    Locale locale(localeID.data());
1296    if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1297        errln("invalid language tag on line %d", (int)fileLineNumber);
1298        infoln(fileLine);
1299        if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1300        return;
1301    }
1302
1303    logln("creating a collator for locale ID %s", locale.getName());
1304    coll = Collator::createInstance(locale, errorCode);
1305    if(errorCode.isFailure()) {
1306        dataerrln("unable to create a collator for locale %s on line %d",
1307                  locale.getName(), (int)fileLineNumber);
1308        infoln(fileLine);
1309        delete coll;
1310        coll = NULL;
1311        errorCode.reset();
1312    }
1313}
1314
1315UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1316    if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1317    // In some sequences with Tibetan composite vowel signs,
1318    // even if the string passes the FCD check,
1319    // those composites must be decomposed.
1320    // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1321    int32_t index = 0;
1322    while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1323        if(++index < s.length()) {
1324            UChar c = s[index];
1325            if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1326        }
1327    }
1328    return FALSE;
1329}
1330
1331UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1332                                     CharString &dest, int32_t partSize,
1333                                     IcuTestErrorCode &errorCode) {
1334    if(errorCode.isFailure()) { return FALSE; }
1335    uint8_t part[32];
1336    U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1337    UCharIterator iter;
1338    uiter_setString(&iter, s, length);
1339    uint32_t state[2] = { 0, 0 };
1340    for(;;) {
1341        int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1342        UBool done = partLength < partSize;
1343        if(done) {
1344            // At the end, append the next byte as well which should be 00.
1345            ++partLength;
1346        }
1347        dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1348        if(done) {
1349            return errorCode.isSuccess();
1350        }
1351    }
1352}
1353
1354UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1355                                     const UChar *s, int32_t length,
1356                                     CollationKey &key, IcuTestErrorCode &errorCode) {
1357    if(errorCode.isFailure()) { return FALSE; }
1358    coll->getCollationKey(s, length, key, errorCode);
1359    if(errorCode.isFailure()) {
1360        infoln(fileTestName);
1361        errln("Collator(%s).getCollationKey() failed: %s",
1362              norm, errorCode.errorName());
1363        infoln(line);
1364        return FALSE;
1365    }
1366    int32_t keyLength;
1367    const uint8_t *keyBytes = key.getByteArray(keyLength);
1368    if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1369        infoln(fileTestName);
1370        errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1371              norm);
1372        infoln(line);
1373        infoln(printCollationKey(key));
1374        return FALSE;
1375    }
1376
1377    int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1378    if(numLevels < UCOL_IDENTICAL) {
1379        ++numLevels;
1380    } else {
1381        numLevels = 5;
1382    }
1383    if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1384        ++numLevels;
1385    }
1386    errorCode.assertSuccess();
1387    int32_t numLevelSeparators = 0;
1388    for(int32_t i = 0; i < (keyLength - 1); ++i) {
1389        uint8_t b = keyBytes[i];
1390        if(b == 0) {
1391            infoln(fileTestName);
1392            errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1393            infoln(line);
1394            infoln(printCollationKey(key));
1395            return FALSE;
1396        }
1397        if(b == 1) { ++numLevelSeparators; }
1398    }
1399    if(numLevelSeparators != (numLevels - 1)) {
1400        infoln(fileTestName);
1401        errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1402              norm, (int)numLevelSeparators, (int)numLevels);
1403        infoln(line);
1404        infoln(printCollationKey(key));
1405        return FALSE;
1406    }
1407
1408    // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1409    static const int32_t partSizes[] = { 32, 3, 1 };
1410    for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1411        int32_t partSize = partSizes[psi];
1412        CharString parts;
1413        if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1414            infoln(fileTestName);
1415            errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1416                  norm, (int)partSize, errorCode.errorName());
1417            infoln(line);
1418            return FALSE;
1419        }
1420        if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1421            infoln(fileTestName);
1422            errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1423                  norm, (int)partSize);
1424            infoln(line);
1425            infoln(printCollationKey(key));
1426            infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1427            return FALSE;
1428        }
1429    }
1430    return TRUE;
1431}
1432
1433/**
1434 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1435 * Leaves key unchanged if s does not contain U+FFFE.
1436 * @return TRUE if the key was successfully changed
1437 */
1438UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1439                                           CollationKey &key, IcuTestErrorCode &errorCode) {
1440    if(errorCode.isFailure()) { return FALSE; }
1441    LocalMemory<uint8_t> mergedKey;
1442    int32_t mergedKeyLength = 0;
1443    int32_t mergedKeyCapacity = 0;
1444    int32_t sLength = (length >= 0) ? length : u_strlen(s);
1445    int32_t segmentStart = 0;
1446    for(int32_t i = 0;;) {
1447        if(i == sLength) {
1448            if(segmentStart == 0) {
1449                // s does not contain any U+FFFE.
1450                return FALSE;
1451            }
1452        } else if(s[i] != 0xfffe) {
1453            ++i;
1454            continue;
1455        }
1456        // Get the sort key for another segment and merge it into mergedKey.
1457        CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1458        CollationKey key2;
1459        coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1460        int32_t key1Length, key2Length;
1461        const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1462        const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1463        uint8_t *dest;
1464        int32_t minCapacity = key1Length + key2Length;
1465        if(key1Length > 0) { --minCapacity; }
1466        if(minCapacity <= mergedKeyCapacity) {
1467            dest = mergedKey.getAlias();
1468        } else {
1469            if(minCapacity <= 200) {
1470                mergedKeyCapacity = 200;
1471            } else if(minCapacity <= 2 * mergedKeyCapacity) {
1472                mergedKeyCapacity *= 2;
1473            } else {
1474                mergedKeyCapacity = minCapacity;
1475            }
1476            dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1477        }
1478        U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1479        if(key1Length == 0) {
1480            // key2 is the sort key for the first segment.
1481            uprv_memcpy(dest, key2Bytes, key2Length);
1482            mergedKeyLength = key2Length;
1483        } else {
1484            mergedKeyLength =
1485                ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1486                                   dest, mergedKeyCapacity);
1487        }
1488        if(i == sLength) { break; }
1489        segmentStart = ++i;
1490    }
1491    key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1492    return TRUE;
1493}
1494
1495namespace {
1496
1497/**
1498 * Replaces unpaired surrogates with U+FFFD.
1499 * Returns s if no replacement was made, otherwise buffer.
1500 */
1501const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1502    int32_t i = 0;
1503    while(i < s.length()) {
1504        UChar32 c = s.char32At(i);
1505        if(U_IS_SURROGATE(c)) {
1506            if(buffer.length() < i) {
1507                buffer.append(s, buffer.length(), i - buffer.length());
1508            }
1509            buffer.append((UChar)0xfffd);
1510        }
1511        i += U16_LENGTH(c);
1512    }
1513    if(buffer.isEmpty()) {
1514        return s;
1515    }
1516    if(buffer.length() < i) {
1517        buffer.append(s, buffer.length(), i - buffer.length());
1518    }
1519    return buffer;
1520}
1521
1522int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1523                           UCollationResult order, UBool collHasCaseLevel) {
1524    if(order == UCOL_EQUAL) {
1525        return Collation::NO_LEVEL;
1526    }
1527    int32_t prevKeyLength;
1528    const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1529    int32_t keyLength;
1530    const uint8_t *bytes = key.getByteArray(keyLength);
1531    int32_t level = Collation::PRIMARY_LEVEL;
1532    for(int32_t i = 0;; ++i) {
1533        uint8_t b = prevBytes[i];
1534        if(b != bytes[i]) { break; }
1535        if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1536            ++level;
1537            if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1538                ++level;
1539            }
1540        }
1541    }
1542    return level;
1543}
1544
1545}
1546
1547UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1548                                     const UnicodeString &prevString, const UnicodeString &s,
1549                                     UCollationResult expectedOrder, Collation::Level expectedLevel,
1550                                     IcuTestErrorCode &errorCode) {
1551    if(errorCode.isFailure()) { return FALSE; }
1552
1553    // Get the sort keys first, for error debug output.
1554    CollationKey prevKey;
1555    if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1556                        prevKey, errorCode)) {
1557        return FALSE;
1558    }
1559    CollationKey key;
1560    if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1561
1562    UCollationResult order = coll->compare(prevString, s, errorCode);
1563    if(order != expectedOrder || errorCode.isFailure()) {
1564        infoln(fileTestName);
1565        errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1566              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1567        infoln(prevFileLine);
1568        infoln(fileLine);
1569        infoln(printCollationKey(prevKey));
1570        infoln(printCollationKey(key));
1571        return FALSE;
1572    }
1573    order = coll->compare(s, prevString, errorCode);
1574    if(order != -expectedOrder || errorCode.isFailure()) {
1575        infoln(fileTestName);
1576        errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1577              (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1578        infoln(prevFileLine);
1579        infoln(fileLine);
1580        infoln(printCollationKey(prevKey));
1581        infoln(printCollationKey(key));
1582        return FALSE;
1583    }
1584    // Test NUL-termination if the strings do not contain NUL characters.
1585    UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1586    if(!containNUL) {
1587        order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1588        if(order != expectedOrder || errorCode.isFailure()) {
1589            infoln(fileTestName);
1590            errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1591                  (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1592            infoln(prevFileLine);
1593            infoln(fileLine);
1594            infoln(printCollationKey(prevKey));
1595            infoln(printCollationKey(key));
1596            return FALSE;
1597        }
1598        order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1599        if(order != -expectedOrder || errorCode.isFailure()) {
1600            infoln(fileTestName);
1601            errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1602                  (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1603            infoln(prevFileLine);
1604            infoln(fileLine);
1605            infoln(printCollationKey(prevKey));
1606            infoln(printCollationKey(key));
1607            return FALSE;
1608        }
1609    }
1610
1611#if U_HAVE_STD_STRING
1612    // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1613    // Unpaired surrogates cannot be converted to UTF-8.
1614    // Create valid UTF-16 strings if necessary, and use those for
1615    // both the expected compare() result and for the input to compare(UTF-8).
1616    UnicodeString prevBuffer, sBuffer;
1617    const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1618    const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1619    std::string prevUTF8, sUTF8;
1620    UnicodeString(prevValid).toUTF8String(prevUTF8);
1621    UnicodeString(sValid).toUTF8String(sUTF8);
1622    UCollationResult expectedUTF8Order;
1623    if(&prevValid == &prevString && &sValid == &s) {
1624        expectedUTF8Order = expectedOrder;
1625    } else {
1626        expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1627    }
1628
1629    order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1630    if(order != expectedUTF8Order || errorCode.isFailure()) {
1631        infoln(fileTestName);
1632        errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1633              (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1634        infoln(prevFileLine);
1635        infoln(fileLine);
1636        infoln(printCollationKey(prevKey));
1637        infoln(printCollationKey(key));
1638        return FALSE;
1639    }
1640    order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1641    if(order != -expectedUTF8Order || errorCode.isFailure()) {
1642        infoln(fileTestName);
1643        errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1644              (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1645        infoln(prevFileLine);
1646        infoln(fileLine);
1647        infoln(printCollationKey(prevKey));
1648        infoln(printCollationKey(key));
1649        return FALSE;
1650    }
1651    // Test NUL-termination if the strings do not contain NUL characters.
1652    if(!containNUL) {
1653        order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1654        if(order != expectedUTF8Order || errorCode.isFailure()) {
1655            infoln(fileTestName);
1656            errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1657                  (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1658            infoln(prevFileLine);
1659            infoln(fileLine);
1660            infoln(printCollationKey(prevKey));
1661            infoln(printCollationKey(key));
1662            return FALSE;
1663        }
1664        order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1665        if(order != -expectedUTF8Order || errorCode.isFailure()) {
1666            infoln(fileTestName);
1667            errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1668                  (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1669            infoln(prevFileLine);
1670            infoln(fileLine);
1671            infoln(printCollationKey(prevKey));
1672            infoln(printCollationKey(key));
1673            return FALSE;
1674        }
1675    }
1676#endif
1677
1678    UCharIterator leftIter;
1679    UCharIterator rightIter;
1680    uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1681    uiter_setString(&rightIter, s.getBuffer(), s.length());
1682    order = coll->compare(leftIter, rightIter, errorCode);
1683    if(order != expectedOrder || errorCode.isFailure()) {
1684        infoln(fileTestName);
1685        errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1686              "wrong order: %d != %d (%s)",
1687              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1688        infoln(prevFileLine);
1689        infoln(fileLine);
1690        infoln(printCollationKey(prevKey));
1691        infoln(printCollationKey(key));
1692        return FALSE;
1693    }
1694
1695    order = prevKey.compareTo(key, errorCode);
1696    if(order != expectedOrder || errorCode.isFailure()) {
1697        infoln(fileTestName);
1698        errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1699              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1700        infoln(prevFileLine);
1701        infoln(fileLine);
1702        infoln(printCollationKey(prevKey));
1703        infoln(printCollationKey(key));
1704        return FALSE;
1705    }
1706    UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1707    int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1708    if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1709        if(level != expectedLevel) {
1710            infoln(fileTestName);
1711            errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1712                  (int)fileLineNumber, norm, order, level, expectedLevel);
1713            infoln(prevFileLine);
1714            infoln(fileLine);
1715            infoln(printCollationKey(prevKey));
1716            infoln(printCollationKey(key));
1717            return FALSE;
1718        }
1719    }
1720
1721    // If either string contains U+FFFE, then their sort keys must compare the same as
1722    // the merged sort keys of each string's between-FFFE segments.
1723    //
1724    // It is not required that
1725    //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1726    // only that those two methods yield the same order.
1727    //
1728    // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1729    if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1730                getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1731            errorCode.isFailure()) {
1732        order = prevKey.compareTo(key, errorCode);
1733        if(order != expectedOrder || errorCode.isFailure()) {
1734            infoln(fileTestName);
1735            errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1736                "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1737                (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1738            infoln(prevFileLine);
1739            infoln(fileLine);
1740            infoln(printCollationKey(prevKey));
1741            infoln(printCollationKey(key));
1742            return FALSE;
1743        }
1744        int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1745        if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1746            if(mergedLevel != level) {
1747                infoln(fileTestName);
1748                errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1749                    "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1750                    (int)fileLineNumber, norm, order, mergedLevel, level);
1751                infoln(prevFileLine);
1752                infoln(fileLine);
1753                infoln(printCollationKey(prevKey));
1754                infoln(printCollationKey(key));
1755                return FALSE;
1756            }
1757        }
1758    }
1759    return TRUE;
1760}
1761
1762void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1763    if(errorCode.isFailure()) { return; }
1764    UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1765    UnicodeString prevString, s;
1766    prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1767    while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1768        // Parse the line even if it will be ignored (when we do not have a Collator)
1769        // in order to report syntax issues.
1770        Collation::Level relation = parseRelationAndString(s, errorCode);
1771        if(errorCode.isFailure()) {
1772            errorCode.reset();
1773            break;
1774        }
1775        if(coll == NULL) {
1776            // We were unable to create the Collator but continue with tests.
1777            // Ignore test data for this Collator.
1778            // The next Collator creation might work.
1779            continue;
1780        }
1781        UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1782        Collation::Level expectedLevel = relation;
1783        s.getTerminatedBuffer();  // Ensure NUL-termination.
1784        UBool isOk = TRUE;
1785        if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1786            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1787            isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1788                                   expectedOrder, expectedLevel, errorCode);
1789        }
1790        if(isOk) {
1791            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1792            isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1793                                   expectedOrder, expectedLevel, errorCode);
1794        }
1795        if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1796            UnicodeString pn = nfd->normalize(prevString, errorCode);
1797            UnicodeString n = nfd->normalize(s, errorCode);
1798            pn.getTerminatedBuffer();
1799            n.getTerminatedBuffer();
1800            errorCode.assertSuccess();
1801            isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1802                                   expectedOrder, expectedLevel, errorCode);
1803        }
1804        if(!isOk) {
1805            errorCode.reset();  // already reported
1806        }
1807        prevFileLine = fileLine;
1808        prevString = s;
1809        prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1810    }
1811}
1812
1813void CollationTest::TestDataDriven() {
1814    IcuTestErrorCode errorCode(*this, "TestDataDriven");
1815
1816    fcd = Normalizer2Factory::getFCDInstance(errorCode);
1817    nfd = Normalizer2::getNFDInstance(errorCode);
1818    if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1819        return;
1820    }
1821
1822    CharString path(getSourceTestData(errorCode), errorCode);
1823    path.appendPathPart("collationtest.txt", errorCode);
1824    const char *codePage = "UTF-8";
1825    LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1826    if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1827        return;
1828    }
1829    // Read a new line if necessary.
1830    // Sub-parsers leave the first line set that they do not handle.
1831    while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1832        if(!isSectionStarter(fileLine[0])) {
1833            errln("syntax error on line %d", (int)fileLineNumber);
1834            infoln(fileLine);
1835            return;
1836        }
1837        if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1838            fileTestName = fileLine;
1839            logln(fileLine);
1840            fileLine.remove();
1841        } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1842            setRootCollator(errorCode);
1843            fileLine.remove();
1844        } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1845            setLocaleCollator(errorCode);
1846            fileLine.remove();
1847        } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1848            buildTailoring(f.getAlias(), errorCode);
1849        } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1850            parseAndSetAttribute(errorCode);
1851        } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1852            checkCompareStrings(f.getAlias(), errorCode);
1853        } else {
1854            errln("syntax error on line %d", (int)fileLineNumber);
1855            infoln(fileLine);
1856            return;
1857        }
1858    }
1859}
1860
1861#endif  // !UCONFIG_NO_COLLATION
1862