1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationtest.cpp
9*
10* created on: 2012apr27
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/coll.h"
19#include "unicode/errorcode.h"
20#include "unicode/localpointer.h"
21#include "unicode/normalizer2.h"
22#include "unicode/sortkey.h"
23#include "unicode/std_string.h"
24#include "unicode/strenum.h"
25#include "unicode/tblcoll.h"
26#include "unicode/uiter.h"
27#include "unicode/uniset.h"
28#include "unicode/unistr.h"
29#include "unicode/usetiter.h"
30#include "unicode/ustring.h"
31#include "charstr.h"
32#include "cmemory.h"
33#include "collation.h"
34#include "collationdata.h"
35#include "collationfcd.h"
36#include "collationiterator.h"
37#include "collationroot.h"
38#include "collationrootelements.h"
39#include "collationruleparser.h"
40#include "collationweights.h"
41#include "cstring.h"
42#include "intltest.h"
43#include "normalizer2impl.h"
44#include "ucbuf.h"
45#include "uhash.h"
46#include "uitercollationiterator.h"
47#include "utf16collationiterator.h"
48#include "utf8collationiterator.h"
49#include "uvectr32.h"
50#include "uvectr64.h"
51#include "writesrc.h"
52
53class CodePointIterator;
54
55// TODO: try to share code with IntlTestCollator; for example, prettify(CollationKey)
56
57class CollationTest : public IntlTest {
58public:
59    CollationTest()
60            : fcd(NULL), nfd(NULL),
61              fileLineNumber(0),
62              coll(NULL) {}
63
64    ~CollationTest() {
65        delete coll;
66    }
67
68    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
69
70    void TestMinMax();
71    void TestImplicits();
72    void TestNulTerminated();
73    void TestIllegalUTF8();
74    void TestShortFCDData();
75    void TestFCD();
76    void TestCollationWeights();
77    void TestRootElements();
78    void TestTailoredElements();
79    void TestDataDriven();
80
81private:
82    void checkFCD(const char *name, CollationIterator &ci, CodePointIterator &cpi);
83    void checkAllocWeights(CollationWeights &cw,
84                           uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
85                           int32_t someLength, int32_t minCount);
86
87    static UnicodeString printSortKey(const uint8_t *p, int32_t length);
88    static UnicodeString printCollationKey(const CollationKey &key);
89
90    // Helpers & fields for data-driven test.
91    static UBool isCROrLF(UChar c) { return c == 0xa || c == 0xd; }
92    static UBool isSpace(UChar c) { return c == 9 || c == 0x20 || c == 0x3000; }
93    static UBool isSectionStarter(UChar c) { return c == 0x25 || c == 0x2a || c == 0x40; }  // %*@
94    int32_t skipSpaces(int32_t i) {
95        while(isSpace(fileLine[i])) { ++i; }
96        return i;
97    }
98
99    UBool readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode);
100    void parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s, UErrorCode &errorCode);
101    Collation::Level parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode);
102    void parseAndSetAttribute(IcuTestErrorCode &errorCode);
103    void parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode);
104    void buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode);
105    void setRootCollator(IcuTestErrorCode &errorCode);
106    void setLocaleCollator(IcuTestErrorCode &errorCode);
107
108    UBool needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const;
109
110    UBool getSortKeyParts(const UChar *s, int32_t length,
111                          CharString &dest, int32_t partSize,
112                          IcuTestErrorCode &errorCode);
113    UBool getCollationKey(const char *norm, const UnicodeString &line,
114                          const UChar *s, int32_t length,
115                          CollationKey &key, IcuTestErrorCode &errorCode);
116    UBool getMergedCollationKey(const UChar *s, int32_t length,
117                                CollationKey &key, IcuTestErrorCode &errorCode);
118    UBool checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
119                          const UnicodeString &prevString, const UnicodeString &s,
120                          UCollationResult expectedOrder, Collation::Level expectedLevel,
121                          IcuTestErrorCode &errorCode);
122    void checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode);
123
124    const Normalizer2 *fcd, *nfd;
125    UnicodeString fileLine;
126    int32_t fileLineNumber;
127    UnicodeString fileTestName;
128    Collator *coll;
129};
130
131extern IntlTest *createCollationTest() {
132    return new CollationTest();
133}
134
135void CollationTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
136    if(exec) {
137        logln("TestSuite CollationTest: ");
138    }
139    TESTCASE_AUTO_BEGIN;
140    TESTCASE_AUTO(TestMinMax);
141    TESTCASE_AUTO(TestImplicits);
142    TESTCASE_AUTO(TestNulTerminated);
143    TESTCASE_AUTO(TestIllegalUTF8);
144    TESTCASE_AUTO(TestShortFCDData);
145    TESTCASE_AUTO(TestFCD);
146    TESTCASE_AUTO(TestCollationWeights);
147    TESTCASE_AUTO(TestRootElements);
148    TESTCASE_AUTO(TestTailoredElements);
149    TESTCASE_AUTO(TestDataDriven);
150    TESTCASE_AUTO_END;
151}
152
153void CollationTest::TestMinMax() {
154    IcuTestErrorCode errorCode(*this, "TestMinMax");
155
156    setRootCollator(errorCode);
157    if(errorCode.isFailure()) {
158        errorCode.reset();
159        return;
160    }
161    RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll);
162    if(rbc == NULL) {
163        errln("the root collator is not a RuleBasedCollator");
164        return;
165    }
166
167    static const UChar s[2] = { 0xfffe, 0xffff };
168    UVector64 ces(errorCode);
169    rbc->internalGetCEs(UnicodeString(FALSE, s, 2), ces, errorCode);
170    errorCode.assertSuccess();
171    if(ces.size() != 2) {
172        errln("expected 2 CEs for <FFFE, FFFF>, got %d", (int)ces.size());
173        return;
174    }
175    int64_t ce = ces.elementAti(0);
176    int64_t expected = Collation::makeCE(Collation::MERGE_SEPARATOR_PRIMARY);
177    if(ce != expected) {
178        errln("CE(U+fffe)=%04lx != 02..", (long)ce);
179    }
180
181    ce = ces.elementAti(1);
182    expected = Collation::makeCE(Collation::MAX_PRIMARY);
183    if(ce != expected) {
184        errln("CE(U+ffff)=%04lx != max..", (long)ce);
185    }
186}
187
188void CollationTest::TestImplicits() {
189    IcuTestErrorCode errorCode(*this, "TestImplicits");
190
191    const CollationData *cd = CollationRoot::getData(errorCode);
192    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
193        return;
194    }
195
196    // Implicit primary weights should be assigned for the following sets,
197    // and sort in ascending order by set and then code point.
198    // See http://www.unicode.org/reports/tr10/#Implicit_Weights
199
200    // core Han Unified Ideographs
201    UnicodeSet coreHan("[\\p{unified_ideograph}&"
202                            "[\\p{Block=CJK_Unified_Ideographs}"
203                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
204                       errorCode);
205    // all other Unified Han ideographs
206    UnicodeSet otherHan("[\\p{unified ideograph}-"
207                            "[\\p{Block=CJK_Unified_Ideographs}"
208                            "\\p{Block=CJK_Compatibility_Ideographs}]]",
209                        errorCode);
210    UnicodeSet unassigned("[[:Cn:][:Cs:][:Co:]]", errorCode);
211    unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
212
213    // Starting with CLDR 26/ICU 54, the root Han order may instead be
214    // the Unihan radical-stroke order.
215    // The tests should pass either way, so we only test the order of a small set of Han characters
216    // whose radical-stroke order is the same as their code point order.
217    UnicodeSet someHanInCPOrder(
218            "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48"
219            "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]",
220            errorCode);
221    UnicodeSet inOrder(someHanInCPOrder);
222    inOrder.addAll(unassigned).freeze();
223    if(errorCode.logIfFailureAndReset("UnicodeSet")) {
224        return;
225    }
226    const UnicodeSet *sets[] = { &coreHan, &otherHan, &unassigned };
227    UChar32 prev = 0;
228    uint32_t prevPrimary = 0;
229    UTF16CollationIterator ci(cd, FALSE, NULL, NULL, NULL);
230    for(int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
231        LocalPointer<UnicodeSetIterator> iter(new UnicodeSetIterator(*sets[i]));
232        while(iter->next()) {
233            UChar32 c = iter->getCodepoint();
234            UnicodeString s(c);
235            ci.setText(s.getBuffer(), s.getBuffer() + s.length());
236            int64_t ce = ci.nextCE(errorCode);
237            int64_t ce2 = ci.nextCE(errorCode);
238            if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
239                return;
240            }
241            if(ce == Collation::NO_CE || ce2 != Collation::NO_CE) {
242                errln("CollationIterator.nextCE(U+%04lx) did not yield exactly one CE", (long)c);
243                continue;
244            }
245            if((ce & 0xffffffff) != Collation::COMMON_SEC_AND_TER_CE) {
246                errln("CollationIterator.nextCE(U+%04lx) has non-common sec/ter weights: %08lx",
247                      (long)c, (long)(ce & 0xffffffff));
248                continue;
249            }
250            uint32_t primary = (uint32_t)(ce >> 32);
251            if(!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
252                errln("CE(U+%04lx)=%04lx.. not greater than CE(U+%04lx)=%04lx..",
253                      (long)c, (long)primary, (long)prev, (long)prevPrimary);
254            }
255            prev = c;
256            prevPrimary = primary;
257        }
258    }
259}
260
261void CollationTest::TestNulTerminated() {
262    IcuTestErrorCode errorCode(*this, "TestNulTerminated");
263    const CollationData *data = CollationRoot::getData(errorCode);
264    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
265        return;
266    }
267
268    static const UChar s[] = { 0x61, 0x62, 0x61, 0x62, 0 };
269
270    UTF16CollationIterator ci1(data, FALSE, s, s, s + 2);
271    UTF16CollationIterator ci2(data, FALSE, s + 2, s + 2, NULL);
272    for(int32_t i = 0;; ++i) {
273        int64_t ce1 = ci1.nextCE(errorCode);
274        int64_t ce2 = ci2.nextCE(errorCode);
275        if(errorCode.logIfFailureAndReset("CollationIterator.nextCE()")) {
276            return;
277        }
278        if(ce1 != ce2) {
279            errln("CollationIterator.nextCE(with length) != nextCE(NUL-terminated) at CE %d", (int)i);
280            break;
281        }
282        if(ce1 == Collation::NO_CE) { break; }
283    }
284}
285
286void CollationTest::TestIllegalUTF8() {
287    IcuTestErrorCode errorCode(*this, "TestIllegalUTF8");
288
289    setRootCollator(errorCode);
290    if(errorCode.isFailure()) {
291        errorCode.reset();
292        return;
293    }
294    coll->setAttribute(UCOL_STRENGTH, UCOL_IDENTICAL, errorCode);
295
296    static const char *strings[] = {
297        // string with U+FFFD == illegal byte sequence
298        u8"a\uFFFDz", "a\x80z",  // trail byte
299        u8"a\uFFFD\uFFFDz", "a\xc1\x81z",  // non-shortest form
300        u8"a\uFFFD\uFFFD\uFFFDz", "a\xe0\x82\x83z",  // non-shortest form
301        u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xa0\x80z",  // lead surrogate: would be U+D800
302        u8"a\uFFFD\uFFFD\uFFFDz", "a\xed\xbf\xbfz",  // trail surrogate: would be U+DFFF
303        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf0\x8f\xbf\xbfz",  // non-shortest form
304        u8"a\uFFFD\uFFFD\uFFFD\uFFFDz", "a\xf4\x90\x80\x80z"  // out of range: would be U+110000
305    };
306
307    for(int32_t i = 0; i < UPRV_LENGTHOF(strings); i += 2) {
308        StringPiece fffd(strings[i]);
309        StringPiece illegal(strings[i + 1]);
310        UCollationResult order = coll->compareUTF8(fffd, illegal, errorCode);
311        if(order != UCOL_EQUAL) {
312            errln("compareUTF8(pair %d: U+FFFD, illegal UTF-8)=%d != UCOL_EQUAL",
313                  (int)i, order);
314        }
315    }
316}
317
318namespace {
319
320void addLeadSurrogatesForSupplementary(const UnicodeSet &src, UnicodeSet &dest) {
321    for(UChar32 c = 0x10000; c < 0x110000;) {
322        UChar32 next = c + 0x400;
323        if(src.containsSome(c, next - 1)) {
324            dest.add(U16_LEAD(c));
325        }
326        c = next;
327    }
328}
329
330}  // namespace
331
332void CollationTest::TestShortFCDData() {
333    // See CollationFCD class comments.
334    IcuTestErrorCode errorCode(*this, "TestShortFCDData");
335    UnicodeSet expectedLccc("[:^lccc=0:]", errorCode);
336    errorCode.assertSuccess();
337    expectedLccc.add(0xdc00, 0xdfff);  // add all trail surrogates
338    addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
339    UnicodeSet lccc;  // actual
340    for(UChar32 c = 0; c <= 0xffff; ++c) {
341        if(CollationFCD::hasLccc(c)) { lccc.add(c); }
342    }
343    UnicodeSet diff(expectedLccc);
344    diff.removeAll(lccc);
345    diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
346    UnicodeString empty("[]");
347    UnicodeString diffString;
348    diff.toPattern(diffString, TRUE);
349    assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
350    diff = lccc;
351    diff.removeAll(expectedLccc);
352    diff.toPattern(diffString, TRUE);
353    assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString, TRUE);
354
355    UnicodeSet expectedTccc("[:^tccc=0:]", errorCode);
356    if (errorCode.isSuccess()) {
357        addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
358        addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
359        UnicodeSet tccc;  // actual
360        for(UChar32 c = 0; c <= 0xffff; ++c) {
361            if(CollationFCD::hasTccc(c)) { tccc.add(c); }
362        }
363        diff = expectedTccc;
364        diff.removeAll(tccc);
365        diff.remove(0x10000, 0x10ffff);  // hasTccc() only works for the BMP
366        assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
367        diff = tccc;
368        diff.removeAll(expectedTccc);
369        diff.toPattern(diffString, TRUE);
370        assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
371    }
372}
373
374class CodePointIterator {
375public:
376    CodePointIterator(const UChar32 *cp, int32_t length) : cp(cp), length(length), pos(0) {}
377    void resetToStart() { pos = 0; }
378    UChar32 next() { return (pos < length) ? cp[pos++] : U_SENTINEL; }
379    UChar32 previous() { return (pos > 0) ? cp[--pos] : U_SENTINEL; }
380    int32_t getLength() const { return length; }
381    int getIndex() const { return (int)pos; }
382private:
383    const UChar32 *cp;
384    int32_t length;
385    int32_t pos;
386};
387
388void CollationTest::checkFCD(const char *name,
389                             CollationIterator &ci, CodePointIterator &cpi) {
390    IcuTestErrorCode errorCode(*this, "checkFCD");
391
392    // Iterate forward to the limit.
393    for(;;) {
394        UChar32 c1 = ci.nextCodePoint(errorCode);
395        UChar32 c2 = cpi.next();
396        if(c1 != c2) {
397            errln("%s.nextCodePoint(to limit, 1st pass) = U+%04lx != U+%04lx at %d",
398                  name, (long)c1, (long)c2, cpi.getIndex());
399            return;
400        }
401        if(c1 < 0) { break; }
402    }
403
404    // Iterate backward most of the way.
405    for(int32_t n = (cpi.getLength() * 2) / 3; n > 0; --n) {
406        UChar32 c1 = ci.previousCodePoint(errorCode);
407        UChar32 c2 = cpi.previous();
408        if(c1 != c2) {
409            errln("%s.previousCodePoint() = U+%04lx != U+%04lx at %d",
410                  name, (long)c1, (long)c2, cpi.getIndex());
411            return;
412        }
413    }
414
415    // Forward again.
416    for(;;) {
417        UChar32 c1 = ci.nextCodePoint(errorCode);
418        UChar32 c2 = cpi.next();
419        if(c1 != c2) {
420            errln("%s.nextCodePoint(to limit again) = U+%04lx != U+%04lx at %d",
421                  name, (long)c1, (long)c2, cpi.getIndex());
422            return;
423        }
424        if(c1 < 0) { break; }
425    }
426
427    // Iterate backward to the start.
428    for(;;) {
429        UChar32 c1 = ci.previousCodePoint(errorCode);
430        UChar32 c2 = cpi.previous();
431        if(c1 != c2) {
432            errln("%s.previousCodePoint(to start) = U+%04lx != U+%04lx at %d",
433                  name, (long)c1, (long)c2, cpi.getIndex());
434            return;
435        }
436        if(c1 < 0) { break; }
437    }
438}
439
440void CollationTest::TestFCD() {
441    IcuTestErrorCode errorCode(*this, "TestFCD");
442    const CollationData *data = CollationRoot::getData(errorCode);
443    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
444        return;
445    }
446
447    // Input string, not FCD, NUL-terminated.
448    static const UChar s[] = {
449        0x308, 0xe1, 0x62, 0x301, 0x327, 0x430, 0x62,
450        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),  // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
451        0x327, 0x308,  // ccc=202, 230
452        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),  // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
453        U16_LEAD(0x1D15F), U16_TRAIL(0x1D15F),
454        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D),
455        0xac01,
456        0xe7,  // Character with tccc!=0 decomposed together with mis-ordered sequence.
457        U16_LEAD(0x1D16D), U16_TRAIL(0x1D16D), U16_LEAD(0x1D165), U16_TRAIL(0x1D165),
458        0xe1,  // Character with tccc!=0 decomposed together with decomposed sequence.
459        0xf73, 0xf75,  // Tibetan composite vowels must be decomposed.
460        0x4e00, 0xf81,
461        0
462    };
463    // Expected code points.
464    static const UChar32 cp[] = {
465        0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
466        0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
467        0x1D15F, 0x1D16D,
468        0xac01,
469        0x63, 0x327, 0x1D165, 0x1D16D,
470        0x61,
471        0xf71, 0xf71, 0xf72, 0xf74, 0x301,
472        0x4e00, 0xf71, 0xf80
473    };
474
475    FCDUTF16CollationIterator u16ci(data, FALSE, s, s, NULL);
476    if(errorCode.logIfFailureAndReset("FCDUTF16CollationIterator constructor")) {
477        return;
478    }
479    CodePointIterator cpi(cp, UPRV_LENGTHOF(cp));
480    checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
481
482    cpi.resetToStart();
483    std::string utf8;
484    UnicodeString(s).toUTF8String(utf8);
485    FCDUTF8CollationIterator u8ci(data, FALSE,
486                                  reinterpret_cast<const uint8_t *>(utf8.c_str()), 0, -1);
487    if(errorCode.logIfFailureAndReset("FCDUTF8CollationIterator constructor")) {
488        return;
489    }
490    checkFCD("FCDUTF8CollationIterator", u8ci, cpi);
491
492    cpi.resetToStart();
493    UCharIterator iter;
494    uiter_setString(&iter, s, UPRV_LENGTHOF(s) - 1);  // -1: without the terminating NUL
495    FCDUIterCollationIterator uici(data, FALSE, iter, 0);
496    if(errorCode.logIfFailureAndReset("FCDUIterCollationIterator constructor")) {
497        return;
498    }
499    checkFCD("FCDUIterCollationIterator", uici, cpi);
500}
501
502void CollationTest::checkAllocWeights(CollationWeights &cw,
503                                      uint32_t lowerLimit, uint32_t upperLimit, int32_t n,
504                                      int32_t someLength, int32_t minCount) {
505    if(!cw.allocWeights(lowerLimit, upperLimit, n)) {
506        errln("CollationWeights::allocWeights(%lx, %lx, %ld) = FALSE",
507              (long)lowerLimit, (long)upperLimit, (long)n);
508        return;
509    }
510    uint32_t previous = lowerLimit;
511    int32_t count = 0;  // number of weights that have someLength
512    for(int32_t i = 0; i < n; ++i) {
513        uint32_t w = cw.nextWeight();
514        if(w == 0xffffffff) {
515            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
516                  "returns only %ld weights",
517                  (long)lowerLimit, (long)upperLimit, (long)n, (long)i);
518            return;
519        }
520        if(!(previous < w && w < upperLimit)) {
521            errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
522                  "number %ld -> %lx not between %lx and %lx",
523                  (long)lowerLimit, (long)upperLimit, (long)n,
524                  (long)(i + 1), (long)w, (long)previous, (long)upperLimit);
525            return;
526        }
527        if(CollationWeights::lengthOfWeight(w) == someLength) { ++count; }
528    }
529    if(count < minCount) {
530        errln("CollationWeights::allocWeights(%lx, %lx, %ld).nextWeight() "
531              "returns only %ld < %ld weights of length %d",
532              (long)lowerLimit, (long)upperLimit, (long)n,
533              (long)count, (long)minCount, (int)someLength);
534    }
535}
536
537void CollationTest::TestCollationWeights() {
538    CollationWeights cw;
539
540    // Non-compressible primaries use 254 second bytes 02..FF.
541    logln("CollationWeights.initForPrimary(non-compressible)");
542    cw.initForPrimary(FALSE);
543    // Expect 1 weight 11 and 254 weights 12xx.
544    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 1, 1);
545    checkAllocWeights(cw, 0x10000000, 0x13000000, 255, 2, 254);
546    // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
547    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 260, 2, 255);
548    // Expect 254 two-byte weights from the ranges 10ff and 11xx.
549    checkAllocWeights(cw, 0x10fefe40, 0x12030300, 600, 2, 254);
550    // Expect 254^2=64516 three-byte weights.
551    // During computation, there should be 3 three-byte ranges
552    // 10ffff, 11xxxx, 120202.
553    // The middle one should be split 64515:1,
554    // and the newly-split-off range and the last ranged lengthened.
555    checkAllocWeights(cw, 0x10fffe00, 0x12020300, 1 + 64516 + 254 + 1, 3, 64516);
556    // Expect weights 1102 & 1103.
557    checkAllocWeights(cw, 0x10ff0000, 0x11040000, 2, 2, 2);
558    // Expect weights 102102 & 102103.
559    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
560
561    // Compressible primaries use 251 second bytes 04..FE.
562    logln("CollationWeights.initForPrimary(compressible)");
563    cw.initForPrimary(TRUE);
564    // Expect 1 weight 11 and 251 weights 12xx.
565    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 1, 1);
566    checkAllocWeights(cw, 0x10000000, 0x13000000, 252, 2, 251);
567    // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
568    checkAllocWeights(cw, 0x10fdfe40, 0x12050300, 260, 2, 252);
569    // Expect weights 1104 & 1105.
570    checkAllocWeights(cw, 0x10fe0000, 0x11060000, 2, 2, 2);
571    // Expect weights 102102 & 102103.
572    checkAllocWeights(cw, 0x1020ff00, 0x10210400, 2, 3, 2);
573
574    // Secondary and tertiary weights use only bytes 3 & 4.
575    logln("CollationWeights.initForSecondary()");
576    cw.initForSecondary();
577    // Expect weights fbxx and all four fc..ff.
578    checkAllocWeights(cw, 0xfb20, 0x10000, 20, 3, 4);
579
580    logln("CollationWeights.initForTertiary()");
581    cw.initForTertiary();
582    // Expect weights 3dxx and both 3e & 3f.
583    checkAllocWeights(cw, 0x3d02, 0x4000, 10, 3, 2);
584}
585
586namespace {
587
588UBool isValidCE(const CollationRootElements &re, const CollationData &data,
589                uint32_t p, uint32_t s, uint32_t ctq) {
590    uint32_t p1 = p >> 24;
591    uint32_t p2 = (p >> 16) & 0xff;
592    uint32_t p3 = (p >> 8) & 0xff;
593    uint32_t p4 = p & 0xff;
594    uint32_t s1 = s >> 8;
595    uint32_t s2 = s & 0xff;
596    // ctq = Case, Tertiary, Quaternary
597    uint32_t c = (ctq & Collation::CASE_MASK) >> 14;
598    uint32_t t = ctq & Collation::ONLY_TERTIARY_MASK;
599    uint32_t t1 = t >> 8;
600    uint32_t t2 = t & 0xff;
601    uint32_t q = ctq & Collation::QUATERNARY_MASK;
602    // No leading zero bytes.
603    if((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
604        return FALSE;
605    }
606    // No intermediate zero bytes.
607    if(p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
608        return FALSE;
609    }
610    if(p2 != 0 && p3 == 0 && p4 != 0) {
611        return FALSE;
612    }
613    // Minimum & maximum lead bytes.
614    if((p1 != 0 && p1 <= Collation::MERGE_SEPARATOR_BYTE) ||
615            s1 == Collation::LEVEL_SEPARATOR_BYTE ||
616            t1 == Collation::LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
617        return FALSE;
618    }
619    if(c > 2) {
620        return FALSE;
621    }
622    // The valid byte range for the second primary byte depends on compressibility.
623    if(p2 != 0) {
624        if(data.isCompressibleLeadByte(p1)) {
625            if(p2 <= Collation::PRIMARY_COMPRESSION_LOW_BYTE ||
626                    Collation::PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
627                return FALSE;
628            }
629        } else {
630            if(p2 <= Collation::LEVEL_SEPARATOR_BYTE) {
631                return FALSE;
632            }
633        }
634    }
635    // Other bytes just need to avoid the level separator.
636    // Trailing zeros are ok.
637    U_ASSERT(Collation::LEVEL_SEPARATOR_BYTE == 1);
638    if(p3 == Collation::LEVEL_SEPARATOR_BYTE || p4 == Collation::LEVEL_SEPARATOR_BYTE ||
639            s2 == Collation::LEVEL_SEPARATOR_BYTE || t2 == Collation::LEVEL_SEPARATOR_BYTE) {
640        return FALSE;
641    }
642    // Well-formed CEs.
643    if(p == 0) {
644        if(s == 0) {
645            if(t == 0) {
646                // Completely ignorable CE.
647                // Quaternary CEs are not supported.
648                if(c != 0 || q != 0) {
649                    return FALSE;
650                }
651            } else {
652                // Tertiary CE.
653                if(t < re.getTertiaryBoundary() || c != 2) {
654                    return FALSE;
655                }
656            }
657        } else {
658            // Secondary CE.
659            if(s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
660                return FALSE;
661            }
662        }
663    } else {
664        // Primary CE.
665        if(s == 0 || (Collation::COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary()) ||
666                s >= re.getSecondaryBoundary()) {
667            return FALSE;
668        }
669        if(t == 0 || t >= re.getTertiaryBoundary()) {
670            return FALSE;
671        }
672    }
673    return TRUE;
674}
675
676UBool isValidCE(const CollationRootElements &re, const CollationData &data, int64_t ce) {
677    uint32_t p = (uint32_t)(ce >> 32);
678    uint32_t secTer = (uint32_t)ce;
679    return isValidCE(re, data, p, secTer >> 16, secTer & 0xffff);
680}
681
682class RootElementsIterator {
683public:
684    RootElementsIterator(const CollationData &root)
685            : data(root),
686              elements(root.rootElements), length(root.rootElementsLength),
687              pri(0), secTer(0),
688              index((int32_t)elements[CollationRootElements::IX_FIRST_TERTIARY_INDEX]) {}
689
690    UBool next() {
691        if(index >= length) { return FALSE; }
692        uint32_t p = elements[index];
693        if(p == CollationRootElements::PRIMARY_SENTINEL) { return FALSE; }
694        if((p & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
695            ++index;
696            secTer = p & ~CollationRootElements::SEC_TER_DELTA_FLAG;
697            return TRUE;
698        }
699        if((p & CollationRootElements::PRIMARY_STEP_MASK) != 0) {
700            // End of a range, enumerate the primaries in the range.
701            int32_t step = (int32_t)p & CollationRootElements::PRIMARY_STEP_MASK;
702            p &= 0xffffff00;
703            if(pri == p) {
704                // Finished the range, return the next CE after it.
705                ++index;
706                return next();
707            }
708            U_ASSERT(pri < p);
709            // Return the next primary in this range.
710            UBool isCompressible = data.isCompressiblePrimary(pri);
711            if((pri & 0xffff) == 0) {
712                pri = Collation::incTwoBytePrimaryByOffset(pri, isCompressible, step);
713            } else {
714                pri = Collation::incThreeBytePrimaryByOffset(pri, isCompressible, step);
715            }
716            return TRUE;
717        }
718        // Simple primary CE.
719        ++index;
720        pri = p;
721        // Does this have an explicit below-common sec/ter unit,
722        // or does it imply a common one?
723        if(index == length) {
724            secTer = Collation::COMMON_SEC_AND_TER_CE;
725        } else {
726            secTer = elements[index];
727            if((secTer & CollationRootElements::SEC_TER_DELTA_FLAG) == 0) {
728                // No sec/ter delta.
729                secTer = Collation::COMMON_SEC_AND_TER_CE;
730            } else {
731                secTer &= ~CollationRootElements::SEC_TER_DELTA_FLAG;
732                if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
733                    // Implied sec/ter.
734                    secTer = Collation::COMMON_SEC_AND_TER_CE;
735                } else {
736                    // Explicit sec/ter below common/common.
737                    ++index;
738                }
739            }
740        }
741        return TRUE;
742    }
743
744    uint32_t getPrimary() const { return pri; }
745    uint32_t getSecTer() const { return secTer; }
746
747private:
748    const CollationData &data;
749    const uint32_t *elements;
750    int32_t length;
751
752    uint32_t pri;
753    uint32_t secTer;
754    int32_t index;
755};
756
757}  // namespace
758
759void CollationTest::TestRootElements() {
760    IcuTestErrorCode errorCode(*this, "TestRootElements");
761    const CollationData *root = CollationRoot::getData(errorCode);
762    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
763        return;
764    }
765    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
766    RootElementsIterator iter(*root);
767
768    // We check each root CE for validity,
769    // and we also verify that there is a tailoring gap between each two CEs.
770    CollationWeights cw1c;  // compressible primary weights
771    CollationWeights cw1u;  // uncompressible primary weights
772    CollationWeights cw2;
773    CollationWeights cw3;
774
775    cw1c.initForPrimary(TRUE);
776    cw1u.initForPrimary(FALSE);
777    cw2.initForSecondary();
778    cw3.initForTertiary();
779
780    // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
781    // nor the special merge-separator CE for U+FFFE.
782    uint32_t prevPri = 0;
783    uint32_t prevSec = 0;
784    uint32_t prevTer = 0;
785    while(iter.next()) {
786        uint32_t pri = iter.getPrimary();
787        uint32_t secTer = iter.getSecTer();
788        // CollationRootElements CEs must have 0 case and quaternary bits.
789        if((secTer & Collation::CASE_AND_QUATERNARY_MASK) != 0) {
790            errln("CollationRootElements CE has non-zero case and/or quaternary bits: %08lx %08lx",
791                  (long)pri, (long)secTer);
792        }
793        uint32_t sec = secTer >> 16;
794        uint32_t ter = secTer & Collation::ONLY_TERTIARY_MASK;
795        uint32_t ctq = ter;
796        if(pri == 0 && sec == 0 && ter != 0) {
797            // Tertiary CEs must have uppercase bits,
798            // but they are not stored in the CollationRootElements.
799            ctq |= 0x8000;
800        }
801        if(!isValidCE(rootElements, *root, pri, sec, ctq)) {
802            errln("invalid root CE %08lx %08lx", (long)pri, (long)secTer);
803        } else {
804            if(pri != prevPri) {
805                uint32_t newWeight = 0;
806                if(prevPri == 0 || prevPri >= Collation::FFFD_PRIMARY) {
807                    // There is currently no tailoring gap after primary ignorables,
808                    // and we forbid tailoring after U+FFFD and U+FFFF.
809                } else if(root->isCompressiblePrimary(prevPri)) {
810                    if(!cw1c.allocWeights(prevPri, pri, 1)) {
811                        errln("no primary/compressible tailoring gap between %08lx and %08lx",
812                              (long)prevPri, (long)pri);
813                    } else {
814                        newWeight = cw1c.nextWeight();
815                    }
816                } else {
817                    if(!cw1u.allocWeights(prevPri, pri, 1)) {
818                        errln("no primary/uncompressible tailoring gap between %08lx and %08lx",
819                              (long)prevPri, (long)pri);
820                    } else {
821                        newWeight = cw1u.nextWeight();
822                    }
823                }
824                if(newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
825                    errln("mis-allocated primary weight, should get %08lx < %08lx < %08lx",
826                          (long)prevPri, (long)newWeight, (long)pri);
827                }
828            } else if(sec != prevSec) {
829                uint32_t lowerLimit =
830                    prevSec == 0 ? rootElements.getSecondaryBoundary() - 0x100 : prevSec;
831                if(!cw2.allocWeights(lowerLimit, sec, 1)) {
832                    errln("no secondary tailoring gap between %04x and %04x", lowerLimit, sec);
833                } else {
834                    uint32_t newWeight = cw2.nextWeight();
835                    if(!(prevSec < newWeight && newWeight < sec)) {
836                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
837                              (long)lowerLimit, (long)newWeight, (long)sec);
838                    }
839                }
840            } else if(ter != prevTer) {
841                uint32_t lowerLimit =
842                    prevTer == 0 ? rootElements.getTertiaryBoundary() - 0x100 : prevTer;
843                if(!cw3.allocWeights(lowerLimit, ter, 1)) {
844                    errln("no teriary tailoring gap between %04x and %04x", lowerLimit, ter);
845                } else {
846                    uint32_t newWeight = cw3.nextWeight();
847                    if(!(prevTer < newWeight && newWeight < ter)) {
848                        errln("mis-allocated secondary weight, should get %04x < %04x < %04x",
849                              (long)lowerLimit, (long)newWeight, (long)ter);
850                    }
851                }
852            } else {
853                errln("duplicate root CE %08lx %08lx", (long)pri, (long)secTer);
854            }
855        }
856        prevPri = pri;
857        prevSec = sec;
858        prevTer = ter;
859    }
860}
861
862void CollationTest::TestTailoredElements() {
863    IcuTestErrorCode errorCode(*this, "TestTailoredElements");
864    const CollationData *root = CollationRoot::getData(errorCode);
865    if(errorCode.logDataIfFailureAndReset("CollationRoot::getData()")) {
866        return;
867    }
868    CollationRootElements rootElements(root->rootElements, root->rootElementsLength);
869
870    UHashtable *prevLocales = uhash_open(uhash_hashChars, uhash_compareChars, NULL, errorCode);
871    if(errorCode.logIfFailureAndReset("failed to create a hash table")) {
872        return;
873    }
874    uhash_setKeyDeleter(prevLocales, uprv_free);
875    // TestRootElements() tests the root collator which does not have tailorings.
876    uhash_puti(prevLocales, uprv_strdup(""), 1, errorCode);
877    uhash_puti(prevLocales, uprv_strdup("root"), 1, errorCode);
878    uhash_puti(prevLocales, uprv_strdup("root@collation=standard"), 1, errorCode);
879
880    UVector64 ces(errorCode);
881    LocalPointer<StringEnumeration> locales(Collator::getAvailableLocales());
882    U_ASSERT(locales.isValid());
883    const char *localeID = "root";
884    do {
885        Locale locale(localeID);
886        LocalPointer<StringEnumeration> types(
887                Collator::getKeywordValuesForLocale("collation", locale, FALSE, errorCode));
888        errorCode.assertSuccess();
889        const char *type;  // first: default type
890        while((type = types->next(NULL, errorCode)) != NULL) {
891            if(strncmp(type, "private-", 8) == 0) {
892                errln("Collator::getKeywordValuesForLocale(%s) returns private collation keyword: %s",
893                        localeID, type);
894            }
895            Locale localeWithType(locale);
896            localeWithType.setKeywordValue("collation", type, errorCode);
897            errorCode.assertSuccess();
898            LocalPointer<Collator> coll(Collator::createInstance(localeWithType, errorCode));
899            if(errorCode.logIfFailureAndReset("Collator::createInstance(%s)",
900                                              localeWithType.getName())) {
901                continue;
902            }
903            Locale actual = coll->getLocale(ULOC_ACTUAL_LOCALE, errorCode);
904            if(uhash_geti(prevLocales, actual.getName()) != 0) {
905                continue;
906            }
907            uhash_puti(prevLocales, uprv_strdup(actual.getName()), 1, errorCode);
908            errorCode.assertSuccess();
909            logln("TestTailoredElements(): requested %s -> actual %s",
910                  localeWithType.getName(), actual.getName());
911            RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(coll.getAlias());
912            if(rbc == NULL) {
913                continue;
914            }
915            // Note: It would be better to get tailored strings such that we can
916            // identify the prefix, and only get the CEs for the prefix+string,
917            // not also for the prefix.
918            // There is currently no API for that.
919            // It would help in an unusual case where a contraction starting in the prefix
920            // extends past its end, and we do not see the intended mapping.
921            // For example, for a mapping p|st, if there is also a contraction ps,
922            // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
923            LocalPointer<UnicodeSet> tailored(coll->getTailoredSet(errorCode));
924            errorCode.assertSuccess();
925            UnicodeSetIterator iter(*tailored);
926            while(iter.next()) {
927                const UnicodeString &s = iter.getString();
928                ces.removeAllElements();
929                rbc->internalGetCEs(s, ces, errorCode);
930                errorCode.assertSuccess();
931                for(int32_t i = 0; i < ces.size(); ++i) {
932                    int64_t ce = ces.elementAti(i);
933                    if(!isValidCE(rootElements, *root, ce)) {
934                        errln("invalid tailored CE %016llx at CE index %d from string:",
935                              (long long)ce, (int)i);
936                        infoln(prettify(s));
937                    }
938                }
939            }
940        }
941    } while((localeID = locales->next(NULL, errorCode)) != NULL);
942    uhash_close(prevLocales);
943}
944
945UnicodeString CollationTest::printSortKey(const uint8_t *p, int32_t length) {
946    UnicodeString s;
947    for(int32_t i = 0; i < length; ++i) {
948        if(i > 0) { s.append((UChar)0x20); }
949        uint8_t b = p[i];
950        if(b == 0) {
951            s.append((UChar)0x2e);  // period
952        } else if(b == 1) {
953            s.append((UChar)0x7c);  // vertical bar
954        } else {
955            appendHex(b, 2, s);
956        }
957    }
958    return s;
959}
960
961UnicodeString CollationTest::printCollationKey(const CollationKey &key) {
962    int32_t length;
963    const uint8_t *p = key.getByteArray(length);
964    return printSortKey(p, length);
965}
966
967UBool CollationTest::readNonEmptyLine(UCHARBUF *f, IcuTestErrorCode &errorCode) {
968    for(;;) {
969        int32_t lineLength;
970        const UChar *line = ucbuf_readline(f, &lineLength, errorCode);
971        if(line == NULL || errorCode.isFailure()) {
972            fileLine.remove();
973            return FALSE;
974        }
975        ++fileLineNumber;
976        // Strip trailing CR/LF, comments, and spaces.
977        const UChar *comment = u_memchr(line, 0x23, lineLength);  // '#'
978        if(comment != NULL) {
979            lineLength = (int32_t)(comment - line);
980        } else {
981            while(lineLength > 0 && isCROrLF(line[lineLength - 1])) { --lineLength; }
982        }
983        while(lineLength > 0 && isSpace(line[lineLength - 1])) { --lineLength; }
984        if(lineLength != 0) {
985            fileLine.setTo(FALSE, line, lineLength);
986            return TRUE;
987        }
988        // Empty line, continue.
989    }
990}
991
992void CollationTest::parseString(int32_t &start, UnicodeString &prefix, UnicodeString &s,
993                                UErrorCode &errorCode) {
994    int32_t length = fileLine.length();
995    int32_t i;
996    for(i = start; i < length && !isSpace(fileLine[i]); ++i) {}
997    int32_t pipeIndex = fileLine.indexOf((UChar)0x7c, start, i - start);  // '|'
998    if(pipeIndex >= 0) {
999        prefix = fileLine.tempSubStringBetween(start, pipeIndex).unescape();
1000        if(prefix.isEmpty()) {
1001            errln("empty prefix on line %d", (int)fileLineNumber);
1002            infoln(fileLine);
1003            errorCode = U_PARSE_ERROR;
1004            return;
1005        }
1006        start = pipeIndex + 1;
1007    } else {
1008        prefix.remove();
1009    }
1010    s = fileLine.tempSubStringBetween(start, i).unescape();
1011    if(s.isEmpty()) {
1012        errln("empty string on line %d", (int)fileLineNumber);
1013        infoln(fileLine);
1014        errorCode = U_PARSE_ERROR;
1015        return;
1016    }
1017    start = i;
1018}
1019
1020Collation::Level CollationTest::parseRelationAndString(UnicodeString &s, IcuTestErrorCode &errorCode) {
1021    Collation::Level relation;
1022    int32_t start;
1023    if(fileLine[0] == 0x3c) {  // <
1024        UChar second = fileLine[1];
1025        start = 2;
1026        switch(second) {
1027        case 0x31:  // <1
1028            relation = Collation::PRIMARY_LEVEL;
1029            break;
1030        case 0x32:  // <2
1031            relation = Collation::SECONDARY_LEVEL;
1032            break;
1033        case 0x33:  // <3
1034            relation = Collation::TERTIARY_LEVEL;
1035            break;
1036        case 0x34:  // <4
1037            relation = Collation::QUATERNARY_LEVEL;
1038            break;
1039        case 0x63:  // <c
1040            relation = Collation::CASE_LEVEL;
1041            break;
1042        case 0x69:  // <i
1043            relation = Collation::IDENTICAL_LEVEL;
1044            break;
1045        default:  // just <
1046            relation = Collation::NO_LEVEL;
1047            start = 1;
1048            break;
1049        }
1050    } else if(fileLine[0] == 0x3d) {  // =
1051        relation = Collation::ZERO_LEVEL;
1052        start = 1;
1053    } else {
1054        start = 0;
1055    }
1056    if(start == 0 || !isSpace(fileLine[start])) {
1057        errln("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line %d", (int)fileLineNumber);
1058        infoln(fileLine);
1059        errorCode.set(U_PARSE_ERROR);
1060        return Collation::NO_LEVEL;
1061    }
1062    start = skipSpaces(start);
1063    UnicodeString prefix;
1064    parseString(start, prefix, s, errorCode);
1065    if(errorCode.isSuccess() && !prefix.isEmpty()) {
1066        errln("prefix string not allowed for test string: on line %d", (int)fileLineNumber);
1067        infoln(fileLine);
1068        errorCode.set(U_PARSE_ERROR);
1069        return Collation::NO_LEVEL;
1070    }
1071    if(start < fileLine.length()) {
1072        errln("unexpected line contents after test string on line %d", (int)fileLineNumber);
1073        infoln(fileLine);
1074        errorCode.set(U_PARSE_ERROR);
1075        return Collation::NO_LEVEL;
1076    }
1077    return relation;
1078}
1079
1080static const struct {
1081    const char *name;
1082    UColAttribute attr;
1083} attributes[] = {
1084    { "backwards", UCOL_FRENCH_COLLATION },
1085    { "alternate", UCOL_ALTERNATE_HANDLING },
1086    { "caseFirst", UCOL_CASE_FIRST },
1087    { "caseLevel", UCOL_CASE_LEVEL },
1088    // UCOL_NORMALIZATION_MODE is turned on and off automatically.
1089    { "strength", UCOL_STRENGTH },
1090    // UCOL_HIRAGANA_QUATERNARY_MODE is deprecated.
1091    { "numeric", UCOL_NUMERIC_COLLATION }
1092};
1093
1094static const struct {
1095    const char *name;
1096    UColAttributeValue value;
1097} attributeValues[] = {
1098    { "default", UCOL_DEFAULT },
1099    { "primary", UCOL_PRIMARY },
1100    { "secondary", UCOL_SECONDARY },
1101    { "tertiary", UCOL_TERTIARY },
1102    { "quaternary", UCOL_QUATERNARY },
1103    { "identical", UCOL_IDENTICAL },
1104    { "off", UCOL_OFF },
1105    { "on", UCOL_ON },
1106    { "shifted", UCOL_SHIFTED },
1107    { "non-ignorable", UCOL_NON_IGNORABLE },
1108    { "lower", UCOL_LOWER_FIRST },
1109    { "upper", UCOL_UPPER_FIRST }
1110};
1111
1112void CollationTest::parseAndSetAttribute(IcuTestErrorCode &errorCode) {
1113    // Parse attributes even if the Collator could not be created,
1114    // in order to report syntax errors.
1115    int32_t start = skipSpaces(1);
1116    int32_t equalPos = fileLine.indexOf((UChar)0x3d);
1117    if(equalPos < 0) {
1118        if(fileLine.compare(start, 7, UNICODE_STRING("reorder", 7)) == 0) {
1119            parseAndSetReorderCodes(start + 7, errorCode);
1120            return;
1121        }
1122        errln("missing '=' on line %d", (int)fileLineNumber);
1123        infoln(fileLine);
1124        errorCode.set(U_PARSE_ERROR);
1125        return;
1126    }
1127
1128    UnicodeString attrString = fileLine.tempSubStringBetween(start, equalPos);
1129    UnicodeString valueString = fileLine.tempSubString(equalPos+1);
1130    if(attrString == UNICODE_STRING("maxVariable", 11)) {
1131        UColReorderCode max;
1132        if(valueString == UNICODE_STRING("space", 5)) {
1133            max = UCOL_REORDER_CODE_SPACE;
1134        } else if(valueString == UNICODE_STRING("punct", 5)) {
1135            max = UCOL_REORDER_CODE_PUNCTUATION;
1136        } else if(valueString == UNICODE_STRING("symbol", 6)) {
1137            max = UCOL_REORDER_CODE_SYMBOL;
1138        } else if(valueString == UNICODE_STRING("currency", 8)) {
1139            max = UCOL_REORDER_CODE_CURRENCY;
1140        } else {
1141            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1142            infoln(fileLine);
1143            errorCode.set(U_PARSE_ERROR);
1144            return;
1145        }
1146        if(coll != NULL) {
1147            coll->setMaxVariable(max, errorCode);
1148            if(errorCode.isFailure()) {
1149                errln("setMaxVariable() failed on line %d: %s",
1150                      (int)fileLineNumber, errorCode.errorName());
1151                infoln(fileLine);
1152                return;
1153            }
1154        }
1155        fileLine.remove();
1156        return;
1157    }
1158
1159    UColAttribute attr;
1160    for(int32_t i = 0;; ++i) {
1161        if(i == UPRV_LENGTHOF(attributes)) {
1162            errln("invalid attribute name on line %d", (int)fileLineNumber);
1163            infoln(fileLine);
1164            errorCode.set(U_PARSE_ERROR);
1165            return;
1166        }
1167        if(attrString == UnicodeString(attributes[i].name, -1, US_INV)) {
1168            attr = attributes[i].attr;
1169            break;
1170        }
1171    }
1172
1173    UColAttributeValue value;
1174    for(int32_t i = 0;; ++i) {
1175        if(i == UPRV_LENGTHOF(attributeValues)) {
1176            errln("invalid attribute value name on line %d", (int)fileLineNumber);
1177            infoln(fileLine);
1178            errorCode.set(U_PARSE_ERROR);
1179            return;
1180        }
1181        if(valueString == UnicodeString(attributeValues[i].name, -1, US_INV)) {
1182            value = attributeValues[i].value;
1183            break;
1184        }
1185    }
1186
1187    if(coll != NULL) {
1188        coll->setAttribute(attr, value, errorCode);
1189        if(errorCode.isFailure()) {
1190            errln("illegal attribute=value combination on line %d: %s",
1191                  (int)fileLineNumber, errorCode.errorName());
1192            infoln(fileLine);
1193            return;
1194        }
1195    }
1196    fileLine.remove();
1197}
1198
1199void CollationTest::parseAndSetReorderCodes(int32_t start, IcuTestErrorCode &errorCode) {
1200    UVector32 reorderCodes(errorCode);
1201    while(start < fileLine.length()) {
1202        start = skipSpaces(start);
1203        int32_t limit = start;
1204        while(limit < fileLine.length() && !isSpace(fileLine[limit])) { ++limit; }
1205        CharString name;
1206        name.appendInvariantChars(fileLine.tempSubStringBetween(start, limit), errorCode);
1207        int32_t code = CollationRuleParser::getReorderCode(name.data());
1208        if(code < 0) {
1209            if(uprv_stricmp(name.data(), "default") == 0) {
1210                code = UCOL_REORDER_CODE_DEFAULT;  // -1
1211            } else {
1212                errln("invalid reorder code '%s' on line %d", name.data(), (int)fileLineNumber);
1213                infoln(fileLine);
1214                errorCode.set(U_PARSE_ERROR);
1215                return;
1216            }
1217        }
1218        reorderCodes.addElement(code, errorCode);
1219        start = limit;
1220    }
1221    if(coll != NULL) {
1222        coll->setReorderCodes(reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
1223        if(errorCode.isFailure()) {
1224            errln("setReorderCodes() failed on line %d: %s",
1225                  (int)fileLineNumber, errorCode.errorName());
1226            infoln(fileLine);
1227            return;
1228        }
1229    }
1230    fileLine.remove();
1231}
1232
1233void CollationTest::buildTailoring(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1234    UnicodeString rules;
1235    while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1236        rules.append(fileLine.unescape());
1237    }
1238    if(errorCode.isFailure()) { return; }
1239    logln(rules);
1240
1241    UParseError parseError;
1242    UnicodeString reason;
1243    delete coll;
1244    coll = new RuleBasedCollator(rules, parseError, reason, errorCode);
1245    if(coll == NULL) {
1246        errln("unable to allocate a new collator");
1247        errorCode.set(U_MEMORY_ALLOCATION_ERROR);
1248        return;
1249    }
1250    if(errorCode.isFailure()) {
1251        dataerrln("RuleBasedCollator(rules) failed - %s", errorCode.errorName());
1252        infoln(UnicodeString("  reason: ") + reason);
1253        if(parseError.offset >= 0) { infoln("  rules offset: %d", (int)parseError.offset); }
1254        if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) {
1255            infoln(UnicodeString("  snippet: ...") +
1256                parseError.preContext + "(!)" + parseError.postContext + "...");
1257        }
1258        delete coll;
1259        coll = NULL;
1260        errorCode.reset();
1261    } else {
1262        assertEquals("no error reason when RuleBasedCollator(rules) succeeds",
1263                     UnicodeString(), reason);
1264    }
1265}
1266
1267void CollationTest::setRootCollator(IcuTestErrorCode &errorCode) {
1268    if(errorCode.isFailure()) { return; }
1269    delete coll;
1270    coll = Collator::createInstance(Locale::getRoot(), errorCode);
1271    if(errorCode.isFailure()) {
1272        dataerrln("unable to create a root collator");
1273        return;
1274    }
1275}
1276
1277void CollationTest::setLocaleCollator(IcuTestErrorCode &errorCode) {
1278    if(errorCode.isFailure()) { return; }
1279    delete coll;
1280    coll = NULL;
1281    int32_t at = fileLine.indexOf((UChar)0x40, 9);  // @ is not invariant
1282    if(at >= 0) {
1283        fileLine.setCharAt(at, (UChar)0x2a);  // *
1284    }
1285    CharString localeID;
1286    localeID.appendInvariantChars(fileLine.tempSubString(9), errorCode);
1287    if(at >= 0) {
1288        localeID.data()[at - 9] = '@';
1289    }
1290    Locale locale(localeID.data());
1291    if(fileLine.length() == 9 || errorCode.isFailure() || locale.isBogus()) {
1292        errln("invalid language tag on line %d", (int)fileLineNumber);
1293        infoln(fileLine);
1294        if(errorCode.isSuccess()) { errorCode.set(U_PARSE_ERROR); }
1295        return;
1296    }
1297
1298    logln("creating a collator for locale ID %s", locale.getName());
1299    coll = Collator::createInstance(locale, errorCode);
1300    if(errorCode.isFailure()) {
1301        dataerrln("unable to create a collator for locale %s on line %d",
1302                  locale.getName(), (int)fileLineNumber);
1303        infoln(fileLine);
1304        delete coll;
1305        coll = NULL;
1306        errorCode.reset();
1307    }
1308}
1309
1310UBool CollationTest::needsNormalization(const UnicodeString &s, UErrorCode &errorCode) const {
1311    if(U_FAILURE(errorCode) || !fcd->isNormalized(s, errorCode)) { return TRUE; }
1312    // In some sequences with Tibetan composite vowel signs,
1313    // even if the string passes the FCD check,
1314    // those composites must be decomposed.
1315    // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1316    int32_t index = 0;
1317    while((index = s.indexOf((UChar)0xf71, index)) >= 0) {
1318        if(++index < s.length()) {
1319            UChar c = s[index];
1320            if(c == 0xf73 || c == 0xf75 || c == 0xf81) { return TRUE; }
1321        }
1322    }
1323    return FALSE;
1324}
1325
1326UBool CollationTest::getSortKeyParts(const UChar *s, int32_t length,
1327                                     CharString &dest, int32_t partSize,
1328                                     IcuTestErrorCode &errorCode) {
1329    if(errorCode.isFailure()) { return FALSE; }
1330    uint8_t part[32];
1331    U_ASSERT(partSize <= UPRV_LENGTHOF(part));
1332    UCharIterator iter;
1333    uiter_setString(&iter, s, length);
1334    uint32_t state[2] = { 0, 0 };
1335    for(;;) {
1336        int32_t partLength = coll->internalNextSortKeyPart(&iter, state, part, partSize, errorCode);
1337        UBool done = partLength < partSize;
1338        if(done) {
1339            // At the end, append the next byte as well which should be 00.
1340            ++partLength;
1341        }
1342        dest.append(reinterpret_cast<char *>(part), partLength, errorCode);
1343        if(done) {
1344            return errorCode.isSuccess();
1345        }
1346    }
1347}
1348
1349UBool CollationTest::getCollationKey(const char *norm, const UnicodeString &line,
1350                                     const UChar *s, int32_t length,
1351                                     CollationKey &key, IcuTestErrorCode &errorCode) {
1352    if(errorCode.isFailure()) { return FALSE; }
1353    coll->getCollationKey(s, length, key, errorCode);
1354    if(errorCode.isFailure()) {
1355        infoln(fileTestName);
1356        errln("Collator(%s).getCollationKey() failed: %s",
1357              norm, errorCode.errorName());
1358        infoln(line);
1359        return FALSE;
1360    }
1361    int32_t keyLength;
1362    const uint8_t *keyBytes = key.getByteArray(keyLength);
1363    if(keyLength == 0 || keyBytes[keyLength - 1] != 0) {
1364        infoln(fileTestName);
1365        errln("Collator(%s).getCollationKey() wrote an empty or unterminated key",
1366              norm);
1367        infoln(line);
1368        infoln(printCollationKey(key));
1369        return FALSE;
1370    }
1371
1372    int32_t numLevels = coll->getAttribute(UCOL_STRENGTH, errorCode);
1373    if(numLevels < UCOL_IDENTICAL) {
1374        ++numLevels;
1375    } else {
1376        numLevels = 5;
1377    }
1378    if(coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON) {
1379        ++numLevels;
1380    }
1381    errorCode.assertSuccess();
1382    int32_t numLevelSeparators = 0;
1383    for(int32_t i = 0; i < (keyLength - 1); ++i) {
1384        uint8_t b = keyBytes[i];
1385        if(b == 0) {
1386            infoln(fileTestName);
1387            errln("Collator(%s).getCollationKey() contains a 00 byte", norm);
1388            infoln(line);
1389            infoln(printCollationKey(key));
1390            return FALSE;
1391        }
1392        if(b == 1) { ++numLevelSeparators; }
1393    }
1394    if(numLevelSeparators != (numLevels - 1)) {
1395        infoln(fileTestName);
1396        errln("Collator(%s).getCollationKey() has %d level separators for %d levels",
1397              norm, (int)numLevelSeparators, (int)numLevels);
1398        infoln(line);
1399        infoln(printCollationKey(key));
1400        return FALSE;
1401    }
1402
1403    // Check that internalNextSortKeyPart() makes the same key, with several part sizes.
1404    static const int32_t partSizes[] = { 32, 3, 1 };
1405    for(int32_t psi = 0; psi < UPRV_LENGTHOF(partSizes); ++psi) {
1406        int32_t partSize = partSizes[psi];
1407        CharString parts;
1408        if(!getSortKeyParts(s, length, parts, 32, errorCode)) {
1409            infoln(fileTestName);
1410            errln("Collator(%s).internalNextSortKeyPart(%d) failed: %s",
1411                  norm, (int)partSize, errorCode.errorName());
1412            infoln(line);
1413            return FALSE;
1414        }
1415        if(keyLength != parts.length() || uprv_memcmp(keyBytes, parts.data(), keyLength) != 0) {
1416            infoln(fileTestName);
1417            errln("Collator(%s).getCollationKey() != internalNextSortKeyPart(%d)",
1418                  norm, (int)partSize);
1419            infoln(line);
1420            infoln(printCollationKey(key));
1421            infoln(printSortKey(reinterpret_cast<uint8_t *>(parts.data()), parts.length()));
1422            return FALSE;
1423        }
1424    }
1425    return TRUE;
1426}
1427
1428/**
1429 * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1430 * Leaves key unchanged if s does not contain U+FFFE.
1431 * @return TRUE if the key was successfully changed
1432 */
1433UBool CollationTest::getMergedCollationKey(const UChar *s, int32_t length,
1434                                           CollationKey &key, IcuTestErrorCode &errorCode) {
1435    if(errorCode.isFailure()) { return FALSE; }
1436    LocalMemory<uint8_t> mergedKey;
1437    int32_t mergedKeyLength = 0;
1438    int32_t mergedKeyCapacity = 0;
1439    int32_t sLength = (length >= 0) ? length : u_strlen(s);
1440    int32_t segmentStart = 0;
1441    for(int32_t i = 0;;) {
1442        if(i == sLength) {
1443            if(segmentStart == 0) {
1444                // s does not contain any U+FFFE.
1445                return FALSE;
1446            }
1447        } else if(s[i] != 0xfffe) {
1448            ++i;
1449            continue;
1450        }
1451        // Get the sort key for another segment and merge it into mergedKey.
1452        CollationKey key1(mergedKey.getAlias(), mergedKeyLength);  // copies the bytes
1453        CollationKey key2;
1454        coll->getCollationKey(s + segmentStart, i - segmentStart, key2, errorCode);
1455        int32_t key1Length, key2Length;
1456        const uint8_t *key1Bytes = key1.getByteArray(key1Length);
1457        const uint8_t *key2Bytes = key2.getByteArray(key2Length);
1458        uint8_t *dest;
1459        int32_t minCapacity = key1Length + key2Length;
1460        if(key1Length > 0) { --minCapacity; }
1461        if(minCapacity <= mergedKeyCapacity) {
1462            dest = mergedKey.getAlias();
1463        } else {
1464            if(minCapacity <= 200) {
1465                mergedKeyCapacity = 200;
1466            } else if(minCapacity <= 2 * mergedKeyCapacity) {
1467                mergedKeyCapacity *= 2;
1468            } else {
1469                mergedKeyCapacity = minCapacity;
1470            }
1471            dest = mergedKey.allocateInsteadAndReset(mergedKeyCapacity);
1472        }
1473        U_ASSERT(dest != NULL || mergedKeyCapacity == 0);
1474        if(key1Length == 0) {
1475            // key2 is the sort key for the first segment.
1476            uprv_memcpy(dest, key2Bytes, key2Length);
1477            mergedKeyLength = key2Length;
1478        } else {
1479            mergedKeyLength =
1480                ucol_mergeSortkeys(key1Bytes, key1Length, key2Bytes, key2Length,
1481                                   dest, mergedKeyCapacity);
1482        }
1483        if(i == sLength) { break; }
1484        segmentStart = ++i;
1485    }
1486    key = CollationKey(mergedKey.getAlias(), mergedKeyLength);
1487    return TRUE;
1488}
1489
1490namespace {
1491
1492/**
1493 * Replaces unpaired surrogates with U+FFFD.
1494 * Returns s if no replacement was made, otherwise buffer.
1495 */
1496const UnicodeString &surrogatesToFFFD(const UnicodeString &s, UnicodeString &buffer) {
1497    int32_t i = 0;
1498    while(i < s.length()) {
1499        UChar32 c = s.char32At(i);
1500        if(U_IS_SURROGATE(c)) {
1501            if(buffer.length() < i) {
1502                buffer.append(s, buffer.length(), i - buffer.length());
1503            }
1504            buffer.append((UChar)0xfffd);
1505        }
1506        i += U16_LENGTH(c);
1507    }
1508    if(buffer.isEmpty()) {
1509        return s;
1510    }
1511    if(buffer.length() < i) {
1512        buffer.append(s, buffer.length(), i - buffer.length());
1513    }
1514    return buffer;
1515}
1516
1517int32_t getDifferenceLevel(const CollationKey &prevKey, const CollationKey &key,
1518                           UCollationResult order, UBool collHasCaseLevel) {
1519    if(order == UCOL_EQUAL) {
1520        return Collation::NO_LEVEL;
1521    }
1522    int32_t prevKeyLength;
1523    const uint8_t *prevBytes = prevKey.getByteArray(prevKeyLength);
1524    int32_t keyLength;
1525    const uint8_t *bytes = key.getByteArray(keyLength);
1526    int32_t level = Collation::PRIMARY_LEVEL;
1527    for(int32_t i = 0;; ++i) {
1528        uint8_t b = prevBytes[i];
1529        if(b != bytes[i]) { break; }
1530        if(b == Collation::LEVEL_SEPARATOR_BYTE) {
1531            ++level;
1532            if(level == Collation::CASE_LEVEL && !collHasCaseLevel) {
1533                ++level;
1534            }
1535        }
1536    }
1537    return level;
1538}
1539
1540}
1541
1542UBool CollationTest::checkCompareTwo(const char *norm, const UnicodeString &prevFileLine,
1543                                     const UnicodeString &prevString, const UnicodeString &s,
1544                                     UCollationResult expectedOrder, Collation::Level expectedLevel,
1545                                     IcuTestErrorCode &errorCode) {
1546    if(errorCode.isFailure()) { return FALSE; }
1547
1548    // Get the sort keys first, for error debug output.
1549    CollationKey prevKey;
1550    if(!getCollationKey(norm, prevFileLine, prevString.getBuffer(), prevString.length(),
1551                        prevKey, errorCode)) {
1552        return FALSE;
1553    }
1554    CollationKey key;
1555    if(!getCollationKey(norm, fileLine, s.getBuffer(), s.length(), key, errorCode)) { return FALSE; }
1556
1557    UCollationResult order = coll->compare(prevString, s, errorCode);
1558    if(order != expectedOrder || errorCode.isFailure()) {
1559        infoln(fileTestName);
1560        errln("line %d Collator(%s).compare(previous, current) wrong order: %d != %d (%s)",
1561              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1562        infoln(prevFileLine);
1563        infoln(fileLine);
1564        infoln(printCollationKey(prevKey));
1565        infoln(printCollationKey(key));
1566        return FALSE;
1567    }
1568    order = coll->compare(s, prevString, errorCode);
1569    if(order != -expectedOrder || errorCode.isFailure()) {
1570        infoln(fileTestName);
1571        errln("line %d Collator(%s).compare(current, previous) wrong order: %d != %d (%s)",
1572              (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1573        infoln(prevFileLine);
1574        infoln(fileLine);
1575        infoln(printCollationKey(prevKey));
1576        infoln(printCollationKey(key));
1577        return FALSE;
1578    }
1579    // Test NUL-termination if the strings do not contain NUL characters.
1580    UBool containNUL = prevString.indexOf((UChar)0) >= 0 || s.indexOf((UChar)0) >= 0;
1581    if(!containNUL) {
1582        order = coll->compare(prevString.getBuffer(), -1, s.getBuffer(), -1, errorCode);
1583        if(order != expectedOrder || errorCode.isFailure()) {
1584            infoln(fileTestName);
1585            errln("line %d Collator(%s).compare(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1586                  (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1587            infoln(prevFileLine);
1588            infoln(fileLine);
1589            infoln(printCollationKey(prevKey));
1590            infoln(printCollationKey(key));
1591            return FALSE;
1592        }
1593        order = coll->compare(s.getBuffer(), -1, prevString.getBuffer(), -1, errorCode);
1594        if(order != -expectedOrder || errorCode.isFailure()) {
1595            infoln(fileTestName);
1596            errln("line %d Collator(%s).compare(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1597                  (int)fileLineNumber, norm, order, -expectedOrder, errorCode.errorName());
1598            infoln(prevFileLine);
1599            infoln(fileLine);
1600            infoln(printCollationKey(prevKey));
1601            infoln(printCollationKey(key));
1602            return FALSE;
1603        }
1604    }
1605
1606    // compare(UTF-16) treats unpaired surrogates like unassigned code points.
1607    // Unpaired surrogates cannot be converted to UTF-8.
1608    // Create valid UTF-16 strings if necessary, and use those for
1609    // both the expected compare() result and for the input to compare(UTF-8).
1610    UnicodeString prevBuffer, sBuffer;
1611    const UnicodeString &prevValid = surrogatesToFFFD(prevString, prevBuffer);
1612    const UnicodeString &sValid = surrogatesToFFFD(s, sBuffer);
1613    std::string prevUTF8, sUTF8;
1614    UnicodeString(prevValid).toUTF8String(prevUTF8);
1615    UnicodeString(sValid).toUTF8String(sUTF8);
1616    UCollationResult expectedUTF8Order;
1617    if(&prevValid == &prevString && &sValid == &s) {
1618        expectedUTF8Order = expectedOrder;
1619    } else {
1620        expectedUTF8Order = coll->compare(prevValid, sValid, errorCode);
1621    }
1622
1623    order = coll->compareUTF8(prevUTF8, sUTF8, errorCode);
1624    if(order != expectedUTF8Order || errorCode.isFailure()) {
1625        infoln(fileTestName);
1626        errln("line %d Collator(%s).compareUTF8(previous, current) wrong order: %d != %d (%s)",
1627              (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1628        infoln(prevFileLine);
1629        infoln(fileLine);
1630        infoln(printCollationKey(prevKey));
1631        infoln(printCollationKey(key));
1632        return FALSE;
1633    }
1634    order = coll->compareUTF8(sUTF8, prevUTF8, errorCode);
1635    if(order != -expectedUTF8Order || errorCode.isFailure()) {
1636        infoln(fileTestName);
1637        errln("line %d Collator(%s).compareUTF8(current, previous) wrong order: %d != %d (%s)",
1638              (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1639        infoln(prevFileLine);
1640        infoln(fileLine);
1641        infoln(printCollationKey(prevKey));
1642        infoln(printCollationKey(key));
1643        return FALSE;
1644    }
1645    // Test NUL-termination if the strings do not contain NUL characters.
1646    if(!containNUL) {
1647        order = coll->internalCompareUTF8(prevUTF8.c_str(), -1, sUTF8.c_str(), -1, errorCode);
1648        if(order != expectedUTF8Order || errorCode.isFailure()) {
1649            infoln(fileTestName);
1650            errln("line %d Collator(%s).internalCompareUTF8(previous-NUL, current-NUL) wrong order: %d != %d (%s)",
1651                  (int)fileLineNumber, norm, order, expectedUTF8Order, errorCode.errorName());
1652            infoln(prevFileLine);
1653            infoln(fileLine);
1654            infoln(printCollationKey(prevKey));
1655            infoln(printCollationKey(key));
1656            return FALSE;
1657        }
1658        order = coll->internalCompareUTF8(sUTF8.c_str(), -1, prevUTF8.c_str(), -1, errorCode);
1659        if(order != -expectedUTF8Order || errorCode.isFailure()) {
1660            infoln(fileTestName);
1661            errln("line %d Collator(%s).internalCompareUTF8(current-NUL, previous-NUL) wrong order: %d != %d (%s)",
1662                  (int)fileLineNumber, norm, order, -expectedUTF8Order, errorCode.errorName());
1663            infoln(prevFileLine);
1664            infoln(fileLine);
1665            infoln(printCollationKey(prevKey));
1666            infoln(printCollationKey(key));
1667            return FALSE;
1668        }
1669    }
1670
1671    UCharIterator leftIter;
1672    UCharIterator rightIter;
1673    uiter_setString(&leftIter, prevString.getBuffer(), prevString.length());
1674    uiter_setString(&rightIter, s.getBuffer(), s.length());
1675    order = coll->compare(leftIter, rightIter, errorCode);
1676    if(order != expectedOrder || errorCode.isFailure()) {
1677        infoln(fileTestName);
1678        errln("line %d Collator(%s).compare(UCharIterator: previous, current) "
1679              "wrong order: %d != %d (%s)",
1680              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1681        infoln(prevFileLine);
1682        infoln(fileLine);
1683        infoln(printCollationKey(prevKey));
1684        infoln(printCollationKey(key));
1685        return FALSE;
1686    }
1687
1688    order = prevKey.compareTo(key, errorCode);
1689    if(order != expectedOrder || errorCode.isFailure()) {
1690        infoln(fileTestName);
1691        errln("line %d Collator(%s).getCollationKey(previous, current).compareTo() wrong order: %d != %d (%s)",
1692              (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1693        infoln(prevFileLine);
1694        infoln(fileLine);
1695        infoln(printCollationKey(prevKey));
1696        infoln(printCollationKey(key));
1697        return FALSE;
1698    }
1699    UBool collHasCaseLevel = coll->getAttribute(UCOL_CASE_LEVEL, errorCode) == UCOL_ON;
1700    int32_t level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1701    if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1702        if(level != expectedLevel) {
1703            infoln(fileTestName);
1704            errln("line %d Collator(%s).getCollationKey(previous, current).compareTo()=%d wrong level: %d != %d",
1705                  (int)fileLineNumber, norm, order, level, expectedLevel);
1706            infoln(prevFileLine);
1707            infoln(fileLine);
1708            infoln(printCollationKey(prevKey));
1709            infoln(printCollationKey(key));
1710            return FALSE;
1711        }
1712    }
1713
1714    // If either string contains U+FFFE, then their sort keys must compare the same as
1715    // the merged sort keys of each string's between-FFFE segments.
1716    //
1717    // It is not required that
1718    //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1719    // only that those two methods yield the same order.
1720    //
1721    // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1722    if((getMergedCollationKey(prevString.getBuffer(), prevString.length(), prevKey, errorCode) |
1723                getMergedCollationKey(s.getBuffer(), s.length(), key, errorCode)) ||
1724            errorCode.isFailure()) {
1725        order = prevKey.compareTo(key, errorCode);
1726        if(order != expectedOrder || errorCode.isFailure()) {
1727            infoln(fileTestName);
1728            errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1729                "(previous, current segments between U+FFFE)).compareTo() wrong order: %d != %d (%s)",
1730                (int)fileLineNumber, norm, order, expectedOrder, errorCode.errorName());
1731            infoln(prevFileLine);
1732            infoln(fileLine);
1733            infoln(printCollationKey(prevKey));
1734            infoln(printCollationKey(key));
1735            return FALSE;
1736        }
1737        int32_t mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1738        if(order != UCOL_EQUAL && expectedLevel != Collation::NO_LEVEL) {
1739            if(mergedLevel != level) {
1740                infoln(fileTestName);
1741                errln("line %d ucol_mergeSortkeys(Collator(%s).getCollationKey"
1742                    "(previous, current segments between U+FFFE)).compareTo()=%d wrong level: %d != %d",
1743                    (int)fileLineNumber, norm, order, mergedLevel, level);
1744                infoln(prevFileLine);
1745                infoln(fileLine);
1746                infoln(printCollationKey(prevKey));
1747                infoln(printCollationKey(key));
1748                return FALSE;
1749            }
1750        }
1751    }
1752    return TRUE;
1753}
1754
1755void CollationTest::checkCompareStrings(UCHARBUF *f, IcuTestErrorCode &errorCode) {
1756    if(errorCode.isFailure()) { return; }
1757    UnicodeString prevFileLine = UNICODE_STRING("(none)", 6);
1758    UnicodeString prevString, s;
1759    prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1760    while(readNonEmptyLine(f, errorCode) && !isSectionStarter(fileLine[0])) {
1761        // Parse the line even if it will be ignored (when we do not have a Collator)
1762        // in order to report syntax issues.
1763        Collation::Level relation = parseRelationAndString(s, errorCode);
1764        if(errorCode.isFailure()) {
1765            errorCode.reset();
1766            break;
1767        }
1768        if(coll == NULL) {
1769            // We were unable to create the Collator but continue with tests.
1770            // Ignore test data for this Collator.
1771            // The next Collator creation might work.
1772            continue;
1773        }
1774        UCollationResult expectedOrder = (relation == Collation::ZERO_LEVEL) ? UCOL_EQUAL : UCOL_LESS;
1775        Collation::Level expectedLevel = relation;
1776        s.getTerminatedBuffer();  // Ensure NUL-termination.
1777        UBool isOk = TRUE;
1778        if(!needsNormalization(prevString, errorCode) && !needsNormalization(s, errorCode)) {
1779            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_OFF, errorCode);
1780            isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1781                                   expectedOrder, expectedLevel, errorCode);
1782        }
1783        if(isOk) {
1784            coll->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, errorCode);
1785            isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1786                                   expectedOrder, expectedLevel, errorCode);
1787        }
1788        if(isOk && (!nfd->isNormalized(prevString, errorCode) || !nfd->isNormalized(s, errorCode))) {
1789            UnicodeString pn = nfd->normalize(prevString, errorCode);
1790            UnicodeString n = nfd->normalize(s, errorCode);
1791            pn.getTerminatedBuffer();
1792            n.getTerminatedBuffer();
1793            errorCode.assertSuccess();
1794            isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1795                                   expectedOrder, expectedLevel, errorCode);
1796        }
1797        if(!isOk) {
1798            errorCode.reset();  // already reported
1799        }
1800        prevFileLine = fileLine;
1801        prevString = s;
1802        prevString.getTerminatedBuffer();  // Ensure NUL-termination.
1803    }
1804}
1805
1806void CollationTest::TestDataDriven() {
1807    IcuTestErrorCode errorCode(*this, "TestDataDriven");
1808
1809    fcd = Normalizer2Factory::getFCDInstance(errorCode);
1810    nfd = Normalizer2::getNFDInstance(errorCode);
1811    if(errorCode.logDataIfFailureAndReset("Normalizer2Factory::getFCDInstance() or getNFDInstance()")) {
1812        return;
1813    }
1814
1815    CharString path(getSourceTestData(errorCode), errorCode);
1816    path.appendPathPart("collationtest.txt", errorCode);
1817    const char *codePage = "UTF-8";
1818    LocalUCHARBUFPointer f(ucbuf_open(path.data(), &codePage, TRUE, FALSE, errorCode));
1819    if(errorCode.logIfFailureAndReset("ucbuf_open(collationtest.txt)")) {
1820        return;
1821    }
1822    // Read a new line if necessary.
1823    // Sub-parsers leave the first line set that they do not handle.
1824    while(errorCode.isSuccess() && (!fileLine.isEmpty() || readNonEmptyLine(f.getAlias(), errorCode))) {
1825        if(!isSectionStarter(fileLine[0])) {
1826            errln("syntax error on line %d", (int)fileLineNumber);
1827            infoln(fileLine);
1828            return;
1829        }
1830        if(fileLine.startsWith(UNICODE_STRING("** test: ", 9))) {
1831            fileTestName = fileLine;
1832            logln(fileLine);
1833            fileLine.remove();
1834        } else if(fileLine == UNICODE_STRING("@ root", 6)) {
1835            setRootCollator(errorCode);
1836            fileLine.remove();
1837        } else if(fileLine.startsWith(UNICODE_STRING("@ locale ", 9))) {
1838            setLocaleCollator(errorCode);
1839            fileLine.remove();
1840        } else if(fileLine == UNICODE_STRING("@ rules", 7)) {
1841            buildTailoring(f.getAlias(), errorCode);
1842        } else if(fileLine[0] == 0x25 && isSpace(fileLine[1])) {  // %
1843            parseAndSetAttribute(errorCode);
1844        } else if(fileLine == UNICODE_STRING("* compare", 9)) {
1845            checkCompareStrings(f.getAlias(), errorCode);
1846        } else {
1847            errln("syntax error on line %d", (int)fileLineNumber);
1848            infoln(fileLine);
1849            return;
1850        }
1851    }
1852}
1853
1854#endif  // !UCONFIG_NO_COLLATION
1855