1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/**
4 *******************************************************************************
5 * Copyright (C) 2001-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 * CollationTest.java, ported from collationtest.cpp
9 * C++ version created on: 2012apr27
10 * created by: Markus W. Scherer
11 */
12package com.ibm.icu.dev.test.collator;
13
14import java.io.BufferedReader;
15import java.io.IOException;
16import java.text.ParseException;
17import java.util.HashSet;
18import java.util.Set;
19
20import org.junit.Test;
21
22import com.ibm.icu.dev.test.TestFmwk;
23import com.ibm.icu.dev.test.TestUtil;
24import com.ibm.icu.impl.Norm2AllModes;
25import com.ibm.icu.impl.Utility;
26import com.ibm.icu.impl.coll.Collation;
27import com.ibm.icu.impl.coll.CollationData;
28import com.ibm.icu.impl.coll.CollationFCD;
29import com.ibm.icu.impl.coll.CollationIterator;
30import com.ibm.icu.impl.coll.CollationRoot;
31import com.ibm.icu.impl.coll.CollationRootElements;
32import com.ibm.icu.impl.coll.CollationRuleParser;
33import com.ibm.icu.impl.coll.CollationWeights;
34import com.ibm.icu.impl.coll.FCDIterCollationIterator;
35import com.ibm.icu.impl.coll.FCDUTF16CollationIterator;
36import com.ibm.icu.impl.coll.UTF16CollationIterator;
37import com.ibm.icu.impl.coll.UVector32;
38import com.ibm.icu.text.CollationElementIterator;
39import com.ibm.icu.text.CollationKey;
40import com.ibm.icu.text.Collator;
41import com.ibm.icu.text.Collator.ReorderCodes;
42import com.ibm.icu.text.Normalizer2;
43import com.ibm.icu.text.RawCollationKey;
44import com.ibm.icu.text.RuleBasedCollator;
45import com.ibm.icu.text.UCharacterIterator;
46import com.ibm.icu.text.UTF16;
47import com.ibm.icu.text.UnicodeSet;
48import com.ibm.icu.text.UnicodeSetIterator;
49import com.ibm.icu.util.IllformedLocaleException;
50import com.ibm.icu.util.Output;
51import com.ibm.icu.util.ULocale;
52
53public class CollationTest extends TestFmwk {
54    public CollationTest() {
55    }
56
57    // Fields
58    Normalizer2 fcd, nfd;
59    Collator coll;
60    String fileLine;
61    int fileLineNumber;
62    String fileTestName;
63
64    // package private methods ----------------------------------------------
65
66    static void doTest(TestFmwk test, RuleBasedCollator col, String source,
67                       String target, int result)
68    {
69        doTestVariant(test, col, source, target, result);
70        if (result == -1) {
71            doTestVariant(test, col, target, source, 1);
72        }
73        else if (result == 1) {
74            doTestVariant(test, col, target, source, -1);
75        }
76        else {
77            doTestVariant(test, col, target, source, 0);
78        }
79
80        CollationElementIterator iter = col.getCollationElementIterator(source);
81        backAndForth(test, iter);
82        iter.setText(target);
83        backAndForth(test, iter);
84    }
85
86    /**
87     * Return an integer array containing all of the collation orders
88     * returned by calls to next on the specified iterator
89     */
90    static int[] getOrders(CollationElementIterator iter)
91    {
92        int maxSize = 100;
93        int size = 0;
94        int[] orders = new int[maxSize];
95
96        int order;
97        while ((order = iter.next()) != CollationElementIterator.NULLORDER) {
98            if (size == maxSize) {
99                maxSize *= 2;
100                int[] temp = new int[maxSize];
101                System.arraycopy(orders, 0, temp,  0, size);
102                orders = temp;
103            }
104            orders[size++] = order;
105        }
106
107        if (maxSize > size) {
108            int[] temp = new int[size];
109            System.arraycopy(orders, 0, temp,  0, size);
110            orders = temp;
111        }
112        return orders;
113    }
114
115    static void backAndForth(TestFmwk test, CollationElementIterator iter)
116    {
117        // Run through the iterator forwards and stick it into an array
118        iter.reset();
119        int[] orders = getOrders(iter);
120
121        // Now go through it backwards and make sure we get the same values
122        int index = orders.length;
123        int o;
124
125        // reset the iterator
126        iter.reset();
127
128        while ((o = iter.previous()) != CollationElementIterator.NULLORDER) {
129            if (o != orders[--index]) {
130                if (o == 0) {
131                    index ++;
132                } else {
133                    while (index > 0 && orders[index] == 0) {
134                        index --;
135                    }
136                    if (o != orders[index]) {
137                        TestFmwk.errln("Mismatch at index " + index + ": 0x"
138                            + Utility.hex(orders[index]) + " vs 0x" + Utility.hex(o));
139                        break;
140                    }
141                }
142            }
143        }
144
145        while (index != 0 && orders[index - 1] == 0) {
146          index --;
147        }
148
149        if (index != 0) {
150            String msg = "Didn't get back to beginning - index is ";
151            TestFmwk.errln(msg + index);
152
153            iter.reset();
154            TestFmwk.err("next: ");
155            while ((o = iter.next()) != CollationElementIterator.NULLORDER) {
156                String hexString = "0x" + Utility.hex(o) + " ";
157                TestFmwk.err(hexString);
158            }
159            TestFmwk.errln("");
160            TestFmwk.err("prev: ");
161            while ((o = iter.previous()) != CollationElementIterator.NULLORDER) {
162                String hexString = "0x" + Utility.hex(o) + " ";
163                 TestFmwk.err(hexString);
164            }
165            TestFmwk.errln("");
166        }
167    }
168
169    static final String appendCompareResult(int result, String target){
170        if (result == -1) {
171            target += "LESS";
172        } else if (result == 0) {
173            target += "EQUAL";
174        } else if (result == 1) {
175            target += "GREATER";
176        } else {
177            String huh = "?";
178            target += huh + result;
179        }
180        return target;
181    }
182
183    static final String prettify(CollationKey key) {
184        byte[] bytes = key.toByteArray();
185        return prettify(bytes, bytes.length);
186    }
187
188    static final String prettify(RawCollationKey key) {
189        return prettify(key.bytes, key.size);
190    }
191
192    static final String prettify(byte[] skBytes, int length) {
193        StringBuilder target = new StringBuilder(length * 3 + 2).append('[');
194
195        for (int i = 0; i < length; i++) {
196            String numStr = Integer.toHexString(skBytes[i] & 0xff);
197            if (numStr.length() < 2) {
198                target.append('0');
199            }
200            target.append(numStr).append(' ');
201        }
202        target.append(']');
203        return target.toString();
204    }
205
206    private static void doTestVariant(TestFmwk test,
207                                      RuleBasedCollator myCollation,
208                                      String source, String target, int result)
209    {
210        int compareResult  = myCollation.compare(source, target);
211        if (compareResult != result) {
212
213            // !!! if not mod build, error, else nothing.
214            // warnln if not build, error, else always print warning.
215            // do we need a 'quiet warning?' (err or log).  Hmmm,
216            // would it work to have the 'verbose' flag let you
217            // suppress warnings?  Are there ever some warnings you
218            // want to suppress, and others you don't?
219            TestFmwk.errln("Comparing \"" + Utility.hex(source) + "\" with \""
220                    + Utility.hex(target) + "\" expected " + result
221                    + " but got " + compareResult);
222        }
223        CollationKey ssk = myCollation.getCollationKey(source);
224        CollationKey tsk = myCollation.getCollationKey(target);
225        compareResult = ssk.compareTo(tsk);
226        if (compareResult != result) {
227            TestFmwk.errln("Comparing CollationKeys of \"" + Utility.hex(source)
228            + "\" with \"" + Utility.hex(target)
229            + "\" expected " + result + " but got "
230            + compareResult);
231        }
232        RawCollationKey srsk = new RawCollationKey();
233        myCollation.getRawCollationKey(source, srsk);
234        RawCollationKey trsk = new RawCollationKey();
235        myCollation.getRawCollationKey(target, trsk);
236        compareResult = ssk.compareTo(tsk);
237        if (compareResult != result) {
238            TestFmwk.errln("Comparing RawCollationKeys of \""
239                    + Utility.hex(source)
240                    + "\" with \"" + Utility.hex(target)
241                    + "\" expected " + result + " but got "
242                    + compareResult);
243        }
244    }
245
246    @Test
247    public void TestMinMax() {
248        setRootCollator();
249        RuleBasedCollator rbc = (RuleBasedCollator)coll;
250
251        final String s = "\uFFFE\uFFFF";
252        long[] ces;
253
254        ces = rbc.internalGetCEs(s);
255        if (ces.length != 2) {
256            errln("expected 2 CEs for <FFFE, FFFF>, got " + ces.length);
257            return;
258        }
259
260        long ce = ces[0];
261        long expected = Collation.makeCE(Collation.MERGE_SEPARATOR_PRIMARY);
262        if (ce != expected) {
263            errln("CE(U+fffe)=0x" + Utility.hex(ce) + " != 02..");
264        }
265
266        ce = ces[1];
267        expected = Collation.makeCE(Collation.MAX_PRIMARY);
268        if (ce != expected) {
269            errln("CE(U+ffff)=0x" + Utility.hex(ce) + " != max..");
270        }
271    }
272
273    @Test
274    public void TestImplicits() {
275        CollationData cd = CollationRoot.getData();
276
277        // Implicit primary weights should be assigned for the following sets,
278        // and sort in ascending order by set and then code point.
279        // See http://www.unicode.org/reports/tr10/#Implicit_Weights
280        // core Han Unified Ideographs
281        UnicodeSet coreHan = new UnicodeSet("[\\p{unified_ideograph}&"
282                                 + "[\\p{Block=CJK_Unified_Ideographs}"
283                                 + "\\p{Block=CJK_Compatibility_Ideographs}]]");
284        // all other Unified Han ideographs
285        UnicodeSet otherHan = new UnicodeSet("[\\p{unified ideograph}-"
286                                 + "[\\p{Block=CJK_Unified_Ideographs}"
287                                 + "\\p{Block=CJK_Compatibility_Ideographs}]]");
288
289        UnicodeSet unassigned = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]");
290        unassigned.remove(0xfffe, 0xffff);  // These have special CLDR root mappings.
291
292        // Starting with CLDR 26/ICU 54, the root Han order may instead be
293        // the Unihan radical-stroke order.
294        // The tests should pass either way, so we only test the order of a small set of Han characters
295        // whose radical-stroke order is the same as their code point order.
296        UnicodeSet someHanInCPOrder = new UnicodeSet(
297                "[\\u4E00-\\u4E16\\u4E18-\\u4E2B\\u4E2D-\\u4E3C\\u4E3E-\\u4E48" +
298                "\\u4E4A-\\u4E60\\u4E63-\\u4E8F\\u4E91-\\u4F63\\u4F65-\\u50F1\\u50F3-\\u50F6]");
299        UnicodeSet inOrder = new UnicodeSet(someHanInCPOrder);
300        inOrder.addAll(unassigned).freeze();
301
302        UnicodeSet[] sets = { coreHan, otherHan, unassigned };
303        int prev = 0;
304        long prevPrimary = 0;
305        UTF16CollationIterator ci = new UTF16CollationIterator(cd, false, "", 0);
306        for (int i = 0; i < sets.length; ++i) {
307            UnicodeSetIterator iter = new UnicodeSetIterator(sets[i]);
308            while (iter.next()) {
309                String s = iter.getString();
310                int c = s.codePointAt(0);
311                ci.setText(false, s, 0);
312                long ce = ci.nextCE();
313                long ce2 = ci.nextCE();
314                if (ce == Collation.NO_CE || ce2 != Collation.NO_CE) {
315                    errln("CollationIterator.nextCE(0x" + Utility.hex(c)
316                            + ") did not yield exactly one CE");
317                    continue;
318
319                }
320                if ((ce & 0xffffffffL) != Collation.COMMON_SEC_AND_TER_CE) {
321                    errln("CollationIterator.nextCE(U+" + Utility.hex(c, 4)
322                            + ") has non-common sec/ter weights: 0x" + Utility.hex(ce & 0xffffffffL, 8));
323                    continue;
324                }
325                long primary = ce >>> 32;
326                if (!(primary > prevPrimary) && inOrder.contains(c) && inOrder.contains(prev)) {
327                    errln("CE(U+" + Utility.hex(c) + ")=0x" + Utility.hex(primary)
328                            + ".. not greater than CE(U+" + Utility.hex(prev)
329                            + ")=0x" + Utility.hex(prevPrimary) + "..");
330
331                }
332                prev = c;
333                prevPrimary = primary;
334            }
335        }
336    }
337
338    // ICU4C: TestNulTerminated / renamed for ICU4J
339    @Test
340    public void TestSubSequence() {
341        CollationData data = CollationRoot.getData();
342        final String s = "abab"; // { 0x61, 0x62, 0x61, 0x62 }
343
344        UTF16CollationIterator ci1 = new UTF16CollationIterator(data, false, s, 0);
345        UTF16CollationIterator ci2 = new UTF16CollationIterator(data, false, s, 2);
346
347        for (int i = 0; i < 2; ++i) {
348            long ce1 = ci1.nextCE();
349            long ce2 = ci2.nextCE();
350
351            if (ce1 != ce2) {
352                errln("CollationIterator.nextCE(with start position at 0) != "
353                      + "nextCE(with start position at 2) at CE " + i);
354            }
355        }
356    }
357
358
359    // ICU4C: TestIllegalUTF8 / not applicable to ICU4J
360
361
362    private static void addLeadSurrogatesForSupplementary(UnicodeSet src, UnicodeSet dest) {
363        for(int c = 0x10000; c < 0x110000;) {
364            int next = c + 0x400;
365            if(src.containsSome(c, next - 1)) {
366                dest.add(UTF16.getLeadSurrogate(c));
367            }
368            c = next;
369        }
370    }
371
372    @Test
373    public void TestShortFCDData() {
374        UnicodeSet expectedLccc = new UnicodeSet("[:^lccc=0:]");
375        expectedLccc.add(0xdc00, 0xdfff);   // add all trail surrogates
376        addLeadSurrogatesForSupplementary(expectedLccc, expectedLccc);
377
378        UnicodeSet lccc = new UnicodeSet(); // actual
379        for (int c = 0; c <= 0xffff; ++c) {
380            if (CollationFCD.hasLccc(c)) {
381                lccc.add(c);
382            }
383        }
384
385        UnicodeSet diff = new UnicodeSet(expectedLccc);
386        diff.removeAll(lccc);
387        diff.remove(0x10000, 0x10ffff);  // hasLccc() only works for the BMP
388
389        String empty = "[]";
390        String diffString;
391
392        diffString = diff.toPattern(true);
393        assertEquals("CollationFCD::hasLccc() expected-actual", empty, diffString);
394
395        diff = lccc;
396        diff.removeAll(expectedLccc);
397        diffString = diff.toPattern(true);
398        assertEquals("CollationFCD::hasLccc() actual-expected", empty, diffString);
399
400        UnicodeSet expectedTccc = new UnicodeSet("[:^tccc=0:]");
401        addLeadSurrogatesForSupplementary(expectedLccc, expectedTccc);
402        addLeadSurrogatesForSupplementary(expectedTccc, expectedTccc);
403
404        UnicodeSet tccc = new UnicodeSet(); // actual
405        for(int c = 0; c <= 0xffff; ++c) {
406            if (CollationFCD.hasTccc(c)) {
407                tccc.add(c);
408            }
409        }
410
411        diff = new UnicodeSet(expectedTccc);
412        diff.removeAll(tccc);
413        diff.remove(0x10000, 0x10ffff); // hasTccc() only works for the BMP
414        assertEquals("CollationFCD::hasTccc() expected-actual", empty, diffString);
415
416        diff = tccc;
417        diff.removeAll(expectedTccc);
418        diffString = diff.toPattern(true);
419        assertEquals("CollationFCD::hasTccc() actual-expected", empty, diffString);
420    }
421
422    private static class CodePointIterator {
423        int[] cp;
424        int length;
425        int pos;
426
427        CodePointIterator(int[] cp) {
428            this.cp = cp;
429            this.length = cp.length;
430            this.pos = 0;
431        }
432
433        void resetToStart() {
434            pos = 0;
435        }
436
437        int next() {
438            return (pos < length) ? cp[pos++] : Collation.SENTINEL_CP;
439        }
440
441        int previous() {
442            return (pos > 0) ? cp[--pos] : Collation.SENTINEL_CP;
443        }
444
445        int getLength() {
446            return length;
447        }
448
449        int getIndex() {
450            return pos;
451        }
452    }
453
454    private void checkFCD(String name, CollationIterator ci, CodePointIterator cpi) {
455        // Iterate forward to the limit.
456        for (;;) {
457            int c1 = ci.nextCodePoint();
458            int c2 = cpi.next();
459            if (c1 != c2) {
460                errln(name + ".nextCodePoint(to limit, 1st pass) = U+" + Utility.hex(c1)
461                        + " != U+" + Utility.hex(c1) + " at " + cpi.getIndex());
462                return;
463            }
464            if (c1 < 0) {
465                break;
466            }
467        }
468
469        // Iterate backward most of the way.
470        for (int n = (cpi.getLength() * 2) / 3; n > 0; --n) {
471            int c1 = ci.previousCodePoint();
472            int c2 = cpi.previous();
473            if (c1 != c2) {
474                errln(name + ".previousCodePoint() = U+" + Utility.hex(c1) +
475                        " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
476                return;
477            }
478        }
479
480        // Forward again.
481        for (;;) {
482            int c1 = ci.nextCodePoint();
483            int c2 = cpi.next();
484            if (c1 != c2) {
485                errln(name + ".nextCodePoint(to limit again) = U+" + Utility.hex(c1)
486                        + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
487                return;
488            }
489            if (c1 < 0) {
490                break;
491            }
492        }
493
494        // Iterate backward to the start.
495        for (;;) {
496            int c1 = ci.previousCodePoint();
497            int c2 = cpi.previous();
498            if (c1 != c2) {
499                errln(name + ".nextCodePoint(to start) = U+" + Utility.hex(c1)
500                        + " != U+" + Utility.hex(c2) + " at " + cpi.getIndex());
501                return;
502            }
503            if (c1 < 0) {
504                break;
505            }
506        }
507    }
508
509    @Test
510    public void TestFCD() {
511        CollationData data = CollationRoot.getData();
512
513        // Input string, not FCD.
514        StringBuilder buf = new StringBuilder();
515        buf.append("\u0308\u00e1\u0062\u0301\u0327\u0430\u0062")
516            .appendCodePoint(0x1D15F)   // MUSICAL SYMBOL QUARTER NOTE=1D158 1D165, ccc=0, 216
517            .append("\u0327\u0308")     // ccc=202, 230
518            .appendCodePoint(0x1D16D)   // MUSICAL SYMBOL COMBINING AUGMENTATION DOT, ccc=226
519            .appendCodePoint(0x1D15F)
520            .appendCodePoint(0x1D16D)
521            .append("\uac01")
522            .append("\u00e7")           // Character with tccc!=0 decomposed together with mis-ordered sequence.
523            .appendCodePoint(0x1D16D).appendCodePoint(0x1D165)
524            .append("\u00e1")           // Character with tccc!=0 decomposed together with decomposed sequence.
525            .append("\u0f73\u0f75")     // Tibetan composite vowels must be decomposed.
526            .append("\u4e00\u0f81");
527        String s = buf.toString();
528
529        // Expected code points.
530        int[] cp = {
531            0x308, 0xe1, 0x62, 0x327, 0x301, 0x430, 0x62,
532            0x1D158, 0x327, 0x1D165, 0x1D16D, 0x308,
533            0x1D15F, 0x1D16D,
534            0xac01,
535            0x63, 0x327, 0x1D165, 0x1D16D,
536            0x61,
537            0xf71, 0xf71, 0xf72, 0xf74, 0x301,
538            0x4e00, 0xf71, 0xf80
539        };
540
541        FCDUTF16CollationIterator u16ci = new FCDUTF16CollationIterator(data, false, s, 0);
542        CodePointIterator cpi = new CodePointIterator(cp);
543        checkFCD("FCDUTF16CollationIterator", u16ci, cpi);
544
545        cpi.resetToStart();
546        UCharacterIterator iter = UCharacterIterator.getInstance(s);
547        FCDIterCollationIterator uici = new FCDIterCollationIterator(data, false, iter, 0);
548        checkFCD("FCDIterCollationIterator", uici, cpi);
549    }
550
551    private void checkAllocWeights(CollationWeights cw, long lowerLimit, long upperLimit,
552            int n, int someLength, int minCount) {
553
554        if (!cw.allocWeights(lowerLimit, upperLimit, n)) {
555            errln("CollationWeights::allocWeights(0x"
556                    + Utility.hex(lowerLimit) + ",0x"
557                    + Utility.hex(upperLimit) + ","
558                    + n + ") = false");
559            return;
560        }
561        long previous = lowerLimit;
562        int count = 0; // number of weights that have someLength
563        for (int i = 0; i < n; ++i) {
564            long w = cw.nextWeight();
565            if (w == 0xffffffffL) {
566                errln("CollationWeights::allocWeights(0x"
567                        + Utility.hex(lowerLimit) + ",0x"
568                        + Utility.hex(upperLimit) + ",0x"
569                        + n + ").nextWeight() returns only "
570                        + i + " weights");
571                return;
572            }
573            if (!(previous < w && w < upperLimit)) {
574                errln("CollationWeights::allocWeights(0x"
575                        + Utility.hex(lowerLimit) + ",0x"
576                        + Utility.hex(upperLimit) + ","
577                        + n + ").nextWeight() number "
578                        + (i + 1) + " -> 0x" + Utility.hex(w)
579                        + " not between "
580                        + Utility.hex(previous) + " and "
581                        + Utility.hex(upperLimit));
582                return;
583            }
584            if (CollationWeights.lengthOfWeight(w) == someLength) {
585                ++count;
586            }
587        }
588        if (count < minCount) {
589            errln("CollationWeights::allocWeights(0x"
590                    + Utility.hex(lowerLimit) + ",0x"
591                    + Utility.hex(upperLimit) + ","
592                    + n + ").nextWeight() returns only "
593                    + count + " < " + minCount + " weights of length "
594                    + someLength);
595
596        }
597    }
598
599    @Test
600    public void TestCollationWeights() {
601        CollationWeights cw = new CollationWeights();
602
603        // Non-compressible primaries use 254 second bytes 02..FF.
604        logln("CollationWeights.initForPrimary(non-compressible)");
605        cw.initForPrimary(false);
606        // Expect 1 weight 11 and 254 weights 12xx.
607        checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 1, 1);
608        checkAllocWeights(cw, 0x10000000L, 0x13000000L, 255, 2, 254);
609        // Expect 255 two-byte weights from the ranges 10ff, 11xx, 1202.
610        checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 260, 2, 255);
611        // Expect 254 two-byte weights from the ranges 10ff and 11xx.
612        checkAllocWeights(cw, 0x10fefe40L, 0x12030300L, 600, 2, 254);
613        // Expect 254^2=64516 three-byte weights.
614        // During computation, there should be 3 three-byte ranges
615        // 10ffff, 11xxxx, 120202.
616        // The middle one should be split 64515:1,
617        // and the newly-split-off range and the last ranged lengthened.
618        checkAllocWeights(cw, 0x10fffe00L, 0x12020300L, 1 + 64516 + 254 + 1, 3, 64516);
619        // Expect weights 1102 & 1103.
620        checkAllocWeights(cw, 0x10ff0000L, 0x11040000L, 2, 2, 2);
621        // Expect weights 102102 & 102103.
622        checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2);
623
624        // Compressible primaries use 251 second bytes 04..FE.
625        logln("CollationWeights.initForPrimary(compressible)");
626        cw.initForPrimary(true);
627        // Expect 1 weight 11 and 251 weights 12xx.
628        checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 1, 1);
629        checkAllocWeights(cw, 0x10000000L, 0x13000000L, 252, 2, 251);
630        // Expect 252 two-byte weights from the ranges 10fe, 11xx, 1204.
631        checkAllocWeights(cw, 0x10fdfe40L, 0x12050300L, 260, 2, 252);
632        // Expect weights 1104 & 1105.
633        checkAllocWeights(cw, 0x10fe0000L, 0x11060000L, 2, 2, 2);
634        // Expect weights 102102 & 102103.
635        checkAllocWeights(cw, 0x1020ff00L, 0x10210400L, 2, 3, 2);
636
637        // Secondary and tertiary weights use only bytes 3 & 4.
638        logln("CollationWeights.initForSecondary()");
639        cw.initForSecondary();
640        // Expect weights fbxx and all four fc..ff.
641        checkAllocWeights(cw, 0xfb20L, 0x10000L, 20, 3, 4);
642
643        logln("CollationWeights.initForTertiary()");
644        cw.initForTertiary();
645        // Expect weights 3dxx and both 3e & 3f.
646        checkAllocWeights(cw, 0x3d02L, 0x4000L, 10, 3, 2);
647    }
648
649    private static boolean isValidCE(CollationRootElements re, CollationData data, long p, long s, long ctq) {
650        long p1 = p >>> 24;
651        long p2 = (p >>> 16) & 0xff;
652        long p3 = (p >>> 8) & 0xff;
653        long p4 = p & 0xff;
654        long s1 = s >>> 8;
655        long s2 = s & 0xff;
656        // ctq = Case, Tertiary, Quaternary
657        long c = (ctq & Collation.CASE_MASK) >>> 14;
658        long t = ctq & Collation.ONLY_TERTIARY_MASK;
659        long t1 = t >>> 8;
660        long t2 = t & 0xff;
661        long q = ctq & Collation.QUATERNARY_MASK;
662        // No leading zero bytes.
663        if ((p != 0 && p1 == 0) || (s != 0 && s1 == 0) || (t != 0 && t1 == 0)) {
664            return false;
665        }
666        // No intermediate zero bytes.
667        if (p1 != 0 && p2 == 0 && (p & 0xffff) != 0) {
668            return false;
669        }
670        if (p2 != 0 && p3 == 0 && p4 != 0) {
671            return false;
672        }
673        // Minimum & maximum lead bytes.
674        if ((p1 != 0 && p1 <= Collation.MERGE_SEPARATOR_BYTE)
675                || s1 == Collation.LEVEL_SEPARATOR_BYTE
676                || t1 == Collation.LEVEL_SEPARATOR_BYTE || t1 > 0x3f) {
677            return false;
678        }
679        if (c > 2) {
680            return false;
681        }
682        // The valid byte range for the second primary byte depends on compressibility.
683        if (p2 != 0) {
684            if (data.isCompressibleLeadByte((int)p1)) {
685                if (p2 <= Collation.PRIMARY_COMPRESSION_LOW_BYTE
686                        || Collation.PRIMARY_COMPRESSION_HIGH_BYTE <= p2) {
687                    return false;
688                }
689            } else {
690                if (p2 <= Collation.LEVEL_SEPARATOR_BYTE) {
691                    return false;
692                }
693            }
694        }
695        // Other bytes just need to avoid the level separator.
696        // Trailing zeros are ok.
697        // assert (Collation.LEVEL_SEPARATOR_BYTE == 1);
698        if (p3 == Collation.LEVEL_SEPARATOR_BYTE || p4 == Collation.LEVEL_SEPARATOR_BYTE
699                || s2 == Collation.LEVEL_SEPARATOR_BYTE || t2 == Collation.LEVEL_SEPARATOR_BYTE) {
700            return false;
701        }
702        // Well-formed CEs.
703        if (p == 0) {
704            if (s == 0) {
705                if (t == 0) {
706                    // Completely ignorable CE.
707                    // Quaternary CEs are not supported.
708                    if (c != 0 || q != 0) {
709                        return false;
710                    }
711                } else {
712                    // Tertiary CE.
713                    if (t < re.getTertiaryBoundary() || c != 2) {
714                        return false;
715                    }
716                }
717            } else {
718                // Secondary CE.
719                if (s < re.getSecondaryBoundary() || t == 0 || t >= re.getTertiaryBoundary()) {
720                    return false;
721                }
722            }
723        } else {
724            // Primary CE.
725            if (s == 0 || (Collation.COMMON_WEIGHT16 < s && s <= re.getLastCommonSecondary())
726                    || s >= re.getSecondaryBoundary()) {
727                return false;
728            }
729            if (t == 0 || t >= re.getTertiaryBoundary()) {
730                return false;
731            }
732        }
733        return true;
734    }
735
736    private static boolean isValidCE(CollationRootElements re, CollationData data, long ce) {
737        long p = ce >>> 32;
738        long secTer = ce & 0xffffffffL;
739        return isValidCE(re, data, p, secTer >>> 16, secTer & 0xffff);
740    }
741
742    private static class RootElementsIterator {
743        CollationData data;
744        long[] elements;
745        int length;
746
747        long pri;
748        long secTer;
749        int index;
750
751        RootElementsIterator(CollationData root) {
752            data = root;
753            elements = root.rootElements;
754            length = elements.length;
755            pri = 0;
756            secTer = 0;
757            index = (int)elements[CollationRootElements.IX_FIRST_TERTIARY_INDEX];
758        }
759
760        boolean next() {
761            if (index >= length) {
762                return false;
763            }
764            long p = elements[index];
765            if (p == CollationRootElements.PRIMARY_SENTINEL) {
766                return false;
767            }
768            if ((p & CollationRootElements.SEC_TER_DELTA_FLAG) != 0) {
769                ++index;
770                secTer = p & ~CollationRootElements.SEC_TER_DELTA_FLAG;
771                return true;
772            }
773            if ((p & CollationRootElements.PRIMARY_STEP_MASK) != 0) {
774                // End of a range, enumerate the primaries in the range.
775                int step = (int)p & CollationRootElements.PRIMARY_STEP_MASK;
776                p &= 0xffffff00;
777                if (pri == p) {
778                    // Finished the range, return the next CE after it.
779                    ++index;
780                    return next();
781                }
782                assert (pri < p);
783                // Return the next primary in this range.
784                boolean isCompressible = data.isCompressiblePrimary(pri);
785                if ((pri & 0xffff) == 0) {
786                    pri = Collation.incTwoBytePrimaryByOffset(pri, isCompressible, step);
787                } else {
788                    pri = Collation.incThreeBytePrimaryByOffset(pri, isCompressible, step);
789                }
790                return true;
791            }
792            // Simple primary CE.
793            ++index;
794            pri = p;
795            // Does this have an explicit below-common sec/ter unit,
796            // or does it imply a common one?
797            if(index == length) {
798                secTer = Collation.COMMON_SEC_AND_TER_CE;
799            } else {
800                secTer = elements[index];
801                if((secTer & CollationRootElements.SEC_TER_DELTA_FLAG) == 0) {
802                    // No sec/ter delta.
803                    secTer = Collation.COMMON_SEC_AND_TER_CE;
804                } else {
805                    secTer &= ~CollationRootElements.SEC_TER_DELTA_FLAG;
806                    if(secTer > Collation.COMMON_SEC_AND_TER_CE) {
807                        // Implied sec/ter.
808                        secTer = Collation.COMMON_SEC_AND_TER_CE;
809                    } else {
810                        // Explicit sec/ter below common/common.
811                        ++index;
812                    }
813                }
814            }
815            return true;
816        }
817
818        long getPrimary() {
819            return pri;
820        }
821
822        long getSecTer() {
823            return secTer;
824        }
825    }
826
827    @Test
828    public void TestRootElements() {
829        CollationData root = CollationRoot.getData();
830
831        CollationRootElements rootElements = new CollationRootElements(root.rootElements);
832        RootElementsIterator iter = new RootElementsIterator(root);
833
834        // We check each root CE for validity,
835        // and we also verify that there is a tailoring gap between each two CEs.
836        CollationWeights cw1c = new CollationWeights(); // compressible primary weights
837        CollationWeights cw1u = new CollationWeights(); // uncompressible primary weights
838        CollationWeights cw2 = new CollationWeights();
839        CollationWeights cw3 = new CollationWeights();
840
841        cw1c.initForPrimary(true);
842        cw1u.initForPrimary(false);
843        cw2.initForSecondary();
844        cw3.initForTertiary();
845
846        // Note: The root elements do not include Han-implicit or unassigned-implicit CEs,
847        // nor the special merge-separator CE for U+FFFE.
848        long prevPri = 0;
849        long prevSec = 0;
850        long prevTer = 0;
851
852        while (iter.next()) {
853            long pri = iter.getPrimary();
854            long secTer = iter.getSecTer();
855            // CollationRootElements CEs must have 0 case and quaternary bits.
856            if ((secTer & Collation.CASE_AND_QUATERNARY_MASK) != 0) {
857                errln("CollationRootElements CE has non-zero case and/or quaternary bits: "
858                        + "0x" + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
859            }
860            long sec = secTer >>> 16;
861            long ter = secTer & Collation.ONLY_TERTIARY_MASK;
862            long ctq = ter;
863            if (pri == 0 && sec == 0 && ter != 0) {
864                // Tertiary CEs must have uppercase bits,
865                // but they are not stored in the CollationRootElements.
866                ctq |= 0x8000;
867            }
868            if (!isValidCE(rootElements, root, pri, sec, ctq)) {
869                errln("invalid root CE 0x"
870                        + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
871            } else {
872                if (pri != prevPri) {
873                    long newWeight = 0;
874                    if (prevPri == 0 || prevPri >= Collation.FFFD_PRIMARY) {
875                        // There is currently no tailoring gap after primary ignorables,
876                        // and we forbid tailoring after U+FFFD and U+FFFF.
877                    } else if (root.isCompressiblePrimary(prevPri)) {
878                        if (!cw1c.allocWeights(prevPri, pri, 1)) {
879                            errln("no primary/compressible tailoring gap between "
880                                    + "0x" + Utility.hex(prevPri, 8)
881                                    + " and 0x" + Utility.hex(pri, 8));
882                        } else {
883                            newWeight = cw1c.nextWeight();
884                        }
885                    } else {
886                        if (!cw1u.allocWeights(prevPri, pri, 1)) {
887                            errln("no primary/uncompressible tailoring gap between "
888                                    + "0x" + Utility.hex(prevPri, 8)
889                                    + " and 0x" + Utility.hex(pri, 8));
890                        } else {
891                            newWeight = cw1u.nextWeight();
892                        }
893                    }
894                    if (newWeight != 0 && !(prevPri < newWeight && newWeight < pri)) {
895                        errln("mis-allocated primary weight, should get "
896                                + "0x" + Utility.hex(prevPri, 8)
897                                + " < 0x" + Utility.hex(newWeight, 8)
898                                + " < 0x" + Utility.hex(pri, 8));
899                    }
900                } else if (sec != prevSec) {
901                    long lowerLimit = prevSec == 0 ?
902                            rootElements.getSecondaryBoundary() - 0x100 : prevSec;
903                    if (!cw2.allocWeights(lowerLimit, sec, 1)) {
904                        errln("no secondary tailoring gap between "
905                                + "0x" + Utility.hex(lowerLimit)
906                                + " and 0x" + Utility.hex(sec));
907                    } else {
908                        long newWeight = cw2.nextWeight();
909                        if (!(prevSec < newWeight && newWeight < sec)) {
910                            errln("mis-allocated secondary weight, should get "
911                                    + "0x" + Utility.hex(lowerLimit)
912                                    + " < 0x" + Utility.hex(newWeight)
913                                    + " < 0x" + Utility.hex(sec));
914                        }
915                    }
916                } else if (ter != prevTer) {
917                    long lowerLimit = prevTer == 0 ?
918                            rootElements.getTertiaryBoundary() - 0x100 : prevTer;
919                    if (!cw3.allocWeights(lowerLimit, ter, 1)) {
920                        errln("no tertiary tailoring gap between "
921                                + "0x" + Utility.hex(lowerLimit)
922                                + " and 0x" + Utility.hex(ter));
923                    } else {
924                        long newWeight = cw3.nextWeight();
925                        if (!(prevTer < newWeight && newWeight < ter)) {
926                            errln("mis-allocated tertiary weight, should get "
927                                    + "0x" + Utility.hex(lowerLimit)
928                                    + " < 0x" + Utility.hex(newWeight)
929                                    + " < 0x" + Utility.hex(ter));
930                        }
931                    }
932                } else {
933                    errln("duplicate root CE 0x"
934                            + Utility.hex(pri, 8) + " 0x" + Utility.hex(secTer, 8));
935                }
936            }
937            prevPri = pri;
938            prevSec = sec;
939            prevTer = ter;
940        }
941    }
942
943    @Test
944    public void TestTailoredElements() {
945        CollationData root = CollationRoot.getData();
946        CollationRootElements rootElements = new CollationRootElements(root.rootElements);
947
948        Set<String> prevLocales = new HashSet<String>();
949        prevLocales.add("");
950        prevLocales.add("root");
951        prevLocales.add("root@collation=standard");
952
953        long[] ces;
954        ULocale[] locales = Collator.getAvailableULocales();
955        String localeID = "root";
956        int locIdx = 0;
957
958        for (; locIdx < locales.length; localeID = locales[locIdx++].getName()) {
959            ULocale locale = new ULocale(localeID);
960            String[] types = Collator.getKeywordValuesForLocale("collation", locale, false);
961            for (int typeIdx = 0; typeIdx < types.length; ++typeIdx) {
962                String type = types[typeIdx];  // first: default type
963                if (type.startsWith("private-")) {
964                    errln("Collator.getKeywordValuesForLocale(" + localeID +
965                            ") returns private collation keyword: " + type);
966                }
967                ULocale localeWithType = locale.setKeywordValue("collation", type);
968                Collator coll = Collator.getInstance(localeWithType);
969                ULocale actual = coll.getLocale(ULocale.ACTUAL_LOCALE);
970                if (prevLocales.contains(actual.getName())) {
971                    continue;
972                }
973                prevLocales.add(actual.getName());
974                logln("TestTailoredElements(): requested " + localeWithType.getName()
975                        + " -> actual " + actual.getName());
976                if (!(coll instanceof RuleBasedCollator)) {
977                    continue;
978                }
979                RuleBasedCollator rbc = (RuleBasedCollator) coll;
980
981                // Note: It would be better to get tailored strings such that we can
982                // identify the prefix, and only get the CEs for the prefix+string,
983                // not also for the prefix.
984                // There is currently no API for that.
985                // It would help in an unusual case where a contraction starting in the prefix
986                // extends past its end, and we do not see the intended mapping.
987                // For example, for a mapping p|st, if there is also a contraction ps,
988                // then we get CEs(ps)+CEs(t), rather than CEs(p|st).
989                UnicodeSet tailored = coll.getTailoredSet();
990                UnicodeSetIterator iter = new UnicodeSetIterator(tailored);
991                while (iter.next()) {
992                    String s = iter.getString();
993                    ces = rbc.internalGetCEs(s);
994                    for (int i = 0; i < ces.length; ++i) {
995                        long ce = ces[i];
996                        if (!isValidCE(rootElements, root, ce)) {
997                            logln(prettify(s));
998                            errln("invalid tailored CE 0x" + Utility.hex(ce, 16)
999                                    + " at CE index " + i + " from string:");
1000                        }
1001                    }
1002                }
1003            }
1004        }
1005    }
1006
1007    private static boolean isSpace(char c) {
1008        return (c == 0x09 || c == 0x20 || c == 0x3000);
1009    }
1010
1011    private static boolean isSectionStarter(char c) {
1012        return (c == '%' || c == '*' || c == '@');
1013    }
1014
1015    private int skipSpaces(int i) {
1016        while (isSpace(fileLine.charAt(i))) {
1017            ++i;
1018        }
1019        return i;
1020    }
1021
1022    private String printSortKey(byte[] p) {
1023        StringBuilder s = new StringBuilder();
1024        for (int i = 0; i < p.length; ++i) {
1025            if (i > 0) {
1026                s.append(' ');
1027            }
1028            byte b = p[i];
1029            if (b == 0) {
1030                s.append('.');
1031            } else if (b == 1) {
1032                s.append('|');
1033            } else {
1034                s.append(String.format("%02x", b & 0xff));
1035            }
1036        }
1037        return s.toString();
1038    }
1039
1040    private String printCollationKey(CollationKey key) {
1041        byte[] p = key.toByteArray();
1042        return printSortKey(p);
1043    }
1044
1045    private boolean readNonEmptyLine(BufferedReader in) throws IOException {
1046        for (;;) {
1047            String line = in.readLine();
1048            if (line == null) {
1049                fileLine = null;
1050                return false;
1051            }
1052            if (fileLineNumber == 0 && line.length() != 0 && line.charAt(0) == '\uFEFF') {
1053                line = line.substring(1);  // Remove the BOM.
1054            }
1055            ++fileLineNumber;
1056            // Strip trailing comments and spaces
1057            int idx = line.indexOf('#');
1058            if (idx < 0) {
1059                idx = line.length();
1060            }
1061            while (idx > 0 && isSpace(line.charAt(idx - 1))) {
1062                --idx;
1063            }
1064            if (idx != 0) {
1065                fileLine = idx < line.length() ? line.substring(0, idx) : line;
1066                return true;
1067            }
1068            // Empty line, continue.
1069        }
1070    }
1071
1072    private int parseString(int start, Output<String> prefix, Output<String> s) throws ParseException {
1073        int length = fileLine.length();
1074        int i;
1075        for (i = start; i < length && !isSpace(fileLine.charAt(i)); ++i) {
1076        }
1077        int pipeIndex = fileLine.indexOf('|', start);
1078        if (pipeIndex >= 0 && pipeIndex < i) {
1079            String tmpPrefix  = Utility.unescape(fileLine.substring(start, pipeIndex));
1080            if (tmpPrefix.length() == 0) {
1081                prefix.value = null;
1082                logln(fileLine);
1083                throw new ParseException("empty prefix on line " + fileLineNumber, fileLineNumber);
1084            }
1085            prefix.value = tmpPrefix;
1086            start = pipeIndex + 1;
1087        } else {
1088            prefix.value = null;
1089        }
1090
1091        String tmp = Utility.unescape(fileLine.substring(start, i));
1092        if (tmp.length() == 0) {
1093            s.value = null;
1094            logln(fileLine);
1095            throw new ParseException("empty string on line " + fileLineNumber, fileLineNumber);
1096        }
1097        s.value = tmp;
1098        return i;
1099    }
1100
1101    private int parseRelationAndString(Output<String> s) throws ParseException {
1102        int relation = Collation.NO_LEVEL;
1103        int start;
1104        if (fileLine.charAt(0) == '<') {
1105            char second = fileLine.charAt(1);
1106            start = 2;
1107            switch(second) {
1108            case 0x31:  // <1
1109                relation = Collation.PRIMARY_LEVEL;
1110                break;
1111            case 0x32:  // <2
1112                relation = Collation.SECONDARY_LEVEL;
1113                break;
1114            case 0x33:  // <3
1115                relation = Collation.TERTIARY_LEVEL;
1116                break;
1117            case 0x34:  // <4
1118                relation = Collation.QUATERNARY_LEVEL;
1119                break;
1120            case 0x63:  // <c
1121                relation = Collation.CASE_LEVEL;
1122                break;
1123            case 0x69:  // <i
1124                relation = Collation.IDENTICAL_LEVEL;
1125                break;
1126            default:  // just <
1127                relation = Collation.NO_LEVEL;
1128                start = 1;
1129                break;
1130            }
1131        } else if (fileLine.charAt(0) == '=') {
1132            relation = Collation.ZERO_LEVEL;
1133            start = 1;
1134        } else {
1135            start = 0;
1136        }
1137
1138        if (start == 0 || !isSpace(fileLine.charAt(start))) {
1139            logln(fileLine);
1140            throw new ParseException("no relation (= < <1 <2 <c <3 <4 <i) at beginning of line "
1141                                        + fileLineNumber, fileLineNumber);
1142        }
1143
1144        start = skipSpaces(start);
1145        Output<String> prefixOut = new Output<String>();
1146        start = parseString(start, prefixOut, s);
1147        if (prefixOut.value != null) {
1148            logln(fileLine);
1149            throw new ParseException("prefix string not allowed for test string: on line "
1150                                        + fileLineNumber, fileLineNumber);
1151        }
1152        if (start < fileLine.length()) {
1153            logln(fileLine);
1154            throw new ParseException("unexpected line contents after test string on line "
1155                                        + fileLineNumber, fileLineNumber);
1156        }
1157
1158        return relation;
1159    }
1160
1161    private void parseAndSetAttribute() throws ParseException {
1162        // Parse attributes even if the Collator could not be created,
1163        // in order to report syntax errors.
1164        int start = skipSpaces(1);
1165        int equalPos = fileLine.indexOf('=');
1166        if (equalPos < 0) {
1167            if (fileLine.regionMatches(start, "reorder", 0, 7)) {
1168                parseAndSetReorderCodes(start + 7);
1169                return;
1170            }
1171            logln(fileLine);
1172            throw new ParseException("missing '=' on line " + fileLineNumber, fileLineNumber);
1173        }
1174
1175        String attrString = fileLine.substring(start,  equalPos);
1176        String valueString = fileLine.substring(equalPos + 1);
1177        if (attrString.equals("maxVariable")) {
1178            int max;
1179            if (valueString.equals("space")) {
1180                max = ReorderCodes.SPACE;
1181            } else if(valueString.equals("punct")) {
1182                max = ReorderCodes.PUNCTUATION;
1183            } else if(valueString.equals("symbol")) {
1184                max = ReorderCodes.SYMBOL;
1185            } else if(valueString.equals("currency")) {
1186                max = ReorderCodes.CURRENCY;
1187            } else {
1188                logln(fileLine);
1189                throw new ParseException("invalid attribute value name on line "
1190                                            + fileLineNumber, fileLineNumber);
1191            }
1192            if (coll != null) {
1193                coll.setMaxVariable(max);
1194            }
1195            fileLine = null;
1196            return;
1197        }
1198
1199        boolean parsed = true;
1200        RuleBasedCollator rbc = (RuleBasedCollator)coll;
1201        if (attrString.equals("backwards")) {
1202            if (valueString.equals("on")) {
1203                if (rbc != null) rbc.setFrenchCollation(true);
1204            } else if (valueString.equals("off")) {
1205                if (rbc != null) rbc.setFrenchCollation(false);
1206            } else if (valueString.equals("default")) {
1207                if (rbc != null) rbc.setFrenchCollationDefault();
1208            } else {
1209                parsed = false;
1210            }
1211        } else if (attrString.equals("alternate")) {
1212            if (valueString.equals("non-ignorable")) {
1213                if (rbc != null) rbc.setAlternateHandlingShifted(false);
1214            } else if (valueString.equals("shifted")) {
1215                if (rbc != null) rbc.setAlternateHandlingShifted(true);
1216            } else if (valueString.equals("default")) {
1217                if (rbc != null) rbc.setAlternateHandlingDefault();
1218            } else {
1219                parsed = false;
1220            }
1221        } else if (attrString.equals("caseFirst")) {
1222            if (valueString.equals("upper")) {
1223                if (rbc != null) rbc.setUpperCaseFirst(true);
1224            } else if (valueString.equals("lower")) {
1225                if (rbc != null) rbc.setLowerCaseFirst(true);
1226            } else if (valueString.equals("default")) {
1227                if (rbc != null) rbc.setCaseFirstDefault();
1228            } else {
1229                parsed = false;
1230            }
1231        } else if (attrString.equals("caseLevel")) {
1232            if (valueString.equals("on")) {
1233                if (rbc != null) rbc.setCaseLevel(true);
1234            } else if (valueString.equals("off")) {
1235                if (rbc != null) rbc.setCaseLevel(false);
1236            } else if (valueString.equals("default")) {
1237                if (rbc != null) rbc.setCaseLevelDefault();
1238            } else {
1239                parsed = false;
1240            }
1241        } else if (attrString.equals("strength")) {
1242            if (valueString.equals("primary")) {
1243                if (rbc != null) rbc.setStrength(Collator.PRIMARY);
1244            } else if (valueString.equals("secondary")) {
1245                if (rbc != null) rbc.setStrength(Collator.SECONDARY);
1246            } else if (valueString.equals("tertiary")) {
1247                if (rbc != null) rbc.setStrength(Collator.TERTIARY);
1248            } else if (valueString.equals("quaternary")) {
1249                if (rbc != null) rbc.setStrength(Collator.QUATERNARY);
1250            } else if (valueString.equals("identical")) {
1251                if (rbc != null) rbc.setStrength(Collator.IDENTICAL);
1252            } else if (valueString.equals("default")) {
1253                if (rbc != null) rbc.setStrengthDefault();
1254            } else {
1255                parsed = false;
1256            }
1257        } else if (attrString.equals("numeric")) {
1258            if (valueString.equals("on")) {
1259                if (rbc != null) rbc.setNumericCollation(true);
1260            } else if (valueString.equals("off")) {
1261                if (rbc != null) rbc.setNumericCollation(false);
1262            } else if (valueString.equals("default")) {
1263                if (rbc != null) rbc.setNumericCollationDefault();
1264            } else {
1265                parsed = false;
1266            }
1267        } else {
1268            logln(fileLine);
1269            throw new ParseException("invalid attribute name on line "
1270                                        + fileLineNumber, fileLineNumber);
1271        }
1272        if (!parsed) {
1273            logln(fileLine);
1274            throw new ParseException(
1275                    "invalid attribute value name or attribute=value combination on line "
1276                    + fileLineNumber, fileLineNumber);
1277        }
1278
1279        fileLine = null;
1280    }
1281
1282    private void parseAndSetReorderCodes(int start) throws ParseException {
1283        UVector32 reorderCodes = new UVector32();
1284        while (start < fileLine.length()) {
1285            start = skipSpaces(start);
1286            int limit = start;
1287            while (limit < fileLine.length() && !isSpace(fileLine.charAt(limit))) {
1288                ++limit;
1289            }
1290            String name = fileLine.substring(start, limit);
1291            int code = CollationRuleParser.getReorderCode(name);
1292            if (code < -1) {
1293                if (name.equalsIgnoreCase("default")) {
1294                    code = ReorderCodes.DEFAULT;  // -1
1295                } else {
1296                    logln(fileLine);
1297                    throw new ParseException("invalid reorder code '" + name + "' on line "
1298                                                + fileLineNumber, fileLineNumber);
1299                }
1300            }
1301            reorderCodes.addElement(code);
1302            start = limit;
1303        }
1304        if (coll != null) {
1305            int[] reorderCodesArray = new int[reorderCodes.size()];
1306            System.arraycopy(reorderCodes.getBuffer(), 0,
1307                    reorderCodesArray, 0, reorderCodes.size());
1308            coll.setReorderCodes(reorderCodesArray);
1309        }
1310
1311        fileLine = null;
1312    }
1313
1314    private void buildTailoring(BufferedReader in) throws IOException {
1315        StringBuilder rules = new StringBuilder();
1316        while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) {
1317            rules.append(Utility.unescape(fileLine));
1318        }
1319
1320        try {
1321            coll = new RuleBasedCollator(rules.toString());
1322        } catch (Exception e) {
1323            logln(rules.toString());
1324            // Android patch: Add --omitCollationRules to genrb.
1325            logln("RuleBasedCollator(rules) failed - " + e.getMessage());
1326            // Android patch end.
1327            coll = null;
1328        }
1329    }
1330
1331    private void setRootCollator() {
1332        coll = Collator.getInstance(ULocale.ROOT);
1333    }
1334
1335    private void setLocaleCollator() {
1336        coll = null;
1337        ULocale locale = null;
1338        if (fileLine.length() > 9) {
1339            String localeID = fileLine.substring(9); // "@ locale <langTag>"
1340            try {
1341                locale = new ULocale(localeID);  // either locale ID or language tag
1342            } catch (IllformedLocaleException e) {
1343                locale = null;
1344            }
1345        }
1346        if (locale == null) {
1347            logln(fileLine);
1348            errln("invalid language tag on line " + fileLineNumber);
1349            return;
1350        }
1351
1352        logln("creating a collator for locale ID " + locale.getName());
1353        try {
1354            coll = Collator.getInstance(locale);
1355        } catch (Exception e) {
1356            errln("unable to create a collator for locale " + locale +
1357                    " on line " + fileLineNumber + " - " + e);
1358        }
1359    }
1360
1361    private boolean needsNormalization(String s) {
1362        if (!fcd.isNormalized(s)) {
1363            return true;
1364        }
1365        // In some sequences with Tibetan composite vowel signs,
1366        // even if the string passes the FCD check,
1367        // those composites must be decomposed.
1368        // Check if s contains 0F71 immediately followed by 0F73 or 0F75 or 0F81.
1369        int index = 0;
1370        while((index = s.indexOf(0xf71, index)) >= 0) {
1371            if (++index < s.length()) {
1372                char c = s.charAt(index);
1373                if (c == 0xf73 || c == 0xf75 || c == 0xf81) {
1374                    return true;
1375                }
1376            }
1377        }
1378        return false;
1379    }
1380
1381    private boolean getCollationKey(String norm, String line, String s, Output<CollationKey> keyOut) {
1382        CollationKey key = coll.getCollationKey(s);
1383        keyOut.value = key;
1384
1385        byte[] keyBytes = key.toByteArray();
1386        if (keyBytes.length == 0 || keyBytes[keyBytes.length - 1] != 0) {
1387            logln(fileTestName);
1388            logln(line);
1389            logln(printCollationKey(key));
1390            errln("Collator(" + norm + ").getCollationKey() wrote an empty or unterminated key");
1391            return false;
1392        }
1393
1394        int numLevels = coll.getStrength();
1395        if (numLevels < Collator.IDENTICAL) {
1396            ++numLevels;
1397        } else {
1398            numLevels = 5;
1399        }
1400        if (((RuleBasedCollator)coll).isCaseLevel()) {
1401            ++numLevels;
1402        }
1403        int numLevelSeparators = 0;
1404        for (int i = 0; i < (keyBytes.length - 1); ++i) {
1405            byte b = keyBytes[i];
1406            if (b == 0) {
1407                logln(fileTestName);
1408                logln(line);
1409                logln(printCollationKey(key));
1410                errln("Collator(" + norm + ").getCollationKey() contains a 00 byte");
1411                return false;
1412            }
1413            if (b == 1) {
1414                ++numLevelSeparators;
1415            }
1416        }
1417        if (numLevelSeparators != (numLevels - 1)) {
1418            logln(fileTestName);
1419            logln(line);
1420            logln(printCollationKey(key));
1421            errln("Collator(" + norm + ").getCollationKey() has "
1422                    + numLevelSeparators + " level separators for "
1423                    + numLevels + " levels");
1424            return false;
1425        }
1426
1427        // No nextSortKeyPart support in ICU4J
1428
1429        return true;
1430    }
1431
1432    /**
1433     * Changes the key to the merged segments of the U+FFFE-separated substrings of s.
1434     * Leaves key unchanged if s does not contain U+FFFE.
1435     * @return true if the key was successfully changed
1436     */
1437    private boolean getMergedCollationKey(String s, Output<CollationKey> key) {
1438        CollationKey mergedKey = null;
1439        int sLength = s.length();
1440        int segmentStart = 0;
1441        for (int i = 0;;) {
1442            if (i == sLength) {
1443                if (segmentStart == 0) {
1444                    // s does not contain any U+FFFE.
1445                    return false;
1446                }
1447            } else if (s.charAt(i) != '\uFFFE') {
1448                ++i;
1449                continue;
1450            }
1451            // Get the sort key for another segment and merge it into mergedKey.
1452            CollationKey tmpKey = coll.getCollationKey(s.substring(segmentStart, i));
1453            if (mergedKey == null) {
1454                mergedKey = tmpKey;
1455            } else {
1456                mergedKey = mergedKey.merge(tmpKey);
1457            }
1458            if (i == sLength) {
1459                break;
1460            }
1461            segmentStart = ++i;
1462        }
1463        key.value = mergedKey;
1464        return true;
1465    }
1466
1467    private static int getDifferenceLevel(CollationKey prevKey, CollationKey key,
1468            int order, boolean collHasCaseLevel) {
1469        if (order == Collation.EQUAL) {
1470            return Collation.NO_LEVEL;
1471        }
1472        byte[] prevBytes = prevKey.toByteArray();
1473        byte[] bytes = key.toByteArray();
1474        int level = Collation.PRIMARY_LEVEL;
1475        for (int i = 0;; ++i) {
1476            byte b = prevBytes[i];
1477            if (b != bytes[i]) {
1478                break;
1479            }
1480            if ((int)b == Collation.LEVEL_SEPARATOR_BYTE) {
1481                ++level;
1482                if (level == Collation.CASE_LEVEL && !collHasCaseLevel) {
1483                    ++level;
1484                }
1485            }
1486        }
1487        return level;
1488    }
1489
1490    private boolean checkCompareTwo(String norm, String prevFileLine, String prevString, String s,
1491                                    int expectedOrder, int expectedLevel) {
1492        // Get the sort keys first, for error debug output.
1493        Output<CollationKey> prevKeyOut = new Output<CollationKey>();
1494        CollationKey prevKey;
1495        if (!getCollationKey(norm, fileLine, prevString, prevKeyOut)) {
1496            return false;
1497        }
1498        prevKey = prevKeyOut.value;
1499
1500        Output<CollationKey> keyOut = new Output<CollationKey>();
1501        CollationKey key;
1502        if (!getCollationKey(norm, fileLine, s, keyOut)) {
1503            return false;
1504        }
1505        key = keyOut.value;
1506
1507        int order = coll.compare(prevString, s);
1508        if (order != expectedOrder) {
1509            logln(fileTestName);
1510            logln(prevFileLine);
1511            logln(fileLine);
1512            logln(printCollationKey(prevKey));
1513            logln(printCollationKey(key));
1514            errln("line " + fileLineNumber
1515                    + " Collator(" + norm + ").compare(previous, current) wrong order: "
1516                    + order + " != " + expectedOrder);
1517            return false;
1518        }
1519        order = coll.compare(s, prevString);
1520        if (order != -expectedOrder) {
1521            logln(fileTestName);
1522            logln(prevFileLine);
1523            logln(fileLine);
1524            logln(printCollationKey(prevKey));
1525            logln(printCollationKey(key));
1526            errln("line " + fileLineNumber
1527                    + " Collator(" + norm + ").compare(current, previous) wrong order: "
1528                    + order + " != " + -expectedOrder);
1529            return false;
1530        }
1531
1532        order = prevKey.compareTo(key);
1533        if (order != expectedOrder) {
1534            logln(fileTestName);
1535            logln(prevFileLine);
1536            logln(fileLine);
1537            logln(printCollationKey(prevKey));
1538            logln(printCollationKey(key));
1539            errln("line " + fileLineNumber
1540                    + " Collator(" + norm + ").getCollationKey(previous, current).compareTo() wrong order: "
1541                    + order + " != " + expectedOrder);
1542            return false;
1543        }
1544        boolean collHasCaseLevel = ((RuleBasedCollator)coll).isCaseLevel();
1545        int level = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1546        if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) {
1547            if (level != expectedLevel) {
1548                logln(fileTestName);
1549                logln(prevFileLine);
1550                logln(fileLine);
1551                logln(printCollationKey(prevKey));
1552                logln(printCollationKey(key));
1553                errln("line " + fileLineNumber
1554                        + " Collator(" + norm + ").getCollationKey(previous, current).compareTo()="
1555                        + order + " wrong level: " + level + " != " + expectedLevel);
1556                return false;
1557            }
1558        }
1559
1560        // If either string contains U+FFFE, then their sort keys must compare the same as
1561        // the merged sort keys of each string's between-FFFE segments.
1562        //
1563        // It is not required that
1564        //   sortkey(str1 + "\uFFFE" + str2) == mergeSortkeys(sortkey(str1), sortkey(str2))
1565        // only that those two methods yield the same order.
1566        //
1567        // Use bit-wise OR so that getMergedCollationKey() is always called for both strings.
1568        Output<CollationKey> outPrevKey = new Output<CollationKey>(prevKey);
1569        Output<CollationKey> outKey = new Output<CollationKey>(key);
1570        if (getMergedCollationKey(prevString, outPrevKey) | getMergedCollationKey(s, outKey)) {
1571            prevKey = outPrevKey.value;
1572            key = outKey.value;
1573            order = prevKey.compareTo(key);
1574            if (order != expectedOrder) {
1575                logln(fileTestName);
1576                errln("line " + fileLineNumber
1577                        + " Collator(" + norm + ").getCollationKey"
1578                        + "(previous, current segments between U+FFFE)).merge().compareTo() wrong order: "
1579                        + order + " != " + expectedOrder);
1580                logln(prevFileLine);
1581                logln(fileLine);
1582                logln(printCollationKey(prevKey));
1583                logln(printCollationKey(key));
1584                return false;
1585            }
1586            int mergedLevel = getDifferenceLevel(prevKey, key, order, collHasCaseLevel);
1587            if (order != Collation.EQUAL && expectedLevel != Collation.NO_LEVEL) {
1588                if(mergedLevel != level) {
1589                    logln(fileTestName);
1590                    errln("line " + fileLineNumber
1591                        + " Collator(" + norm + ").getCollationKey"
1592                        + "(previous, current segments between U+FFFE)).merge().compareTo()="
1593                        + order + " wrong level: " + mergedLevel + " != " + level);
1594                    logln(prevFileLine);
1595                    logln(fileLine);
1596                    logln(printCollationKey(prevKey));
1597                    logln(printCollationKey(key));
1598                    return false;
1599                }
1600            }
1601        }
1602        return true;
1603    }
1604
1605    private void checkCompareStrings(BufferedReader in) throws IOException {
1606        String prevFileLine = "(none)";
1607        String prevString = "";
1608        Output<String> sOut = new Output<String>();
1609        while (readNonEmptyLine(in) && !isSectionStarter(fileLine.charAt(0))) {
1610            // Parse the line even if it will be ignored (when we do not have a Collator)
1611            // in order to report syntax issues.
1612            int relation;
1613            try {
1614                relation = parseRelationAndString(sOut);
1615            } catch (ParseException pe) {
1616                errln(pe.toString());
1617                break;
1618            }
1619            if(coll == null) {
1620                // We were unable to create the Collator but continue with tests.
1621                // Ignore test data for this Collator.
1622                // The next Collator creation might work.
1623                continue;
1624            }
1625            String s = sOut.value;
1626            int expectedOrder = (relation == Collation.ZERO_LEVEL) ? Collation.EQUAL : Collation.LESS;
1627            int expectedLevel = relation;
1628            boolean isOk = true;
1629            if (!needsNormalization(prevString) && !needsNormalization(s)) {
1630                coll.setDecomposition(Collator.NO_DECOMPOSITION);
1631                isOk = checkCompareTwo("normalization=off", prevFileLine, prevString, s,
1632                                        expectedOrder, expectedLevel);
1633            }
1634            if (isOk) {
1635                coll.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
1636                isOk = checkCompareTwo("normalization=on", prevFileLine, prevString, s,
1637                                        expectedOrder, expectedLevel);
1638            }
1639            if (isOk && (!nfd.isNormalized(prevString) || !nfd.isNormalized(s))) {
1640                String pn = nfd.normalize(prevString);
1641                String n = nfd.normalize(s);
1642                isOk = checkCompareTwo("NFD input", prevFileLine, pn, n,
1643                                        expectedOrder, expectedLevel);
1644            }
1645            prevFileLine = fileLine;
1646            prevString = s;
1647        }
1648    }
1649
1650    @Test
1651    public void TestDataDriven() {
1652        nfd = Normalizer2.getNFDInstance();
1653        fcd = Norm2AllModes.getFCDNormalizer2();
1654
1655        BufferedReader in = null;
1656
1657        try {
1658            in = TestUtil.getDataReader("collationtest.txt", "UTF-8");
1659
1660            // Read a new line if necessary.
1661            // Sub-parsers leave the first line set that they do not handle.
1662            while (fileLine != null || readNonEmptyLine(in)) {
1663                if (!isSectionStarter(fileLine.charAt(0))) {
1664                    logln(fileLine);
1665                    errln("syntax error on line " + fileLineNumber);
1666                    return;
1667                }
1668                if (fileLine.startsWith("** test: ")) {
1669                    fileTestName = fileLine;
1670                    logln(fileLine);
1671                    fileLine = null;
1672                } else if (fileLine.equals("@ root")) {
1673                    setRootCollator();
1674                    fileLine = null;
1675                } else if (fileLine.startsWith("@ locale ")) {
1676                    setLocaleCollator();
1677                    fileLine = null;
1678                } else if (fileLine.equals("@ rules")) {
1679                    buildTailoring(in);
1680                } else if (fileLine.charAt(0) == '%'
1681                        && fileLine.length() > 1 && isSpace(fileLine.charAt(1))) {
1682                    parseAndSetAttribute();
1683                } else if (fileLine.equals("* compare")) {
1684                    checkCompareStrings(in);
1685                } else {
1686                    logln(fileLine);
1687                    errln("syntax error on line " + fileLineNumber);
1688                    return;
1689                }
1690            }
1691        } catch (ParseException pe) {
1692            errln(pe.toString());
1693        } catch (IOException e) {
1694            errln(e.getMessage());
1695        } finally {
1696            try {
1697                if (in != null) {
1698                    in.close();
1699                }
1700            } catch (IOException e) {
1701                e.printStackTrace();
1702            }
1703        }
1704    }
1705}
1706