1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5*******************************************************************************
6* Copyright (C) 2010-2015, International Business Machines
7* Corporation and others.  All Rights Reserved.
8*******************************************************************************
9* CollationData.java, ported from collationdata.h/.cpp
10*
11* C++ version created on: 2010oct27
12* created by: Markus W. Scherer
13*/
14
15package android.icu.impl.coll;
16
17import android.icu.impl.Normalizer2Impl;
18import android.icu.impl.Trie2_32;
19import android.icu.lang.UScript;
20import android.icu.text.Collator;
21import android.icu.text.UnicodeSet;
22import android.icu.util.ICUException;
23
24/**
25 * Collation data container.
26 * Immutable data created by a CollationDataBuilder, or loaded from a file,
27 * or deserialized from API-provided binary data.
28 *
29 * Includes data for the collation base (root/default), aliased if this is not the base.
30 * @hide Only a subset of ICU is exposed in Android
31 */
32public final class CollationData {
33    // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
34    // parallel with the ranges, and resetting ranges that are indexed.
35    // The reordering builder code could clone the resulting template array.
36    static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
37    static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
38
39    static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
40
41    CollationData(Normalizer2Impl nfc) {
42        nfcImpl = nfc;
43    }
44
45    public int getCE32(int c) {
46        return trie.get(c);
47    }
48
49    int getCE32FromSupplementary(int c) {
50        return trie.get(c);  // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
51    }
52
53    boolean isDigit(int c) {
54        return c < 0x660 ? c <= 0x39 && 0x30 <= c :
55                Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
56    }
57
58    public boolean isUnsafeBackward(int c, boolean numeric) {
59        return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
60    }
61
62    public boolean isCompressibleLeadByte(int b) {
63        return compressibleBytes[b];
64    }
65
66    public boolean isCompressiblePrimary(long p) {
67        return isCompressibleLeadByte((int)p >>> 24);
68    }
69
70    /**
71     * Returns the CE32 from two contexts words.
72     * Access to the defaultCE32 for contraction and prefix matching.
73     */
74    int getCE32FromContexts(int index) {
75        return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
76    }
77
78    /**
79     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
80     * Requires that ce32 is special.
81     */
82    int getIndirectCE32(int ce32) {
83        assert(Collation.isSpecialCE32(ce32));
84        int tag = Collation.tagFromCE32(ce32);
85        if(tag == Collation.DIGIT_TAG) {
86            // Fetch the non-numeric-collation CE32.
87            ce32 = ce32s[Collation.indexFromCE32(ce32)];
88        } else if(tag == Collation.LEAD_SURROGATE_TAG) {
89            ce32 = Collation.UNASSIGNED_CE32;
90        } else if(tag == Collation.U0000_TAG) {
91            // Fetch the normal ce32 for U+0000.
92            ce32 = ce32s[0];
93        }
94        return ce32;
95    }
96
97    /**
98     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
99     * if ce32 is special.
100     */
101    int getFinalCE32(int ce32) {
102        if(Collation.isSpecialCE32(ce32)) {
103            ce32 = getIndirectCE32(ce32);
104        }
105        return ce32;
106    }
107
108    /**
109     * Computes a CE from c's ce32 which has the OFFSET_TAG.
110     */
111    long getCEFromOffsetCE32(int c, int ce32) {
112        long dataCE = ces[Collation.indexFromCE32(ce32)];
113        return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
114    }
115
116    /**
117     * Returns the single CE that c maps to.
118     * Throws UnsupportedOperationException if c does not map to a single CE.
119     */
120    long getSingleCE(int c) {
121        CollationData d;
122        int ce32 = getCE32(c);
123        if(ce32 == Collation.FALLBACK_CE32) {
124            d = base;
125            ce32 = base.getCE32(c);
126        } else {
127            d = this;
128        }
129        while(Collation.isSpecialCE32(ce32)) {
130            switch(Collation.tagFromCE32(ce32)) {
131            case Collation.LATIN_EXPANSION_TAG:
132            case Collation.BUILDER_DATA_TAG:
133            case Collation.PREFIX_TAG:
134            case Collation.CONTRACTION_TAG:
135            case Collation.HANGUL_TAG:
136            case Collation.LEAD_SURROGATE_TAG:
137                throw new UnsupportedOperationException(String.format(
138                        "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
139                        c, ce32));
140            case Collation.FALLBACK_TAG:
141            case Collation.RESERVED_TAG_3:
142                throw new AssertionError(String.format(
143                        "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
144            case Collation.LONG_PRIMARY_TAG:
145                return Collation.ceFromLongPrimaryCE32(ce32);
146            case Collation.LONG_SECONDARY_TAG:
147                return Collation.ceFromLongSecondaryCE32(ce32);
148            case Collation.EXPANSION32_TAG:
149                if(Collation.lengthFromCE32(ce32) == 1) {
150                    ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
151                    break;
152                } else {
153                    throw new UnsupportedOperationException(String.format(
154                            "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
155                            c, ce32));
156                }
157            case Collation.EXPANSION_TAG: {
158                if(Collation.lengthFromCE32(ce32) == 1) {
159                    return d.ces[Collation.indexFromCE32(ce32)];
160                } else {
161                    throw new UnsupportedOperationException(String.format(
162                            "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
163                            c, ce32));
164                }
165            }
166            case Collation.DIGIT_TAG:
167                // Fetch the non-numeric-collation CE32 and continue.
168                ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
169                break;
170            case Collation.U0000_TAG:
171                assert(c == 0);
172                // Fetch the normal ce32 for U+0000 and continue.
173                ce32 = d.ce32s[0];
174                break;
175            case Collation.OFFSET_TAG:
176                return d.getCEFromOffsetCE32(c, ce32);
177            case Collation.IMPLICIT_TAG:
178                return Collation.unassignedCEFromCodePoint(c);
179            }
180        }
181        return Collation.ceFromSimpleCE32(ce32);
182    }
183
184    /**
185     * Returns the FCD16 value for code point c. c must be >= 0.
186     */
187    int getFCD16(int c) {
188        return nfcImpl.getFCD16(c);
189    }
190
191    /**
192     * Returns the first primary for the script's reordering group.
193     * @return the primary with only the first primary lead byte of the group
194     *         (not necessarily an actual root collator primary weight),
195     *         or 0 if the script is unknown
196     */
197    long getFirstPrimaryForGroup(int script) {
198        int index = getScriptIndex(script);
199        return index == 0 ? 0 : (long)scriptStarts[index] << 16;
200    }
201
202    /**
203     * Returns the last primary for the script's reordering group.
204     * @return the last primary of the group
205     *         (not an actual root collator primary weight),
206     *         or 0 if the script is unknown
207     */
208    public long getLastPrimaryForGroup(int script) {
209        int index = getScriptIndex(script);
210        if(index == 0) {
211            return 0;
212        }
213        long limit = scriptStarts[index + 1];
214        return (limit << 16) - 1;
215    }
216
217    /**
218     * Finds the reordering group which contains the primary weight.
219     * @return the first script of the group, or -1 if the weight is beyond the last group
220     */
221    public int getGroupForPrimary(long p) {
222        p >>= 16;
223        if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
224            return -1;
225        }
226        int index = 1;
227        while(p >= scriptStarts[index + 1]) { ++index; }
228        for(int i = 0; i < numScripts; ++i) {
229            if(scriptsIndex[i] == index) {
230                return i;
231            }
232        }
233        for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
234            if(scriptsIndex[numScripts + i] == index) {
235                return Collator.ReorderCodes.FIRST + i;
236            }
237        }
238        return -1;
239    }
240
241    private int getScriptIndex(int script) {
242        if(script < 0) {
243            return 0;
244        } else if(script < numScripts) {
245            return scriptsIndex[script];
246        } else if(script < Collator.ReorderCodes.FIRST) {
247            return 0;
248        } else {
249            script -= Collator.ReorderCodes.FIRST;
250            if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
251                return scriptsIndex[numScripts + script];
252            } else {
253                return 0;
254            }
255        }
256    }
257
258    public int[] getEquivalentScripts(int script) {
259        int index = getScriptIndex(script);
260        if(index == 0) { return EMPTY_INT_ARRAY; }
261        if(script >= Collator.ReorderCodes.FIRST) {
262            // Special groups have no aliases.
263            return new int[] { script };
264        }
265
266        int length = 0;
267        for(int i = 0; i < numScripts; ++i) {
268            if(scriptsIndex[i] == index) {
269                ++length;
270            }
271        }
272        int[] dest = new int[length];
273        if(length == 1) {
274            dest[0] = script;
275            return dest;
276        }
277        length = 0;
278        for(int i = 0; i < numScripts; ++i) {
279            if(scriptsIndex[i] == index) {
280                dest[length++] = i;
281            }
282        }
283        return dest;
284    }
285
286    /**
287     * Writes the permutation of primary-weight ranges
288     * for the given reordering of scripts and groups.
289     * The caller checks for illegal arguments and
290     * takes care of [DEFAULT] and memory allocation.
291     *
292     * <p>Each list element will be a (limit, offset) pair as described
293     * for the CollationSettings.reorderRanges.
294     * The list will be empty if no ranges are reordered.
295     */
296    void makeReorderRanges(int[] reorder, UVector32 ranges) {
297        makeReorderRanges(reorder, false, ranges);
298    }
299
300    private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
301        ranges.removeAllElements();
302        int length = reorder.length;
303        if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
304            return;
305        }
306
307        // Maps each script-or-group range to a new lead byte.
308        short[] table = new short[scriptStarts.length - 1];  // C++: uint8_t[]
309
310        {
311            // Set "don't care" values for reserved ranges.
312            int index = scriptsIndex[
313                    numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
314            if(index != 0) {
315                table[index] = 0xff;
316            }
317            index = scriptsIndex[
318                    numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
319            if(index != 0) {
320                table[index] = 0xff;
321            }
322        }
323
324        // Never reorder special low and high primary lead bytes.
325        assert(scriptStarts.length >= 2);
326        assert(scriptStarts[0] == 0);
327        int lowStart = scriptStarts[1];
328        assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
329        int highLimit = scriptStarts[scriptStarts.length - 1];
330        assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
331
332        // Get the set of special reorder codes in the input list.
333        // This supports a fixed number of special reorder codes;
334        // it works for data with codes beyond Collator.ReorderCodes.LIMIT.
335        int specials = 0;
336        for(int i = 0; i < length; ++i) {
337            int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
338            if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
339                specials |= 1 << reorderCode;
340            }
341        }
342
343        // Start the reordering with the special low reorder codes that do not occur in the input.
344        for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
345            int index = scriptsIndex[numScripts + i];
346            if(index != 0 && (specials & (1 << i)) == 0) {
347                lowStart = addLowScriptRange(table, index, lowStart);
348            }
349        }
350
351        // Skip the reserved range before Latin if Latin is the first script,
352        // so that we do not move it unnecessarily.
353        int skippedReserved = 0;
354        if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
355            int index = scriptsIndex[UScript.LATIN];
356            assert(index != 0);
357            int start = scriptStarts[index];
358            assert(lowStart <= start);
359            skippedReserved = start - lowStart;
360            lowStart = start;
361        }
362
363        // Reorder according to the input scripts, continuing from the bottom of the primary range.
364        boolean hasReorderToEnd = false;
365        for(int i = 0; i < length;) {
366            int script = reorder[i++];
367            if(script == UScript.UNKNOWN) {
368                // Put the remaining scripts at the top.
369                hasReorderToEnd = true;
370                while(i < length) {
371                    script = reorder[--length];
372                    if(script == UScript.UNKNOWN) {  // Must occur at most once.
373                        throw new IllegalArgumentException(
374                                "setReorderCodes(): duplicate UScript.UNKNOWN");
375                    }
376                    if(script == Collator.ReorderCodes.DEFAULT) {
377                        throw new IllegalArgumentException(
378                                "setReorderCodes(): UScript.DEFAULT together with other scripts");
379                    }
380                    int index = getScriptIndex(script);
381                    if(index == 0) { continue; }
382                    if(table[index] != 0) {  // Duplicate or equivalent script.
383                        throw new IllegalArgumentException(
384                                "setReorderCodes(): duplicate or equivalent script " +
385                                scriptCodeString(script));
386                    }
387                    highLimit = addHighScriptRange(table, index, highLimit);
388                }
389                break;
390            }
391            if(script == Collator.ReorderCodes.DEFAULT) {
392                // The default code must be the only one in the list, and that is handled by the caller.
393                // Otherwise it must not be used.
394                throw new IllegalArgumentException(
395                        "setReorderCodes(): UScript.DEFAULT together with other scripts");
396            }
397            int index = getScriptIndex(script);
398            if(index == 0) { continue; }
399            if(table[index] != 0) {  // Duplicate or equivalent script.
400                throw new IllegalArgumentException(
401                        "setReorderCodes(): duplicate or equivalent script " +
402                        scriptCodeString(script));
403            }
404            lowStart = addLowScriptRange(table, index, lowStart);
405        }
406
407        // Put all remaining scripts into the middle.
408        for(int i = 1; i < scriptStarts.length - 1; ++i) {
409            int leadByte = table[i];
410            if(leadByte != 0) { continue; }
411            int start = scriptStarts[i];
412            if(!hasReorderToEnd && start > lowStart) {
413                // No need to move this script.
414                lowStart = start;
415            }
416            lowStart = addLowScriptRange(table, i, lowStart);
417        }
418        if(lowStart > highLimit) {
419            if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
420                // Try not skipping the before-Latin reserved range.
421                makeReorderRanges(reorder, true, ranges);
422                return;
423            }
424            // We need more primary lead bytes than available, despite the reserved ranges.
425            throw new ICUException(
426                    "setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
427        }
428
429        // Turn lead bytes into a list of (limit, offset) pairs.
430        // Encode each pair in one list element:
431        // Upper 16 bits = limit, lower 16 = signed lead byte offset.
432        int offset = 0;
433        for(int i = 1;; ++i) {
434            int nextOffset = offset;
435            while(i < scriptStarts.length - 1) {
436                int newLeadByte = table[i];
437                if(newLeadByte == 0xff) {
438                    // "Don't care" lead byte for reserved range, continue with current offset.
439                } else {
440                    nextOffset = newLeadByte - (scriptStarts[i] >> 8);
441                    if(nextOffset != offset) { break; }
442                }
443                ++i;
444            }
445            if(offset != 0 || i < scriptStarts.length - 1) {
446                ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
447            }
448            if(i == scriptStarts.length - 1) { break; }
449            offset = nextOffset;
450        }
451    }
452
453    private int addLowScriptRange(short[] table, int index, int lowStart) {
454        int start = scriptStarts[index];
455        if((start & 0xff) < (lowStart & 0xff)) {
456            lowStart += 0x100;
457        }
458        table[index] = (short)(lowStart >> 8);
459        int limit = scriptStarts[index + 1];
460        lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
461        return lowStart;
462    }
463
464    private int addHighScriptRange(short[] table, int index, int highLimit) {
465        int limit = scriptStarts[index + 1];
466        if((limit & 0xff) > (highLimit & 0xff)) {
467            highLimit -= 0x100;
468        }
469        int start = scriptStarts[index];
470        highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
471        table[index] = (short)(highLimit >> 8);
472        return highLimit;
473    }
474
475    private static String scriptCodeString(int script) {
476        // Do not use the script name here: We do not want to depend on that data.
477        return (script < Collator.ReorderCodes.FIRST) ?
478                Integer.toString(script) : "0x" + Integer.toHexString(script);
479    }
480
481    private static final int[] EMPTY_INT_ARRAY = new int[0];
482
483    /** @see jamoCE32s */
484    static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
485
486    /** Main lookup trie. */
487    Trie2_32 trie;
488    /**
489     * Array of CE32 values.
490     * At index 0 there must be CE32(U+0000)
491     * to support U+0000's special-tag for NUL-termination handling.
492     */
493    int[] ce32s;
494    /** Array of CE values for expansions and OFFSET_TAG. */
495    long[] ces;
496    /** Array of prefix and contraction-suffix matching data. */
497    String contexts;
498    /** Base collation data, or null if this data itself is a base. */
499    public CollationData base;
500    /**
501     * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
502     * They are normally simple CE32s, rarely expansions.
503     * For fast handling of HANGUL_TAG.
504     */
505    int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
506    public Normalizer2Impl nfcImpl;
507    /** The single-byte primary weight (xx000000) for numeric collation. */
508    long numericPrimary = 0x12000000;
509
510    /** 256 flags for which primary-weight lead bytes are compressible. */
511    public boolean[] compressibleBytes;
512    /**
513     * Set of code points that are unsafe for starting string comparison after an identical prefix,
514     * or in backwards CE iteration.
515     */
516    UnicodeSet unsafeBackwardSet;
517
518    /**
519     * Fast Latin table for common-Latin-text string comparisons.
520     * Data structure see class CollationFastLatin.
521     */
522    public char[] fastLatinTable;
523    /**
524     * Header portion of the fastLatinTable.
525     * In C++, these are one array, and the header is skipped for mapping characters.
526     * In Java, two arrays work better.
527     */
528    char[] fastLatinTableHeader;
529
530    /**
531     * Data for scripts and reordering groups.
532     * Uses include building a reordering permutation table and
533     * providing script boundaries to AlphabeticIndex.
534     */
535    int numScripts;
536    /**
537     * The length of scriptsIndex is numScripts+16.
538     * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
539     * 16 special reorder codes (not all used) are mapped starting at numScripts.
540     * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
541     * There are special codes at the end for reorder-reserved primary ranges.
542     *
543     * <p>Multiple scripts may share a range and index, for example Hira & Kana.
544     */
545    char[] scriptsIndex;
546    /**
547     * Start primary weight (top 16 bits only) for a group/script/reserved range
548     * indexed by scriptsIndex.
549     * The first range (separators & terminators) and the last range (trailing weights)
550     * are not reorderable, and no scriptsIndex entry points to them.
551     */
552    char[] scriptStarts;
553
554    /**
555     * Collation elements in the root collator.
556     * Used by the CollationRootElements class. The data structure is described there.
557     * null in a tailoring.
558     */
559    public long[] rootElements;
560}
561