1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2010-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* CollationData.java, ported from collationdata.h/.cpp
9*
10* C++ version created on: 2010oct27
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import com.ibm.icu.impl.Normalizer2Impl;
17import com.ibm.icu.impl.Trie2_32;
18import com.ibm.icu.lang.UScript;
19import com.ibm.icu.text.Collator;
20import com.ibm.icu.text.UnicodeSet;
21import com.ibm.icu.util.ICUException;
22
23/**
24 * Collation data container.
25 * Immutable data created by a CollationDataBuilder, or loaded from a file,
26 * or deserialized from API-provided binary data.
27 *
28 * Includes data for the collation base (root/default), aliased if this is not the base.
29 */
30public final class CollationData {
31    // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
32    // parallel with the ranges, and resetting ranges that are indexed.
33    // The reordering builder code could clone the resulting template array.
34    static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
35    static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
36
37    static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
38
39    CollationData(Normalizer2Impl nfc) {
40        nfcImpl = nfc;
41    }
42
43    public int getCE32(int c) {
44        return trie.get(c);
45    }
46
47    int getCE32FromSupplementary(int c) {
48        return trie.get(c);  // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
49    }
50
51    boolean isDigit(int c) {
52        return c < 0x660 ? c <= 0x39 && 0x30 <= c :
53                Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
54    }
55
56    public boolean isUnsafeBackward(int c, boolean numeric) {
57        return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
58    }
59
60    public boolean isCompressibleLeadByte(int b) {
61        return compressibleBytes[b];
62    }
63
64    public boolean isCompressiblePrimary(long p) {
65        return isCompressibleLeadByte((int)p >>> 24);
66    }
67
68    /**
69     * Returns the CE32 from two contexts words.
70     * Access to the defaultCE32 for contraction and prefix matching.
71     */
72    int getCE32FromContexts(int index) {
73        return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
74    }
75
76    /**
77     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
78     * Requires that ce32 is special.
79     */
80    int getIndirectCE32(int ce32) {
81        assert(Collation.isSpecialCE32(ce32));
82        int tag = Collation.tagFromCE32(ce32);
83        if(tag == Collation.DIGIT_TAG) {
84            // Fetch the non-numeric-collation CE32.
85            ce32 = ce32s[Collation.indexFromCE32(ce32)];
86        } else if(tag == Collation.LEAD_SURROGATE_TAG) {
87            ce32 = Collation.UNASSIGNED_CE32;
88        } else if(tag == Collation.U0000_TAG) {
89            // Fetch the normal ce32 for U+0000.
90            ce32 = ce32s[0];
91        }
92        return ce32;
93    }
94
95    /**
96     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
97     * if ce32 is special.
98     */
99    int getFinalCE32(int ce32) {
100        if(Collation.isSpecialCE32(ce32)) {
101            ce32 = getIndirectCE32(ce32);
102        }
103        return ce32;
104    }
105
106    /**
107     * Computes a CE from c's ce32 which has the OFFSET_TAG.
108     */
109    long getCEFromOffsetCE32(int c, int ce32) {
110        long dataCE = ces[Collation.indexFromCE32(ce32)];
111        return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
112    }
113
114    /**
115     * Returns the single CE that c maps to.
116     * Throws UnsupportedOperationException if c does not map to a single CE.
117     */
118    long getSingleCE(int c) {
119        CollationData d;
120        int ce32 = getCE32(c);
121        if(ce32 == Collation.FALLBACK_CE32) {
122            d = base;
123            ce32 = base.getCE32(c);
124        } else {
125            d = this;
126        }
127        while(Collation.isSpecialCE32(ce32)) {
128            switch(Collation.tagFromCE32(ce32)) {
129            case Collation.LATIN_EXPANSION_TAG:
130            case Collation.BUILDER_DATA_TAG:
131            case Collation.PREFIX_TAG:
132            case Collation.CONTRACTION_TAG:
133            case Collation.HANGUL_TAG:
134            case Collation.LEAD_SURROGATE_TAG:
135                throw new UnsupportedOperationException(String.format(
136                        "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
137                        c, ce32));
138            case Collation.FALLBACK_TAG:
139            case Collation.RESERVED_TAG_3:
140                throw new AssertionError(String.format(
141                        "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
142            case Collation.LONG_PRIMARY_TAG:
143                return Collation.ceFromLongPrimaryCE32(ce32);
144            case Collation.LONG_SECONDARY_TAG:
145                return Collation.ceFromLongSecondaryCE32(ce32);
146            case Collation.EXPANSION32_TAG:
147                if(Collation.lengthFromCE32(ce32) == 1) {
148                    ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
149                    break;
150                } else {
151                    throw new UnsupportedOperationException(String.format(
152                            "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
153                            c, ce32));
154                }
155            case Collation.EXPANSION_TAG: {
156                if(Collation.lengthFromCE32(ce32) == 1) {
157                    return d.ces[Collation.indexFromCE32(ce32)];
158                } else {
159                    throw new UnsupportedOperationException(String.format(
160                            "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
161                            c, ce32));
162                }
163            }
164            case Collation.DIGIT_TAG:
165                // Fetch the non-numeric-collation CE32 and continue.
166                ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
167                break;
168            case Collation.U0000_TAG:
169                assert(c == 0);
170                // Fetch the normal ce32 for U+0000 and continue.
171                ce32 = d.ce32s[0];
172                break;
173            case Collation.OFFSET_TAG:
174                return d.getCEFromOffsetCE32(c, ce32);
175            case Collation.IMPLICIT_TAG:
176                return Collation.unassignedCEFromCodePoint(c);
177            }
178        }
179        return Collation.ceFromSimpleCE32(ce32);
180    }
181
182    /**
183     * Returns the FCD16 value for code point c. c must be >= 0.
184     */
185    int getFCD16(int c) {
186        return nfcImpl.getFCD16(c);
187    }
188
189    /**
190     * Returns the first primary for the script's reordering group.
191     * @return the primary with only the first primary lead byte of the group
192     *         (not necessarily an actual root collator primary weight),
193     *         or 0 if the script is unknown
194     */
195    long getFirstPrimaryForGroup(int script) {
196        int index = getScriptIndex(script);
197        return index == 0 ? 0 : (long)scriptStarts[index] << 16;
198    }
199
200    /**
201     * Returns the last primary for the script's reordering group.
202     * @return the last primary of the group
203     *         (not an actual root collator primary weight),
204     *         or 0 if the script is unknown
205     */
206    public long getLastPrimaryForGroup(int script) {
207        int index = getScriptIndex(script);
208        if(index == 0) {
209            return 0;
210        }
211        long limit = scriptStarts[index + 1];
212        return (limit << 16) - 1;
213    }
214
215    /**
216     * Finds the reordering group which contains the primary weight.
217     * @return the first script of the group, or -1 if the weight is beyond the last group
218     */
219    public int getGroupForPrimary(long p) {
220        p >>= 16;
221        if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
222            return -1;
223        }
224        int index = 1;
225        while(p >= scriptStarts[index + 1]) { ++index; }
226        for(int i = 0; i < numScripts; ++i) {
227            if(scriptsIndex[i] == index) {
228                return i;
229            }
230        }
231        for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
232            if(scriptsIndex[numScripts + i] == index) {
233                return Collator.ReorderCodes.FIRST + i;
234            }
235        }
236        return -1;
237    }
238
239    private int getScriptIndex(int script) {
240        if(script < 0) {
241            return 0;
242        } else if(script < numScripts) {
243            return scriptsIndex[script];
244        } else if(script < Collator.ReorderCodes.FIRST) {
245            return 0;
246        } else {
247            script -= Collator.ReorderCodes.FIRST;
248            if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
249                return scriptsIndex[numScripts + script];
250            } else {
251                return 0;
252            }
253        }
254    }
255
256    public int[] getEquivalentScripts(int script) {
257        int index = getScriptIndex(script);
258        if(index == 0) { return EMPTY_INT_ARRAY; }
259        if(script >= Collator.ReorderCodes.FIRST) {
260            // Special groups have no aliases.
261            return new int[] { script };
262        }
263
264        int length = 0;
265        for(int i = 0; i < numScripts; ++i) {
266            if(scriptsIndex[i] == index) {
267                ++length;
268            }
269        }
270        int[] dest = new int[length];
271        if(length == 1) {
272            dest[0] = script;
273            return dest;
274        }
275        length = 0;
276        for(int i = 0; i < numScripts; ++i) {
277            if(scriptsIndex[i] == index) {
278                dest[length++] = i;
279            }
280        }
281        return dest;
282    }
283
284    /**
285     * Writes the permutation of primary-weight ranges
286     * for the given reordering of scripts and groups.
287     * The caller checks for illegal arguments and
288     * takes care of [DEFAULT] and memory allocation.
289     *
290     * <p>Each list element will be a (limit, offset) pair as described
291     * for the CollationSettings.reorderRanges.
292     * The list will be empty if no ranges are reordered.
293     */
294    void makeReorderRanges(int[] reorder, UVector32 ranges) {
295        makeReorderRanges(reorder, false, ranges);
296    }
297
298    private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
299        ranges.removeAllElements();
300        int length = reorder.length;
301        if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
302            return;
303        }
304
305        // Maps each script-or-group range to a new lead byte.
306        short[] table = new short[scriptStarts.length - 1];  // C++: uint8_t[]
307
308        {
309            // Set "don't care" values for reserved ranges.
310            int index = scriptsIndex[
311                    numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
312            if(index != 0) {
313                table[index] = 0xff;
314            }
315            index = scriptsIndex[
316                    numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
317            if(index != 0) {
318                table[index] = 0xff;
319            }
320        }
321
322        // Never reorder special low and high primary lead bytes.
323        assert(scriptStarts.length >= 2);
324        assert(scriptStarts[0] == 0);
325        int lowStart = scriptStarts[1];
326        assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
327        int highLimit = scriptStarts[scriptStarts.length - 1];
328        assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
329
330        // Get the set of special reorder codes in the input list.
331        // This supports a fixed number of special reorder codes;
332        // it works for data with codes beyond Collator.ReorderCodes.LIMIT.
333        int specials = 0;
334        for(int i = 0; i < length; ++i) {
335            int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
336            if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
337                specials |= 1 << reorderCode;
338            }
339        }
340
341        // Start the reordering with the special low reorder codes that do not occur in the input.
342        for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
343            int index = scriptsIndex[numScripts + i];
344            if(index != 0 && (specials & (1 << i)) == 0) {
345                lowStart = addLowScriptRange(table, index, lowStart);
346            }
347        }
348
349        // Skip the reserved range before Latin if Latin is the first script,
350        // so that we do not move it unnecessarily.
351        int skippedReserved = 0;
352        if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
353            int index = scriptsIndex[UScript.LATIN];
354            assert(index != 0);
355            int start = scriptStarts[index];
356            assert(lowStart <= start);
357            skippedReserved = start - lowStart;
358            lowStart = start;
359        }
360
361        // Reorder according to the input scripts, continuing from the bottom of the primary range.
362        boolean hasReorderToEnd = false;
363        for(int i = 0; i < length;) {
364            int script = reorder[i++];
365            if(script == UScript.UNKNOWN) {
366                // Put the remaining scripts at the top.
367                hasReorderToEnd = true;
368                while(i < length) {
369                    script = reorder[--length];
370                    if(script == UScript.UNKNOWN) {  // Must occur at most once.
371                        throw new IllegalArgumentException(
372                                "setReorderCodes(): duplicate UScript.UNKNOWN");
373                    }
374                    if(script == Collator.ReorderCodes.DEFAULT) {
375                        throw new IllegalArgumentException(
376                                "setReorderCodes(): UScript.DEFAULT together with other scripts");
377                    }
378                    int index = getScriptIndex(script);
379                    if(index == 0) { continue; }
380                    if(table[index] != 0) {  // Duplicate or equivalent script.
381                        throw new IllegalArgumentException(
382                                "setReorderCodes(): duplicate or equivalent script " +
383                                scriptCodeString(script));
384                    }
385                    highLimit = addHighScriptRange(table, index, highLimit);
386                }
387                break;
388            }
389            if(script == Collator.ReorderCodes.DEFAULT) {
390                // The default code must be the only one in the list, and that is handled by the caller.
391                // Otherwise it must not be used.
392                throw new IllegalArgumentException(
393                        "setReorderCodes(): UScript.DEFAULT together with other scripts");
394            }
395            int index = getScriptIndex(script);
396            if(index == 0) { continue; }
397            if(table[index] != 0) {  // Duplicate or equivalent script.
398                throw new IllegalArgumentException(
399                        "setReorderCodes(): duplicate or equivalent script " +
400                        scriptCodeString(script));
401            }
402            lowStart = addLowScriptRange(table, index, lowStart);
403        }
404
405        // Put all remaining scripts into the middle.
406        for(int i = 1; i < scriptStarts.length - 1; ++i) {
407            int leadByte = table[i];
408            if(leadByte != 0) { continue; }
409            int start = scriptStarts[i];
410            if(!hasReorderToEnd && start > lowStart) {
411                // No need to move this script.
412                lowStart = start;
413            }
414            lowStart = addLowScriptRange(table, i, lowStart);
415        }
416        if(lowStart > highLimit) {
417            if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
418                // Try not skipping the before-Latin reserved range.
419                makeReorderRanges(reorder, true, ranges);
420                return;
421            }
422            // We need more primary lead bytes than available, despite the reserved ranges.
423            throw new ICUException(
424                    "setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
425        }
426
427        // Turn lead bytes into a list of (limit, offset) pairs.
428        // Encode each pair in one list element:
429        // Upper 16 bits = limit, lower 16 = signed lead byte offset.
430        int offset = 0;
431        for(int i = 1;; ++i) {
432            int nextOffset = offset;
433            while(i < scriptStarts.length - 1) {
434                int newLeadByte = table[i];
435                if(newLeadByte == 0xff) {
436                    // "Don't care" lead byte for reserved range, continue with current offset.
437                } else {
438                    nextOffset = newLeadByte - (scriptStarts[i] >> 8);
439                    if(nextOffset != offset) { break; }
440                }
441                ++i;
442            }
443            if(offset != 0 || i < scriptStarts.length - 1) {
444                ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
445            }
446            if(i == scriptStarts.length - 1) { break; }
447            offset = nextOffset;
448        }
449    }
450
451    private int addLowScriptRange(short[] table, int index, int lowStart) {
452        int start = scriptStarts[index];
453        if((start & 0xff) < (lowStart & 0xff)) {
454            lowStart += 0x100;
455        }
456        table[index] = (short)(lowStart >> 8);
457        int limit = scriptStarts[index + 1];
458        lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
459        return lowStart;
460    }
461
462    private int addHighScriptRange(short[] table, int index, int highLimit) {
463        int limit = scriptStarts[index + 1];
464        if((limit & 0xff) > (highLimit & 0xff)) {
465            highLimit -= 0x100;
466        }
467        int start = scriptStarts[index];
468        highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
469        table[index] = (short)(highLimit >> 8);
470        return highLimit;
471    }
472
473    private static String scriptCodeString(int script) {
474        // Do not use the script name here: We do not want to depend on that data.
475        return (script < Collator.ReorderCodes.FIRST) ?
476                Integer.toString(script) : "0x" + Integer.toHexString(script);
477    }
478
479    private static final int[] EMPTY_INT_ARRAY = new int[0];
480
481    /** @see jamoCE32s */
482    static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
483
484    /** Main lookup trie. */
485    Trie2_32 trie;
486    /**
487     * Array of CE32 values.
488     * At index 0 there must be CE32(U+0000)
489     * to support U+0000's special-tag for NUL-termination handling.
490     */
491    int[] ce32s;
492    /** Array of CE values for expansions and OFFSET_TAG. */
493    long[] ces;
494    /** Array of prefix and contraction-suffix matching data. */
495    String contexts;
496    /** Base collation data, or null if this data itself is a base. */
497    public CollationData base;
498    /**
499     * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
500     * They are normally simple CE32s, rarely expansions.
501     * For fast handling of HANGUL_TAG.
502     */
503    int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
504    public Normalizer2Impl nfcImpl;
505    /** The single-byte primary weight (xx000000) for numeric collation. */
506    long numericPrimary = 0x12000000;
507
508    /** 256 flags for which primary-weight lead bytes are compressible. */
509    public boolean[] compressibleBytes;
510    /**
511     * Set of code points that are unsafe for starting string comparison after an identical prefix,
512     * or in backwards CE iteration.
513     */
514    UnicodeSet unsafeBackwardSet;
515
516    /**
517     * Fast Latin table for common-Latin-text string comparisons.
518     * Data structure see class CollationFastLatin.
519     */
520    public char[] fastLatinTable;
521    /**
522     * Header portion of the fastLatinTable.
523     * In C++, these are one array, and the header is skipped for mapping characters.
524     * In Java, two arrays work better.
525     */
526    char[] fastLatinTableHeader;
527
528    /**
529     * Data for scripts and reordering groups.
530     * Uses include building a reordering permutation table and
531     * providing script boundaries to AlphabeticIndex.
532     */
533    int numScripts;
534    /**
535     * The length of scriptsIndex is numScripts+16.
536     * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
537     * 16 special reorder codes (not all used) are mapped starting at numScripts.
538     * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
539     * There are special codes at the end for reorder-reserved primary ranges.
540     *
541     * <p>Multiple scripts may share a range and index, for example Hira & Kana.
542     */
543    char[] scriptsIndex;
544    /**
545     * Start primary weight (top 16 bits only) for a group/script/reserved range
546     * indexed by scriptsIndex.
547     * The first range (separators & terminators) and the last range (trailing weights)
548     * are not reorderable, and no scriptsIndex entry points to them.
549     */
550    char[] scriptStarts;
551
552    /**
553     * Collation elements in the root collator.
554     * Used by the CollationRootElements class. The data structure is described there.
555     * null in a tailoring.
556     */
557    public long[] rootElements;
558}
559