1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* CollationSettings.java, ported from collationsettings.h/.cpp
9*
10* C++ version created on: 2013feb07
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import java.util.Arrays;
17
18import com.ibm.icu.text.Collator;
19
20/**
21 * Collation settings/options/attributes.
22 * These are the values that can be changed via API.
23 */
24public final class CollationSettings extends SharedObject {
25    /**
26     * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
27     */
28    public static final int CHECK_FCD = 1;
29    /**
30     * Options bit 1: Numeric collation.
31     * Also known as CODAN = COllate Digits As Numbers.
32     *
33     * Treat digit sequences as numbers with CE sequences in numeric order,
34     * rather than returning a normal CE for each digit.
35     */
36    public static final int NUMERIC = 2;
37    /**
38     * "Shifted" alternate handling, see ALTERNATE_MASK.
39     */
40    static final int SHIFTED = 4;
41    /**
42     * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
43     * Reserve values 8 and 0xc for shift-trimmed and blanked.
44     */
45    static final int ALTERNATE_MASK = 0xc;
46    /**
47     * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
48     */
49    static final int MAX_VARIABLE_SHIFT = 4;
50    /** maxVariable options bit mask before shifting. */
51    static final int MAX_VARIABLE_MASK = 0x70;
52    /** Options bit 7: Reserved/unused/0. */
53    /**
54     * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
55     */
56    static final int UPPER_FIRST = 0x100;
57    /**
58     * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
59     * unless case level is on (when they are *moved* into the separate case level).
60     * By default, the case bits are removed from the tertiary weight (ignored).
61     *
62     * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
63     * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
64     */
65    public static final int CASE_FIRST = 0x200;
66    /**
67     * Options bit mask for caseFirst and upperFirst, before shifting.
68     * Same value as caseFirst==upperFirst.
69     */
70    public static final int CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
71    /**
72     * Options bit 10: Insert the case level between the secondary and tertiary levels.
73     */
74    public static final int CASE_LEVEL = 0x400;
75    /**
76     * Options bit 11: Compare secondary weights backwards. ("French secondary")
77     */
78    public static final int BACKWARD_SECONDARY = 0x800;
79    /**
80     * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
81     * It is the top used bit field in the options. (No need to mask after shifting.)
82     */
83    static final int STRENGTH_SHIFT = 12;
84    /** Strength options bit mask before shifting. */
85    static final int STRENGTH_MASK = 0xf000;
86
87    /** maxVariable values */
88    static final int MAX_VAR_SPACE = 0;
89    static final int MAX_VAR_PUNCT = 1;
90    static final int MAX_VAR_SYMBOL = 2;
91    static final int MAX_VAR_CURRENCY = 3;
92
93    CollationSettings() {}
94
95    @Override
96    public CollationSettings clone() {
97        CollationSettings newSettings = (CollationSettings)super.clone();
98        // Note: The reorderTable, reorderRanges, and reorderCodes need not be cloned
99        // because, in Java, they only get replaced but not modified.
100        newSettings.fastLatinPrimaries = fastLatinPrimaries.clone();
101        return newSettings;
102    }
103
104    @Override
105    public boolean equals(Object other) {
106        if(other == null) { return false; }
107        if(!this.getClass().equals(other.getClass())) { return false; }
108        CollationSettings o = (CollationSettings)other;
109        if(options != o.options) { return false; }
110        if((options & ALTERNATE_MASK) != 0 && variableTop != o.variableTop) { return false; }
111        if(!Arrays.equals(reorderCodes, o.reorderCodes)) { return false; }
112        return true;
113    }
114
115    @Override
116    public int hashCode() {
117        int h = options << 8;
118        if((options & ALTERNATE_MASK) != 0) { h ^= variableTop; }
119        h ^= reorderCodes.length;
120        for(int i = 0; i < reorderCodes.length; ++i) {
121            h ^= (reorderCodes[i] << i);
122        }
123        return h;
124    }
125
126    public void resetReordering() {
127        // When we turn off reordering, we want to set a null permutation
128        // rather than a no-op permutation.
129        reorderTable = null;
130        minHighNoReorder = 0;
131        reorderRanges = null;
132        reorderCodes = EMPTY_INT_ARRAY;
133    }
134
135    void aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table) {
136        int[] codes;
137        if(codesLength == codesAndRanges.length) {
138            codes = codesAndRanges;
139        } else {
140            // TODO: Java 6: Arrays.copyOf(codes, codesLength);
141            codes = new int[codesLength];
142            System.arraycopy(codesAndRanges, 0, codes, 0, codesLength);
143        }
144        int rangesStart = codesLength;
145        int rangesLimit = codesAndRanges.length;
146        int rangesLength = rangesLimit - rangesStart;
147        if(table != null &&
148                (rangesLength == 0 ?
149                        !reorderTableHasSplitBytes(table) :
150                        rangesLength >= 2 &&
151                        // The first offset must be 0. The last offset must not be 0.
152                        (codesAndRanges[rangesStart] & 0xffff) == 0 &&
153                        (codesAndRanges[rangesLimit - 1] & 0xffff) != 0)) {
154            reorderTable = table;
155            reorderCodes = codes;
156            // Drop ranges before the first split byte. They are reordered by the table.
157            // This then speeds up reordering of the remaining ranges.
158            int firstSplitByteRangeIndex = rangesStart;
159            while(firstSplitByteRangeIndex < rangesLimit &&
160                    (codesAndRanges[firstSplitByteRangeIndex] & 0xff0000) == 0) {
161                // The second byte of the primary limit is 0.
162                ++firstSplitByteRangeIndex;
163            }
164            if(firstSplitByteRangeIndex == rangesLimit) {
165                assert(!reorderTableHasSplitBytes(table));
166                minHighNoReorder = 0;
167                reorderRanges = null;
168            } else {
169                assert(table[codesAndRanges[firstSplitByteRangeIndex] >>> 24] == 0);
170                minHighNoReorder = codesAndRanges[rangesLimit - 1] & 0xffff0000L;
171                setReorderRanges(codesAndRanges, firstSplitByteRangeIndex,
172                        rangesLimit - firstSplitByteRangeIndex);
173            }
174            return;
175        }
176        // Regenerate missing data.
177        setReordering(data, codes);
178    }
179
180    public void setReordering(CollationData data, int[] codes) {
181        if(codes.length == 0 || (codes.length == 1 && codes[0] == Collator.ReorderCodes.NONE)) {
182            resetReordering();
183            return;
184        }
185        UVector32 rangesList = new UVector32();
186        data.makeReorderRanges(codes, rangesList);
187        int rangesLength = rangesList.size();
188        if(rangesLength == 0) {
189            resetReordering();
190            return;
191        }
192        int[] ranges = rangesList.getBuffer();
193        // ranges[] contains at least two (limit, offset) pairs.
194        // The first offset must be 0. The last offset must not be 0.
195        // Separators (at the low end) and trailing weights (at the high end)
196        // are never reordered.
197        assert(rangesLength >= 2);
198        assert((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0);
199        minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000L;
200
201        // Write the lead byte permutation table.
202        // Set a 0 for each lead byte that has a range boundary in the middle.
203        byte[] table = new byte[256];
204        int b = 0;
205        int firstSplitByteRangeIndex = -1;
206        for(int i = 0; i < rangesLength; ++i) {
207            int pair = ranges[i];
208            int limit1 = pair >>> 24;
209            while(b < limit1) {
210                table[b] = (byte)(b + pair);
211                ++b;
212            }
213            // Check the second byte of the limit.
214            if((pair & 0xff0000) != 0) {
215                table[limit1] = 0;
216                b = limit1 + 1;
217                if(firstSplitByteRangeIndex < 0) {
218                    firstSplitByteRangeIndex = i;
219                }
220            }
221        }
222        while(b <= 0xff) {
223            table[b] = (byte)b;
224            ++b;
225        }
226        int rangesStart;
227        if(firstSplitByteRangeIndex < 0) {
228            // The lead byte permutation table alone suffices for reordering.
229            rangesStart = rangesLength = 0;
230        } else {
231            // Remove the ranges below the first split byte.
232            rangesStart = firstSplitByteRangeIndex;
233            rangesLength -= firstSplitByteRangeIndex;
234        }
235        setReorderArrays(codes, ranges, rangesStart, rangesLength, table);
236    }
237
238    private void setReorderArrays(int[] codes,
239            int[] ranges, int rangesStart, int rangesLength, byte[] table) {
240        // Very different from C++. See the comments after the reorderCodes declaration.
241        if(codes == null) {
242            codes = EMPTY_INT_ARRAY;
243        }
244        assert (codes.length == 0) == (table == null);
245        reorderTable = table;
246        reorderCodes = codes;
247        setReorderRanges(ranges, rangesStart, rangesLength);
248    }
249
250    private void setReorderRanges(int[] ranges, int rangesStart, int rangesLength) {
251        if(rangesLength == 0) {
252            reorderRanges = null;
253        } else {
254            reorderRanges = new long[rangesLength];
255            int i = 0;
256            do {
257                reorderRanges[i++] = ranges[rangesStart++] & 0xffffffffL;
258            } while(i < rangesLength);
259        }
260    }
261
262    public void copyReorderingFrom(CollationSettings other) {
263        if(!other.hasReordering()) {
264            resetReordering();
265            return;
266        }
267        minHighNoReorder = other.minHighNoReorder;
268        reorderTable = other.reorderTable;
269        reorderRanges = other.reorderRanges;
270        reorderCodes = other.reorderCodes;
271    }
272
273    public boolean hasReordering() { return reorderTable != null; }
274
275    private static boolean reorderTableHasSplitBytes(byte[] table) {
276        assert(table[0] == 0);
277        for(int i = 1; i < 256; ++i) {
278            if(table[i] == 0) {
279                return true;
280            }
281        }
282        return false;
283    }
284
285    public long reorder(long p) {
286        byte b = reorderTable[(int)p >>> 24];
287        if(b != 0 || p <= Collation.NO_CE_PRIMARY) {
288            return ((b & 0xffL) << 24) | (p & 0xffffff);
289        } else {
290            return reorderEx(p);
291        }
292    }
293
294    private long reorderEx(long p) {
295        assert minHighNoReorder > 0;
296        if(p >= minHighNoReorder) { return p; }
297        // Round up p so that its lower 16 bits are >= any offset bits.
298        // Then compare q directly with (limit, offset) pairs.
299        long q = p | 0xffff;
300        long r;
301        int i = 0;
302        while(q >= (r = reorderRanges[i])) { ++i; }
303        return p + ((long)(short)r << 24);
304    }
305
306    // In C++, we use enums for attributes and their values, with a special value for the default.
307    // Combined getter/setter methods handle many attributes.
308    // In Java, we have specific methods for getting, setting, and set-to-default,
309    // except that this class uses bits in its own bit set for simple values.
310
311    public void setStrength(int value) {
312        int noStrength = options & ~STRENGTH_MASK;
313        switch(value) {
314        case Collator.PRIMARY:
315        case Collator.SECONDARY:
316        case Collator.TERTIARY:
317        case Collator.QUATERNARY:
318        case Collator.IDENTICAL:
319            options = noStrength | (value << STRENGTH_SHIFT);
320            break;
321        default:
322            throw new IllegalArgumentException("illegal strength value " + value);
323        }
324    }
325
326    public void setStrengthDefault(int defaultOptions) {
327        int noStrength = options & ~STRENGTH_MASK;
328        options = noStrength | (defaultOptions & STRENGTH_MASK);
329    }
330
331    static int getStrength(int options) {
332        return options >> STRENGTH_SHIFT;
333    }
334
335    public int getStrength() {
336        return getStrength(options);
337    }
338
339    /** Sets the options bit for an on/off attribute. */
340    public void setFlag(int bit, boolean value) {
341        if(value) {
342            options |= bit;
343        } else {
344            options &= ~bit;
345        }
346    }
347
348    public void setFlagDefault(int bit, int defaultOptions) {
349        options = (options & ~bit) | (defaultOptions & bit);
350    }
351
352    public boolean getFlag(int bit) {
353        return (options & bit) != 0;
354    }
355
356    public void setCaseFirst(int value) {
357        assert value == 0 || value == CASE_FIRST || value == CASE_FIRST_AND_UPPER_MASK;
358        int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK;
359        options = noCaseFirst | value;
360    }
361
362    public void setCaseFirstDefault(int defaultOptions) {
363        int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK;
364        options = noCaseFirst | (defaultOptions & CASE_FIRST_AND_UPPER_MASK);
365    }
366
367    public int getCaseFirst() {
368        return options & CASE_FIRST_AND_UPPER_MASK;
369    }
370
371    public void setAlternateHandlingShifted(boolean value) {
372        int noAlternate = options & ~ALTERNATE_MASK;
373        if(value) {
374            options = noAlternate | SHIFTED;
375        } else {
376            options = noAlternate;
377        }
378    }
379
380    public void setAlternateHandlingDefault(int defaultOptions) {
381        int noAlternate = options & ~ALTERNATE_MASK;
382        options = noAlternate | (defaultOptions & ALTERNATE_MASK);
383    }
384
385    public boolean getAlternateHandling() {
386        return (options & ALTERNATE_MASK) != 0;
387    }
388
389    public void setMaxVariable(int value, int defaultOptions) {
390        int noMax = options & ~MAX_VARIABLE_MASK;
391        switch(value) {
392        case MAX_VAR_SPACE:
393        case MAX_VAR_PUNCT:
394        case MAX_VAR_SYMBOL:
395        case MAX_VAR_CURRENCY:
396            options = noMax | (value << MAX_VARIABLE_SHIFT);
397            break;
398        case -1:
399            options = noMax | (defaultOptions & MAX_VARIABLE_MASK);
400            break;
401        default:
402            throw new IllegalArgumentException("illegal maxVariable value " + value);
403        }
404    }
405
406    public int getMaxVariable() {
407        return (options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT;
408    }
409
410    /**
411     * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
412     */
413    static boolean isTertiaryWithCaseBits(int options) {
414        return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
415    }
416    static int getTertiaryMask(int options) {
417        // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
418        return isTertiaryWithCaseBits(options) ?
419                Collation.CASE_AND_TERTIARY_MASK : Collation.ONLY_TERTIARY_MASK;
420    }
421
422    static boolean sortsTertiaryUpperCaseFirst(int options) {
423        // On tertiary level, consider case bits and sort uppercase first
424        // if caseLevel is off and caseFirst==upperFirst.
425        return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
426    }
427
428    public boolean dontCheckFCD() {
429        return (options & CHECK_FCD) == 0;
430    }
431
432    boolean hasBackwardSecondary() {
433        return (options & BACKWARD_SECONDARY) != 0;
434    }
435
436    public boolean isNumeric() {
437        return (options & NUMERIC) != 0;
438    }
439
440    /** CHECK_FCD etc. */
441    public int options = (Collator.TERTIARY << STRENGTH_SHIFT) |  // DEFAULT_STRENGTH
442            (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT);
443    /** Variable-top primary weight. */
444    public long variableTop;
445    /**
446     * 256-byte table for reordering permutation of primary lead bytes; null if no reordering.
447     * A 0 entry at a non-zero index means that the primary lead byte is "split"
448     * (there are different offsets for primaries that share that lead byte)
449     * and the reordering offset must be determined via the reorderRanges.
450     */
451    public byte[] reorderTable;
452    /** Limit of last reordered range. 0 if no reordering or no split bytes. */
453    long minHighNoReorder;
454    /**
455     * Primary-weight ranges for script reordering,
456     * to be used by reorder(p) for split-reordered primary lead bytes.
457     *
458     * <p>Each entry is a (limit, offset) pair.
459     * The upper 16 bits of the entry are the upper 16 bits of the
460     * exclusive primary limit of a range.
461     * Primaries between the previous limit and this one have their lead bytes
462     * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
463     *
464     * <p>CollationData.makeReorderRanges() writes a full list where the first range
465     * (at least for terminators and separators) has a 0 offset.
466     * The last range has a non-zero offset.
467     * minHighNoReorder is set to the limit of that last range.
468     *
469     * <p>In the settings object, the initial ranges before the first split lead byte
470     * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
471     * If there are no split-reordered lead bytes, then no ranges are needed.
472     */
473    long[] reorderRanges;
474    /** Array of reorder codes; ignored if length == 0. */
475    public int[] reorderCodes = EMPTY_INT_ARRAY;
476    // Note: In C++, we keep a memory block around for the reorder codes,
477    // the ranges, and the permutation table,
478    // and modify them for new codes.
479    // In Java, we simply copy references and then never modify the array contents.
480    // The caller must abandon the arrays.
481    // Reorder codes from the public setter API must be cloned.
482    private static final int[] EMPTY_INT_ARRAY = new int[0];
483
484    /** Options for CollationFastLatin. Negative if disabled. */
485    public int fastLatinOptions = -1;
486    // fastLatinPrimaries.length must be equal to CollationFastLatin.LATIN_LIMIT,
487    // but we do not import CollationFastLatin to reduce circular dependencies.
488    public char[] fastLatinPrimaries = new char[0x180];  // mutable contents
489}
490