1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2013-2014, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* CollationRootElements.java, ported from collationrootelements.h/.cpp
9*
10* C++ version created on: 2013mar01
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16/**
17 * Container and access methods for collation elements and weights
18 * that occur in the root collator.
19 * Needed for finding boundaries for building a tailoring.
20 *
21 * This class takes and returns 16-bit secondary and tertiary weights.
22 */
23public final class CollationRootElements {
24    public CollationRootElements(long[] rootElements) {
25        elements = rootElements;
26    }
27
28    /**
29     * Higher than any root primary.
30     */
31    public static final long PRIMARY_SENTINEL = 0xffffff00L;
32
33    /**
34     * Flag in a root element, set if the element contains secondary & tertiary weights,
35     * rather than a primary.
36     */
37    public static final int SEC_TER_DELTA_FLAG = 0x80;
38    /**
39     * Mask for getting the primary range step value from a primary-range-end element.
40     */
41    public static final int PRIMARY_STEP_MASK = 0x7f;
42
43    /**
44     * Index of the first CE with a non-zero tertiary weight.
45     * Same as the start of the compact root elements table.
46     */
47    public static final int IX_FIRST_TERTIARY_INDEX = 0;
48    /**
49     * Index of the first CE with a non-zero secondary weight.
50     */
51    static final int IX_FIRST_SECONDARY_INDEX = 1;
52    /**
53     * Index of the first CE with a non-zero primary weight.
54     */
55    static final int IX_FIRST_PRIMARY_INDEX = 2;
56    /**
57     * Must match Collation.COMMON_SEC_AND_TER_CE.
58     */
59    static final int IX_COMMON_SEC_AND_TER_CE = 3;
60    /**
61     * Secondary & tertiary boundaries.
62     * Bits 31..24: [fixed last secondary common byte 45]
63     * Bits 23..16: [fixed first ignorable secondary byte 80]
64     * Bits 15.. 8: reserved, 0
65     * Bits  7.. 0: [fixed first ignorable tertiary byte 3C]
66     */
67    static final int IX_SEC_TER_BOUNDARIES = 4;
68    /**
69     * The current number of indexes.
70     * Currently the same as elements[IX_FIRST_TERTIARY_INDEX].
71     */
72    static final int IX_COUNT = 5;
73
74    /**
75     * Returns the boundary between tertiary weights of primary/secondary CEs
76     * and those of tertiary CEs.
77     * This is the upper limit for tertiaries of primary/secondary CEs.
78     * This minus one is the lower limit for tertiaries of tertiary CEs.
79     */
80    public int getTertiaryBoundary() {
81        return ((int)elements[IX_SEC_TER_BOUNDARIES] << 8) & 0xff00;
82    }
83
84    /**
85     * Returns the first assigned tertiary CE.
86     */
87    long getFirstTertiaryCE() {
88        return elements[(int)elements[IX_FIRST_TERTIARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
89    }
90
91    /**
92     * Returns the last assigned tertiary CE.
93     */
94    long getLastTertiaryCE() {
95        return elements[(int)elements[IX_FIRST_SECONDARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
96    }
97
98    /**
99     * Returns the last common secondary weight.
100     * This is the lower limit for secondaries of primary CEs.
101     */
102    public int getLastCommonSecondary() {
103        return ((int)elements[IX_SEC_TER_BOUNDARIES] >> 16) & 0xff00;
104    }
105
106    /**
107     * Returns the boundary between secondary weights of primary CEs
108     * and those of secondary CEs.
109     * This is the upper limit for secondaries of primary CEs.
110     * This minus one is the lower limit for secondaries of secondary CEs.
111     */
112    public int getSecondaryBoundary() {
113        return ((int)elements[IX_SEC_TER_BOUNDARIES] >> 8) & 0xff00;
114    }
115
116    /**
117     * Returns the first assigned secondary CE.
118     */
119    long getFirstSecondaryCE() {
120        return elements[(int)elements[IX_FIRST_SECONDARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
121    }
122
123    /**
124     * Returns the last assigned secondary CE.
125     */
126    long getLastSecondaryCE() {
127        return elements[(int)elements[IX_FIRST_PRIMARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
128    }
129
130    /**
131     * Returns the first assigned primary weight.
132     */
133    long getFirstPrimary() {
134        return elements[(int)elements[IX_FIRST_PRIMARY_INDEX]];  // step=0: cannot be a range end
135    }
136
137    /**
138     * Returns the first assigned primary CE.
139     */
140    long getFirstPrimaryCE() {
141        return Collation.makeCE(getFirstPrimary());
142    }
143
144    /**
145     * Returns the last root CE with a primary weight before p.
146     * Intended only for reordering group boundaries.
147     */
148    long lastCEWithPrimaryBefore(long p) {
149        if(p == 0) { return 0; }
150        assert(p > elements[(int)elements[IX_FIRST_PRIMARY_INDEX]]);
151        int index = findP(p);
152        long q = elements[index];
153        long secTer;
154        if(p == (q & 0xffffff00L)) {
155            // p == elements[index] is a root primary. Find the CE before it.
156            // We must not be in a primary range.
157            assert((q & PRIMARY_STEP_MASK) == 0);
158            secTer = elements[index - 1];
159            if((secTer & SEC_TER_DELTA_FLAG) == 0) {
160                // Primary CE just before p.
161                p = secTer & 0xffffff00L;
162                secTer = Collation.COMMON_SEC_AND_TER_CE;
163            } else {
164                // secTer = last secondary & tertiary for the previous primary
165                index -= 2;
166                for(;;) {
167                    p = elements[index];
168                    if((p & SEC_TER_DELTA_FLAG) == 0) {
169                        p &= 0xffffff00L;
170                        break;
171                    }
172                    --index;
173                }
174            }
175        } else {
176            // p > elements[index] which is the previous primary.
177            // Find the last secondary & tertiary weights for it.
178            p = q & 0xffffff00L;
179            secTer = Collation.COMMON_SEC_AND_TER_CE;
180            for(;;) {
181                q = elements[++index];
182                if((q & SEC_TER_DELTA_FLAG) == 0) {
183                    // We must not be in a primary range.
184                    assert((q & PRIMARY_STEP_MASK) == 0);
185                    break;
186                }
187                secTer = q;
188            }
189        }
190        return (p << 32) | (secTer & ~SEC_TER_DELTA_FLAG);
191    }
192
193    /**
194     * Returns the first root CE with a primary weight of at least p.
195     * Intended only for reordering group boundaries.
196     */
197    long firstCEWithPrimaryAtLeast(long p) {
198        if(p == 0) { return 0; }
199        int index = findP(p);
200        if(p != (elements[index] & 0xffffff00L)) {
201            for(;;) {
202                p = elements[++index];
203                if((p & SEC_TER_DELTA_FLAG) == 0) {
204                    // First primary after p. We must not be in a primary range.
205                    assert((p & PRIMARY_STEP_MASK) == 0);
206                    break;
207                }
208            }
209        }
210        // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0.
211        return (p << 32) | Collation.COMMON_SEC_AND_TER_CE;
212    }
213
214    /**
215     * Returns the primary weight before p.
216     * p must be greater than the first root primary.
217     */
218    long getPrimaryBefore(long p, boolean isCompressible) {
219        int index = findPrimary(p);
220        int step;
221        long q = elements[index];
222        if(p == (q & 0xffffff00L)) {
223            // Found p itself. Return the previous primary.
224            // See if p is at the end of a previous range.
225            step = (int)q & PRIMARY_STEP_MASK;
226            if(step == 0) {
227                // p is not at the end of a range. Look for the previous primary.
228                do {
229                    p = elements[--index];
230                } while((p & SEC_TER_DELTA_FLAG) != 0);
231                return p & 0xffffff00L;
232            }
233        } else {
234            // p is in a range, and not at the start.
235            long nextElement = elements[index + 1];
236            assert(isEndOfPrimaryRange(nextElement));
237            step = (int)nextElement & PRIMARY_STEP_MASK;
238        }
239        // Return the previous range primary.
240        if((p & 0xffff) == 0) {
241            return Collation.decTwoBytePrimaryByOneStep(p, isCompressible, step);
242        } else {
243            return Collation.decThreeBytePrimaryByOneStep(p, isCompressible, step);
244        }
245    }
246
247    /** Returns the secondary weight before [p, s]. */
248    int getSecondaryBefore(long p, int s) {
249        int index;
250        int previousSec, sec;
251        if(p == 0) {
252            index = (int)elements[IX_FIRST_SECONDARY_INDEX];
253            // Gap at the beginning of the secondary CE range.
254            previousSec = 0;
255            sec = (int)(elements[index] >> 16);
256        } else {
257            index = findPrimary(p) + 1;
258            previousSec = Collation.BEFORE_WEIGHT16;
259            sec = (int)getFirstSecTerForPrimary(index) >>> 16;
260        }
261        assert(s >= sec);
262        while(s > sec) {
263            previousSec = sec;
264            assert((elements[index] & SEC_TER_DELTA_FLAG) != 0);
265            sec = (int)(elements[index++] >> 16);
266        }
267        assert(sec == s);
268        return previousSec;
269    }
270
271    /** Returns the tertiary weight before [p, s, t]. */
272    int getTertiaryBefore(long p, int s, int t) {
273        assert((t & ~Collation.ONLY_TERTIARY_MASK) == 0);
274        int index;
275        int previousTer;
276        long secTer;
277        if(p == 0) {
278            if(s == 0) {
279                index = (int)elements[IX_FIRST_TERTIARY_INDEX];
280                // Gap at the beginning of the tertiary CE range.
281                previousTer = 0;
282            } else {
283                index = (int)elements[IX_FIRST_SECONDARY_INDEX];
284                previousTer = Collation.BEFORE_WEIGHT16;
285            }
286            secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
287        } else {
288            index = findPrimary(p) + 1;
289            previousTer = Collation.BEFORE_WEIGHT16;
290            secTer = getFirstSecTerForPrimary(index);
291        }
292        long st = ((long)s << 16) | t;
293        while(st > secTer) {
294            if((int)(secTer >> 16) == s) { previousTer = (int)secTer; }
295            assert((elements[index] & SEC_TER_DELTA_FLAG) != 0);
296            secTer = elements[index++] & ~SEC_TER_DELTA_FLAG;
297        }
298        assert(secTer == st);
299        return previousTer & 0xffff;
300    }
301
302    /**
303     * Finds the index of the input primary.
304     * p must occur as a root primary, and must not be 0.
305     */
306    int findPrimary(long p) {
307        // Requirement: p must occur as a root primary.
308        assert((p & 0xff) == 0);  // at most a 3-byte primary
309        int index = findP(p);
310        // If p is in a range, then we just assume that p is an actual primary in this range.
311        // (Too cumbersome/expensive to check.)
312        // Otherwise, it must be an exact match.
313        assert(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00L));
314        return index;
315    }
316
317    /**
318     * Returns the primary weight after p where index=findPrimary(p).
319     * p must be at least the first root primary.
320     */
321    long getPrimaryAfter(long p, int index, boolean isCompressible) {
322        assert(p == (elements[index] & 0xffffff00L) || isEndOfPrimaryRange(elements[index + 1]));
323        long q = elements[++index];
324        int step;
325        if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int)q & PRIMARY_STEP_MASK) != 0) {
326            // Return the next primary in this range.
327            if((p & 0xffff) == 0) {
328                return Collation.incTwoBytePrimaryByOffset(p, isCompressible, step);
329            } else {
330                return Collation.incThreeBytePrimaryByOffset(p, isCompressible, step);
331            }
332        } else {
333            // Return the next primary in the list.
334            while((q & SEC_TER_DELTA_FLAG) != 0) {
335                q = elements[++index];
336            }
337            assert((q & PRIMARY_STEP_MASK) == 0);
338            return q;
339        }
340    }
341    /**
342     * Returns the secondary weight after [p, s] where index=findPrimary(p)
343     * except use index=0 for p=0.
344     *
345     * <p>Must return a weight for every root [p, s] as well as for every weight
346     * returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16.
347     *
348     * <p>Exception: [0, 0] is handled by the CollationBuilder:
349     * Both its lower and upper boundaries are special.
350     */
351    int getSecondaryAfter(int index, int s) {
352        long secTer;
353        int secLimit;
354        if(index == 0) {
355            // primary = 0
356            assert(s != 0);
357            index = (int)elements[IX_FIRST_SECONDARY_INDEX];
358            secTer = elements[index];
359            // Gap at the end of the secondary CE range.
360            secLimit = 0x10000;
361        } else {
362            assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]);
363            secTer = getFirstSecTerForPrimary(index + 1);
364            // If this is an explicit sec/ter unit, then it will be read once more.
365            // Gap for secondaries of primary CEs.
366            secLimit = getSecondaryBoundary();
367        }
368        for(;;) {
369            int sec = (int)(secTer >> 16);
370            if(sec > s) { return sec; }
371            secTer = elements[++index];
372            if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
373        }
374    }
375    /**
376     * Returns the tertiary weight after [p, s, t] where index=findPrimary(p)
377     * except use index=0 for p=0.
378     *
379     * <p>Must return a weight for every root [p, s, t] as well as for every weight
380     * returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16.
381     *
382     * <p>Exception: [0, 0, 0] is handled by the CollationBuilder:
383     * Both its lower and upper boundaries are special.
384     */
385    int getTertiaryAfter(int index, int s, int t) {
386        long secTer;
387        int terLimit;
388        if(index == 0) {
389            // primary = 0
390            if(s == 0) {
391                assert(t != 0);
392                index = (int)elements[IX_FIRST_TERTIARY_INDEX];
393                // Gap at the end of the tertiary CE range.
394                terLimit = 0x4000;
395            } else {
396                index = (int)elements[IX_FIRST_SECONDARY_INDEX];
397                // Gap for tertiaries of primary/secondary CEs.
398                terLimit = getTertiaryBoundary();
399            }
400            secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
401        } else {
402            assert(index >= (int)elements[IX_FIRST_PRIMARY_INDEX]);
403            secTer = getFirstSecTerForPrimary(index + 1);
404            // If this is an explicit sec/ter unit, then it will be read once more.
405            terLimit = getTertiaryBoundary();
406        }
407        long st = (((long)s & 0xffffffffL) << 16) | t;
408        for(;;) {
409            if(secTer > st) {
410                assert((secTer >> 16) == s);
411                return (int)secTer & 0xffff;
412            }
413            secTer = elements[++index];
414            // No tertiary greater than t for this primary+secondary.
415            if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; }
416            secTer &= ~SEC_TER_DELTA_FLAG;
417        }
418    }
419
420    /**
421     * Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1.
422     */
423    private long getFirstSecTerForPrimary(int index) {
424        long secTer = elements[index];
425        if((secTer & SEC_TER_DELTA_FLAG) == 0) {
426            // No sec/ter delta.
427            return Collation.COMMON_SEC_AND_TER_CE;
428        }
429        secTer &= ~SEC_TER_DELTA_FLAG;
430        if(secTer > Collation.COMMON_SEC_AND_TER_CE) {
431            // Implied sec/ter.
432            return Collation.COMMON_SEC_AND_TER_CE;
433        }
434        // Explicit sec/ter below common/common.
435        return secTer;
436    }
437
438    /**
439     * Finds the largest index i where elements[i]<=p.
440     * Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL).
441     * Does not require that p is a root collator primary.
442     */
443    private int findP(long p) {
444        // p need not occur as a root primary.
445        // For example, it might be a reordering group boundary.
446        assert((p >> 24) != Collation.UNASSIGNED_IMPLICIT_BYTE);
447        // modified binary search
448        int start = (int)elements[IX_FIRST_PRIMARY_INDEX];
449        assert(p >= elements[start]);
450        int limit = elements.length - 1;
451        assert(elements[limit] >= PRIMARY_SENTINEL);
452        assert(p < elements[limit]);
453        while((start + 1) < limit) {
454            // Invariant: elements[start] and elements[limit] are primaries,
455            // and elements[start]<=p<=elements[limit].
456            int i = (int)(((long)start + (long)limit) / 2);
457            long q = elements[i];
458            if((q & SEC_TER_DELTA_FLAG) != 0) {
459                // Find the next primary.
460                int j = i + 1;
461                for(;;) {
462                    if(j == limit) { break; }
463                    q = elements[j];
464                    if((q & SEC_TER_DELTA_FLAG) == 0) {
465                        i = j;
466                        break;
467                    }
468                    ++j;
469                }
470                if((q & SEC_TER_DELTA_FLAG) != 0) {
471                    // Find the preceding primary.
472                    j = i - 1;
473                    for(;;) {
474                        if(j == start) { break; }
475                        q = elements[j];
476                        if((q & SEC_TER_DELTA_FLAG) == 0) {
477                            i = j;
478                            break;
479                        }
480                        --j;
481                    }
482                    if((q & SEC_TER_DELTA_FLAG) != 0) {
483                        // No primary between start and limit.
484                        break;
485                    }
486                }
487            }
488            if(p < (q & 0xffffff00L)) {  // Reset the "step" bits of a range end primary.
489                limit = i;
490            } else {
491                start = i;
492            }
493        }
494        return start;
495    }
496
497    private static boolean isEndOfPrimaryRange(long q) {
498        return (q & SEC_TER_DELTA_FLAG) == 0 && (q & PRIMARY_STEP_MASK) != 0;
499    }
500
501    /**
502     * Data structure: See ICU4C source/i18n/collationrootelements.h.
503     */
504    private long[] elements;
505}
506