1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 ******************************************************************************
6 *
7 *   Copyright (C) 2009-2015, International Business Machines
8 *   Corporation and others.  All Rights Reserved.
9 *
10 ******************************************************************************
11 */
12
13package android.icu.impl;
14
15import android.icu.text.UnicodeSet.SpanCondition;
16import android.icu.util.OutputInt;
17
18/**
19 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
20 *
21 * Latin-1: Look up bytes.
22 * 2-byte characters: Bits organized vertically.
23 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
24 * Supplementary characters: Call contains() on the parent set.
25 * @hide Only a subset of ICU is exposed in Android
26 */
27public final class BMPSet {
28    public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
29
30    /**
31     * One boolean ('true' or 'false') per Latin-1 character.
32     */
33    private boolean[] latin1Contains;
34
35    /**
36     * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
37     * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
38     * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
39     *
40     * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
41     * runtime.
42     */
43    private int[] table7FF;
44
45    /**
46     * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
47     * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
48     * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
49     * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
50     * and set.contains(c) must be called.
51     *
52     * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
53     * validity checking at runtime.
54     */
55    private int[] bmpBlockBits;
56
57    /**
58     * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
59     * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
60     * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
61     */
62    private int[] list4kStarts;
63
64    /**
65     * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
66     * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
67     */
68    private final int[] list;
69    private final int listLength; // length used; list may be longer to minimize reallocs
70
71    public BMPSet(final int[] parentList, int parentListLength) {
72        list = parentList;
73        listLength = parentListLength;
74        latin1Contains = new boolean[0x100];
75        table7FF = new int[64];
76        bmpBlockBits = new int[64];
77        list4kStarts = new int[18];
78
79        /*
80         * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
81         * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
82         * indexes is for finding supplementary code points.
83         */
84        list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
85        int i;
86        for (i = 1; i <= 0x10; ++i) {
87            list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
88        }
89        list4kStarts[0x11] = listLength - 1;
90
91        initBits();
92    }
93
94    public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) {
95        list = newParentList;
96        listLength = newParentListLength;
97        latin1Contains = otherBMPSet.latin1Contains.clone();
98        table7FF = otherBMPSet.table7FF.clone();
99        bmpBlockBits = otherBMPSet.bmpBlockBits.clone();
100        list4kStarts = otherBMPSet.list4kStarts.clone();
101    }
102
103    public boolean contains(int c) {
104        if (c <= 0xff) {
105            return (latin1Contains[c]);
106        } else if (c <= 0x7ff) {
107            return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
108        } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
109            int lead = c >> 12;
110            int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
111            if (twoBits <= 1) {
112                // All 64 code points with the same bits 15..6
113                // are either in the set or not.
114                return (0 != twoBits);
115            } else {
116                // Look up the code point in its 4k block of code points.
117                return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
118            }
119        } else if (c <= 0x10ffff) {
120            // surrogate or supplementary code point
121            return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
122        } else {
123            // Out-of-range code points get false, consistent with long-standing
124            // behavior of UnicodeSet.contains(c).
125            return false;
126        }
127    }
128
129    /**
130     * Span the initial substring for which each character c has spanCondition==contains(c). It must be
131     * spanCondition==0 or 1.
132     *
133     * @param start The start index
134     * @param outCount If not null: Receives the number of code points in the span.
135     * @return the limit (exclusive end) of the span
136     *
137     * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
138     * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
139     * as usual in ICU.
140     */
141    public final int span(CharSequence s, int start, SpanCondition spanCondition,
142            OutputInt outCount) {
143        char c, c2;
144        int i = start;
145        int limit = s.length();
146        int numSupplementary = 0;
147        if (SpanCondition.NOT_CONTAINED != spanCondition) {
148            // span
149            while (i < limit) {
150                c = s.charAt(i);
151                if (c <= 0xff) {
152                    if (!latin1Contains[c]) {
153                        break;
154                    }
155                } else if (c <= 0x7ff) {
156                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
157                        break;
158                    }
159                } else if (c < 0xd800 ||
160                           c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
161                    int lead = c >> 12;
162                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
163                    if (twoBits <= 1) {
164                        // All 64 code points with the same bits 15..6
165                        // are either in the set or not.
166                        if (twoBits == 0) {
167                            break;
168                        }
169                    } else {
170                        // Look up the code point in its 4k block of code points.
171                        if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
172                            break;
173                        }
174                    }
175                } else {
176                    // surrogate pair
177                    int supplementary = Character.toCodePoint(c, c2);
178                    if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
179                        break;
180                    }
181                    ++numSupplementary;
182                    ++i;
183                }
184                ++i;
185            }
186        } else {
187            // span not
188            while (i < limit) {
189                c = s.charAt(i);
190                if (c <= 0xff) {
191                    if (latin1Contains[c]) {
192                        break;
193                    }
194                } else if (c <= 0x7ff) {
195                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
196                        break;
197                    }
198                } else if (c < 0xd800 ||
199                           c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
200                    int lead = c >> 12;
201                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
202                    if (twoBits <= 1) {
203                        // All 64 code points with the same bits 15..6
204                        // are either in the set or not.
205                        if (twoBits != 0) {
206                            break;
207                        }
208                    } else {
209                        // Look up the code point in its 4k block of code points.
210                        if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
211                            break;
212                        }
213                    }
214                } else {
215                    // surrogate pair
216                    int supplementary = Character.toCodePoint(c, c2);
217                    if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
218                        break;
219                    }
220                    ++numSupplementary;
221                    ++i;
222                }
223                ++i;
224            }
225        }
226        if (outCount != null) {
227            int spanLength = i - start;
228            outCount.value = spanLength - numSupplementary;  // number of code points
229        }
230        return i;
231    }
232
233    /**
234     * Symmetrical with span().
235     * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
236     * limit and spanCondition==0 or 1.
237     *
238     * @return The string index which starts the span (i.e. inclusive).
239     */
240    public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
241        char c, c2;
242
243        if (SpanCondition.NOT_CONTAINED != spanCondition) {
244            // span
245            for (;;) {
246                c = s.charAt(--limit);
247                if (c <= 0xff) {
248                    if (!latin1Contains[c]) {
249                        break;
250                    }
251                } else if (c <= 0x7ff) {
252                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
253                        break;
254                    }
255                } else if (c < 0xd800 ||
256                           c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
257                    int lead = c >> 12;
258                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
259                    if (twoBits <= 1) {
260                        // All 64 code points with the same bits 15..6
261                        // are either in the set or not.
262                        if (twoBits == 0) {
263                            break;
264                        }
265                    } else {
266                        // Look up the code point in its 4k block of code points.
267                        if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
268                            break;
269                        }
270                    }
271                } else {
272                    // surrogate pair
273                    int supplementary = Character.toCodePoint(c2, c);
274                    if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
275                        break;
276                    }
277                    --limit;
278                }
279                if (0 == limit) {
280                    return 0;
281                }
282            }
283        } else {
284            // span not
285            for (;;) {
286                c = s.charAt(--limit);
287                if (c <= 0xff) {
288                    if (latin1Contains[c]) {
289                        break;
290                    }
291                } else if (c <= 0x7ff) {
292                    if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
293                        break;
294                    }
295                } else if (c < 0xd800 ||
296                           c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
297                    int lead = c >> 12;
298                    int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
299                    if (twoBits <= 1) {
300                        // All 64 code points with the same bits 15..6
301                        // are either in the set or not.
302                        if (twoBits != 0) {
303                            break;
304                        }
305                    } else {
306                        // Look up the code point in its 4k block of code points.
307                        if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
308                            break;
309                        }
310                    }
311                } else {
312                    // surrogate pair
313                    int supplementary = Character.toCodePoint(c2, c);
314                    if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
315                        break;
316                    }
317                    --limit;
318                }
319                if (0 == limit) {
320                    return 0;
321                }
322            }
323        }
324        return limit + 1;
325    }
326
327    /**
328     * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
329     */
330    private static void set32x64Bits(int[] table, int start, int limit) {
331        assert (64 == table.length);
332        int lead = start >> 6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
333        int trail = start & 0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
334
335        // Set one bit indicating an all-one block.
336        int bits = 1 << lead;
337        if ((start + 1) == limit) { // Single-character shortcut.
338            table[trail] |= bits;
339            return;
340        }
341
342        int limitLead = limit >> 6;
343        int limitTrail = limit & 0x3f;
344
345        if (lead == limitLead) {
346            // Partial vertical bit column.
347            while (trail < limitTrail) {
348                table[trail++] |= bits;
349            }
350        } else {
351            // Partial vertical bit column,
352            // followed by a bit rectangle,
353            // followed by another partial vertical bit column.
354            if (trail > 0) {
355                do {
356                    table[trail++] |= bits;
357                } while (trail < 64);
358                ++lead;
359            }
360            if (lead < limitLead) {
361                bits = ~((1 << lead) - 1);
362                if (limitLead < 0x20) {
363                    bits &= (1 << limitLead) - 1;
364                }
365                for (trail = 0; trail < 64; ++trail) {
366                    table[trail] |= bits;
367                }
368            }
369            // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
370            // In that case, bits=1<<limitLead == 1<<0 == 1
371            // (because Java << uses only the lower 5 bits of the shift operand)
372            // but the bits value is not used because trail<limitTrail is already false.
373            bits = 1 << limitLead;
374            for (trail = 0; trail < limitTrail; ++trail) {
375                table[trail] |= bits;
376            }
377        }
378    }
379
380    private void initBits() {
381        int start, limit;
382        int listIndex = 0;
383
384        // Set latin1Contains[].
385        do {
386            start = list[listIndex++];
387            if (listIndex < listLength) {
388                limit = list[listIndex++];
389            } else {
390                limit = 0x110000;
391            }
392            if (start >= 0x100) {
393                break;
394            }
395            do {
396                latin1Contains[start++] = true;
397            } while (start < limit && start < 0x100);
398        } while (limit <= 0x100);
399
400        // Set table7FF[].
401        while (start < 0x800) {
402            set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
403            if (limit > 0x800) {
404                start = 0x800;
405                break;
406            }
407
408            start = list[listIndex++];
409            if (listIndex < listLength) {
410                limit = list[listIndex++];
411            } else {
412                limit = 0x110000;
413            }
414        }
415
416        // Set bmpBlockBits[].
417        int minStart = 0x800;
418        while (start < 0x10000) {
419            if (limit > 0x10000) {
420                limit = 0x10000;
421            }
422
423            if (start < minStart) {
424                start = minStart;
425            }
426            if (start < limit) { // Else: Another range entirely in a known mixed-value block.
427                if (0 != (start & 0x3f)) {
428                    // Mixed-value block of 64 code points.
429                    start >>= 6;
430                    bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
431                    start = (start + 1) << 6; // Round up to the next block boundary.
432                    minStart = start; // Ignore further ranges in this block.
433                }
434                if (start < limit) {
435                    if (start < (limit & ~0x3f)) {
436                        // Multiple all-ones blocks of 64 code points each.
437                        set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
438                    }
439
440                    if (0 != (limit & 0x3f)) {
441                        // Mixed-value block of 64 code points.
442                        limit >>= 6;
443                        bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
444                        limit = (limit + 1) << 6; // Round up to the next block boundary.
445                        minStart = limit; // Ignore further ranges in this block.
446                    }
447                }
448            }
449
450            if (limit == 0x10000) {
451                break;
452            }
453
454            start = list[listIndex++];
455            if (listIndex < listLength) {
456                limit = list[listIndex++];
457            } else {
458                limit = 0x110000;
459            }
460        }
461    }
462
463
464    /**
465     * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
466     * points in a certain range.
467     *
468     * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
469     * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
470     *
471     * @param c
472     *            a character in a subrange of MIN_VALUE..MAX_VALUE
473     * @param lo
474     *            The lowest index to be returned.
475     * @param hi
476     *            The highest index to be returned.
477     * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
478     */
479    private int findCodePoint(int c, int lo, int hi) {
480        /* Examples:
481                                           findCodePoint(c)
482           set              list[]         c=0 1 3 4 7 8
483           ===              ==============   ===========
484           []               [110000]         0 0 0 0 0 0
485           [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
486           [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
487           [:Any:]          [0, 110000]      1 1 1 1 1 1
488         */
489
490        // Return the smallest i such that c < list[i]. Assume
491        // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
492        if (c < list[lo])
493            return lo;
494        // High runner test. c is often after the last range, so an
495        // initial check for this condition pays off.
496        if (lo >= hi || c >= list[hi - 1])
497            return hi;
498        // invariant: c >= list[lo]
499        // invariant: c < list[hi]
500        for (;;) {
501            int i = (lo + hi) >>> 1;
502            if (i == lo) {
503                break; // Found!
504            } else if (c < list[i]) {
505                hi = i;
506            } else {
507                lo = i;
508            }
509        }
510        return hi;
511    }
512
513    private final boolean containsSlow(int c, int lo, int hi) {
514        return (0 != (findCodePoint(c, lo, hi) & 1));
515    }
516}
517
518