1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2009-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10package com.ibm.icu.impl;
11
12import java.io.DataOutputStream;
13import java.io.IOException;
14import java.io.OutputStream;
15import java.nio.ByteBuffer;
16
17
18/**
19 * @author aheninger
20 *
21 * A read-only Trie2, holding 16 bit data values.
22 *
23 * A Trie2 is a highly optimized data structure for mapping from Unicode
24 * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
25 *
26 * See class Trie2 for descriptions of the API for accessing the contents of a trie.
27 *
28 * The fundamental data access methods are declared final in this class, with
29 * the intent that applications might gain a little extra performance, when compared
30 * with calling the same methods via the abstract UTrie2 base class.
31 */
32public final class Trie2_16 extends Trie2 {
33
34
35    /**
36     *  Internal constructor, not for general use.
37     */
38    Trie2_16() {
39    }
40
41
42    /**
43     * Create a Trie2 from its serialized form.  Inverse of utrie2_serialize().
44     * The serialized format is identical between ICU4C and ICU4J, so this function
45     * will work with serialized Trie2s from either.
46     *
47     * The serialized Trie2 in the bytes may be in either little or big endian byte order.
48     * This allows using serialized Tries from ICU4C without needing to consider the
49     * byte order of the system that created them.
50     *
51     * @param bytes a byte buffer to the serialized form of a UTrie2.
52     * @return An unserialized Trie2_16, ready for use.
53     * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2.
54     * @throws IOException if a read error occurs in the buffer.
55     * @throws ClassCastException if the bytes contain a serialized Trie2_32
56     */
57    public static Trie2_16  createFromSerialized(ByteBuffer bytes) throws IOException {
58        return (Trie2_16) Trie2.createFromSerialized(bytes);
59    }
60
61    /**
62     * Get the value for a code point as stored in the Trie2.
63     *
64     * @param codePoint the code point
65     * @return the value
66     */
67    @Override
68    public final int get(int codePoint) {
69        int value;
70        int ix;
71
72        if (codePoint >= 0) {
73            if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
74                // Ordinary BMP code point, excluding leading surrogates.
75                // BMP uses a single level lookup.  BMP index starts at offset 0 in the Trie2 index.
76                // 16 bit data is stored in the index array itself.
77                ix = index[codePoint >> UTRIE2_SHIFT_2];
78                ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
79                value = index[ix];
80                return value;
81            }
82            if (codePoint <= 0xffff) {
83                // Lead Surrogate Code Point.  A Separate index section is stored for
84                // lead surrogate code units and code points.
85                //   The main index has the code unit data.
86                //   For this function, we need the code point data.
87                // Note: this expression could be refactored for slightly improved efficiency, but
88                //       surrogate code points will be so rare in practice that it's not worth it.
89                ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
90                ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
91                value = index[ix];
92                return value;
93            }
94            if (codePoint < highStart) {
95                // Supplemental code point, use two-level lookup.
96                ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
97                ix = index[ix];
98                ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
99                ix = index[ix];
100                ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
101                value = index[ix];
102                return value;
103            }
104            if (codePoint <= 0x10ffff) {
105                value = index[highValueIndex];
106                return value;
107            }
108        }
109
110        // Fall through.  The code point is outside of the legal range of 0..0x10ffff.
111        return errorValue;
112    }
113
114
115    /**
116     * Get a Trie2 value for a UTF-16 code unit.
117     *
118     * This function returns the same value as get() if the input
119     * character is outside of the lead surrogate range
120     *
121     * There are two values stored in a Trie2 for inputs in the lead
122     * surrogate range.  This function returns the alternate value,
123     * while Trie2.get() returns the main value.
124     *
125     * @param codeUnit a 16 bit code unit or lead surrogate value.
126     * @return the value
127     */
128    @Override
129    public int getFromU16SingleLead(char codeUnit) {
130        int value;
131        int ix;
132
133        // Because the input is a 16 bit char, we can skip the tests for it being in
134        // the BMP range.  It is.
135        ix = index[codeUnit >> UTRIE2_SHIFT_2];
136        ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
137        value = index[ix];
138        return value;
139    }
140
141
142    /**
143     * Serialize a Trie2_16 onto an OutputStream.
144     *
145     * A Trie2 can be serialized multiple times.
146     * The serialized data is compatible with ICU4C UTrie2 serialization.
147     * Trie2 serialization is unrelated to Java object serialization.
148     *
149     * @param os the stream to which the serialized Trie2 data will be written.
150     * @return the number of bytes written.
151     * @throw IOException on an error writing to the OutputStream.
152     */
153    public int serialize(OutputStream os) throws IOException {
154        DataOutputStream dos = new DataOutputStream(os);
155        int  bytesWritten = 0;
156
157        bytesWritten += serializeHeader(dos);
158        for (int i=0; i<dataLength; i++) {
159            dos.writeChar(index[data16+i]);
160        }
161        bytesWritten += dataLength*2;
162        return bytesWritten;
163    }
164
165    /**
166     * @return the number of bytes of the serialized trie
167     */
168    public int getSerializedLength() {
169        return 16+(header.indexLength+dataLength)*2;
170    }
171
172    /**
173     * Given a starting code point, find the last in a range of code points,
174     * all with the same value.
175     *
176     * This function is part of the implementation of iterating over the
177     * Trie2's contents.
178     * @param startingCP The code point at which to begin looking.
179     * @return The last code point with the same value as the starting code point.
180     */
181    @Override
182    int rangeEnd(int startingCP, int limit, int value) {
183        int   cp = startingCP;
184        int   block = 0;
185        int   index2Block = 0;
186
187        // Loop runs once for each of
188        //   - a partial data block
189        //   - a reference to the null (default) data block.
190        //   - a reference to the index2 null block
191
192      outerLoop:
193        for (;;) {
194            if (cp >= limit) {
195                break;
196            }
197            if (cp < 0x0d800 || (cp > 0x0dbff && cp <= 0x0ffff)) {
198                // Ordinary BMP code point, excluding leading surrogates.
199                // BMP uses a single level lookup.  BMP index starts at offset 0 in the Trie2 index.
200                // 16 bit data is stored in the index array itself.
201                index2Block = 0;
202                block       = index[cp >> UTRIE2_SHIFT_2] << UTRIE2_INDEX_SHIFT;
203            } else if (cp < 0xffff) {
204                // Lead Surrogate Code Point, 0xd800 <= cp < 0xdc00
205                index2Block = UTRIE2_LSCP_INDEX_2_OFFSET;
206                block       = index[index2Block + ((cp - 0xd800) >> UTRIE2_SHIFT_2)] << UTRIE2_INDEX_SHIFT;
207            } else if (cp < highStart) {
208                // Supplemental code point, use two-level lookup.
209                int ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (cp >> UTRIE2_SHIFT_1);
210                index2Block = index[ix];
211                block = index[index2Block + ((cp >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK)] << UTRIE2_INDEX_SHIFT;
212            } else  {
213                // Code point above highStart.
214                if (value == index[highValueIndex]) {
215                    cp = limit;
216                }
217                break;
218            }
219
220            if (index2Block == index2NullOffset) {
221                if (value != initialValue) {
222                    break;
223                }
224                cp += UTRIE2_CP_PER_INDEX_1_ENTRY;
225            } else if (block == dataNullOffset) {
226                // The block at dataNullOffset has all values == initialValue.
227                // Because Trie2 iteration always proceeds in ascending order, we will always
228                //   encounter a null block at its beginning, and can skip over
229                //   a number of code points equal to the length of the block.
230                if (value != initialValue) {
231                    break;
232                }
233                cp += UTRIE2_DATA_BLOCK_LENGTH;
234            } else {
235                // Current position refers to an ordinary data block.
236                // Walk over the data entries, checking the values.
237                int startIx = block + (cp & UTRIE2_DATA_MASK);
238                int limitIx = block + UTRIE2_DATA_BLOCK_LENGTH;
239                for (int ix = startIx; ix<limitIx; ix++) {
240                    if (index[ix] != value) {
241                        // We came to an entry with a different value.
242                        //   We are done.
243                        cp += (ix - startIx);
244                        break outerLoop;
245                    }
246                }
247                // The ordinary data block contained our value until its end.
248                //  Advance the current code point, and continue the outerloop.
249                cp += limitIx - startIx;
250            }
251        }
252        if (cp > limit) {
253            cp = limit;
254        }
255
256        return cp - 1;
257    }
258}
259