1/* GENERATED SOURCE. DO NOT MODIFY. */
2/*
3 ******************************************************************************
4 * Copyright (C) 1996-2015, International Business Machines Corporation and
5 * others. All Rights Reserved.
6 ******************************************************************************
7 */
8
9package android.icu.impl;
10
11import java.nio.ByteBuffer;
12import java.util.Arrays;
13
14import android.icu.lang.UCharacter;
15import android.icu.text.UTF16;
16
17/**
18 * <p>A trie is a kind of compressed, serializable table of values
19 * associated with Unicode code points (0..0x10ffff).</p>
20 * <p>This class defines the basic structure of a trie and provides methods
21 * to <b>retrieve the offsets to the actual data</b>.</p>
22 * <p>Data will be the form of an array of basic types, char or int.</p>
23 * <p>The actual data format will have to be specified by the user in the
24 * inner static interface android.icu.impl.Trie.DataManipulate.</p>
25 * <p>This trie implementation is optimized for getting offset while walking
26 * forward through a UTF-16 string.
27 * Therefore, the simplest and fastest access macros are the
28 * fromLead() and fromOffsetTrail() methods.
29 * The fromBMP() method are a little more complicated; they get offsets even
30 * for lead surrogate codepoints, while the fromLead() method get special
31 * "folded" offsets for lead surrogate code units if there is relevant data
32 * associated with them.
33 * From such a folded offsets, an offset needs to be extracted to supply
34 * to the fromOffsetTrail() methods.
35 * To handle such supplementary codepoints, some offset information are kept
36 * in the data.</p>
37 * <p>Methods in android.icu.impl.Trie.DataManipulate are called to retrieve
38 * that offset from the folded value for the lead surrogate unit.</p>
39 * <p>For examples of use, see android.icu.impl.CharTrie or
40 * android.icu.impl.IntTrie.</p>
41 * @author synwee
42 * @see android.icu.impl.CharTrie
43 * @see android.icu.impl.IntTrie
44 * @hide Only a subset of ICU is exposed in Android
45 */
46public abstract class Trie
47{
48    // public class declaration ----------------------------------------
49
50    /**
51    * Character data in com.ibm.impl.Trie have different user-specified format
52    * for different purposes.
53    * This interface specifies methods to be implemented in order for
54    * com.ibm.impl.Trie, to surrogate offset information encapsulated within
55    * the data.
56    */
57    public static interface DataManipulate
58    {
59        /**
60        * Called by android.icu.impl.Trie to extract from a lead surrogate's
61        * data
62        * the index array offset of the indexes for that lead surrogate.
63        * @param value data value for a surrogate from the trie, including the
64        *        folding offset
65        * @return data offset or 0 if there is no data for the lead surrogate
66        */
67        public int getFoldingOffset(int value);
68    }
69
70    // default implementation
71    private static class DefaultGetFoldingOffset implements DataManipulate {
72        public int getFoldingOffset(int value) {
73            return value;
74        }
75    }
76
77    // public methods --------------------------------------------------
78
79    /**
80     * Determines if this trie has a linear latin 1 array
81     * @return true if this trie has a linear latin 1 array, false otherwise
82     */
83    public final boolean isLatin1Linear()
84    {
85        return m_isLatin1Linear_;
86    }
87
88    /**
89     * Checks if the argument Trie has the same data as this Trie.
90     * Attributes are checked but not the index data.
91     * @param other Trie to check
92     * @return true if the argument Trie has the same data as this Trie, false
93     *         otherwise
94     */
95    ///CLOVER:OFF
96    public boolean equals(Object other)
97    {
98        if (other == this) {
99            return true;
100        }
101        if (!(other instanceof Trie)) {
102            return false;
103        }
104        Trie othertrie = (Trie)other;
105        return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
106               && m_options_ == othertrie.m_options_
107               && m_dataLength_ == othertrie.m_dataLength_
108               && Arrays.equals(m_index_, othertrie.m_index_);
109    }
110
111    public int hashCode() {
112        assert false : "hashCode not designed";
113        return 42;
114    }
115    ///CLOVER:ON
116
117    /**
118     * Gets the serialized data file size of the Trie. This is used during
119     * trie data reading for size checking purposes.
120     * @return size size of serialized trie data file in terms of the number
121     *              of bytes
122     */
123    public int getSerializedDataSize()
124    {
125        // includes signature, option, dataoffset and datalength output
126        int result = (4 << 2);
127        result += (m_dataOffset_ << 1);
128        if (isCharTrie()) {
129            result += (m_dataLength_ << 1);
130        }
131        else if (isIntTrie()) {
132            result += (m_dataLength_ << 2);
133        }
134        return result;
135    }
136
137    // protected constructor -------------------------------------------
138
139    /**
140     * Trie constructor for CharTrie use.
141     * @param bytes data of an ICU data file, containing the trie
142     * @param dataManipulate object containing the information to parse the
143     *                       trie data
144     */
145    protected Trie(ByteBuffer bytes, DataManipulate  dataManipulate)
146    {
147        // Magic number to authenticate the data.
148        int signature = bytes.getInt();
149        m_options_    = bytes.getInt();
150
151        if (!checkHeader(signature)) {
152            throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
153        }
154
155        if(dataManipulate != null) {
156            m_dataManipulate_ = dataManipulate;
157        } else {
158            m_dataManipulate_ = new DefaultGetFoldingOffset();
159        }
160        m_isLatin1Linear_ = (m_options_ &
161                             HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
162        m_dataOffset_     = bytes.getInt();
163        m_dataLength_     = bytes.getInt();
164        unserialize(bytes);
165    }
166
167    /**
168    * Trie constructor
169    * @param index array to be used for index
170    * @param options used by the trie
171    * @param dataManipulate object containing the information to parse the
172    *                       trie data
173    */
174    protected Trie(char index[], int options, DataManipulate dataManipulate)
175    {
176        m_options_ = options;
177        if(dataManipulate != null) {
178            m_dataManipulate_ = dataManipulate;
179        } else {
180            m_dataManipulate_ = new DefaultGetFoldingOffset();
181        }
182        m_isLatin1Linear_ = (m_options_ &
183                             HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
184        m_index_ = index;
185        m_dataOffset_ = m_index_.length;
186    }
187
188
189    // protected data members ------------------------------------------
190
191    /**
192    * Lead surrogate code points' index displacement in the index array.
193    * 0x10000-0xd800=0x2800
194    * 0x2800 >> INDEX_STAGE_1_SHIFT_
195    */
196    protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
197    /**
198    * Shift size for shifting right the input index. 1..9
199    */
200    protected static final int INDEX_STAGE_1_SHIFT_ = 5;
201    /**
202    * Shift size for shifting left the index array values.
203    * Increases possible data size with 16-bit index values at the cost
204    * of compactability.
205    * This requires blocks of stage 2 data to be aligned by
206    * DATA_GRANULARITY.
207    * 0..INDEX_STAGE_1_SHIFT
208    */
209    protected static final int INDEX_STAGE_2_SHIFT_ = 2;
210    /**
211     * Number of data values in a stage 2 (data array) block.
212     */
213    protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
214    /**
215    * Mask for getting the lower bits from the input index.
216    * DATA_BLOCK_LENGTH - 1.
217    */
218    protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
219    /** Number of bits of a trail surrogate that are used in index table lookups. */
220    protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
221    /**
222     * Number of index (stage 1) entries per lead surrogate.
223     * Same as number of index entries for 1024 trail surrogates,
224     * ==0x400>>INDEX_STAGE_1_SHIFT_
225     */
226    protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
227    /** Length of the BMP portion of the index (stage 1) array. */
228    protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
229    /**
230    * Surrogate mask to use when shifting offset to retrieve supplementary
231    * values
232    */
233    protected static final int SURROGATE_MASK_ = 0x3FF;
234    /**
235    * Index or UTF16 characters
236    */
237    protected char m_index_[];
238    /**
239    * Internal TrieValue which handles the parsing of the data value.
240    * This class is to be implemented by the user
241    */
242    protected DataManipulate m_dataManipulate_;
243    /**
244    * Start index of the data portion of the trie. CharTrie combines
245    * index and data into a char array, so this is used to indicate the
246    * initial offset to the data portion.
247    * Note this index always points to the initial value.
248    */
249    protected int m_dataOffset_;
250    /**
251    * Length of the data array
252    */
253    protected int m_dataLength_;
254
255    // protected methods -----------------------------------------------
256
257    /**
258    * Gets the offset to the data which the surrogate pair points to.
259    * @param lead lead surrogate
260    * @param trail trailing surrogate
261    * @return offset to data
262    */
263    protected abstract int getSurrogateOffset(char lead, char trail);
264
265    /**
266    * Gets the value at the argument index
267    * @param index value at index will be retrieved
268    * @return 32 bit value
269    */
270    protected abstract int getValue(int index);
271
272    /**
273    * Gets the default initial value
274    * @return 32 bit value
275    */
276    protected abstract int getInitialValue();
277
278    /**
279    * Gets the offset to the data which the index ch after variable offset
280    * points to.
281    * Note for locating a non-supplementary character data offset, calling
282    * <p>
283    * getRawOffset(0, ch);
284    * </p>
285    * will do. Otherwise if it is a supplementary character formed by
286    * surrogates lead and trail. Then we would have to call getRawOffset()
287    * with getFoldingIndexOffset(). See getSurrogateOffset().
288    * @param offset index offset which ch is to start from
289    * @param ch index to be used after offset
290    * @return offset to the data
291    */
292    protected final int getRawOffset(int offset, char ch)
293    {
294        return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
295                << INDEX_STAGE_2_SHIFT_)
296                + (ch & INDEX_STAGE_3_MASK_);
297    }
298
299    /**
300    * Gets the offset to data which the BMP character points to
301    * Treats a lead surrogate as a normal code point.
302    * @param ch BMP character
303    * @return offset to data
304    */
305    protected final int getBMPOffset(char ch)
306    {
307        return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
308                && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
309                ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
310                : getRawOffset(0, ch);
311                // using a getRawOffset(ch) makes no diff
312    }
313
314    /**
315    * Gets the offset to the data which this lead surrogate character points
316    * to.
317    * Data at the returned offset may contain folding offset information for
318    * the next trailing surrogate character.
319    * @param ch lead surrogate character
320    * @return offset to data
321    */
322    protected final int getLeadOffset(char ch)
323    {
324       return getRawOffset(0, ch);
325    }
326
327    /**
328    * Internal trie getter from a code point.
329    * Could be faster(?) but longer with
330    *   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
331    * Gets the offset to data which the codepoint points to
332    * @param ch codepoint
333    * @return offset to data
334    */
335    protected final int getCodePointOffset(int ch)
336    {
337        // if ((ch >> 16) == 0) slower
338        if (ch < 0) {
339            return -1;
340        } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
341            // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
342            return getRawOffset(0, (char)ch);
343        } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
344            // BMP codepoint
345            return getBMPOffset((char)ch);
346        } else if (ch <= UCharacter.MAX_VALUE) {
347            // look at the construction of supplementary characters
348            // trail forms the ends of it.
349            return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
350                                      (char)(ch & SURROGATE_MASK_));
351        } else {
352            // return -1 if there is an error, in this case we return
353            return -1;
354        }
355    }
356
357    /**
358     * <p>Parses the byte buffer and creates the trie index with it.</p>
359     * <p>The position of the input ByteBuffer must be right after the trie header.</p>
360     * <p>This is overwritten by the child classes.
361     * @param bytes buffer containing trie data
362     */
363    protected void unserialize(ByteBuffer bytes)
364    {
365        m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0);
366    }
367
368    /**
369    * Determines if this is a 32 bit trie
370    * @return true if options specifies this is a 32 bit trie
371    */
372    protected final boolean isIntTrie()
373    {
374        return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
375    }
376
377    /**
378    * Determines if this is a 16 bit trie
379    * @return true if this is a 16 bit trie
380    */
381    protected final boolean isCharTrie()
382    {
383        return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
384    }
385
386    // private data members --------------------------------------------
387
388    //  struct UTrieHeader {
389    //      int32_t   signature;
390    //      int32_t   options  (a bit field)
391    //      int32_t   indexLength
392    //      int32_t   dataLength
393
394    /**
395    * Size of Trie header in bytes
396    */
397    protected static final int HEADER_LENGTH_ = 4 * 4;
398    /**
399    * Latin 1 option mask
400    */
401    protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
402    /**
403    * Constant number to authenticate the byte block
404    */
405    protected static final int HEADER_SIGNATURE_ = 0x54726965;
406    /**
407    * Header option formatting
408    */
409    private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
410    protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
411    protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
412
413    /**
414    * Flag indicator for Latin quick access data block
415    */
416    private boolean m_isLatin1Linear_;
417
418    /**
419    * <p>Trie options field.</p>
420    * <p>options bit field:<br>
421    * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
422    * 8  0 = 16-bit data, 1=32-bit data<br>
423    * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
424    * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
425    */
426    private int m_options_;
427
428    // private methods ---------------------------------------------------
429
430    /**
431    * Authenticates raw data header.
432    * Checking the header information, signature and options.
433    * @param signature This contains the options and type of a Trie
434    * @return true if the header is authenticated valid
435    */
436    private final boolean checkHeader(int signature)
437    {
438        // check the signature
439        // Trie in big-endian US-ASCII (0x54726965).
440        // Magic number to authenticate the data.
441        if (signature != HEADER_SIGNATURE_) {
442            return false;
443        }
444
445        if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
446                                                    INDEX_STAGE_1_SHIFT_ ||
447            ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
448                                                HEADER_OPTIONS_SHIFT_MASK_)
449                                                 != INDEX_STAGE_2_SHIFT_) {
450            return false;
451        }
452        return true;
453    }
454}
455