Trie.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/*
2 ******************************************************************************
3 * Copyright (C) 1996-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 */
7
8package com.ibm.icu.impl;
9
10import java.nio.ByteBuffer;
11import java.util.Arrays;
12
13import com.ibm.icu.lang.UCharacter;
14import com.ibm.icu.text.UTF16;
15
16/**
17 * <p>A trie is a kind of compressed, serializable table of values
18 * associated with Unicode code points (0..0x10ffff).</p>
19 * <p>This class defines the basic structure of a trie and provides methods
20 * to <b>retrieve the offsets to the actual data</b>.</p>
21 * <p>Data will be the form of an array of basic types, char or int.</p>
22 * <p>The actual data format will have to be specified by the user in the
23 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
24 * <p>This trie implementation is optimized for getting offset while walking
25 * forward through a UTF-16 string.
26 * Therefore, the simplest and fastest access macros are the
27 * fromLead() and fromOffsetTrail() methods.
28 * The fromBMP() method are a little more complicated; they get offsets even
29 * for lead surrogate codepoints, while the fromLead() method get special
30 * "folded" offsets for lead surrogate code units if there is relevant data
31 * associated with them.
32 * From such a folded offsets, an offset needs to be extracted to supply
33 * to the fromOffsetTrail() methods.
34 * To handle such supplementary codepoints, some offset information are kept
35 * in the data.</p>
36 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
37 * that offset from the folded value for the lead surrogate unit.</p>
38 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
39 * com.ibm.icu.impl.IntTrie.</p>
40 * @author synwee
41 * @see com.ibm.icu.impl.CharTrie
42 * @see com.ibm.icu.impl.IntTrie
43 * @since release 2.1, Jan 01 2002
44 */
45public abstract class Trie
46{
47    // public class declaration ----------------------------------------
48
49    /**
50    * Character data in com.ibm.impl.Trie have different user-specified format
51    * for different purposes.
52    * This interface specifies methods to be implemented in order for
53    * com.ibm.impl.Trie, to surrogate offset information encapsulated within
54    * the data.
55    */
56    public static interface DataManipulate
57    {
58        /**
59        * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
60        * data
61        * the index array offset of the indexes for that lead surrogate.
62        * @param value data value for a surrogate from the trie, including the
63        *        folding offset
64        * @return data offset or 0 if there is no data for the lead surrogate
65        */
66        public int getFoldingOffset(int value);
67    }
68
69    // default implementation
70    private static class DefaultGetFoldingOffset implements DataManipulate {
71        public int getFoldingOffset(int value) {
72            return value;
73        }
74    }
75
76    // public methods --------------------------------------------------
77
78    /**
79     * Determines if this trie has a linear latin 1 array
80     * @return true if this trie has a linear latin 1 array, false otherwise
81     */
82    public final boolean isLatin1Linear()
83    {
84        return m_isLatin1Linear_;
85    }
86
87    /**
88     * Checks if the argument Trie has the same data as this Trie.
89     * Attributes are checked but not the index data.
90     * @param other Trie to check
91     * @return true if the argument Trie has the same data as this Trie, false
92     *         otherwise
93     */
94    ///CLOVER:OFF
95    public boolean equals(Object other)
96    {
97        if (other == this) {
98            return true;
99        }
100        if (!(other instanceof Trie)) {
101            return false;
102        }
103        Trie othertrie = (Trie)other;
104        return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
105               && m_options_ == othertrie.m_options_
106               && m_dataLength_ == othertrie.m_dataLength_
107               && Arrays.equals(m_index_, othertrie.m_index_);
108    }
109
110    public int hashCode() {
111        assert false : "hashCode not designed";
112        return 42;
113    }
114    ///CLOVER:ON
115
116    /**
117     * Gets the serialized data file size of the Trie. This is used during
118     * trie data reading for size checking purposes.
119     * @return size size of serialized trie data file in terms of the number
120     *              of bytes
121     */
122    public int getSerializedDataSize()
123    {
124        // includes signature, option, dataoffset and datalength output
125        int result = (4 << 2);
126        result += (m_dataOffset_ << 1);
127        if (isCharTrie()) {
128            result += (m_dataLength_ << 1);
129        }
130        else if (isIntTrie()) {
131            result += (m_dataLength_ << 2);
132        }
133        return result;
134    }
135
136    // protected constructor -------------------------------------------
137
138    /**
139     * Trie constructor for CharTrie use.
140     * @param bytes data of an ICU data file, containing the trie
141     * @param dataManipulate object containing the information to parse the
142     *                       trie data
143     */
144    protected Trie(ByteBuffer bytes, DataManipulate  dataManipulate)
145    {
146        // Magic number to authenticate the data.
147        int signature = bytes.getInt();
148        m_options_    = bytes.getInt();
149
150        if (!checkHeader(signature)) {
151            throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
152        }
153
154        if(dataManipulate != null) {
155            m_dataManipulate_ = dataManipulate;
156        } else {
157            m_dataManipulate_ = new DefaultGetFoldingOffset();
158        }
159        m_isLatin1Linear_ = (m_options_ &
160                             HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
161        m_dataOffset_     = bytes.getInt();
162        m_dataLength_     = bytes.getInt();
163        unserialize(bytes);
164    }
165
166    /**
167    * Trie constructor
168    * @param index array to be used for index
169    * @param options used by the trie
170    * @param dataManipulate object containing the information to parse the
171    *                       trie data
172    */
173    protected Trie(char index[], int options, DataManipulate dataManipulate)
174    {
175        m_options_ = options;
176        if(dataManipulate != null) {
177            m_dataManipulate_ = dataManipulate;
178        } else {
179            m_dataManipulate_ = new DefaultGetFoldingOffset();
180        }
181        m_isLatin1Linear_ = (m_options_ &
182                             HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
183        m_index_ = index;
184        m_dataOffset_ = m_index_.length;
185    }
186
187
188    // protected data members ------------------------------------------
189
190    /**
191    * Lead surrogate code points' index displacement in the index array.
192    * 0x10000-0xd800=0x2800
193    * 0x2800 >> INDEX_STAGE_1_SHIFT_
194    */
195    protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
196    /**
197    * Shift size for shifting right the input index. 1..9
198    */
199    protected static final int INDEX_STAGE_1_SHIFT_ = 5;
200    /**
201    * Shift size for shifting left the index array values.
202    * Increases possible data size with 16-bit index values at the cost
203    * of compactability.
204    * This requires blocks of stage 2 data to be aligned by
205    * DATA_GRANULARITY.
206    * 0..INDEX_STAGE_1_SHIFT
207    */
208    protected static final int INDEX_STAGE_2_SHIFT_ = 2;
209    /**
210     * Number of data values in a stage 2 (data array) block.
211     */
212    protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
213    /**
214    * Mask for getting the lower bits from the input index.
215    * DATA_BLOCK_LENGTH - 1.
216    */
217    protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
218    /** Number of bits of a trail surrogate that are used in index table lookups. */
219    protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
220    /**
221     * Number of index (stage 1) entries per lead surrogate.
222     * Same as number of index entries for 1024 trail surrogates,
223     * ==0x400>>INDEX_STAGE_1_SHIFT_
224     */
225    protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
226    /** Length of the BMP portion of the index (stage 1) array. */
227    protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
228    /**
229    * Surrogate mask to use when shifting offset to retrieve supplementary
230    * values
231    */
232    protected static final int SURROGATE_MASK_ = 0x3FF;
233    /**
234    * Index or UTF16 characters
235    */
236    protected char m_index_[];
237    /**
238    * Internal TrieValue which handles the parsing of the data value.
239    * This class is to be implemented by the user
240    */
241    protected DataManipulate m_dataManipulate_;
242    /**
243    * Start index of the data portion of the trie. CharTrie combines
244    * index and data into a char array, so this is used to indicate the
245    * initial offset to the data portion.
246    * Note this index always points to the initial value.
247    */
248    protected int m_dataOffset_;
249    /**
250    * Length of the data array
251    */
252    protected int m_dataLength_;
253
254    // protected methods -----------------------------------------------
255
256    /**
257    * Gets the offset to the data which the surrogate pair points to.
258    * @param lead lead surrogate
259    * @param trail trailing surrogate
260    * @return offset to data
261    */
262    protected abstract int getSurrogateOffset(char lead, char trail);
263
264    /**
265    * Gets the value at the argument index
266    * @param index value at index will be retrieved
267    * @return 32 bit value
268    */
269    protected abstract int getValue(int index);
270
271    /**
272    * Gets the default initial value
273    * @return 32 bit value
274    */
275    protected abstract int getInitialValue();
276
277    /**
278    * Gets the offset to the data which the index ch after variable offset
279    * points to.
280    * Note for locating a non-supplementary character data offset, calling
281    * <p>
282    * getRawOffset(0, ch);
283    * </p>
284    * will do. Otherwise if it is a supplementary character formed by
285    * surrogates lead and trail. Then we would have to call getRawOffset()
286    * with getFoldingIndexOffset(). See getSurrogateOffset().
287    * @param offset index offset which ch is to start from
288    * @param ch index to be used after offset
289    * @return offset to the data
290    */
291    protected final int getRawOffset(int offset, char ch)
292    {
293        return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
294                << INDEX_STAGE_2_SHIFT_)
295                + (ch & INDEX_STAGE_3_MASK_);
296    }
297
298    /**
299    * Gets the offset to data which the BMP character points to
300    * Treats a lead surrogate as a normal code point.
301    * @param ch BMP character
302    * @return offset to data
303    */
304    protected final int getBMPOffset(char ch)
305    {
306        return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
307                && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
308                ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
309                : getRawOffset(0, ch);
310                // using a getRawOffset(ch) makes no diff
311    }
312
313    /**
314    * Gets the offset to the data which this lead surrogate character points
315    * to.
316    * Data at the returned offset may contain folding offset information for
317    * the next trailing surrogate character.
318    * @param ch lead surrogate character
319    * @return offset to data
320    */
321    protected final int getLeadOffset(char ch)
322    {
323       return getRawOffset(0, ch);
324    }
325
326    /**
327    * Internal trie getter from a code point.
328    * Could be faster(?) but longer with
329    *   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
330    * Gets the offset to data which the codepoint points to
331    * @param ch codepoint
332    * @return offset to data
333    */
334    protected final int getCodePointOffset(int ch)
335    {
336        // if ((ch >> 16) == 0) slower
337        if (ch < 0) {
338            return -1;
339        } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
340            // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
341            return getRawOffset(0, (char)ch);
342        } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
343            // BMP codepoint
344            return getBMPOffset((char)ch);
345        } else if (ch <= UCharacter.MAX_VALUE) {
346            // look at the construction of supplementary characters
347            // trail forms the ends of it.
348            return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
349                                      (char)(ch & SURROGATE_MASK_));
350        } else {
351            // return -1 if there is an error, in this case we return
352            return -1;
353        }
354    }
355
356    /**
357     * <p>Parses the byte buffer and creates the trie index with it.</p>
358     * <p>The position of the input ByteBuffer must be right after the trie header.</p>
359     * <p>This is overwritten by the child classes.
360     * @param bytes buffer containing trie data
361     */
362    protected void unserialize(ByteBuffer bytes)
363    {
364        m_index_ = new char[m_dataOffset_];
365        for (int i = 0; i < m_dataOffset_; i ++) {
366             m_index_[i] = bytes.getChar();
367        }
368    }
369
370    /**
371    * Determines if this is a 32 bit trie
372    * @return true if options specifies this is a 32 bit trie
373    */
374    protected final boolean isIntTrie()
375    {
376        return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
377    }
378
379    /**
380    * Determines if this is a 16 bit trie
381    * @return true if this is a 16 bit trie
382    */
383    protected final boolean isCharTrie()
384    {
385        return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
386    }
387
388    // private data members --------------------------------------------
389
390    //  struct UTrieHeader {
391    //      int32_t   signature;
392    //      int32_t   options  (a bit field)
393    //      int32_t   indexLength
394    //      int32_t   dataLength
395
396    /**
397    * Size of Trie header in bytes
398    */
399    protected static final int HEADER_LENGTH_ = 4 * 4;
400    /**
401    * Latin 1 option mask
402    */
403    protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
404    /**
405    * Constant number to authenticate the byte block
406    */
407    protected static final int HEADER_SIGNATURE_ = 0x54726965;
408    /**
409    * Header option formatting
410    */
411    private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
412    protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
413    protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
414
415    /**
416    * Flag indicator for Latin quick access data block
417    */
418    private boolean m_isLatin1Linear_;
419
420    /**
421    * <p>Trie options field.</p>
422    * <p>options bit field:<br>
423    * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
424    * 8  0 = 16-bit data, 1=32-bit data<br>
425    * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
426    * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
427    */
428    private int m_options_;
429
430    // private methods ---------------------------------------------------
431
432    /**
433    * Authenticates raw data header.
434    * Checking the header information, signature and options.
435    * @param signature This contains the options and type of a Trie
436    * @return true if the header is authenticated valid
437    */
438    private final boolean checkHeader(int signature)
439    {
440        // check the signature
441        // Trie in big-endian US-ASCII (0x54726965).
442        // Magic number to authenticate the data.
443        if (signature != HEADER_SIGNATURE_) {
444            return false;
445        }
446
447        if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
448                                                    INDEX_STAGE_1_SHIFT_ ||
449            ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
450                                                HEADER_OPTIONS_SHIFT_MASK_)
451                                                 != INDEX_STAGE_2_SHIFT_) {
452            return false;
453        }
454        return true;
455    }
456}
457