1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4******************************************************************************
5* Copyright (C) 1996-2010, International Business Machines Corporation and   *
6* others. All Rights Reserved.                                               *
7******************************************************************************
8*/
9
10package com.ibm.icu.impl;
11
12import java.util.Arrays;
13
14import com.ibm.icu.lang.UCharacter;
15
16/**
17 * Builder class to manipulate and generate a trie.
18 * This is useful for ICU data in primitive types.
19 * Provides a compact way to store information that is indexed by Unicode
20 * values, such as character properties, types, keyboard values, etc. This is
21 * very useful when you have a block of Unicode data that contains significant
22 * values while the rest of the Unicode data is unused in the application or
23 * when you have a lot of redundance, such as where all 21,000 Han ideographs
24 * have the same value.  However, lookup is much faster than a hash table.
25 * A trie of any primitive data type serves two purposes:
26 * <UL type = round>
27 *     <LI>Fast access of the indexed values.
28 *     <LI>Smaller memory footprint.
29 * </UL>
30 * This is a direct port from the ICU4C version
31 * @author             Syn Wee Quek
32 */
33public class TrieBuilder
34{
35    // public data member ----------------------------------------------
36
37    /**
38     * Number of data values in a stage 2 (data array) block. 2, 4, 8, ..,
39     * 0x200
40     */
41    public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;
42
43    // public class declaration ----------------------------------------
44
45    /**
46     * Character data in com.ibm.impl.Trie have different user-specified format
47     * for different purposes.
48     * This interface specifies methods to be implemented in order for
49     * com.ibm.impl.Trie, to surrogate offset information encapsulated within
50     * the data.
51     */
52    public static interface DataManipulate
53    {
54        /**
55         * Build-time trie callback function, used with serialize().
56         * This function calculates a lead surrogate's value including a
57         * folding offset from the 1024 supplementary code points
58         * [start..start+1024[ .
59         * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
60         * The folding offset is provided by the caller.
61         * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT
62         * with n=0..1023.
63         * Instead of the offset itself, n can be stored in 10 bits - or fewer
64         * if it can be assumed that few lead surrogates have associated data.
65         * The returned value must be
66         *  - not zero if and only if there is relevant data for the
67         *                        corresponding 1024 supplementary code points
68         *  - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(...,
69         *                                                    offset))==offset
70         * @return a folded value, or 0 if there is no relevant data for the
71         *         lead surrogate.
72         */
73        public int getFoldedValue(int start, int offset);
74    }
75
76    // public methods ----------------------------------------------------
77
78    /**
79     * Checks if the character belongs to a zero block in the trie
80     * @param ch codepoint which data is to be retrieved
81     * @return true if ch is in the zero block
82     */
83    public boolean isInZeroBlock(int ch)
84    {
85        // valid, uncompacted trie and valid c?
86        if (m_isCompacted_ || ch > UCharacter.MAX_VALUE
87            || ch < UCharacter.MIN_VALUE) {
88            return true;
89        }
90
91        return m_index_[ch >> SHIFT_] == 0;
92    }
93
94    // package private method -----------------------------------------------
95
96    // protected data member -----------------------------------------------
97
98    /**
99     * Index values at build-time are 32 bits wide for easier processing.
100     * Bit 31 is set if the data block is used by multiple index values
101     * (from setRange()).
102     */
103    protected int m_index_[];
104    protected int m_indexLength_;
105    protected int m_dataCapacity_;
106    protected int m_dataLength_;
107    protected boolean m_isLatin1Linear_;
108    protected boolean m_isCompacted_;
109    /**
110     * Map of adjusted indexes, used in utrie_compact().
111     * Maps from original indexes to new ones.
112     */
113    protected int m_map_[];
114
115    /**
116     * Shift size for shifting right the input index. 1..9
117     */
118    protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_;
119    /**
120     * Length of the index (stage 1) array before folding.
121     * Maximum number of Unicode code points (0x110000) shifted right by
122     * SHIFT.
123     */
124    protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_);
125    /**
126     * Length of the BMP portion of the index (stage 1) array.
127     */
128    protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_;
129    /**
130     * Number of index (stage 1) entries per lead surrogate.
131     * Same as number of indexe entries for 1024 trail surrogates,
132     * ==0x400>>UTRIE_SHIFT
133     * 10 - SHIFT == Number of bits of a trail surrogate that are used in
134     *               index table lookups.
135     */
136    protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);
137    /**
138     * Mask for getting the lower bits from the input index.
139     * DATA_BLOCK_LENGTH - 1.
140     */
141    protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;
142    /**
143     * Shift size for shifting left the index array values.
144     * Increases possible data size with 16-bit index values at the cost
145     * of compactability.
146     * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
147     * 0..UTRIE_SHIFT
148     */
149    protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_;
150    /**
151     * Maximum length of the runtime data (stage 2) array.
152     * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_.
153     */
154    protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_);
155    /**
156     * Shifting to position the index value in options
157     */
158    protected static final int OPTIONS_INDEX_SHIFT_ = 4;
159    /**
160     * If set, then the data (stage 2) array is 32 bits wide.
161     */
162    protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100;
163    /**
164     * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data
165     * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH.
166     */
167    protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200;
168    /**
169     * The alignment size of a stage 2 data block. Also the granularity for
170     * compaction.
171     */
172    protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_;
173
174    // protected constructor ----------------------------------------------
175
176    protected TrieBuilder()
177    {
178        m_index_ = new int[MAX_INDEX_LENGTH_];
179        m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_];
180        m_isLatin1Linear_ = false;
181        m_isCompacted_ = false;
182        m_indexLength_ = MAX_INDEX_LENGTH_;
183    }
184
185    protected TrieBuilder(TrieBuilder table)
186    {
187        m_index_ = new int[MAX_INDEX_LENGTH_];
188        m_indexLength_ = table.m_indexLength_;
189        System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_);
190        m_dataCapacity_ = table.m_dataCapacity_;
191        m_dataLength_ = table.m_dataLength_;
192        m_map_ = new int[table.m_map_.length];
193        System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length);
194        m_isLatin1Linear_ = table.m_isLatin1Linear_;
195        m_isCompacted_ = table.m_isCompacted_;
196    }
197
198    // protected functions ------------------------------------------------
199
200    /**
201     * Compare two sections of an array for equality.
202     */
203    protected static final boolean equal_int(int[] array, int start1, int start2, int length) {
204        while(length>0 && array[start1]==array[start2]) {
205            ++start1;
206            ++start2;
207            --length;
208        }
209        return length==0;
210    }
211
212    /**
213     * Set a value in the trie index map to indicate which data block
214     * is referenced and which one is not.
215     * utrie_compact() will remove data blocks that are not used at all.
216     * Set
217     * - 0 if it is used
218     * - -1 if it is not used
219     */
220    protected void findUnusedBlocks()
221    {
222        // fill the entire map with "not used"
223        Arrays.fill(m_map_, 0xff);
224
225        // mark each block that _is_ used with 0
226        for (int i = 0; i < m_indexLength_; ++ i) {
227            m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0;
228        }
229
230        // never move the all-initial-value block 0
231        m_map_[0] = 0;
232    }
233
234    /**
235     * Finds the same index block as the otherBlock
236     * @param index array
237     * @param indexLength size of index
238     * @param otherBlock
239     * @return same index block
240     */
241    protected static final int findSameIndexBlock(int index[], int indexLength,
242                                                  int otherBlock)
243    {
244        for (int block = BMP_INDEX_LENGTH_; block < indexLength;
245             block += SURROGATE_BLOCK_COUNT_) {
246            if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) {
247                return block;
248            }
249        }
250        return indexLength;
251    }
252
253    // private data member ------------------------------------------------
254
255    /**
256     * Maximum length of the build-time data (stage 2) array.
257     * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400.
258     * (Number of Unicode code points + one all-initial-value block +
259     *  possible duplicate entries for 1024 lead surrogates.)
260     */
261    private static final int MAX_BUILD_TIME_DATA_LENGTH_ =
262        0x110000 + DATA_BLOCK_LENGTH + 0x400;
263}
264