1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* CollationDataReader.java, ported from collationdatareader.h/.cpp
9*
10* C++ version created on: 2013feb07
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import java.io.IOException;
17import java.nio.ByteBuffer;
18import java.nio.CharBuffer;
19import java.util.Arrays;
20
21import com.ibm.icu.impl.ICUBinary;
22import com.ibm.icu.impl.Trie2_32;
23import com.ibm.icu.impl.USerializedSet;
24import com.ibm.icu.text.Collator;
25import com.ibm.icu.text.UnicodeSet;
26import com.ibm.icu.util.ICUException;
27
28/**
29 * Collation binary data reader.
30 */
31final class CollationDataReader /* all static */ {
32    // The following constants are also copied into source/common/ucol_swp.cpp.
33    // Keep them in sync!
34    /**
35     * Number of int indexes.
36     *
37     * Can be 2 if there are only options.
38     * Can be 7 or 8 if there are only options and a script reordering.
39     * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
40     */
41    static final int IX_INDEXES_LENGTH = 0;
42    /**
43     * Bits 31..24: numericPrimary, for numeric collation
44     *      23..16: fast Latin format version (0 = no fast Latin table)
45     *      15.. 0: options bit set
46     */
47    static final int IX_OPTIONS = 1;
48    static final int IX_RESERVED2 = 2;
49    static final int IX_RESERVED3 = 3;
50
51    /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
52    static final int IX_JAMO_CE32S_START = 4;
53
54    // Byte offsets from the start of the data, after the generic header.
55    // The indexes[] are at byte offset 0, other data follows.
56    // Each data item is aligned properly.
57    // The data items should be in descending order of unit size,
58    // to minimize the need for padding.
59    // Each item's byte length is given by the difference between its offset and
60    // the next index/offset value.
61    /** Byte offset to int reorderCodes[]. */
62    static final int IX_REORDER_CODES_OFFSET = 5;
63    /**
64     * Byte offset to uint8_t reorderTable[].
65     * Empty table if <256 bytes (padding only).
66     * Otherwise 256 bytes or more (with padding).
67     */
68    static final int IX_REORDER_TABLE_OFFSET = 6;
69    /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
70    static final int IX_TRIE_OFFSET = 7;
71
72    static final int IX_RESERVED8_OFFSET = 8;
73    /** Byte offset to long ces[]. */
74    static final int IX_CES_OFFSET = 9;
75    static final int IX_RESERVED10_OFFSET = 10;
76    /** Byte offset to int ce32s[]. */
77    static final int IX_CE32S_OFFSET = 11;
78
79    /** Byte offset to uint32_t rootElements[]. */
80    static final int IX_ROOT_ELEMENTS_OFFSET = 12;
81    /** Byte offset to UChar *contexts[]. */
82    static final int IX_CONTEXTS_OFFSET = 13;
83    /** Byte offset to char [] with serialized unsafeBackwardSet. */
84    static final int IX_UNSAFE_BWD_OFFSET = 14;
85    /** Byte offset to char fastLatinTable[]. */
86    static final int IX_FAST_LATIN_TABLE_OFFSET = 15;
87
88    /** Byte offset to char scripts[]. */
89    static final int IX_SCRIPTS_OFFSET = 16;
90    /**
91     * Byte offset to boolean compressibleBytes[].
92     * Empty table if <256 bytes (padding only).
93     * Otherwise 256 bytes or more (with padding).
94     */
95    static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17;
96    static final int IX_RESERVED18_OFFSET = 18;
97    static final int IX_TOTAL_SIZE = 19;
98
99    static void read(CollationTailoring base, ByteBuffer inBytes,
100                     CollationTailoring tailoring) throws IOException {
101        tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
102        if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
103            throw new ICUException("Tailoring UCA version differs from base data UCA version");
104        }
105
106        int inLength = inBytes.remaining();
107        if(inLength < 8) {
108            throw new ICUException("not enough bytes");
109        }
110        int indexesLength = inBytes.getInt();  // inIndexes[IX_INDEXES_LENGTH]
111        if(indexesLength < 2 || inLength < indexesLength * 4) {
112            throw new ICUException("not enough indexes");
113        }
114        int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
115        inIndexes[0] = indexesLength;
116        for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
117            inIndexes[i] = inBytes.getInt();
118        }
119        for(int i = indexesLength; i < inIndexes.length; ++i) {
120            inIndexes[i] = -1;
121        }
122        if(indexesLength > inIndexes.length) {
123            ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
124        }
125
126        // Assume that the tailoring data is in initial state,
127        // with null pointers and 0 lengths.
128
129        // Set pointers to non-empty data parts.
130        // Do this in order of their byte offsets. (Should help porting to Java.)
131
132        int index;  // one of the indexes[] slots
133        int offset;  // byte offset for the index part
134        int length;  // number of bytes in the index part
135
136        if(indexesLength > IX_TOTAL_SIZE) {
137            length = inIndexes[IX_TOTAL_SIZE];
138        } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
139            length = inIndexes[indexesLength - 1];
140        } else {
141            length = 0;  // only indexes, and inLength was already checked for them
142        }
143        if(inLength < length) {
144            throw new ICUException("not enough bytes");
145        }
146
147        CollationData baseData = base == null ? null : base.data;
148        int[] reorderCodes;
149        int reorderCodesLength;
150        index = IX_REORDER_CODES_OFFSET;
151        offset = inIndexes[index];
152        length = inIndexes[index + 1] - offset;
153        if(length >= 4) {
154            if(baseData == null) {
155                // We assume for collation settings that
156                // the base data does not have a reordering.
157                throw new ICUException("Collation base data must not reorder scripts");
158            }
159            reorderCodesLength = length / 4;
160            reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
161
162            // The reorderRanges (if any) are the trailing reorderCodes entries.
163            // Split the array at the boundary.
164            // Script or reorder codes do not exceed 16-bit values.
165            // Range limits are stored in the upper 16 bits, and are never 0.
166            int reorderRangesLength = 0;
167            while(reorderRangesLength < reorderCodesLength &&
168                    (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
169                ++reorderRangesLength;
170            }
171            assert(reorderRangesLength < reorderCodesLength);
172            reorderCodesLength -= reorderRangesLength;
173        } else {
174            reorderCodes = new int[0];
175            reorderCodesLength = 0;
176            ICUBinary.skipBytes(inBytes, length);
177        }
178
179        // There should be a reorder table only if there are reorder codes.
180        // However, when there are reorder codes the reorder table may be omitted to reduce
181        // the data size.
182        byte[] reorderTable = null;
183        index = IX_REORDER_TABLE_OFFSET;
184        offset = inIndexes[index];
185        length = inIndexes[index + 1] - offset;
186        if(length >= 256) {
187            if(reorderCodesLength == 0) {
188                throw new ICUException("Reordering table without reordering codes");
189            }
190            reorderTable = new byte[256];
191            inBytes.get(reorderTable);
192            length -= 256;
193        } else {
194            // If we have reorder codes, then build the reorderTable at the end,
195            // when the CollationData is otherwise complete.
196        }
197        ICUBinary.skipBytes(inBytes, length);
198
199        if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
200            throw new ICUException("Tailoring numeric primary weight differs from base data");
201        }
202        CollationData data = null;  // Remains null if there are no mappings.
203
204        index = IX_TRIE_OFFSET;
205        offset = inIndexes[index];
206        length = inIndexes[index + 1] - offset;
207        if(length >= 8) {
208            tailoring.ensureOwnedData();
209            data = tailoring.ownedData;
210            data.base = baseData;
211            data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
212            data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
213            int trieLength = data.trie.getSerializedLength();
214            if(trieLength > length) {
215                throw new ICUException("Not enough bytes for the mappings trie");  // No mappings.
216            }
217            length -= trieLength;
218        } else if(baseData != null) {
219            // Use the base data. Only the settings are tailored.
220            tailoring.data = baseData;
221        } else {
222            throw new ICUException("Missing collation data mappings");  // No mappings.
223        }
224        ICUBinary.skipBytes(inBytes, length);
225
226        index = IX_RESERVED8_OFFSET;
227        offset = inIndexes[index];
228        length = inIndexes[index + 1] - offset;
229        ICUBinary.skipBytes(inBytes, length);
230
231        index = IX_CES_OFFSET;
232        offset = inIndexes[index];
233        length = inIndexes[index + 1] - offset;
234        if(length >= 8) {
235            if(data == null) {
236                throw new ICUException("Tailored ces without tailored trie");
237            }
238            data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
239        } else {
240            ICUBinary.skipBytes(inBytes, length);
241        }
242
243        index = IX_RESERVED10_OFFSET;
244        offset = inIndexes[index];
245        length = inIndexes[index + 1] - offset;
246        ICUBinary.skipBytes(inBytes, length);
247
248        index = IX_CE32S_OFFSET;
249        offset = inIndexes[index];
250        length = inIndexes[index + 1] - offset;
251        if(length >= 4) {
252            if(data == null) {
253                throw new ICUException("Tailored ce32s without tailored trie");
254            }
255            data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
256        } else {
257            ICUBinary.skipBytes(inBytes, length);
258        }
259
260        int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
261        if(jamoCE32sStart >= 0) {
262            if(data == null || data.ce32s == null) {
263                throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
264            }
265            data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
266            System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
267        } else if(data == null) {
268            // Nothing to do.
269        } else if(baseData != null) {
270            data.jamoCE32s = baseData.jamoCE32s;
271        } else {
272            throw new ICUException("Missing Jamo CE32s for Hangul processing");
273        }
274
275        index = IX_ROOT_ELEMENTS_OFFSET;
276        offset = inIndexes[index];
277        length = inIndexes[index + 1] - offset;
278        if(length >= 4) {
279            int rootElementsLength = length / 4;
280            if(data == null) {
281                throw new ICUException("Root elements but no mappings");
282            }
283            if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
284                throw new ICUException("Root elements array too short");
285            }
286            data.rootElements = new long[rootElementsLength];
287            for(int i = 0; i < rootElementsLength; ++i) {
288                data.rootElements[i] = inBytes.getInt() & 0xffffffffL;  // unsigned int -> long
289            }
290            long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
291            if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
292                throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
293            }
294            long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
295            if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
296                // [fixed last secondary common byte] is too low,
297                // and secondary weights would collide with compressed common secondaries.
298                throw new ICUException("[fixed last secondary common byte] is too low");
299            }
300            length &= 3;
301        }
302        ICUBinary.skipBytes(inBytes, length);
303
304        index = IX_CONTEXTS_OFFSET;
305        offset = inIndexes[index];
306        length = inIndexes[index + 1] - offset;
307        if(length >= 2) {
308            if(data == null) {
309                throw new ICUException("Tailored contexts without tailored trie");
310            }
311            data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
312        } else {
313            ICUBinary.skipBytes(inBytes, length);
314        }
315
316        index = IX_UNSAFE_BWD_OFFSET;
317        offset = inIndexes[index];
318        length = inIndexes[index + 1] - offset;
319        if(length >= 2) {
320            if(data == null) {
321                throw new ICUException("Unsafe-backward-set but no mappings");
322            }
323            if(baseData == null) {
324                // Create the unsafe-backward set for the root collator.
325                // Include all non-zero combining marks and trail surrogates.
326                // We do this at load time, rather than at build time,
327                // to simplify Unicode version bootstrapping:
328                // The root data builder only needs the new FractionalUCA.txt data,
329                // but it need not be built with a version of ICU already updated to
330                // the corresponding new Unicode Character Database.
331                //
332                // The following is an optimized version of
333                // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
334                // It is faster and requires fewer code dependencies.
335                tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
336                data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
337            } else {
338                // Clone the root collator's set contents.
339                tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
340            }
341            // Add the ranges from the data file to the unsafe-backward set.
342            USerializedSet sset = new USerializedSet();
343            char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
344            length = 0;
345            sset.getSet(unsafeData, 0);
346            int count = sset.countRanges();
347            int[] range = new int[2];
348            for(int i = 0; i < count; ++i) {
349                sset.getRange(i, range);
350                tailoring.unsafeBackwardSet.add(range[0], range[1]);
351            }
352            // Mark each lead surrogate as "unsafe"
353            // if any of its 1024 associated supplementary code points is "unsafe".
354            int c = 0x10000;
355            for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
356                if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
357                    tailoring.unsafeBackwardSet.add(lead);
358                }
359            }
360            tailoring.unsafeBackwardSet.freeze();
361            data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
362        } else if(data == null) {
363            // Nothing to do.
364        } else if(baseData != null) {
365            // No tailoring-specific data: Alias the root collator's set.
366            data.unsafeBackwardSet = baseData.unsafeBackwardSet;
367        } else {
368            throw new ICUException("Missing unsafe-backward-set");
369        }
370        ICUBinary.skipBytes(inBytes, length);
371
372        // If the fast Latin format version is different,
373        // or the version is set to 0 for "no fast Latin table",
374        // then just always use the normal string comparison path.
375        index = IX_FAST_LATIN_TABLE_OFFSET;
376        offset = inIndexes[index];
377        length = inIndexes[index + 1] - offset;
378        if(data != null) {
379            data.fastLatinTable = null;
380            data.fastLatinTableHeader = null;
381            if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
382                if(length >= 2) {
383                    char header0 = inBytes.getChar();
384                    int headerLength = header0 & 0xff;
385                    data.fastLatinTableHeader = new char[headerLength];
386                    data.fastLatinTableHeader[0] = header0;
387                    for(int i = 1; i < headerLength; ++i) {
388                        data.fastLatinTableHeader[i] = inBytes.getChar();
389                    }
390                    int tableLength = length / 2 - headerLength;
391                    data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
392                    length = 0;
393                    if((header0 >> 8) != CollationFastLatin.VERSION) {
394                        throw new ICUException("Fast-Latin table version differs from version in data header");
395                    }
396                } else if(baseData != null) {
397                    data.fastLatinTable = baseData.fastLatinTable;
398                    data.fastLatinTableHeader = baseData.fastLatinTableHeader;
399                }
400            }
401        }
402        ICUBinary.skipBytes(inBytes, length);
403
404        index = IX_SCRIPTS_OFFSET;
405        offset = inIndexes[index];
406        length = inIndexes[index + 1] - offset;
407        if(length >= 2) {
408            if(data == null) {
409                throw new ICUException("Script order data but no mappings");
410            }
411            int scriptsLength = length / 2;
412            CharBuffer inChars = inBytes.asCharBuffer();
413            data.numScripts = inChars.get();
414            // There must be enough entries for both arrays, including more than two range starts.
415            int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
416            if(scriptStartsLength <= 2) {
417                throw new ICUException("Script order data too short");
418            }
419            inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
420            inChars.get(data.scriptStarts = new char[scriptStartsLength]);
421            if(!(data.scriptStarts[0] == 0 &&
422                    data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
423                    data.scriptStarts[scriptStartsLength - 1] ==
424                            (Collation.TRAIL_WEIGHT_BYTE << 8))) {
425                throw new ICUException("Script order data not valid");
426            }
427        } else if(data == null) {
428            // Nothing to do.
429        } else if(baseData != null) {
430            data.numScripts = baseData.numScripts;
431            data.scriptsIndex = baseData.scriptsIndex;
432            data.scriptStarts = baseData.scriptStarts;
433        }
434        ICUBinary.skipBytes(inBytes, length);
435
436        index = IX_COMPRESSIBLE_BYTES_OFFSET;
437        offset = inIndexes[index];
438        length = inIndexes[index + 1] - offset;
439        if(length >= 256) {
440            if(data == null) {
441                throw new ICUException("Data for compressible primary lead bytes but no mappings");
442            }
443            data.compressibleBytes = new boolean[256];
444            for(int i = 0; i < 256; ++i) {
445                data.compressibleBytes[i] = inBytes.get() != 0;
446            }
447            length -= 256;
448        } else if(data == null) {
449            // Nothing to do.
450        } else if(baseData != null) {
451            data.compressibleBytes = baseData.compressibleBytes;
452        } else {
453            throw new ICUException("Missing data for compressible primary lead bytes");
454        }
455        ICUBinary.skipBytes(inBytes, length);
456
457        index = IX_RESERVED18_OFFSET;
458        offset = inIndexes[index];
459        length = inIndexes[index + 1] - offset;
460        ICUBinary.skipBytes(inBytes, length);
461
462        CollationSettings ts = tailoring.settings.readOnly();
463        int options = inIndexes[IX_OPTIONS] & 0xffff;
464        char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
465        int fastLatinOptions = CollationFastLatin.getOptions(
466                tailoring.data, ts, fastLatinPrimaries);
467        if(options == ts.options && ts.variableTop != 0 &&
468                Arrays.equals(reorderCodes, ts.reorderCodes) &&
469                fastLatinOptions == ts.fastLatinOptions &&
470                (fastLatinOptions < 0 ||
471                        Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
472            return;
473        }
474
475        CollationSettings settings = tailoring.settings.copyOnWrite();
476        settings.options = options;
477        // Set variableTop from options and scripts data.
478        settings.variableTop = tailoring.data.getLastPrimaryForGroup(
479                Collator.ReorderCodes.FIRST + settings.getMaxVariable());
480        if(settings.variableTop == 0) {
481            throw new ICUException("The maxVariable could not be mapped to a variableTop");
482        }
483
484        if(reorderCodesLength != 0) {
485            settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
486        }
487
488        settings.fastLatinOptions = CollationFastLatin.getOptions(
489            tailoring.data, settings,
490            settings.fastLatinPrimaries);
491    }
492
493    private static final class IsAcceptable implements ICUBinary.Authenticate {
494        @Override
495        public boolean isDataVersionAcceptable(byte version[]) {
496            return version[0] == 5;
497        }
498    }
499    private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
500    private static final int DATA_FORMAT = 0x55436f6c;  // "UCol"
501
502    private CollationDataReader() {}  // no constructor
503}
504
505/*
506 * Format of collation data (ucadata.icu, binary data in coll/ *.res files):
507 * See ICU4C source/common/collationdatareader.h.
508 */
509