1/*
2*******************************************************************************
3* Copyright (C) 2013-2015, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* CollationDataReader.java, ported from collationdatareader.h/.cpp
7*
8* C++ version created on: 2013feb07
9* created by: Markus W. Scherer
10*/
11
12package com.ibm.icu.impl.coll;
13
14import java.io.IOException;
15import java.nio.ByteBuffer;
16import java.nio.CharBuffer;
17import java.util.Arrays;
18
19import com.ibm.icu.impl.ICUBinary;
20import com.ibm.icu.impl.Trie2_32;
21import com.ibm.icu.impl.USerializedSet;
22import com.ibm.icu.text.Collator;
23import com.ibm.icu.text.UnicodeSet;
24import com.ibm.icu.util.ICUException;
25
26/**
27 * Collation binary data reader.
28 */
29final class CollationDataReader /* all static */ {
30    // The following constants are also copied into source/common/ucol_swp.cpp.
31    // Keep them in sync!
32    /**
33     * Number of int indexes.
34     *
35     * Can be 2 if there are only options.
36     * Can be 7 or 8 if there are only options and a script reordering.
37     * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
38     */
39    static final int IX_INDEXES_LENGTH = 0;
40    /**
41     * Bits 31..24: numericPrimary, for numeric collation
42     *      23..16: fast Latin format version (0 = no fast Latin table)
43     *      15.. 0: options bit set
44     */
45    static final int IX_OPTIONS = 1;
46    static final int IX_RESERVED2 = 2;
47    static final int IX_RESERVED3 = 3;
48
49    /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
50    static final int IX_JAMO_CE32S_START = 4;
51
52    // Byte offsets from the start of the data, after the generic header.
53    // The indexes[] are at byte offset 0, other data follows.
54    // Each data item is aligned properly.
55    // The data items should be in descending order of unit size,
56    // to minimize the need for padding.
57    // Each item's byte length is given by the difference between its offset and
58    // the next index/offset value.
59    /** Byte offset to int reorderCodes[]. */
60    static final int IX_REORDER_CODES_OFFSET = 5;
61    /**
62     * Byte offset to uint8_t reorderTable[].
63     * Empty table if <256 bytes (padding only).
64     * Otherwise 256 bytes or more (with padding).
65     */
66    static final int IX_REORDER_TABLE_OFFSET = 6;
67    /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
68    static final int IX_TRIE_OFFSET = 7;
69
70    static final int IX_RESERVED8_OFFSET = 8;
71    /** Byte offset to long ces[]. */
72    static final int IX_CES_OFFSET = 9;
73    static final int IX_RESERVED10_OFFSET = 10;
74    /** Byte offset to int ce32s[]. */
75    static final int IX_CE32S_OFFSET = 11;
76
77    /** Byte offset to uint32_t rootElements[]. */
78    static final int IX_ROOT_ELEMENTS_OFFSET = 12;
79    /** Byte offset to UChar *contexts[]. */
80    static final int IX_CONTEXTS_OFFSET = 13;
81    /** Byte offset to char [] with serialized unsafeBackwardSet. */
82    static final int IX_UNSAFE_BWD_OFFSET = 14;
83    /** Byte offset to char fastLatinTable[]. */
84    static final int IX_FAST_LATIN_TABLE_OFFSET = 15;
85
86    /** Byte offset to char scripts[]. */
87    static final int IX_SCRIPTS_OFFSET = 16;
88    /**
89     * Byte offset to boolean compressibleBytes[].
90     * Empty table if <256 bytes (padding only).
91     * Otherwise 256 bytes or more (with padding).
92     */
93    static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17;
94    static final int IX_RESERVED18_OFFSET = 18;
95    static final int IX_TOTAL_SIZE = 19;
96
97    static void read(CollationTailoring base, ByteBuffer inBytes,
98                     CollationTailoring tailoring) throws IOException {
99        tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
100        if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
101            throw new ICUException("Tailoring UCA version differs from base data UCA version");
102        }
103
104        int inLength = inBytes.remaining();
105        if(inLength < 8) {
106            throw new ICUException("not enough bytes");
107        }
108        int indexesLength = inBytes.getInt();  // inIndexes[IX_INDEXES_LENGTH]
109        if(indexesLength < 2 || inLength < indexesLength * 4) {
110            throw new ICUException("not enough indexes");
111        }
112        int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
113        inIndexes[0] = indexesLength;
114        for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
115            inIndexes[i] = inBytes.getInt();
116        }
117        for(int i = indexesLength; i < inIndexes.length; ++i) {
118            inIndexes[i] = -1;
119        }
120        if(indexesLength > inIndexes.length) {
121            ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
122        }
123
124        // Assume that the tailoring data is in initial state,
125        // with null pointers and 0 lengths.
126
127        // Set pointers to non-empty data parts.
128        // Do this in order of their byte offsets. (Should help porting to Java.)
129
130        int index;  // one of the indexes[] slots
131        int offset;  // byte offset for the index part
132        int length;  // number of bytes in the index part
133
134        if(indexesLength > IX_TOTAL_SIZE) {
135            length = inIndexes[IX_TOTAL_SIZE];
136        } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
137            length = inIndexes[indexesLength - 1];
138        } else {
139            length = 0;  // only indexes, and inLength was already checked for them
140        }
141        if(inLength < length) {
142            throw new ICUException("not enough bytes");
143        }
144
145        CollationData baseData = base == null ? null : base.data;
146        int[] reorderCodes;
147        int reorderCodesLength;
148        index = IX_REORDER_CODES_OFFSET;
149        offset = inIndexes[index];
150        length = inIndexes[index + 1] - offset;
151        if(length >= 4) {
152            if(baseData == null) {
153                // We assume for collation settings that
154                // the base data does not have a reordering.
155                throw new ICUException("Collation base data must not reorder scripts");
156            }
157            reorderCodesLength = length / 4;
158            reorderCodes = new int[reorderCodesLength];
159            for(int i = 0; i < reorderCodesLength; ++i) {
160                reorderCodes[i] = inBytes.getInt();
161            }
162            length &= 3;
163
164            // The reorderRanges (if any) are the trailing reorderCodes entries.
165            // Split the array at the boundary.
166            // Script or reorder codes do not exceed 16-bit values.
167            // Range limits are stored in the upper 16 bits, and are never 0.
168            int reorderRangesLength = 0;
169            while(reorderRangesLength < reorderCodesLength &&
170                    (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
171                ++reorderRangesLength;
172            }
173            assert(reorderRangesLength < reorderCodesLength);
174            reorderCodesLength -= reorderRangesLength;
175        } else {
176            reorderCodes = new int[0];
177            reorderCodesLength = 0;
178        }
179        ICUBinary.skipBytes(inBytes, length);
180
181        // There should be a reorder table only if there are reorder codes.
182        // However, when there are reorder codes the reorder table may be omitted to reduce
183        // the data size.
184        byte[] reorderTable = null;
185        index = IX_REORDER_TABLE_OFFSET;
186        offset = inIndexes[index];
187        length = inIndexes[index + 1] - offset;
188        if(length >= 256) {
189            if(reorderCodesLength == 0) {
190                throw new ICUException("Reordering table without reordering codes");
191            }
192            reorderTable = new byte[256];
193            inBytes.get(reorderTable);
194            length -= 256;
195        } else {
196            // If we have reorder codes, then build the reorderTable at the end,
197            // when the CollationData is otherwise complete.
198        }
199        ICUBinary.skipBytes(inBytes, length);
200
201        if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
202            throw new ICUException("Tailoring numeric primary weight differs from base data");
203        }
204        CollationData data = null;  // Remains null if there are no mappings.
205
206        index = IX_TRIE_OFFSET;
207        offset = inIndexes[index];
208        length = inIndexes[index + 1] - offset;
209        if(length >= 8) {
210            tailoring.ensureOwnedData();
211            data = tailoring.ownedData;
212            data.base = baseData;
213            data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
214            data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
215            int trieLength = data.trie.getSerializedLength();
216            if(trieLength > length) {
217                throw new ICUException("Not enough bytes for the mappings trie");  // No mappings.
218            }
219            length -= trieLength;
220        } else if(baseData != null) {
221            // Use the base data. Only the settings are tailored.
222            tailoring.data = baseData;
223        } else {
224            throw new ICUException("Missing collation data mappings");  // No mappings.
225        }
226        ICUBinary.skipBytes(inBytes, length);
227
228        index = IX_RESERVED8_OFFSET;
229        offset = inIndexes[index];
230        length = inIndexes[index + 1] - offset;
231        ICUBinary.skipBytes(inBytes, length);
232
233        index = IX_CES_OFFSET;
234        offset = inIndexes[index];
235        length = inIndexes[index + 1] - offset;
236        if(length >= 8) {
237            if(data == null) {
238                throw new ICUException("Tailored ces without tailored trie");
239            }
240            data.ces = new long[length / 8];
241            for(int i = 0; i < length / 8; ++i) {
242                data.ces[i] = inBytes.getLong();
243            }
244            length &= 7;
245        }
246        ICUBinary.skipBytes(inBytes, length);
247
248        index = IX_RESERVED10_OFFSET;
249        offset = inIndexes[index];
250        length = inIndexes[index + 1] - offset;
251        ICUBinary.skipBytes(inBytes, length);
252
253        index = IX_CE32S_OFFSET;
254        offset = inIndexes[index];
255        length = inIndexes[index + 1] - offset;
256        if(length >= 4) {
257            if(data == null) {
258                throw new ICUException("Tailored ce32s without tailored trie");
259            }
260            data.ce32s = new int[length / 4];
261            for(int i = 0; i < length / 4; ++i) {
262                data.ce32s[i] = inBytes.getInt();
263            }
264            length &= 3;
265        }
266        ICUBinary.skipBytes(inBytes, length);
267
268        int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
269        if(jamoCE32sStart >= 0) {
270            if(data == null || data.ce32s == null) {
271                throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
272            }
273            data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
274            System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
275        } else if(data == null) {
276            // Nothing to do.
277        } else if(baseData != null) {
278            data.jamoCE32s = baseData.jamoCE32s;
279        } else {
280            throw new ICUException("Missing Jamo CE32s for Hangul processing");
281        }
282
283        index = IX_ROOT_ELEMENTS_OFFSET;
284        offset = inIndexes[index];
285        length = inIndexes[index + 1] - offset;
286        if(length >= 4) {
287            int rootElementsLength = length / 4;
288            if(data == null) {
289                throw new ICUException("Root elements but no mappings");
290            }
291            if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
292                throw new ICUException("Root elements array too short");
293            }
294            data.rootElements = new long[rootElementsLength];
295            for(int i = 0; i < rootElementsLength; ++i) {
296                data.rootElements[i] = inBytes.getInt() & 0xffffffffL;  // unsigned int -> long
297            }
298            long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
299            if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
300                throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
301            }
302            long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
303            if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
304                // [fixed last secondary common byte] is too low,
305                // and secondary weights would collide with compressed common secondaries.
306                throw new ICUException("[fixed last secondary common byte] is too low");
307            }
308            length &= 3;
309        }
310        ICUBinary.skipBytes(inBytes, length);
311
312        index = IX_CONTEXTS_OFFSET;
313        offset = inIndexes[index];
314        length = inIndexes[index + 1] - offset;
315        if(length >= 2) {
316            if(data == null) {
317                throw new ICUException("Tailored contexts without tailored trie");
318            }
319            StringBuilder sb = new StringBuilder(length / 2);
320            for(int i = 0; i < length / 2; ++i) {
321                sb.append(inBytes.getChar());
322            }
323            data.contexts = sb.toString();
324            length &= 1;
325        }
326        ICUBinary.skipBytes(inBytes, length);
327
328        index = IX_UNSAFE_BWD_OFFSET;
329        offset = inIndexes[index];
330        length = inIndexes[index + 1] - offset;
331        if(length >= 2) {
332            if(data == null) {
333                throw new ICUException("Unsafe-backward-set but no mappings");
334            }
335            if(baseData == null) {
336                // Create the unsafe-backward set for the root collator.
337                // Include all non-zero combining marks and trail surrogates.
338                // We do this at load time, rather than at build time,
339                // to simplify Unicode version bootstrapping:
340                // The root data builder only needs the new FractionalUCA.txt data,
341                // but it need not be built with a version of ICU already updated to
342                // the corresponding new Unicode Character Database.
343                //
344                // The following is an optimized version of
345                // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
346                // It is faster and requires fewer code dependencies.
347                tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
348                data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
349            } else {
350                // Clone the root collator's set contents.
351                tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
352            }
353            // Add the ranges from the data file to the unsafe-backward set.
354            USerializedSet sset = new USerializedSet();
355            char[] unsafeData = new char[length / 2];
356            for(int i = 0; i < length / 2; ++i) {
357                unsafeData[i] = inBytes.getChar();
358            }
359            length &= 1;
360            sset.getSet(unsafeData, 0);
361            int count = sset.countRanges();
362            int[] range = new int[2];
363            for(int i = 0; i < count; ++i) {
364                sset.getRange(i, range);
365                tailoring.unsafeBackwardSet.add(range[0], range[1]);
366            }
367            // Mark each lead surrogate as "unsafe"
368            // if any of its 1024 associated supplementary code points is "unsafe".
369            int c = 0x10000;
370            for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
371                if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
372                    tailoring.unsafeBackwardSet.add(lead);
373                }
374            }
375            tailoring.unsafeBackwardSet.freeze();
376            data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
377        } else if(data == null) {
378            // Nothing to do.
379        } else if(baseData != null) {
380            // No tailoring-specific data: Alias the root collator's set.
381            data.unsafeBackwardSet = baseData.unsafeBackwardSet;
382        } else {
383            throw new ICUException("Missing unsafe-backward-set");
384        }
385        ICUBinary.skipBytes(inBytes, length);
386
387        // If the fast Latin format version is different,
388        // or the version is set to 0 for "no fast Latin table",
389        // then just always use the normal string comparison path.
390        index = IX_FAST_LATIN_TABLE_OFFSET;
391        offset = inIndexes[index];
392        length = inIndexes[index + 1] - offset;
393        if(data != null) {
394            data.fastLatinTable = null;
395            data.fastLatinTableHeader = null;
396            if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
397                if(length >= 2) {
398                    char header0 = inBytes.getChar();
399                    int headerLength = header0 & 0xff;
400                    data.fastLatinTableHeader = new char[headerLength];
401                    data.fastLatinTableHeader[0] = header0;
402                    for(int i = 1; i < headerLength; ++i) {
403                        data.fastLatinTableHeader[i] = inBytes.getChar();
404                    }
405                    int tableLength = length / 2 - headerLength;
406                    data.fastLatinTable = new char[tableLength];
407                    for(int i = 0; i < tableLength; ++i) {
408                        data.fastLatinTable[i] = inBytes.getChar();
409                    }
410                    length &= 1;
411                    if((header0 >> 8) != CollationFastLatin.VERSION) {
412                        throw new ICUException("Fast-Latin table version differs from version in data header");
413                    }
414                } else if(baseData != null) {
415                    data.fastLatinTable = baseData.fastLatinTable;
416                    data.fastLatinTableHeader = baseData.fastLatinTableHeader;
417                }
418            }
419        }
420        ICUBinary.skipBytes(inBytes, length);
421
422        index = IX_SCRIPTS_OFFSET;
423        offset = inIndexes[index];
424        length = inIndexes[index + 1] - offset;
425        if(length >= 2) {
426            if(data == null) {
427                throw new ICUException("Script order data but no mappings");
428            }
429            int scriptsLength = length / 2;
430            CharBuffer inChars = inBytes.asCharBuffer();
431            data.numScripts = inChars.get();
432            // There must be enough entries for both arrays, including more than two range starts.
433            int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
434            if(scriptStartsLength <= 2) {
435                throw new ICUException("Script order data too short");
436            }
437            inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
438            inChars.get(data.scriptStarts = new char[scriptStartsLength]);
439            if(!(data.scriptStarts[0] == 0 &&
440                    data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
441                    data.scriptStarts[scriptStartsLength - 1] ==
442                            (Collation.TRAIL_WEIGHT_BYTE << 8))) {
443                throw new ICUException("Script order data not valid");
444            }
445        } else if(data == null) {
446            // Nothing to do.
447        } else if(baseData != null) {
448            data.numScripts = baseData.numScripts;
449            data.scriptsIndex = baseData.scriptsIndex;
450            data.scriptStarts = baseData.scriptStarts;
451        }
452        ICUBinary.skipBytes(inBytes, length);
453
454        index = IX_COMPRESSIBLE_BYTES_OFFSET;
455        offset = inIndexes[index];
456        length = inIndexes[index + 1] - offset;
457        if(length >= 256) {
458            if(data == null) {
459                throw new ICUException("Data for compressible primary lead bytes but no mappings");
460            }
461            data.compressibleBytes = new boolean[256];
462            for(int i = 0; i < 256; ++i) {
463                data.compressibleBytes[i] = inBytes.get() != 0;
464            }
465            length -= 256;
466        } else if(data == null) {
467            // Nothing to do.
468        } else if(baseData != null) {
469            data.compressibleBytes = baseData.compressibleBytes;
470        } else {
471            throw new ICUException("Missing data for compressible primary lead bytes");
472        }
473        ICUBinary.skipBytes(inBytes, length);
474
475        index = IX_RESERVED18_OFFSET;
476        offset = inIndexes[index];
477        length = inIndexes[index + 1] - offset;
478        ICUBinary.skipBytes(inBytes, length);
479
480        CollationSettings ts = tailoring.settings.readOnly();
481        int options = inIndexes[IX_OPTIONS] & 0xffff;
482        char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
483        int fastLatinOptions = CollationFastLatin.getOptions(
484                tailoring.data, ts, fastLatinPrimaries);
485        if(options == ts.options && ts.variableTop != 0 &&
486                Arrays.equals(reorderCodes, ts.reorderCodes) &&
487                fastLatinOptions == ts.fastLatinOptions &&
488                (fastLatinOptions < 0 ||
489                        Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
490            return;
491        }
492
493        CollationSettings settings = tailoring.settings.copyOnWrite();
494        settings.options = options;
495        // Set variableTop from options and scripts data.
496        settings.variableTop = tailoring.data.getLastPrimaryForGroup(
497                Collator.ReorderCodes.FIRST + settings.getMaxVariable());
498        if(settings.variableTop == 0) {
499            throw new ICUException("The maxVariable could not be mapped to a variableTop");
500        }
501
502        if(reorderCodesLength != 0) {
503            settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
504        }
505
506        settings.fastLatinOptions = CollationFastLatin.getOptions(
507            tailoring.data, settings,
508            settings.fastLatinPrimaries);
509    }
510
511    private static final class IsAcceptable implements ICUBinary.Authenticate {
512        // @Override when we switch to Java 6
513        public boolean isDataVersionAcceptable(byte version[]) {
514            return version[0] == 5;
515        }
516    }
517    private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
518    private static final int DATA_FORMAT = 0x55436f6c;  // "UCol"
519
520    private CollationDataReader() {}  // no constructor
521}
522
523/*
524 * Format of collation data (ucadata.icu, binary data in coll/ *.res files):
525 * See ICU4C source/common/collationdatareader.h.
526 */
527