1/*
2*******************************************************************************
3* Copyright (C) 2013-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationdatareader.cpp
7*
8* created on: 2013feb07
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/ucol.h"
17#include "unicode/udata.h"
18#include "unicode/uscript.h"
19#include "cmemory.h"
20#include "collation.h"
21#include "collationdata.h"
22#include "collationdatareader.h"
23#include "collationfastlatin.h"
24#include "collationkeys.h"
25#include "collationrootelements.h"
26#include "collationsettings.h"
27#include "collationtailoring.h"
28#include "normalizer2impl.h"
29#include "uassert.h"
30#include "ucmndata.h"
31#include "utrie2.h"
32
33#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
34
35U_NAMESPACE_BEGIN
36
37namespace {
38
39int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
40    return (i < length) ? indexes[i] : -1;
41}
42
43}  // namespace
44
45void
46CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
47                          CollationTailoring &tailoring, UErrorCode &errorCode) {
48    if(U_FAILURE(errorCode)) { return; }
49    if(base != NULL) {
50        if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
51            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
52            return;
53        }
54        const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
55        if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
56                isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
57            errorCode = U_INVALID_FORMAT_ERROR;
58            return;
59        }
60        if(base->getUCAVersion() != tailoring.getUCAVersion()) {
61            errorCode = U_COLLATOR_VERSION_MISMATCH;
62            return;
63        }
64        int32_t headerLength = header->dataHeader.headerSize;
65        inBytes += headerLength;
66        if(inLength >= 0) {
67            inLength -= headerLength;
68        }
69    }
70
71    if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
72        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
73        return;
74    }
75    const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
76    int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
77    if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
78        errorCode = U_INVALID_FORMAT_ERROR;  // Not enough indexes.
79        return;
80    }
81
82    // Assume that the tailoring data is in initial state,
83    // with NULL pointers and 0 lengths.
84
85    // Set pointers to non-empty data parts.
86    // Do this in order of their byte offsets. (Should help porting to Java.)
87
88    int32_t index;  // one of the indexes[] slots
89    int32_t offset;  // byte offset for the index part
90    int32_t length;  // number of bytes in the index part
91
92    if(indexesLength > IX_TOTAL_SIZE) {
93        length = inIndexes[IX_TOTAL_SIZE];
94    } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
95        length = inIndexes[indexesLength - 1];
96    } else {
97        length = 0;  // only indexes, and inLength was already checked for them
98    }
99    if(0 <= inLength && inLength < length) {
100        errorCode = U_INVALID_FORMAT_ERROR;
101        return;
102    }
103
104    const CollationData *baseData = base == NULL ? NULL : base->data;
105    const int32_t *reorderCodes = NULL;
106    int32_t reorderCodesLength = 0;
107    index = IX_REORDER_CODES_OFFSET;
108    offset = getIndex(inIndexes, indexesLength, index);
109    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
110    if(length >= 4) {
111        if(baseData == NULL) {
112            // We assume for collation settings that
113            // the base data does not have a reordering.
114            errorCode = U_INVALID_FORMAT_ERROR;
115            return;
116        }
117        reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
118        reorderCodesLength = length / 4;
119    }
120
121    // There should be a reorder table only if there are reorder codes.
122    // However, when there are reorder codes the reorder table may be omitted to reduce
123    // the data size.
124    const uint8_t *reorderTable = NULL;
125    index = IX_REORDER_TABLE_OFFSET;
126    offset = getIndex(inIndexes, indexesLength, index);
127    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
128    if(length >= 256) {
129        if(reorderCodesLength == 0) {
130            errorCode = U_INVALID_FORMAT_ERROR;  // Reordering table without reordering codes.
131            return;
132        }
133        reorderTable = inBytes + offset;
134    } else {
135        // If we have reorder codes, then build the reorderTable at the end,
136        // when the CollationData is otherwise complete.
137    }
138
139    if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
140        errorCode = U_INVALID_FORMAT_ERROR;
141        return;
142    }
143    CollationData *data = NULL;  // Remains NULL if there are no mappings.
144
145    index = IX_TRIE_OFFSET;
146    offset = getIndex(inIndexes, indexesLength, index);
147    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
148    if(length >= 8) {
149        if(!tailoring.ensureOwnedData(errorCode)) { return; }
150        data = tailoring.ownedData;
151        data->base = baseData;
152        data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
153        data->trie = tailoring.trie = utrie2_openFromSerialized(
154            UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
155            &errorCode);
156        if(U_FAILURE(errorCode)) { return; }
157    } else if(baseData != NULL) {
158        // Use the base data. Only the settings are tailored.
159        tailoring.data = baseData;
160    } else {
161        errorCode = U_INVALID_FORMAT_ERROR;  // No mappings.
162        return;
163    }
164
165    index = IX_CES_OFFSET;
166    offset = getIndex(inIndexes, indexesLength, index);
167    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
168    if(length >= 8) {
169        if(data == NULL) {
170            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ces without tailored trie.
171            return;
172        }
173        data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
174        data->cesLength = length / 8;
175    }
176
177    index = IX_CE32S_OFFSET;
178    offset = getIndex(inIndexes, indexesLength, index);
179    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
180    if(length >= 4) {
181        if(data == NULL) {
182            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ce32s without tailored trie.
183            return;
184        }
185        data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
186        data->ce32sLength = length / 4;
187    }
188
189    int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
190    if(jamoCE32sStart >= 0) {
191        if(data == NULL || data->ce32s == NULL) {
192            errorCode = U_INVALID_FORMAT_ERROR;  // Index into non-existent ce32s[].
193            return;
194        }
195        data->jamoCE32s = data->ce32s + jamoCE32sStart;
196    } else if(data == NULL) {
197        // Nothing to do.
198    } else if(baseData != NULL) {
199        data->jamoCE32s = baseData->jamoCE32s;
200    } else {
201        errorCode = U_INVALID_FORMAT_ERROR;  // No Jamo CE32s for Hangul processing.
202        return;
203    }
204
205    index = IX_ROOT_ELEMENTS_OFFSET;
206    offset = getIndex(inIndexes, indexesLength, index);
207    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
208    if(length >= 4) {
209        length /= 4;
210        if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
211            errorCode = U_INVALID_FORMAT_ERROR;
212            return;
213        }
214        data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
215        data->rootElementsLength = length;
216        uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
217        if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
218            errorCode = U_INVALID_FORMAT_ERROR;
219            return;
220        }
221        uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
222        if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
223            // [fixed last secondary common byte] is too low,
224            // and secondary weights would collide with compressed common secondaries.
225            errorCode = U_INVALID_FORMAT_ERROR;
226            return;
227        }
228    }
229
230    index = IX_CONTEXTS_OFFSET;
231    offset = getIndex(inIndexes, indexesLength, index);
232    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
233    if(length >= 2) {
234        if(data == NULL) {
235            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored contexts without tailored trie.
236            return;
237        }
238        data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
239        data->contextsLength = length / 2;
240    }
241
242    index = IX_UNSAFE_BWD_OFFSET;
243    offset = getIndex(inIndexes, indexesLength, index);
244    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
245    if(length >= 2) {
246        if(data == NULL) {
247            errorCode = U_INVALID_FORMAT_ERROR;
248            return;
249        }
250        if(baseData == NULL) {
251            // Create the unsafe-backward set for the root collator.
252            // Include all non-zero combining marks and trail surrogates.
253            // We do this at load time, rather than at build time,
254            // to simplify Unicode version bootstrapping:
255            // The root data builder only needs the new FractionalUCA.txt data,
256            // but it need not be built with a version of ICU already updated to
257            // the corresponding new Unicode Character Database.
258            //
259            // The following is an optimized version of
260            // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
261            // It is faster and requires fewer code dependencies.
262            tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
263            if(tailoring.unsafeBackwardSet == NULL) {
264                errorCode = U_MEMORY_ALLOCATION_ERROR;
265                return;
266            }
267            data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
268        } else {
269            // Clone the root collator's set contents.
270            tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
271                baseData->unsafeBackwardSet->cloneAsThawed());
272            if(tailoring.unsafeBackwardSet == NULL) {
273                errorCode = U_MEMORY_ALLOCATION_ERROR;
274                return;
275            }
276        }
277        // Add the ranges from the data file to the unsafe-backward set.
278        USerializedSet sset;
279        const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
280        if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
281            errorCode = U_INVALID_FORMAT_ERROR;
282            return;
283        }
284        int32_t count = uset_getSerializedRangeCount(&sset);
285        for(int32_t i = 0; i < count; ++i) {
286            UChar32 start, end;
287            uset_getSerializedRange(&sset, i, &start, &end);
288            tailoring.unsafeBackwardSet->add(start, end);
289        }
290        // Mark each lead surrogate as "unsafe"
291        // if any of its 1024 associated supplementary code points is "unsafe".
292        UChar32 c = 0x10000;
293        for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
294            if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
295                tailoring.unsafeBackwardSet->add(lead);
296            }
297        }
298        tailoring.unsafeBackwardSet->freeze();
299        data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
300    } else if(data == NULL) {
301        // Nothing to do.
302    } else if(baseData != NULL) {
303        // No tailoring-specific data: Alias the root collator's set.
304        data->unsafeBackwardSet = baseData->unsafeBackwardSet;
305    } else {
306        errorCode = U_INVALID_FORMAT_ERROR;  // No unsafeBackwardSet.
307        return;
308    }
309
310    // If the fast Latin format version is different,
311    // or the version is set to 0 for "no fast Latin table",
312    // then just always use the normal string comparison path.
313    if(data != NULL) {
314        data->fastLatinTable = NULL;
315        data->fastLatinTableLength = 0;
316        if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
317            index = IX_FAST_LATIN_TABLE_OFFSET;
318            offset = getIndex(inIndexes, indexesLength, index);
319            length = getIndex(inIndexes, indexesLength, index + 1) - offset;
320            if(length >= 2) {
321                data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
322                data->fastLatinTableLength = length / 2;
323                if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
324                    errorCode = U_INVALID_FORMAT_ERROR;  // header vs. table version mismatch
325                    return;
326                }
327            } else if(baseData != NULL) {
328                data->fastLatinTable = baseData->fastLatinTable;
329                data->fastLatinTableLength = baseData->fastLatinTableLength;
330            }
331        }
332    }
333
334    index = IX_SCRIPTS_OFFSET;
335    offset = getIndex(inIndexes, indexesLength, index);
336    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
337    if(length >= 2) {
338        if(data == NULL) {
339            errorCode = U_INVALID_FORMAT_ERROR;
340            return;
341        }
342        data->scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
343        data->scriptsLength = length / 2;
344    } else if(data == NULL) {
345        // Nothing to do.
346    } else if(baseData != NULL) {
347        data->scripts = baseData->scripts;
348        data->scriptsLength = baseData->scriptsLength;
349    }
350
351    index = IX_COMPRESSIBLE_BYTES_OFFSET;
352    offset = getIndex(inIndexes, indexesLength, index);
353    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
354    if(length >= 256) {
355        if(data == NULL) {
356            errorCode = U_INVALID_FORMAT_ERROR;
357            return;
358        }
359        data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
360    } else if(data == NULL) {
361        // Nothing to do.
362    } else if(baseData != NULL) {
363        data->compressibleBytes = baseData->compressibleBytes;
364    } else {
365        errorCode = U_INVALID_FORMAT_ERROR;  // No compressibleBytes[].
366        return;
367    }
368
369    const CollationSettings &ts = *tailoring.settings;
370    int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
371    uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
372    int32_t fastLatinOptions = CollationFastLatin::getOptions(
373            tailoring.data, ts, fastLatinPrimaries, LENGTHOF(fastLatinPrimaries));
374    if(options == ts.options && ts.variableTop != 0 &&
375            reorderCodesLength == ts.reorderCodesLength &&
376            uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 &&
377            fastLatinOptions == ts.fastLatinOptions &&
378            (fastLatinOptions < 0 ||
379                uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
380                            sizeof(fastLatinPrimaries)) == 0)) {
381        return;
382    }
383
384    CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
385    if(settings == NULL) {
386        errorCode = U_MEMORY_ALLOCATION_ERROR;
387        return;
388    }
389    settings->options = options;
390    // Set variableTop from options and scripts data.
391    settings->variableTop = tailoring.data->getLastPrimaryForGroup(
392            UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
393    if(settings->variableTop == 0) {
394        errorCode = U_INVALID_FORMAT_ERROR;
395        return;
396    }
397
398    if(reorderCodesLength == 0 || reorderTable != NULL) {
399        settings->aliasReordering(reorderCodes, reorderCodesLength, reorderTable);
400    } else {
401        uint8_t table[256];
402        baseData->makeReorderTable(reorderCodes, reorderCodesLength, table, errorCode);
403        if(U_FAILURE(errorCode)) { return; }
404        if(!settings->setReordering(reorderCodes, reorderCodesLength,table)) {
405            errorCode = U_MEMORY_ALLOCATION_ERROR;
406            return;
407        }
408    }
409
410    settings->fastLatinOptions = CollationFastLatin::getOptions(
411        tailoring.data, *settings,
412        settings->fastLatinPrimaries, LENGTHOF(settings->fastLatinPrimaries));
413}
414
415UBool U_CALLCONV
416CollationDataReader::isAcceptable(void *context,
417                                  const char * /* type */, const char * /*name*/,
418                                  const UDataInfo *pInfo) {
419    if(
420        pInfo->size >= 20 &&
421        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
422        pInfo->charsetFamily == U_CHARSET_FAMILY &&
423        pInfo->dataFormat[0] == 0x55 &&  // dataFormat="UCol"
424        pInfo->dataFormat[1] == 0x43 &&
425        pInfo->dataFormat[2] == 0x6f &&
426        pInfo->dataFormat[3] == 0x6c &&
427        pInfo->formatVersion[0] == 4
428    ) {
429        UVersionInfo *version = static_cast<UVersionInfo *>(context);
430        if(version != NULL) {
431            uprv_memcpy(version, pInfo->dataVersion, 4);
432        }
433        return TRUE;
434    } else {
435        return FALSE;
436    }
437}
438
439U_NAMESPACE_END
440
441#endif  // !UCONFIG_NO_COLLATION
442