1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationdatawriter.cpp
9*
10* created on: 2013aug06
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/tblcoll.h"
19#include "unicode/udata.h"
20#include "unicode/uniset.h"
21#include "cmemory.h"
22#include "collationdata.h"
23#include "collationdatabuilder.h"
24#include "collationdatareader.h"
25#include "collationdatawriter.h"
26#include "collationfastlatin.h"
27#include "collationsettings.h"
28#include "collationtailoring.h"
29#include "uassert.h"
30#include "ucmndata.h"
31
32U_NAMESPACE_BEGIN
33
34uint8_t *
35RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
36    if(U_FAILURE(errorCode)) { return NULL; }
37    LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
38    if(buffer.isNull()) {
39        errorCode = U_MEMORY_ALLOCATION_ERROR;
40        return NULL;
41    }
42    length = cloneBinary(buffer.getAlias(), 20000, errorCode);
43    if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
44        if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
45            errorCode = U_MEMORY_ALLOCATION_ERROR;
46            return NULL;
47        }
48        errorCode = U_ZERO_ERROR;
49        length = cloneBinary(buffer.getAlias(), length, errorCode);
50    }
51    if(U_FAILURE(errorCode)) { return NULL; }
52    return buffer.orphan();
53}
54
55int32_t
56RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
57    int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
58    return CollationDataWriter::writeTailoring(
59            *tailoring, *settings, indexes, dest, capacity,
60            errorCode);
61}
62
63static const UDataInfo dataInfo = {
64    sizeof(UDataInfo),
65    0,
66
67    U_IS_BIG_ENDIAN,
68    U_CHARSET_FAMILY,
69    U_SIZEOF_UCHAR,
70    0,
71
72    { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
73    { 5, 0, 0, 0 },                     // formatVersion
74    { 6, 3, 0, 0 }                      // dataVersion
75};
76
77int32_t
78CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
79                               const void *rootElements, int32_t rootElementsLength,
80                               int32_t indexes[], uint8_t *dest, int32_t capacity,
81                               UErrorCode &errorCode) {
82    return write(TRUE, NULL,
83                 data, settings,
84                 rootElements, rootElementsLength,
85                 indexes, dest, capacity, errorCode);
86}
87
88int32_t
89CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
90                                    int32_t indexes[], uint8_t *dest, int32_t capacity,
91                                    UErrorCode &errorCode) {
92    return write(FALSE, t.version,
93                 *t.data, settings,
94                 NULL, 0,
95                 indexes, dest, capacity, errorCode);
96}
97
98int32_t
99CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
100                           const CollationData &data, const CollationSettings &settings,
101                           const void *rootElements, int32_t rootElementsLength,
102                           int32_t indexes[], uint8_t *dest, int32_t capacity,
103                           UErrorCode &errorCode) {
104    if(U_FAILURE(errorCode)) { return 0; }
105    if(capacity < 0 || (capacity > 0 && dest == NULL)) {
106        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
107        return 0;
108    }
109
110    // Figure out which data items to write before settling on
111    // the indexes length and writing offsets.
112    // For any data item, we need to write the start and limit offsets,
113    // so the indexes length must be at least index-of-start-offset + 2.
114    int32_t indexesLength;
115    UBool hasMappings;
116    UnicodeSet unsafeBackwardSet;
117    const CollationData *baseData = data.base;
118
119    int32_t fastLatinVersion;
120    if(data.fastLatinTable != NULL) {
121        fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
122    } else {
123        fastLatinVersion = 0;
124    }
125    int32_t fastLatinTableLength = 0;
126
127    if(isBase) {
128        // For the root collator, we write an even number of indexes
129        // so that we start with an 8-aligned offset.
130        indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
131        U_ASSERT(settings.reorderCodesLength == 0);
132        hasMappings = TRUE;
133        unsafeBackwardSet = *data.unsafeBackwardSet;
134        fastLatinTableLength = data.fastLatinTableLength;
135    } else if(baseData == NULL) {
136        hasMappings = FALSE;
137        if(settings.reorderCodesLength == 0) {
138            // only options
139            indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
140        } else {
141            // only options, reorder codes, and the reorder table
142            indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
143        }
144    } else {
145        hasMappings = TRUE;
146        // Tailored mappings, and what else?
147        // Check in ascending order of optional tailoring data items.
148        indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
149        if(data.contextsLength != 0) {
150            indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
151        }
152        unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
153        if(!unsafeBackwardSet.isEmpty()) {
154            indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
155        }
156        if(data.fastLatinTable != baseData->fastLatinTable) {
157            fastLatinTableLength = data.fastLatinTableLength;
158            indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
159        }
160    }
161
162    UVector32 codesAndRanges(errorCode);
163    const int32_t *reorderCodes = settings.reorderCodes;
164    int32_t reorderCodesLength = settings.reorderCodesLength;
165    if(settings.hasReordering() &&
166            CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
167        // Rebuild the full list of reorder ranges.
168        // The list in the settings is truncated for efficiency.
169        data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
170        // Write the codes, then the ranges.
171        for(int32_t i = 0; i < reorderCodesLength; ++i) {
172            codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
173        }
174        if(U_FAILURE(errorCode)) { return 0; }
175        reorderCodes = codesAndRanges.getBuffer();
176        reorderCodesLength = codesAndRanges.size();
177    }
178
179    int32_t headerSize;
180    if(isBase) {
181        headerSize = 0;  // udata_create() writes the header
182    } else {
183        DataHeader header;
184        header.dataHeader.magic1 = 0xda;
185        header.dataHeader.magic2 = 0x27;
186        uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
187        uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
188        headerSize = (int32_t)sizeof(header);
189        U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
190        if(hasMappings && data.cesLength != 0) {
191            // Sum of the sizes of the data items which are
192            // not automatically multiples of 8 bytes and which are placed before the CEs.
193            int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
194            if((sum & 7) != 0) {
195                // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
196                // We add to the header size here.
197                // Alternatively, we could increment the indexesLength
198                // or add a few bytes to the reorderTable.
199                headerSize += 4;
200            }
201        }
202        header.dataHeader.headerSize = (uint16_t)headerSize;
203        if(headerSize <= capacity) {
204            uprv_memcpy(dest, &header, sizeof(header));
205            // Write 00 bytes so that the padding is not mistaken for a copyright string.
206            uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
207            dest += headerSize;
208            capacity -= headerSize;
209        } else {
210            dest = NULL;
211            capacity = 0;
212        }
213    }
214
215    indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
216    U_ASSERT((settings.options & ~0xffff) == 0);
217    indexes[CollationDataReader::IX_OPTIONS] =
218            data.numericPrimary | fastLatinVersion | settings.options;
219    indexes[CollationDataReader::IX_RESERVED2] = 0;
220    indexes[CollationDataReader::IX_RESERVED3] = 0;
221
222    // Byte offsets of data items all start from the start of the indexes.
223    // We add the headerSize at the very end.
224    int32_t totalSize = indexesLength * 4;
225
226    if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
227        indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
228    } else {
229        indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
230    }
231
232    indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
233    totalSize += reorderCodesLength * 4;
234
235    indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
236    if(settings.reorderTable != NULL) {
237        totalSize += 256;
238    }
239
240    indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
241    if(hasMappings) {
242        UErrorCode errorCode2 = U_ZERO_ERROR;
243        int32_t length;
244        if(totalSize < capacity) {
245            length = utrie2_serialize(data.trie, dest + totalSize,
246                                      capacity - totalSize, &errorCode2);
247        } else {
248            length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
249        }
250        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
251            errorCode = errorCode2;
252            return 0;
253        }
254        // The trie size should be a multiple of 8 bytes due to the way
255        // compactIndex2(UNewTrie2 *trie) currently works.
256        U_ASSERT((length & 7) == 0);
257        totalSize += length;
258    }
259
260    indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
261    indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
262    if(hasMappings && data.cesLength != 0) {
263        U_ASSERT(((headerSize + totalSize) & 7) == 0);
264        totalSize += data.cesLength * 8;
265    }
266
267    indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
268    indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
269    if(hasMappings) {
270        totalSize += data.ce32sLength * 4;
271    }
272
273    indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
274    totalSize += rootElementsLength * 4;
275
276    indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
277    if(hasMappings) {
278        totalSize += data.contextsLength * 2;
279    }
280
281    indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
282    if(hasMappings && !unsafeBackwardSet.isEmpty()) {
283        UErrorCode errorCode2 = U_ZERO_ERROR;
284        int32_t length;
285        if(totalSize < capacity) {
286            uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
287            length = unsafeBackwardSet.serialize(
288                    p, (capacity - totalSize) / 2, errorCode2);
289        } else {
290            length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
291        }
292        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
293            errorCode = errorCode2;
294            return 0;
295        }
296        totalSize += length * 2;
297    }
298
299    indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
300    totalSize += fastLatinTableLength * 2;
301
302    UnicodeString scripts;
303    indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
304    if(isBase) {
305        scripts.append((UChar)data.numScripts);
306        scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
307        scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
308        totalSize += scripts.length() * 2;
309    }
310
311    indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
312    if(isBase) {
313        totalSize += 256;
314    }
315
316    indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
317    indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
318
319    if(totalSize > capacity) {
320        errorCode = U_BUFFER_OVERFLOW_ERROR;
321        return headerSize + totalSize;
322    }
323
324    uprv_memcpy(dest, indexes, indexesLength * 4);
325    copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
326    copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
327    // The trie has already been serialized into the dest buffer.
328    copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
329    copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
330    copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
331    copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
332    // The unsafeBackwardSet has already been serialized into the dest buffer.
333    copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
334    copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
335    copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
336
337    return headerSize + totalSize;
338}
339
340void
341CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
342                              const void *src, uint8_t *dest) {
343    int32_t start = indexes[startIndex];
344    int32_t limit = indexes[startIndex + 1];
345    if(start < limit) {
346        uprv_memcpy(dest + start, src, limit - start);
347    }
348}
349
350U_NAMESPACE_END
351
352#endif  // !UCONFIG_NO_COLLATION
353