1/*
2*******************************************************************************
3* Copyright (C) 2012-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationbasedatabuilder.cpp
7*
8* created on: 2012aug11
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/localpointer.h"
17#include "unicode/ucharstriebuilder.h"
18#include "unicode/uniset.h"
19#include "unicode/unistr.h"
20#include "unicode/utf16.h"
21#include "collation.h"
22#include "collationbasedatabuilder.h"
23#include "collationdata.h"
24#include "collationdatabuilder.h"
25#include "collationrootelements.h"
26#include "normalizer2impl.h"
27#include "uassert.h"
28#include "utrie2.h"
29#include "uvectr32.h"
30#include "uvectr64.h"
31#include "uvector.h"
32
33U_NAMESPACE_BEGIN
34
35namespace {
36
37/**
38 * Compare two signed int64_t values as if they were unsigned.
39 */
40int32_t
41compareInt64AsUnsigned(int64_t a, int64_t b) {
42    if((uint64_t)a < (uint64_t)b) {
43        return -1;
44    } else if((uint64_t)a > (uint64_t)b) {
45        return 1;
46    } else {
47        return 0;
48    }
49}
50
51// TODO: Try to merge this with the binarySearch in alphaindex.cpp.
52/**
53 * Like Java Collections.binarySearch(List, String, Comparator).
54 *
55 * @return the index>=0 where the item was found,
56 *         or the index<0 for inserting the string at ~index in sorted order
57 */
58int32_t
59binarySearch(const UVector64 &list, int64_t ce) {
60    if (list.size() == 0) { return ~0; }
61    int32_t start = 0;
62    int32_t limit = list.size();
63    for (;;) {
64        int32_t i = (start + limit) / 2;
65        int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i));
66        if (cmp == 0) {
67            return i;
68        } else if (cmp < 0) {
69            if (i == start) {
70                return ~start;  // insert ce before i
71            }
72            limit = i;
73        } else {
74            if (i == start) {
75                return ~(start + 1);  // insert ce after i
76            }
77            start = i;
78        }
79    }
80}
81
82}  // namespace
83
84CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode)
85        : CollationDataBuilder(errorCode),
86          numericPrimary(0x12000000),
87          firstHanPrimary(0), lastHanPrimary(0), hanStep(2),
88          rootElements(errorCode) {
89}
90
91CollationBaseDataBuilder::~CollationBaseDataBuilder() {
92}
93
94void
95CollationBaseDataBuilder::init(UErrorCode &errorCode) {
96    if(U_FAILURE(errorCode)) { return; }
97    if(trie != NULL) {
98        errorCode = U_INVALID_STATE_ERROR;
99        return;
100    }
101
102    // Not compressible:
103    // - digits
104    // - Latin
105    // - Hani
106    // - trail weights
107    // Some scripts are compressible, some are not.
108    uprv_memset(compressibleBytes, FALSE, 256);
109    compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = TRUE;
110
111    // For a base, the default is to compute an unassigned-character implicit CE.
112    // This includes surrogate code points; see the last option in
113    // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences.
114    trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode);
115
116    // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches.
117    for(UChar32 c = 0; c < 0x180; ++c) {
118        utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode);
119    }
120
121    utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode);
122    // No root element for the merge separator which has 02 weights.
123    // Some code assumes that the root first primary CE is the "space first primary"
124    // from FractionalUCA.txt.
125
126    uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
127    utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
128
129    // Add a mapping for the first-unassigned boundary,
130    // which is the AlphabeticIndex overflow boundary.
131    UnicodeString s((UChar)0xfdd1);  // Script boundary contractions start with U+FDD1.
132    s.append((UChar)0xfdd0);  // Zzzz script sample character U+FDD0.
133    int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY);
134    add(UnicodeString(), s, &ce, 1, errorCode);
135
136    // Add a tailoring boundary, but not a mapping, for [first trailing].
137    ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
138    rootElements.addElement(ce, errorCode);
139
140    // U+FFFD maps to a CE with the third-highest primary weight,
141    // for predictable handling of ill-formed UTF-8.
142    uint32_t ce32 = Collation::FFFD_CE32;
143    utrie2_set32(trie, 0xfffd, ce32, &errorCode);
144    addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
145
146    // U+FFFF maps to a CE with the highest primary weight.
147    ce32 = Collation::MAX_REGULAR_CE32;
148    utrie2_set32(trie, 0xffff, ce32, &errorCode);
149    addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode);
150}
151
152void
153CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length,
154                                        UErrorCode &errorCode) {
155    if(U_FAILURE(errorCode) || length == 0) { return; }
156    if((length & 1) != 0) {  // incomplete start/end pairs
157        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
158        return;
159    }
160    if(isAssigned(0x4e00)) {  // already set
161        errorCode = U_INVALID_STATE_ERROR;
162        return;
163    }
164    int32_t numHanCodePoints = 0;
165    for(int32_t i = 0; i < length; i += 2) {
166        UChar32 start = ranges[i];
167        UChar32 end = ranges[i + 1];
168        numHanCodePoints += end - start + 1;
169    }
170    // Multiply the number of code points by (gap+1).
171    // Add hanStep+2 for tailoring after the last Han character.
172    int32_t gap = 1;
173    hanStep = gap + 1;
174    int32_t numHan = numHanCodePoints * hanStep + hanStep + 2;
175    // Numbers of Han primaries per lead byte determined by
176    // numbers of 2nd (not compressible) times 3rd primary byte values.
177    int32_t numHanPerLeadByte = 254 * 254;
178    int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte;
179    uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24;
180    hanPrimary |= 0x20200;
181    firstHanPrimary = hanPrimary;
182    for(int32_t i = 0; i < length; i += 2) {
183        UChar32 start = ranges[i];
184        UChar32 end = ranges[i + 1];
185        hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode);
186    }
187    // One past the actual last one, but that is harmless for tailoring.
188    // It saves us from subtracting "hanStep" and handling underflows.
189    lastHanPrimary = hanPrimary;
190}
191
192UBool
193CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const {
194    return compressibleBytes[b];
195}
196
197void
198CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) {
199    compressibleBytes[b] = TRUE;
200}
201
202int32_t
203CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
204    if((p1 & 0xff000000) == (p2 & 0xff000000)) {
205        // Same lead bytes.
206        return (int32_t)(p2 - p1) >> 16;
207    } else {
208        int32_t linear1;
209        int32_t linear2;
210        int32_t factor;
211        if(isCompressible) {
212            // Second byte for compressible lead byte: 251 bytes 04..FE
213            linear1 = (int32_t)((p1 >> 16) & 0xff) - 4;
214            linear2 = (int32_t)((p2 >> 16) & 0xff) - 4;
215            factor = 251;
216        } else {
217            // Second byte for incompressible lead byte: 254 bytes 02..FF
218            linear1 = (int32_t)((p1 >> 16) & 0xff) - 2;
219            linear2 = (int32_t)((p2 >> 16) & 0xff) - 2;
220            factor = 254;
221        }
222        linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
223        linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
224        return linear2 - linear1;
225    }
226}
227
228int32_t
229CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) {
230    if((p1 & 0xffff0000) == (p2 & 0xffff0000)) {
231        // Same first two bytes.
232        return (int32_t)(p2 - p1) >> 8;
233    } else {
234        // Third byte: 254 bytes 02..FF
235        int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2;
236        int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2;
237        int32_t factor;
238        if(isCompressible) {
239            // Second byte for compressible lead byte: 251 bytes 04..FE
240            linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4);
241            linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4);
242            factor = 251 * 254;
243        } else {
244            // Second byte for incompressible lead byte: 254 bytes 02..FF
245            linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2);
246            linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2);
247            factor = 254 * 254;
248        }
249        linear1 += factor * (int32_t)((p1 >> 24) & 0xff);
250        linear2 += factor * (int32_t)((p2 >> 24) & 0xff);
251        return linear2 - linear1;
252    }
253}
254
255uint32_t
256CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) {
257    addRootElements(ces, cesLength, errorCode);
258    return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode);
259}
260
261void
262CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength,
263                                          UErrorCode &errorCode) {
264    if(U_FAILURE(errorCode)) { return; }
265    for(int32_t i = 0; i < cesLength; ++i) {
266        addRootElement(ces[i], errorCode);
267    }
268}
269
270void
271CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) {
272    if(U_FAILURE(errorCode) || ce == 0) { return; }
273    // Remove case bits.
274    ce &= INT64_C(0xffffffffffff3fff);
275    U_ASSERT((ce & 0xc0) == 0);  // quaternary==0
276    // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights.
277    // We will add it later, as part of the Han ranges.
278    uint32_t p = (uint32_t)(ce >> 32);
279    uint32_t secTer = (uint32_t)ce;
280    if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
281        if(firstHanPrimary <= p && p <= lastHanPrimary) {
282            return;
283        }
284    } else {
285        // Check that secondary and tertiary weights are >= "common".
286        uint32_t s = secTer >> 16;
287        uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK;
288        if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collation::COMMON_WEIGHT16)) {
289            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
290            return;
291        }
292    }
293    // Check that primaries have at most 3 bytes.
294    if((p & 0xff) != 0) {
295        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
296        return;
297    }
298    int32_t i = binarySearch(rootElements, ce);
299    if(i < 0) {
300        rootElements.insertElementAt(ce, ~i, errorCode);
301    }
302}
303
304void
305CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte,
306                                             const UnicodeString &groupScripts,
307                                             UErrorCode &errorCode) {
308    if(U_FAILURE(errorCode)) { return; }
309    if(groupScripts.isEmpty()) {
310        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
311        return;
312    }
313    if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) {
314        // Zzzz must not occur.
315        // It is the code used in the API to separate low and high scripts.
316        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
317        return;
318    }
319    // Note: We are mostly trusting the input data,
320    // rather than verifying that reordering groups do not intersect
321    // with their lead byte ranges nor their sets of scripts,
322    // and that all script codes are valid.
323    scripts.append((UChar)((firstByte << 8) | lastByte));
324    scripts.append((UChar)groupScripts.length());
325    scripts.append(groupScripts);
326}
327
328void
329CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
330    buildMappings(data, errorCode);
331    data.numericPrimary = numericPrimary;
332    data.compressibleBytes = compressibleBytes;
333    data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer());
334    data.scriptsLength = scripts.length();
335    buildFastLatinTable(data, errorCode);
336}
337
338void
339CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) {
340    if(U_FAILURE(errorCode)) { return; }
341    uint32_t nextHanPrimary = firstHanPrimary;  // Set to 0xffffffff after the last Han range.
342    uint32_t prevPrimary = 0;  // Start with primary ignorable CEs.
343    UBool tryRange = FALSE;
344    for(int32_t i = 0; i < rootElements.size(); ++i) {
345        int64_t ce = rootElements.elementAti(i);
346        uint32_t p = (uint32_t)(ce >> 32);
347        uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
348        if(p != prevPrimary) {
349            U_ASSERT((p & 0xff) == 0);
350            int32_t end;
351            if(p >= nextHanPrimary) {
352                // Add a Han primary weight or range.
353                // We omitted them initially, and omitted all CEs with Han primaries
354                // and common secondary/tertiary weights.
355                U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_AND_TER_CE);
356                if(p == nextHanPrimary) {
357                    // One single Han primary with non-common secondary/tertiary weights.
358                    table.addElement((int32_t)p, errorCode);
359                    if(p < lastHanPrimary) {
360                        // Prepare for the next Han range.
361                        nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
362                    } else {
363                        // p is the last Han primary.
364                        nextHanPrimary = 0xffffffff;
365                    }
366                } else {
367                    // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary.
368                    table.addElement((int32_t)nextHanPrimary, errorCode);
369                    if(nextHanPrimary == lastHanPrimary) {
370                        // nextHanPrimary == lastHanPrimary < p
371                        // We just wrote the single last Han primary.
372                        nextHanPrimary = 0xffffffff;
373                    } else if(p < lastHanPrimary) {
374                        // nextHanPrimary < p < lastHanPrimary
375                        // End the Han range on p, prepare for the next range.
376                        table.addElement((int32_t)p | hanStep, errorCode);
377                        nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep);
378                    } else if(p == lastHanPrimary) {
379                        // nextHanPrimary < p == lastHanPrimary
380                        // End the last Han range on p.
381                        table.addElement((int32_t)p | hanStep, errorCode);
382                        nextHanPrimary = 0xffffffff;
383                    } else {
384                        // nextHanPrimary < lastHanPrimary < p
385                        // End the last Han range, then write p.
386                        table.addElement((int32_t)lastHanPrimary | hanStep, errorCode);
387                        nextHanPrimary = 0xffffffff;
388                        table.addElement((int32_t)p, errorCode);
389                    }
390                }
391            } else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE &&
392                    (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) {
393                // Multiple CEs with only common secondary/tertiary weights were
394                // combined into a primary range.
395                // The range end was written, ending with the primary of rootElements[end].
396                ce = rootElements.elementAti(end);
397                p = (uint32_t)(ce >> 32);
398                secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
399                i = end;
400            } else {
401                // Write the primary weight of a normal CE.
402                table.addElement((int32_t)p, errorCode);
403            }
404            prevPrimary = p;
405        }
406        if(secTer == Collation::COMMON_SEC_AND_TER_CE) {
407            // The common secondar/tertiary weights are implied in the primary unit.
408            // If there is no intervening delta unit, then we will try to combine
409            // the next several primaries into a range.
410            tryRange = TRUE;
411        } else {
412            // For each new set of secondary/tertiary weights we write a delta unit.
413            table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode);
414            tryRange = FALSE;
415        }
416    }
417
418    // Limit sentinel for root elements.
419    // This allows us to reduce range checks at runtime.
420    table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode);
421}
422
423int32_t
424CollationBaseDataBuilder::writeRootElementsRange(
425        uint32_t prevPrimary, uint32_t p, int32_t i,
426        UVector32 &table, UErrorCode &errorCode) {
427    if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; }
428    U_ASSERT(prevPrimary < p);
429    // No ranges of single-byte primaries.
430    if((p & prevPrimary & 0xff0000) == 0) { return 0; }
431    // Lead bytes of compressible primaries must match.
432    UBool isCompressible = isCompressiblePrimary(p);
433    if((isCompressible || isCompressiblePrimary(prevPrimary)) &&
434            (p & 0xff000000) != (prevPrimary & 0xff000000)) {
435        return 0;
436    }
437    // Number of bytes in the primaries.
438    UBool twoBytes;
439    // Number of primaries from prevPrimary to p.
440    int32_t step;
441    if((p & 0xff00) == 0) {
442        // 2-byte primary
443        if((prevPrimary & 0xff00) != 0) { return 0; }  // length mismatch
444        twoBytes = TRUE;
445        step = diffTwoBytePrimaries(prevPrimary, p, isCompressible);
446    } else {
447        // 3-byte primary
448        if((prevPrimary & 0xff00) == 0) { return 0; }  // length mismatch
449        twoBytes = FALSE;
450        step = diffThreeBytePrimaries(prevPrimary, p, isCompressible);
451    }
452    if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; }
453    // See if there are more than two CEs with primaries increasing by "step"
454    // and with only common secondary/tertiary weights on all but the last one.
455    int32_t end = 0;  // Initially 0: No range for just two primaries.
456    for(;;) {
457        prevPrimary = p;
458        // Calculate which primary we expect next.
459        uint32_t nextPrimary;  // = p + step
460        if(twoBytes) {
461            nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
462        } else {
463            nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
464        }
465        // Fetch the actual next CE.
466        int64_t ce = rootElements.elementAti(i);
467        p = (uint32_t)(ce >> 32);
468        uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK;
469        // Does this primary increase by "step" from the last one?
470        if(p != nextPrimary ||
471                // Do not cross into a new lead byte if either is compressible.
472                ((p & 0xff000000) != (prevPrimary & 0xff000000) &&
473                    (isCompressible || isCompressiblePrimary(p)))) {
474            // The range ends with the previous CE.
475            p = prevPrimary;
476            break;
477        }
478        // Extend the range to include this primary.
479        end = i++;
480        // This primary is the last in the range if it has non-common weights
481        // or if we are at the end of the list.
482        if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; }
483    }
484    if(end != 0) {
485        table.addElement((int32_t)p | step, errorCode);
486    }
487    return end;
488}
489
490U_NAMESPACE_END
491
492#endif  // !UCONFIG_NO_COLLATION
493