1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2012-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationdata.cpp
9*
10* created on: 2012jul28
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/ucol.h"
19#include "unicode/udata.h"
20#include "unicode/uscript.h"
21#include "cmemory.h"
22#include "collation.h"
23#include "collationdata.h"
24#include "uassert.h"
25#include "utrie2.h"
26#include "uvectr32.h"
27
28U_NAMESPACE_BEGIN
29
30uint32_t
31CollationData::getIndirectCE32(uint32_t ce32) const {
32    U_ASSERT(Collation::isSpecialCE32(ce32));
33    int32_t tag = Collation::tagFromCE32(ce32);
34    if(tag == Collation::DIGIT_TAG) {
35        // Fetch the non-numeric-collation CE32.
36        ce32 = ce32s[Collation::indexFromCE32(ce32)];
37    } else if(tag == Collation::LEAD_SURROGATE_TAG) {
38        ce32 = Collation::UNASSIGNED_CE32;
39    } else if(tag == Collation::U0000_TAG) {
40        // Fetch the normal ce32 for U+0000.
41        ce32 = ce32s[0];
42    }
43    return ce32;
44}
45
46uint32_t
47CollationData::getFinalCE32(uint32_t ce32) const {
48    if(Collation::isSpecialCE32(ce32)) {
49        ce32 = getIndirectCE32(ce32);
50    }
51    return ce32;
52}
53
54int64_t
55CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
56    if(U_FAILURE(errorCode)) { return 0; }
57    // Keep parallel with CollationDataBuilder::getSingleCE().
58    const CollationData *d;
59    uint32_t ce32 = getCE32(c);
60    if(ce32 == Collation::FALLBACK_CE32) {
61        d = base;
62        ce32 = base->getCE32(c);
63    } else {
64        d = this;
65    }
66    while(Collation::isSpecialCE32(ce32)) {
67        switch(Collation::tagFromCE32(ce32)) {
68        case Collation::LATIN_EXPANSION_TAG:
69        case Collation::BUILDER_DATA_TAG:
70        case Collation::PREFIX_TAG:
71        case Collation::CONTRACTION_TAG:
72        case Collation::HANGUL_TAG:
73        case Collation::LEAD_SURROGATE_TAG:
74            errorCode = U_UNSUPPORTED_ERROR;
75            return 0;
76        case Collation::FALLBACK_TAG:
77        case Collation::RESERVED_TAG_3:
78            errorCode = U_INTERNAL_PROGRAM_ERROR;
79            return 0;
80        case Collation::LONG_PRIMARY_TAG:
81            return Collation::ceFromLongPrimaryCE32(ce32);
82        case Collation::LONG_SECONDARY_TAG:
83            return Collation::ceFromLongSecondaryCE32(ce32);
84        case Collation::EXPANSION32_TAG:
85            if(Collation::lengthFromCE32(ce32) == 1) {
86                ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
87                break;
88            } else {
89                errorCode = U_UNSUPPORTED_ERROR;
90                return 0;
91            }
92        case Collation::EXPANSION_TAG: {
93            if(Collation::lengthFromCE32(ce32) == 1) {
94                return d->ces[Collation::indexFromCE32(ce32)];
95            } else {
96                errorCode = U_UNSUPPORTED_ERROR;
97                return 0;
98            }
99        }
100        case Collation::DIGIT_TAG:
101            // Fetch the non-numeric-collation CE32 and continue.
102            ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
103            break;
104        case Collation::U0000_TAG:
105            U_ASSERT(c == 0);
106            // Fetch the normal ce32 for U+0000 and continue.
107            ce32 = d->ce32s[0];
108            break;
109        case Collation::OFFSET_TAG:
110            return d->getCEFromOffsetCE32(c, ce32);
111        case Collation::IMPLICIT_TAG:
112            return Collation::unassignedCEFromCodePoint(c);
113        }
114    }
115    return Collation::ceFromSimpleCE32(ce32);
116}
117
118uint32_t
119CollationData::getFirstPrimaryForGroup(int32_t script) const {
120    int32_t index = getScriptIndex(script);
121    return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
122}
123
124uint32_t
125CollationData::getLastPrimaryForGroup(int32_t script) const {
126    int32_t index = getScriptIndex(script);
127    if(index == 0) {
128        return 0;
129    }
130    uint32_t limit = scriptStarts[index + 1];
131    return (limit << 16) - 1;
132}
133
134int32_t
135CollationData::getGroupForPrimary(uint32_t p) const {
136    p >>= 16;
137    if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
138        return -1;
139    }
140    int32_t index = 1;
141    while(p >= scriptStarts[index + 1]) { ++index; }
142    for(int32_t i = 0; i < numScripts; ++i) {
143        if(scriptsIndex[i] == index) {
144            return i;
145        }
146    }
147    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
148        if(scriptsIndex[numScripts + i] == index) {
149            return UCOL_REORDER_CODE_FIRST + i;
150        }
151    }
152    return -1;
153}
154
155int32_t
156CollationData::getScriptIndex(int32_t script) const {
157    if(script < 0) {
158        return 0;
159    } else if(script < numScripts) {
160        return scriptsIndex[script];
161    } else if(script < UCOL_REORDER_CODE_FIRST) {
162        return 0;
163    } else {
164        script -= UCOL_REORDER_CODE_FIRST;
165        if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
166            return scriptsIndex[numScripts + script];
167        } else {
168            return 0;
169        }
170    }
171}
172
173int32_t
174CollationData::getEquivalentScripts(int32_t script,
175                                    int32_t dest[], int32_t capacity,
176                                    UErrorCode &errorCode) const {
177    if(U_FAILURE(errorCode)) { return 0; }
178    int32_t index = getScriptIndex(script);
179    if(index == 0) { return 0; }
180    if(script >= UCOL_REORDER_CODE_FIRST) {
181        // Special groups have no aliases.
182        if(capacity > 0) {
183            dest[0] = script;
184        } else {
185            errorCode = U_BUFFER_OVERFLOW_ERROR;
186        }
187        return 1;
188    }
189
190    int32_t length = 0;
191    for(int32_t i = 0; i < numScripts; ++i) {
192        if(scriptsIndex[i] == index) {
193            if(length < capacity) {
194                dest[length] = i;
195            }
196            ++length;
197        }
198    }
199    if(length > capacity) {
200        errorCode = U_BUFFER_OVERFLOW_ERROR;
201    }
202    return length;
203}
204
205void
206CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
207                                 UVector32 &ranges, UErrorCode &errorCode) const {
208    makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
209}
210
211void
212CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
213                                 UBool latinMustMove,
214                                 UVector32 &ranges, UErrorCode &errorCode) const {
215    if(U_FAILURE(errorCode)) { return; }
216    ranges.removeAllElements();
217    if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
218        return;
219    }
220
221    // Maps each script-or-group range to a new lead byte.
222    uint8_t table[MAX_NUM_SCRIPT_RANGES];
223    uprv_memset(table, 0, sizeof(table));
224
225    {
226        // Set "don't care" values for reserved ranges.
227        int32_t index = scriptsIndex[
228                numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
229        if(index != 0) {
230            table[index] = 0xff;
231        }
232        index = scriptsIndex[
233                numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
234        if(index != 0) {
235            table[index] = 0xff;
236        }
237    }
238
239    // Never reorder special low and high primary lead bytes.
240    U_ASSERT(scriptStartsLength >= 2);
241    U_ASSERT(scriptStarts[0] == 0);
242    int32_t lowStart = scriptStarts[1];
243    U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
244    int32_t highLimit = scriptStarts[scriptStartsLength - 1];
245    U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
246
247    // Get the set of special reorder codes in the input list.
248    // This supports a fixed number of special reorder codes;
249    // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
250    uint32_t specials = 0;
251    for(int32_t i = 0; i < length; ++i) {
252        int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
253        if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
254            specials |= (uint32_t)1 << reorderCode;
255        }
256    }
257
258    // Start the reordering with the special low reorder codes that do not occur in the input.
259    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
260        int32_t index = scriptsIndex[numScripts + i];
261        if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
262            lowStart = addLowScriptRange(table, index, lowStart);
263        }
264    }
265
266    // Skip the reserved range before Latin if Latin is the first script,
267    // so that we do not move it unnecessarily.
268    int32_t skippedReserved = 0;
269    if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
270        int32_t index = scriptsIndex[USCRIPT_LATIN];
271        U_ASSERT(index != 0);
272        int32_t start = scriptStarts[index];
273        U_ASSERT(lowStart <= start);
274        skippedReserved = start - lowStart;
275        lowStart = start;
276    }
277
278    // Reorder according to the input scripts, continuing from the bottom of the primary range.
279    int32_t originalLength = length;  // length will be decremented if "others" is in the list.
280    UBool hasReorderToEnd = FALSE;
281    for(int32_t i = 0; i < length;) {
282        int32_t script = reorder[i++];
283        if(script == USCRIPT_UNKNOWN) {
284            // Put the remaining scripts at the top.
285            hasReorderToEnd = TRUE;
286            while(i < length) {
287                script = reorder[--length];
288                if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
289                        script == UCOL_REORDER_CODE_DEFAULT) {
290                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
291                    return;
292                }
293                int32_t index = getScriptIndex(script);
294                if(index == 0) { continue; }
295                if(table[index] != 0) {  // Duplicate or equivalent script.
296                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
297                    return;
298                }
299                highLimit = addHighScriptRange(table, index, highLimit);
300            }
301            break;
302        }
303        if(script == UCOL_REORDER_CODE_DEFAULT) {
304            // The default code must be the only one in the list, and that is handled by the caller.
305            // Otherwise it must not be used.
306            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307            return;
308        }
309        int32_t index = getScriptIndex(script);
310        if(index == 0) { continue; }
311        if(table[index] != 0) {  // Duplicate or equivalent script.
312            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313            return;
314        }
315        lowStart = addLowScriptRange(table, index, lowStart);
316    }
317
318    // Put all remaining scripts into the middle.
319    for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
320        int32_t leadByte = table[i];
321        if(leadByte != 0) { continue; }
322        int32_t start = scriptStarts[i];
323        if(!hasReorderToEnd && start > lowStart) {
324            // No need to move this script.
325            lowStart = start;
326        }
327        lowStart = addLowScriptRange(table, i, lowStart);
328    }
329    if(lowStart > highLimit) {
330        if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
331            // Try not skipping the before-Latin reserved range.
332            makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
333            return;
334        }
335        // We need more primary lead bytes than available, despite the reserved ranges.
336        errorCode = U_BUFFER_OVERFLOW_ERROR;
337        return;
338    }
339
340    // Turn lead bytes into a list of (limit, offset) pairs.
341    // Encode each pair in one list element:
342    // Upper 16 bits = limit, lower 16 = signed lead byte offset.
343    int32_t offset = 0;
344    for(int32_t i = 1;; ++i) {
345        int32_t nextOffset = offset;
346        while(i < scriptStartsLength - 1) {
347            int32_t newLeadByte = table[i];
348            if(newLeadByte == 0xff) {
349                // "Don't care" lead byte for reserved range, continue with current offset.
350            } else {
351                nextOffset = newLeadByte - (scriptStarts[i] >> 8);
352                if(nextOffset != offset) { break; }
353            }
354            ++i;
355        }
356        if(offset != 0 || i < scriptStartsLength - 1) {
357            ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
358        }
359        if(i == scriptStartsLength - 1) { break; }
360        offset = nextOffset;
361    }
362}
363
364int32_t
365CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
366    int32_t start = scriptStarts[index];
367    if((start & 0xff) < (lowStart & 0xff)) {
368        lowStart += 0x100;
369    }
370    table[index] = (uint8_t)(lowStart >> 8);
371    int32_t limit = scriptStarts[index + 1];
372    lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
373    return lowStart;
374}
375
376int32_t
377CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
378    int32_t limit = scriptStarts[index + 1];
379    if((limit & 0xff) > (highLimit & 0xff)) {
380        highLimit -= 0x100;
381    }
382    int32_t start = scriptStarts[index];
383    highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
384    table[index] = (uint8_t)(highLimit >> 8);
385    return highLimit;
386}
387
388U_NAMESPACE_END
389
390#endif  // !UCONFIG_NO_COLLATION
391