1/*
2*******************************************************************************
3* Copyright (C) 2012-2015, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationdata.cpp
7*
8* created on: 2012jul28
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/ucol.h"
17#include "unicode/udata.h"
18#include "unicode/uscript.h"
19#include "cmemory.h"
20#include "collation.h"
21#include "collationdata.h"
22#include "uassert.h"
23#include "utrie2.h"
24#include "uvectr32.h"
25
26U_NAMESPACE_BEGIN
27
28uint32_t
29CollationData::getIndirectCE32(uint32_t ce32) const {
30    U_ASSERT(Collation::isSpecialCE32(ce32));
31    int32_t tag = Collation::tagFromCE32(ce32);
32    if(tag == Collation::DIGIT_TAG) {
33        // Fetch the non-numeric-collation CE32.
34        ce32 = ce32s[Collation::indexFromCE32(ce32)];
35    } else if(tag == Collation::LEAD_SURROGATE_TAG) {
36        ce32 = Collation::UNASSIGNED_CE32;
37    } else if(tag == Collation::U0000_TAG) {
38        // Fetch the normal ce32 for U+0000.
39        ce32 = ce32s[0];
40    }
41    return ce32;
42}
43
44uint32_t
45CollationData::getFinalCE32(uint32_t ce32) const {
46    if(Collation::isSpecialCE32(ce32)) {
47        ce32 = getIndirectCE32(ce32);
48    }
49    return ce32;
50}
51
52int64_t
53CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
54    if(U_FAILURE(errorCode)) { return 0; }
55    // Keep parallel with CollationDataBuilder::getSingleCE().
56    const CollationData *d;
57    uint32_t ce32 = getCE32(c);
58    if(ce32 == Collation::FALLBACK_CE32) {
59        d = base;
60        ce32 = base->getCE32(c);
61    } else {
62        d = this;
63    }
64    while(Collation::isSpecialCE32(ce32)) {
65        switch(Collation::tagFromCE32(ce32)) {
66        case Collation::LATIN_EXPANSION_TAG:
67        case Collation::BUILDER_DATA_TAG:
68        case Collation::PREFIX_TAG:
69        case Collation::CONTRACTION_TAG:
70        case Collation::HANGUL_TAG:
71        case Collation::LEAD_SURROGATE_TAG:
72            errorCode = U_UNSUPPORTED_ERROR;
73            return 0;
74        case Collation::FALLBACK_TAG:
75        case Collation::RESERVED_TAG_3:
76            errorCode = U_INTERNAL_PROGRAM_ERROR;
77            return 0;
78        case Collation::LONG_PRIMARY_TAG:
79            return Collation::ceFromLongPrimaryCE32(ce32);
80        case Collation::LONG_SECONDARY_TAG:
81            return Collation::ceFromLongSecondaryCE32(ce32);
82        case Collation::EXPANSION32_TAG:
83            if(Collation::lengthFromCE32(ce32) == 1) {
84                ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
85                break;
86            } else {
87                errorCode = U_UNSUPPORTED_ERROR;
88                return 0;
89            }
90        case Collation::EXPANSION_TAG: {
91            if(Collation::lengthFromCE32(ce32) == 1) {
92                return d->ces[Collation::indexFromCE32(ce32)];
93            } else {
94                errorCode = U_UNSUPPORTED_ERROR;
95                return 0;
96            }
97        }
98        case Collation::DIGIT_TAG:
99            // Fetch the non-numeric-collation CE32 and continue.
100            ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
101            break;
102        case Collation::U0000_TAG:
103            U_ASSERT(c == 0);
104            // Fetch the normal ce32 for U+0000 and continue.
105            ce32 = d->ce32s[0];
106            break;
107        case Collation::OFFSET_TAG:
108            return d->getCEFromOffsetCE32(c, ce32);
109        case Collation::IMPLICIT_TAG:
110            return Collation::unassignedCEFromCodePoint(c);
111        }
112    }
113    return Collation::ceFromSimpleCE32(ce32);
114}
115
116uint32_t
117CollationData::getFirstPrimaryForGroup(int32_t script) const {
118    int32_t index = getScriptIndex(script);
119    return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
120}
121
122uint32_t
123CollationData::getLastPrimaryForGroup(int32_t script) const {
124    int32_t index = getScriptIndex(script);
125    if(index == 0) {
126        return 0;
127    }
128    uint32_t limit = scriptStarts[index + 1];
129    return (limit << 16) - 1;
130}
131
132int32_t
133CollationData::getGroupForPrimary(uint32_t p) const {
134    p >>= 16;
135    if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
136        return -1;
137    }
138    int32_t index = 1;
139    while(p >= scriptStarts[index + 1]) { ++index; }
140    for(int32_t i = 0; i < numScripts; ++i) {
141        if(scriptsIndex[i] == index) {
142            return i;
143        }
144    }
145    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
146        if(scriptsIndex[numScripts + i] == index) {
147            return UCOL_REORDER_CODE_FIRST + i;
148        }
149    }
150    return -1;
151}
152
153int32_t
154CollationData::getScriptIndex(int32_t script) const {
155    if(script < 0) {
156        return 0;
157    } else if(script < numScripts) {
158        return scriptsIndex[script];
159    } else if(script < UCOL_REORDER_CODE_FIRST) {
160        return 0;
161    } else {
162        script -= UCOL_REORDER_CODE_FIRST;
163        if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
164            return scriptsIndex[numScripts + script];
165        } else {
166            return 0;
167        }
168    }
169}
170
171int32_t
172CollationData::getEquivalentScripts(int32_t script,
173                                    int32_t dest[], int32_t capacity,
174                                    UErrorCode &errorCode) const {
175    if(U_FAILURE(errorCode)) { return 0; }
176    int32_t index = getScriptIndex(script);
177    if(index == 0) { return 0; }
178    if(script >= UCOL_REORDER_CODE_FIRST) {
179        // Special groups have no aliases.
180        if(capacity > 0) {
181            dest[0] = script;
182        } else {
183            errorCode = U_BUFFER_OVERFLOW_ERROR;
184        }
185        return 1;
186    }
187
188    int32_t length = 0;
189    for(int32_t i = 0; i < numScripts; ++i) {
190        if(scriptsIndex[i] == index) {
191            if(length < capacity) {
192                dest[length] = i;
193            }
194            ++length;
195        }
196    }
197    if(length > capacity) {
198        errorCode = U_BUFFER_OVERFLOW_ERROR;
199    }
200    return length;
201}
202
203void
204CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
205                                 UVector32 &ranges, UErrorCode &errorCode) const {
206    makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
207}
208
209void
210CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
211                                 UBool latinMustMove,
212                                 UVector32 &ranges, UErrorCode &errorCode) const {
213    if(U_FAILURE(errorCode)) { return; }
214    ranges.removeAllElements();
215    if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
216        return;
217    }
218
219    // Maps each script-or-group range to a new lead byte.
220    uint8_t table[MAX_NUM_SCRIPT_RANGES];
221    uprv_memset(table, 0, sizeof(table));
222
223    {
224        // Set "don't care" values for reserved ranges.
225        int32_t index = scriptsIndex[
226                numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
227        if(index != 0) {
228            table[index] = 0xff;
229        }
230        index = scriptsIndex[
231                numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
232        if(index != 0) {
233            table[index] = 0xff;
234        }
235    }
236
237    // Never reorder special low and high primary lead bytes.
238    U_ASSERT(scriptStartsLength >= 2);
239    U_ASSERT(scriptStarts[0] == 0);
240    int32_t lowStart = scriptStarts[1];
241    U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
242    int32_t highLimit = scriptStarts[scriptStartsLength - 1];
243    U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
244
245    // Get the set of special reorder codes in the input list.
246    // This supports a fixed number of special reorder codes;
247    // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
248    uint32_t specials = 0;
249    for(int32_t i = 0; i < length; ++i) {
250        int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
251        if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
252            specials |= (uint32_t)1 << reorderCode;
253        }
254    }
255
256    // Start the reordering with the special low reorder codes that do not occur in the input.
257    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
258        int32_t index = scriptsIndex[numScripts + i];
259        if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
260            lowStart = addLowScriptRange(table, index, lowStart);
261        }
262    }
263
264    // Skip the reserved range before Latin if Latin is the first script,
265    // so that we do not move it unnecessarily.
266    int32_t skippedReserved = 0;
267    if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
268        int32_t index = scriptsIndex[USCRIPT_LATIN];
269        U_ASSERT(index != 0);
270        int32_t start = scriptStarts[index];
271        U_ASSERT(lowStart <= start);
272        skippedReserved = start - lowStart;
273        lowStart = start;
274    }
275
276    // Reorder according to the input scripts, continuing from the bottom of the primary range.
277    int32_t originalLength = length;  // length will be decremented if "others" is in the list.
278    UBool hasReorderToEnd = FALSE;
279    for(int32_t i = 0; i < length;) {
280        int32_t script = reorder[i++];
281        if(script == USCRIPT_UNKNOWN) {
282            // Put the remaining scripts at the top.
283            hasReorderToEnd = TRUE;
284            while(i < length) {
285                script = reorder[--length];
286                if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
287                        script == UCOL_REORDER_CODE_DEFAULT) {
288                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
289                    return;
290                }
291                int32_t index = getScriptIndex(script);
292                if(index == 0) { continue; }
293                if(table[index] != 0) {  // Duplicate or equivalent script.
294                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
295                    return;
296                }
297                highLimit = addHighScriptRange(table, index, highLimit);
298            }
299            break;
300        }
301        if(script == UCOL_REORDER_CODE_DEFAULT) {
302            // The default code must be the only one in the list, and that is handled by the caller.
303            // Otherwise it must not be used.
304            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
305            return;
306        }
307        int32_t index = getScriptIndex(script);
308        if(index == 0) { continue; }
309        if(table[index] != 0) {  // Duplicate or equivalent script.
310            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
311            return;
312        }
313        lowStart = addLowScriptRange(table, index, lowStart);
314    }
315
316    // Put all remaining scripts into the middle.
317    for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
318        int32_t leadByte = table[i];
319        if(leadByte != 0) { continue; }
320        int32_t start = scriptStarts[i];
321        if(!hasReorderToEnd && start > lowStart) {
322            // No need to move this script.
323            lowStart = start;
324        }
325        lowStart = addLowScriptRange(table, i, lowStart);
326    }
327    if(lowStart > highLimit) {
328        if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
329            // Try not skipping the before-Latin reserved range.
330            makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
331            return;
332        }
333        // We need more primary lead bytes than available, despite the reserved ranges.
334        errorCode = U_BUFFER_OVERFLOW_ERROR;
335        return;
336    }
337
338    // Turn lead bytes into a list of (limit, offset) pairs.
339    // Encode each pair in one list element:
340    // Upper 16 bits = limit, lower 16 = signed lead byte offset.
341    int32_t offset = 0;
342    for(int32_t i = 1;; ++i) {
343        int32_t nextOffset = offset;
344        while(i < scriptStartsLength - 1) {
345            int32_t newLeadByte = table[i];
346            if(newLeadByte == 0xff) {
347                // "Don't care" lead byte for reserved range, continue with current offset.
348            } else {
349                nextOffset = newLeadByte - (scriptStarts[i] >> 8);
350                if(nextOffset != offset) { break; }
351            }
352            ++i;
353        }
354        if(offset != 0 || i < scriptStartsLength - 1) {
355            ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
356        }
357        if(i == scriptStartsLength - 1) { break; }
358        offset = nextOffset;
359    }
360}
361
362int32_t
363CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
364    int32_t start = scriptStarts[index];
365    if((start & 0xff) < (lowStart & 0xff)) {
366        lowStart += 0x100;
367    }
368    table[index] = (uint8_t)(lowStart >> 8);
369    int32_t limit = scriptStarts[index + 1];
370    lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
371    return lowStart;
372}
373
374int32_t
375CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
376    int32_t limit = scriptStarts[index + 1];
377    if((limit & 0xff) > (highLimit & 0xff)) {
378        highLimit -= 0x100;
379    }
380    int32_t start = scriptStarts[index];
381    highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
382    table[index] = (uint8_t)(highLimit >> 8);
383    return highLimit;
384}
385
386U_NAMESPACE_END
387
388#endif  // !UCONFIG_NO_COLLATION
389