1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2013, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucol_bld.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created 02/22/2001
14*   created by: Vladimir Weinstein
15*
16* This module builds a collator based on the rule set.
17*
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/ucoleitr.h"
25#include "unicode/udata.h"
26#include "unicode/uchar.h"
27#include "unicode/uniset.h"
28#include "unicode/uscript.h"
29#include "unicode/ustring.h"
30#include "unicode/utf16.h"
31#include "normalizer2impl.h"
32#include "uassert.h"
33#include "ucol_bld.h"
34#include "ucol_elm.h"
35#include "ucol_cnt.h"
36#include "ucln_in.h"
37#include "umutex.h"
38#include "cmemory.h"
39#include "cstring.h"
40
41#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42
43static const InverseUCATableHeader* _staticInvUCA = NULL;
44static UDataMemory* invUCA_DATA_MEM = NULL;
45static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
46
47U_CDECL_BEGIN
48static UBool U_CALLCONV
49isAcceptableInvUCA(void * /*context*/,
50                   const char * /*type*/, const char * /*name*/,
51                   const UDataInfo *pInfo)
52{
53    /* context, type & name are intentionally not used */
54    if( pInfo->size>=20 &&
55        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
56        pInfo->charsetFamily==U_CHARSET_FAMILY &&
57        pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
58        pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
59        pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
60        pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
61        pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
62        pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
63        //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
64        //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
65        //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
66        )
67    {
68        // TODO: Check that the invuca data version (pInfo->dataVersion)
69        // matches the ucadata version.
70        return TRUE;
71    } else {
72        return FALSE;
73    }
74}
75U_CDECL_END
76
77/*
78* Takes two CEs (lead and continuation) and
79* compares them as CEs should be compared:
80* primary vs. primary, secondary vs. secondary
81* tertiary vs. tertiary
82*/
83static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
84    uint32_t s1 = source0, s2, t1 = target0, t2;
85    if(isContinuation(source1)) {
86        s2 = source1;
87    } else {
88        s2 = 0;
89    }
90    if(isContinuation(target1)) {
91        t2 = target1;
92    } else {
93        t2 = 0;
94    }
95
96    uint32_t s = 0, t = 0;
97    if(s1 == t1 && s2 == t2) {
98        return 0;
99    }
100    s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
101    t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
102    if(s < t) {
103        return -1;
104    } else if(s > t) {
105        return 1;
106    } else {
107        s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
108        t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
109        if(s < t) {
110            return -1;
111        } else if(s > t) {
112            return 1;
113        } else {
114            s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
115            t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
116            if(s < t) {
117                return -1;
118            } else {
119                return 1;
120            }
121        }
122    }
123}
124
125static
126int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
127    uint32_t bottom = 0, top = src->invUCA->tableSize;
128    uint32_t i = 0;
129    uint32_t first = 0, second = 0;
130    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
131    int32_t res = 0;
132
133    while(bottom < top-1) {
134        i = (top+bottom)/2;
135        first = *(CETable+3*i);
136        second = *(CETable+3*i+1);
137        res = compareCEs(first, second, CE, SecondCE);
138        if(res > 0) {
139            top = i;
140        } else if(res < 0) {
141            bottom = i;
142        } else {
143            break;
144        }
145    }
146
147    /* weiv:                                                  */
148    /* in searching for elements, I have removed the failure  */
149    /* The reason for this is that the builder does not rely  */
150    /* on search mechanism telling it that it didn't find an  */
151    /* element. However, indirect positioning relies on being */
152    /* able to find the elements around any CE, even if it is */
153    /* not defined in the UCA. */
154    return i;
155    /*
156    if((first == CE && second == SecondCE)) {
157    return i;
158    } else {
159    return -1;
160    }
161    */
162}
163
164static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
165    0xFFFF0000,
166    0xFFFFFF00,
167    0xFFFFFFFF
168};
169
170U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
171                                            uint32_t CE, uint32_t contCE,
172                                            uint32_t *nextCE, uint32_t *nextContCE,
173                                            uint32_t strength)
174{
175    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
176    int32_t iCE;
177
178    iCE = ucol_inv_findCE(src, CE, contCE);
179
180    if(iCE<0) {
181        *nextCE = UCOL_NOT_FOUND;
182        return -1;
183    }
184
185    CE &= strengthMask[strength];
186    contCE &= strengthMask[strength];
187
188    *nextCE = CE;
189    *nextContCE = contCE;
190
191    while((*nextCE  & strengthMask[strength]) == CE
192        && (*nextContCE  & strengthMask[strength]) == contCE)
193    {
194        *nextCE = (*(CETable+3*(++iCE)));
195        *nextContCE = (*(CETable+3*(iCE)+1));
196    }
197
198    return iCE;
199}
200
201U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
202                                            uint32_t CE, uint32_t contCE,
203                                            uint32_t *prevCE, uint32_t *prevContCE,
204                                            uint32_t strength)
205{
206    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
207    int32_t iCE;
208
209    iCE = ucol_inv_findCE(src, CE, contCE);
210
211    if(iCE<0) {
212        *prevCE = UCOL_NOT_FOUND;
213        return -1;
214    }
215
216    CE &= strengthMask[strength];
217    contCE &= strengthMask[strength];
218
219    *prevCE = CE;
220    *prevContCE = contCE;
221
222    while((*prevCE  & strengthMask[strength]) == CE
223        && (*prevContCE  & strengthMask[strength])== contCE
224        && iCE > 0) /* this condition should prevent falling off the edge of the world */
225    {
226        /* here, we end up in a singularity - zero */
227        *prevCE = (*(CETable+3*(--iCE)));
228        *prevContCE = (*(CETable+3*(iCE)+1));
229    }
230
231    return iCE;
232}
233
234U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
235                                                       uint32_t prevCE, uint32_t prevContCE)
236{
237    if(prevCE == CE && prevContCE == contCE) {
238        return UCOL_IDENTICAL;
239    }
240    if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
241        || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
242    {
243        return UCOL_PRIMARY;
244    }
245    if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
246        || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
247    {
248        return UCOL_SECONDARY;
249    }
250    return UCOL_TERTIARY;
251}
252
253
254/*static
255inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
256
257    uint32_t CE = lh->baseCE;
258    uint32_t SecondCE = lh->baseContCE;
259
260    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
261    uint32_t previousCE, previousContCE;
262    int32_t iCE;
263
264    iCE = ucol_inv_findCE(src, CE, SecondCE);
265
266    if(iCE<0) {
267        return -1;
268    }
269
270    CE &= strengthMask[strength];
271    SecondCE &= strengthMask[strength];
272
273    previousCE = CE;
274    previousContCE = SecondCE;
275
276    while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
277        previousCE = (*(CETable+3*(--iCE)));
278        previousContCE = (*(CETable+3*(iCE)+1));
279    }
280    lh->previousCE = previousCE;
281    lh->previousContCE = previousContCE;
282
283    return iCE;
284}*/
285
286static
287inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
288    uint32_t CE = lh->baseCE;
289    uint32_t SecondCE = lh->baseContCE;
290
291    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
292    uint32_t nextCE, nextContCE;
293    int32_t iCE;
294
295    iCE = ucol_inv_findCE(src, CE, SecondCE);
296
297    if(iCE<0) {
298        return -1;
299    }
300
301    CE &= strengthMask[strength];
302    SecondCE &= strengthMask[strength];
303
304    nextCE = CE;
305    nextContCE = SecondCE;
306
307    while((nextCE  & strengthMask[strength]) == CE
308        && (nextContCE  & strengthMask[strength]) == SecondCE)
309    {
310        nextCE = (*(CETable+3*(++iCE)));
311        nextContCE = (*(CETable+3*(iCE)+1));
312    }
313
314    lh->nextCE = nextCE;
315    lh->nextContCE = nextContCE;
316
317    return iCE;
318}
319
320static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
321    /* reset all the gaps */
322    int32_t i = 0;
323    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
324    uint32_t st = 0;
325    uint32_t t1, t2;
326    int32_t pos;
327
328    UColToken *tok = lh->first;
329    uint32_t tokStrength = tok->strength;
330
331    for(i = 0; i<3; i++) {
332        lh->gapsHi[3*i] = 0;
333        lh->gapsHi[3*i+1] = 0;
334        lh->gapsHi[3*i+2] = 0;
335        lh->gapsLo[3*i] = 0;
336        lh->gapsLo[3*i+1] = 0;
337        lh->gapsLo[3*i+2] = 0;
338        lh->numStr[i] = 0;
339        lh->fStrToken[i] = NULL;
340        lh->lStrToken[i] = NULL;
341        lh->pos[i] = -1;
342    }
343
344    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
345
346    if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
347        //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
348        lh->pos[0] = 0;
349        t1 = lh->baseCE;
350        t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
351        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
352        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
353        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
354        uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
355        primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
356
357        t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
358        t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
359
360        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
361        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
362        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
363    } else if(lh->indirect == TRUE && lh->nextCE != 0) {
364        //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
365        lh->pos[0] = 0;
366        t1 = lh->baseCE;
367        t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
368        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
369        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
370        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
371        t1 = lh->nextCE;
372        t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
373        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
374        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
375        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
376    } else {
377        for(;;) {
378            if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
379                if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
380                    lh->fStrToken[tokStrength] = tok;
381                } else { /* The CE must be implicit, since it's not in the table */
382                    /* Error */
383                    *status = U_INTERNAL_PROGRAM_ERROR;
384                }
385            }
386
387            while(tok != NULL && tok->strength >= tokStrength) {
388                if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
389                    lh->lStrToken[tokStrength] = tok;
390                }
391                tok = tok->next;
392            }
393            if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
394                /* check if previous interval is the same and merge the intervals if it is so */
395                if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
396                    lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
397                    lh->fStrToken[tokStrength+1] = NULL;
398                    lh->lStrToken[tokStrength+1] = NULL;
399                    lh->pos[tokStrength+1] = -1;
400                }
401            }
402            if(tok != NULL) {
403                tokStrength = tok->strength;
404            } else {
405                break;
406            }
407        }
408        for(st = 0; st < 3; st++) {
409            if((pos = lh->pos[st]) >= 0) {
410                t1 = *(CETable+3*(pos));
411                t2 = *(CETable+3*(pos)+1);
412                lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
413                lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
414                //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
415                lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
416                //pos--;
417                //t1 = *(CETable+3*(pos));
418                //t2 = *(CETable+3*(pos)+1);
419                t1 = lh->baseCE;
420                t2 = lh->baseContCE;
421                lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
422                lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
423                lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
424            }
425        }
426    }
427}
428
429
430#define ucol_countBytes(value, noOfBytes)   \
431{                               \
432    uint32_t mask = 0xFFFFFFFF;   \
433    (noOfBytes) = 0;              \
434    while(mask != 0) {            \
435    if(((value) & mask) != 0) { \
436    (noOfBytes)++;            \
437    }                           \
438    mask >>= 8;                 \
439    }                             \
440}
441
442static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
443    if(U_SUCCESS(*status)) {
444        g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
445    }
446    return g->current;
447}
448
449static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
450    /* TODO: rename to enum names */
451    uint32_t high, low, count=1;
452    uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
453
454    if(strength == UCOL_SECONDARY) {
455        low = UCOL_COMMON_TOP2<<24;
456        high = 0xFFFFFFFF;
457        count = 0xFF - UCOL_COMMON_TOP2;
458    } else {
459        low = UCOL_BYTE_COMMON << 24; //0x05000000;
460        high = 0x40000000;
461        count = 0x40 - UCOL_BYTE_COMMON;
462    }
463
464    if(tok->next != NULL && tok->next->strength == strength) {
465        count = tok->next->toInsert;
466    }
467
468    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
469    g->current = UCOL_BYTE_COMMON<<24;
470
471    if(g->noOfRanges == 0) {
472        *status = U_INTERNAL_PROGRAM_ERROR;
473    }
474    return g->current;
475}
476
477static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
478    uint32_t strength = tok->strength;
479    uint32_t low = lows[fStrength*3+strength];
480    uint32_t high = highs[fStrength*3+strength];
481    uint32_t maxByte = 0;
482    if(strength == UCOL_TERTIARY) {
483        maxByte = 0x3F;
484    } else if(strength == UCOL_PRIMARY) {
485        maxByte = 0xFE;
486    } else {
487        maxByte = 0xFF;
488    }
489
490    uint32_t count = tok->toInsert;
491
492    if(low >= high && strength > UCOL_PRIMARY) {
493        int32_t s = strength;
494        for(;;) {
495            s--;
496            if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
497                if(strength == UCOL_SECONDARY) {
498                    if (low < UCOL_COMMON_TOP2<<24 ) {
499                       // Override if low range is less than UCOL_COMMON_TOP2.
500		        low = UCOL_COMMON_TOP2<<24;
501                    }
502                    high = 0xFFFFFFFF;
503                } else {
504                    // Override if low range is less than UCOL_COMMON_BOT3.
505		    if ( low < UCOL_COMMON_BOT3<<24 ) {
506                        low = UCOL_COMMON_BOT3<<24;
507		    }
508                    high = 0x40000000;
509                }
510                break;
511            }
512            if(s<0) {
513                *status = U_INTERNAL_PROGRAM_ERROR;
514                return 0;
515            }
516        }
517    }
518
519    if(low < 0x02000000) {
520        // We must not use CE weight byte 02, so we set it as the minimum lower bound.
521        // See http://site.icu-project.org/design/collation/bytes
522        low = 0x02000000;
523    }
524
525    if(strength == UCOL_SECONDARY) { /* similar as simple */
526        if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
527            low = UCOL_COMMON_TOP2<<24;
528        }
529        if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
530            high = UCOL_COMMON_TOP2<<24;
531        }
532        if(low < (UCOL_COMMON_BOT2<<24)) {
533            g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
534            g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
535            //g->current = UCOL_COMMON_BOT2<<24;
536            return g->current;
537        }
538    }
539
540    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
541    if(g->noOfRanges == 0) {
542        *status = U_INTERNAL_PROGRAM_ERROR;
543    }
544    g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
545    return g->current;
546}
547
548static
549uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
550    uint32_t i = 0;
551    UChar c;
552
553    if(U_FAILURE(*status)) {
554        return 0;
555    }
556
557    if(sourceLen > resLen) {
558        *status = U_MEMORY_ALLOCATION_ERROR;
559        return 0;
560    }
561
562    for(i = 0; i < sourceLen; i++) {
563        c = source[i];
564        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
565            switch(c - 0x3000) {
566            case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
567            case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
568                c++;
569                break;
570            case 0xF5:
571                c = 0x30AB;
572                break;
573            case 0xF6:
574                c = 0x30B1;
575                break;
576            }
577        }
578        resBuf[i] = c;
579    }
580    return sourceLen;
581}
582
583static
584uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
585    uint32_t i = 0;
586    UChar c;
587
588    if(U_FAILURE(*status)) {
589        return 0;
590    }
591
592    if(sourceLen > resLen) {
593        *status = U_MEMORY_ALLOCATION_ERROR;
594        return 0;
595    }
596
597    for(i = 0; i < sourceLen; i++) {
598        c = source[i];
599        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
600            switch(c - 0x3000) {
601            case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
602            case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
603                c--;
604                break;
605            case 0xAB:
606                c = 0x30F5;
607                break;
608            case 0xB1:
609                c = 0x30F6;
610                break;
611            }
612        }
613        resBuf[i] = c;
614    }
615    return sourceLen;
616}
617
618U_NAMESPACE_BEGIN
619
620static
621uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
622    uint32_t i = 0;
623    UChar n[128];
624    uint32_t nLen = 0;
625    uint32_t uCount = 0, lCount = 0;
626
627    collIterate s;
628    uint32_t order = 0;
629
630    if(U_FAILURE(*status)) {
631        return UCOL_LOWER_CASE;
632    }
633
634    nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
635    if(U_SUCCESS(*status)) {
636        for(i = 0; i < nLen; i++) {
637            uprv_init_collIterate(UCA, &n[i], 1, &s, status);
638            order = ucol_getNextCE(UCA, &s, status);
639            if(isContinuation(order)) {
640                *status = U_INTERNAL_PROGRAM_ERROR;
641                return UCOL_LOWER_CASE;
642            }
643            if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
644                uCount++;
645            } else {
646                if(u_islower(n[i])) {
647                    lCount++;
648                } else if(U_SUCCESS(*status)) {
649                    UChar sk[1], lk[1];
650                    u_toSmallKana(&n[i], 1, sk, 1, status);
651                    u_toLargeKana(&n[i], 1, lk, 1, status);
652                    if(sk[0] == n[i] && lk[0] != n[i]) {
653                        lCount++;
654                    }
655                }
656            }
657        }
658    }
659
660    if(uCount != 0 && lCount != 0) {
661        return UCOL_MIXED_CASE;
662    } else if(uCount != 0) {
663        return UCOL_UPPER_CASE;
664    } else {
665        return UCOL_LOWER_CASE;
666    }
667}
668
669
670U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
671    /* this one makes the table and stuff */
672    uint32_t noOfBytes[3];
673    uint32_t i;
674
675    for(i = 0; i<3; i++) {
676        ucol_countBytes(CEparts[i], noOfBytes[i]);
677    }
678
679    /* Here we have to pack CEs from parts */
680
681    uint32_t CEi = 0;
682    uint32_t value = 0;
683
684    while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
685        if(CEi > 0) {
686            value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
687        } else {
688            value = 0;
689        }
690
691        if(2*CEi<noOfBytes[0]) {
692            value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
693        }
694        if(CEi<noOfBytes[1]) {
695            value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
696        }
697        if(CEi<noOfBytes[2]) {
698            value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
699        }
700        tok->CEs[CEi] = value;
701        CEi++;
702    }
703    if(CEi == 0) { /* totally ignorable */
704        tok->noOfCEs = 1;
705        tok->CEs[0] = 0;
706    } else { /* there is at least something */
707        tok->noOfCEs = CEi;
708    }
709
710
711    // we want to set case bits here and now, not later.
712    // Case bits handling
713    if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
714        tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
715        int32_t cSize = (tok->source & 0xFF000000) >> 24;
716        UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
717
718        if(cSize > 1) {
719            // Do it manually
720            tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
721        } else {
722            // Copy it from the UCA
723            uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
724            tok->CEs[0] |= (caseCE & 0xC0);
725        }
726    }
727
728#if UCOL_DEBUG==2
729    fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
730    for(i = 0; i<tok->noOfCEs; i++) {
731        fprintf(stderr, "%08X ", tok->CEs[i]);
732    }
733    fprintf(stderr, "\n");
734#endif
735}
736
737U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
738    ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
739    uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
740
741    UColToken *tok = lh->last;
742    uint32_t t[UCOL_STRENGTH_LIMIT];
743
744    uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
745
746    /* must initialize ranges to avoid memory check warnings */
747    for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
748        uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
749    }
750
751    tok->toInsert = 1;
752    t[tok->strength] = 1;
753
754    while(tok->previous != NULL) {
755        if(tok->previous->strength < tok->strength) { /* going up */
756            t[tok->strength] = 0;
757            t[tok->previous->strength]++;
758        } else if(tok->previous->strength > tok->strength) { /* going down */
759            t[tok->previous->strength] = 1;
760        } else {
761            t[tok->strength]++;
762        }
763        tok=tok->previous;
764        tok->toInsert = t[tok->strength];
765    }
766
767    tok->toInsert = t[tok->strength];
768    ucol_inv_getGapPositions(src, lh, status);
769
770#if UCOL_DEBUG
771    fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
772    int32_t j = 2;
773    for(j = 2; j >= 0; j--) {
774        fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
775        fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
776    }
777    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
778
779    do {
780        fprintf(stderr,"%i", tok->strength);
781        tok = tok->next;
782    } while(tok != NULL);
783    fprintf(stderr, "\n");
784
785    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
786
787    do {
788        fprintf(stderr,"%i", tok->toInsert);
789        tok = tok->next;
790    } while(tok != NULL);
791#endif
792
793    tok = lh->first;
794    uint32_t fStrength = UCOL_IDENTICAL;
795    uint32_t initStrength = UCOL_IDENTICAL;
796
797
798    CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
799    CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
800    CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
801
802    while (tok != NULL && U_SUCCESS(*status)) {
803        fStrength = tok->strength;
804        if(fStrength < initStrength) {
805            initStrength = fStrength;
806            if(lh->pos[fStrength] == -1) {
807                while(lh->pos[fStrength] == -1 && fStrength > 0) {
808                    fStrength--;
809                }
810                if(lh->pos[fStrength] == -1) {
811                    *status = U_INTERNAL_PROGRAM_ERROR;
812                    return;
813                }
814            }
815            if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
816                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
817                CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
818                /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
819                CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
820            } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
821                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
822                /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
823                CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
824                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
825            } else { /* primaries */
826                /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
827                CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
828                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
829                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
830            }
831        } else {
832            if(tok->strength == UCOL_TERTIARY) {
833                CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
834            } else if(tok->strength == UCOL_SECONDARY) {
835                CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
836                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
837            } else if(tok->strength == UCOL_PRIMARY) {
838                CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
839                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
840                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
841            }
842        }
843        ucol_doCE(src, CEparts, tok, status);
844        tok = tok->next;
845    }
846}
847
848U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
849    UCAElements el;
850    UColToken *tok = lh->first;
851    UColToken *expt = NULL;
852    uint32_t i = 0, j = 0;
853    const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
854
855    while(tok != NULL && U_SUCCESS(*status)) {
856        /* first, check if there are any expansions */
857        /* if there are expansions, we need to do a little bit more processing */
858        /* since parts of expansion can be tailored, while others are not */
859        if(tok->expansion != 0) {
860            uint32_t len = tok->expansion >> 24;
861            uint32_t currentSequenceLen = len;
862            uint32_t expOffset = tok->expansion & 0x00FFFFFF;
863            //uint32_t exp = currentSequenceLen | expOffset;
864            UColToken exp;
865            exp.source = currentSequenceLen | expOffset;
866            exp.rulesToParseHdl = &(src->source);
867
868            while(len > 0) {
869                currentSequenceLen = len;
870                while(currentSequenceLen > 0) {
871                    exp.source = (currentSequenceLen << 24) | expOffset;
872                    if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
873                        uint32_t noOfCEsToCopy = expt->noOfCEs;
874                        for(j = 0; j<noOfCEsToCopy; j++) {
875                            tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
876                        }
877                        tok->noOfExpCEs += noOfCEsToCopy;
878                        // Smart people never try to add codepoints and CEs.
879                        // For some odd reason, it won't work.
880                        expOffset += currentSequenceLen; //noOfCEsToCopy;
881                        len -= currentSequenceLen; //noOfCEsToCopy;
882                        break;
883                    } else {
884                        currentSequenceLen--;
885                    }
886                }
887                if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
888                    /* will have to get one from UCA */
889                    /* first, get the UChars from the rules */
890                    /* then pick CEs out until there is no more and stuff them into expansion */
891                    collIterate s;
892                    uint32_t order = 0;
893                    uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
894
895                    for(;;) {
896                        order = ucol_getNextCE(src->UCA, &s, status);
897                        if(order == UCOL_NO_MORE_CES) {
898                            break;
899                        }
900                        tok->expCEs[tok->noOfExpCEs++] = order;
901                    }
902                    expOffset++;
903                    len--;
904                }
905            }
906        } else {
907            tok->noOfExpCEs = 0;
908        }
909
910        /* set the ucaelement with obtained values */
911        el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
912        /* copy CEs */
913        for(i = 0; i<tok->noOfCEs; i++) {
914            el.CEs[i] = tok->CEs[i];
915        }
916        for(i = 0; i<tok->noOfExpCEs; i++) {
917            el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
918        }
919
920        /* copy UChars */
921        // We kept prefix and source kind of together, as it is a kind of a contraction.
922        // However, now we have to slice the prefix off the main thing -
923        el.prefix = el.prefixChars;
924        el.cPoints = el.uchars;
925        if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
926            // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
927            // decomposed elements to the unsaf table.
928            el.prefixSize = tok->prefix>>24;
929            uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
930
931            el.cSize = (tok->source >> 24)-(tok->prefix>>24);
932            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
933        } else {
934            el.prefixSize = 0;
935            *el.prefix = 0;
936
937            el.cSize = (tok->source >> 24);
938            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
939        }
940        if(src->UCA != NULL) {
941            for(i = 0; i<el.cSize; i++) {
942                if(UCOL_ISJAMO(el.cPoints[i])) {
943                    t->image->jamoSpecial = TRUE;
944                }
945            }
946            if (!src->buildCCTabFlag && el.cSize > 0) {
947                // Check the trailing canonical combining class (tccc) of the last character.
948                const UChar *s = el.cPoints + el.cSize;
949                uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
950                if ((fcd & 0xff) != 0) {
951                    src->buildCCTabFlag = TRUE;
952                }
953            }
954        }
955
956        /* and then, add it */
957#if UCOL_DEBUG==2
958        fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
959#endif
960        uprv_uca_addAnElement(t, &el, status);
961
962#if UCOL_DEBUG_DUPLICATES
963        if(*status != U_ZERO_ERROR) {
964            fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
965            *status = U_ZERO_ERROR;
966        }
967#endif
968
969        tok = tok->next;
970    }
971}
972
973U_CDECL_BEGIN
974static UBool U_CALLCONV
975_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
976    UErrorCode status = U_ZERO_ERROR;
977    tempUCATable *t = (tempUCATable *)context;
978    if(value == 0) {
979        while(start < limit) {
980            uint32_t CE = utrie_get32(t->mapping, start, NULL);
981            if(CE == UCOL_NOT_FOUND) {
982                UCAElements el;
983                el.isThai = FALSE;
984                el.prefixSize = 0;
985                el.prefixChars[0] = 0;
986                el.prefix = el.prefixChars;
987                el.cPoints = el.uchars;
988
989                el.cSize = 0;
990                U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
991
992                el.noOfCEs = 1;
993                el.CEs[0] = 0;
994                uprv_uca_addAnElement(t, &el, &status);
995
996            }
997            start++;
998        }
999    }
1000    if(U_FAILURE(status)) {
1001        return FALSE;
1002    } else {
1003        return TRUE;
1004    }
1005}
1006U_CDECL_END
1007
1008static void
1009ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1010                               UChar32 start, UChar32 end,
1011                               UErrorCode *status)
1012{
1013    //UChar decomp[256];
1014    uint32_t CE = UCOL_NOT_FOUND;
1015    UChar32 u = 0;
1016    UCAElements el;
1017    el.isThai = FALSE;
1018    el.prefixSize = 0;
1019    el.prefixChars[0] = 0;
1020    collIterate colIt;
1021
1022    if(U_SUCCESS(*status)) {
1023        for(u = start; u<=end; u++) {
1024            if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1025                /* this test is for contractions that are missing the starting element. */
1026                || ((isCntTableElement(CE)) &&
1027                (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1028                )
1029            {
1030                el.cSize = 0;
1031                U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1032                //decomp[0] = (UChar)u;
1033                //el.uchars[0] = (UChar)u;
1034                el.cPoints = el.uchars;
1035                //el.cSize = 1;
1036                el.noOfCEs = 0;
1037                el.prefix = el.prefixChars;
1038                el.prefixSize = 0;
1039                //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1040                // We actually want to check whether this element is a special
1041                // If it is an implicit element (hangul, CJK - we want to copy the
1042                // special, not the resolved CEs) - for hangul, copying resolved
1043                // would just make things the same (there is an expansion and it
1044                // takes approximately the same amount of time to resolve as
1045                // falling back to the UCA).
1046                /*
1047                UTRIE_GET32(src->UCA->mapping, u, CE);
1048                tag = getCETag(CE);
1049                if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1050                || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1051                || tag == LEAD_SURROGATE_TAG) {
1052                el.CEs[el.noOfCEs++] = CE;
1053                } else {
1054                */
1055                // It turns out that it does not make sense to keep implicits
1056                // unresolved. The cost of resolving them is big enough so that
1057                // it doesn't make any difference whether we have to go to the UCA
1058                // or not.
1059                {
1060                    uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1061                    while(CE != UCOL_NO_MORE_CES) {
1062                        CE = ucol_getNextCE(src->UCA, &colIt, status);
1063                        if(CE != UCOL_NO_MORE_CES) {
1064                            el.CEs[el.noOfCEs++] = CE;
1065                        }
1066                    }
1067                }
1068                uprv_uca_addAnElement(t, &el, status);
1069            }
1070        }
1071    }
1072}
1073
1074U_NAMESPACE_END
1075
1076U_CFUNC UCATableHeader *
1077ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1078    U_NAMESPACE_USE
1079
1080    uint32_t i = 0;
1081    if(U_FAILURE(*status)) {
1082        return NULL;
1083    }
1084    /*
1085    2.  Eliminate the negative lists by doing the following for each non-null negative list:
1086    o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1087    create new ListHeader X
1088    o   reverse the list, add to the end of X's positive list. Reset the strength of the
1089    first item you add, based on the stronger strength levels of the two lists.
1090    */
1091    /*
1092    3.  For each ListHeader with a non-null positive list:
1093    */
1094    /*
1095    o   Find all character strings with CEs between the baseCE and the
1096    next/previous CE, at the strength of the first token. Add these to the
1097    tailoring.
1098    ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1099    tailoring has & x < z...
1100    ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1101    */
1102    /* It is possible that this part should be done even while constructing list */
1103    /* The problem is that it is unknown what is going to be the strongest weight */
1104    /* So we might as well do it here */
1105
1106    /*
1107    o   Allocate CEs for each token in the list, based on the total number N of the
1108    largest level difference, and the gap G between baseCE and nextCE at that
1109    level. The relation * between the last item and nextCE is the same as the
1110    strongest strength.
1111    o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1112    ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1113    Then fit b and c into the secondary gap between a and d, then fit q
1114    into the tertiary gap between b and c.
1115
1116    o   Example: baseCE << b <<< q << c * nextCE(X,2)
1117    ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1118    Then fit q into the tertiary gap between b and c.
1119    o   When incrementing primary values, we will not cross high byte
1120    boundaries except where there is only a single-byte primary. That is to
1121    ensure that the script reordering will continue to work.
1122    */
1123    UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1124    /* test for NULL */
1125    if (image == NULL) {
1126        *status = U_MEMORY_ALLOCATION_ERROR;
1127        return NULL;
1128    }
1129    uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1130
1131    for(i = 0; i<src->resultLen; i++) {
1132        /* now we need to generate the CEs */
1133        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1134        /* According to strength                                                          */
1135        if(U_SUCCESS(*status)) {
1136            if(src->lh[i].first) { // if there are any elements
1137                // due to the way parser works, subsequent tailorings
1138                // may remove all the elements from a sequence, therefore
1139                // leaving an empty tailoring sequence.
1140                ucol_initBuffers(src, &src->lh[i], status);
1141            }
1142        }
1143        if(U_FAILURE(*status)) {
1144            uprv_free(image);
1145            return NULL;
1146        }
1147    }
1148
1149    if(src->varTop != NULL) { /* stuff the variable top value */
1150        src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1151        /* remove it from the list */
1152        if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1153            src->varTop->listHeader->first = src->varTop->next;
1154        }
1155        if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1156            src->varTop->listHeader->last = src->varTop->previous;
1157        }
1158        if(src->varTop->next != NULL) {
1159            src->varTop->next->previous = src->varTop->previous;
1160        }
1161        if(src->varTop->previous != NULL) {
1162            src->varTop->previous->next = src->varTop->next;
1163        }
1164    }
1165
1166
1167    tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1168    if(U_FAILURE(*status)) {
1169        uprv_free(image);
1170        return NULL;
1171    }
1172
1173
1174    /* After this, we have assigned CE values to all regular CEs      */
1175    /* now we will go through list once more and resolve expansions,  */
1176    /* make UCAElements structs and add them to table                 */
1177    for(i = 0; i<src->resultLen; i++) {
1178        /* now we need to generate the CEs */
1179        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1180        /* According to strength                                                          */
1181        if(U_SUCCESS(*status)) {
1182            ucol_createElements(src, t, &src->lh[i], status);
1183        }
1184    }
1185
1186    UCAElements el;
1187    el.isThai = FALSE;
1188    el.prefixSize = 0;
1189    el.prefixChars[0] = 0;
1190
1191    /* add latin-1 stuff */
1192    ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1193
1194    /* add stuff for copying */
1195    if(src->copySet != NULL) {
1196        int32_t i = 0;
1197        UnicodeSet *set = (UnicodeSet *)src->copySet;
1198        for(i = 0; i < set->getRangeCount(); i++) {
1199            ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1200        }
1201    }
1202
1203    if(U_SUCCESS(*status)) {
1204        /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1205
1206        uint32_t tailoredCE = UCOL_NOT_FOUND;
1207        UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1208        int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1209        UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1210        // Check for null pointer
1211        if (ucaEl == NULL) {
1212            *status = U_MEMORY_ALLOCATION_ERROR;
1213            return NULL;
1214        }
1215        while(*conts != 0) {
1216            // A continuation is NUL-terminated and NUL-padded
1217            // except if it has the maximum length.
1218            int32_t contractionLength = maxUCAContractionLength;
1219            while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1220                --contractionLength;
1221            }
1222            UChar32 first;
1223            int32_t firstLength = 0;
1224            U16_NEXT(conts, firstLength, contractionLength, first);
1225            tailoredCE = utrie_get32(t->mapping, first, NULL);
1226            if(tailoredCE != UCOL_NOT_FOUND) {
1227                UBool needToAdd = TRUE;
1228                if(isCntTableElement(tailoredCE)) {
1229                    if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1230                        needToAdd = FALSE;
1231                    }
1232                }
1233                if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1234                    UCAElements elm;
1235                    elm.cPoints = el.uchars;
1236                    elm.noOfCEs = 0;
1237                    elm.uchars[0] = *conts;
1238                    elm.uchars[1] = 0;
1239                    elm.cSize = 1;
1240                    elm.prefixChars[0] = *(conts+2);
1241                    elm.isThai = FALSE;
1242                    elm.prefix = elm.prefixChars;
1243                    elm.prefixSize = 1;
1244                    UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1245                    if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1246                        needToAdd = TRUE;
1247                    }
1248                }
1249                if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1250                    needToAdd = FALSE;
1251                }
1252
1253                if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1254                    if (*(conts+1) != 0) {  // contractions
1255                        el.prefix = el.prefixChars;
1256                        el.prefixSize = 0;
1257                        el.cPoints = el.uchars;
1258                        el.noOfCEs = 0;
1259                        u_memcpy(el.uchars, conts, contractionLength);
1260                        el.cSize = contractionLength;
1261                        ucol_setText(ucaEl, el.uchars, el.cSize, status);
1262                    }
1263                    else { // pre-context character
1264                        UChar str[4] = { 0 };
1265                        int32_t len=0;
1266                        int32_t preKeyLen=0;
1267
1268                        el.cPoints = el.uchars;
1269                        el.noOfCEs = 0;
1270                        el.uchars[0] = *conts;
1271                        el.uchars[1] = 0;
1272                        el.cSize = 1;
1273                        el.prefixChars[0] = *(conts+2);
1274                        el.prefix = el.prefixChars;
1275                        el.prefixSize = 1;
1276                        if (el.prefixChars[0]!=0) {
1277                            // get CE of prefix character first
1278                            str[0]=el.prefixChars[0];
1279                            str[1]=0;
1280                            ucol_setText(ucaEl, str, 1, status);
1281                            while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1282                                    != UCOL_NULLORDER) {
1283                                preKeyLen++;  // count number of keys for prefix character
1284                            }
1285                            str[len++] = el.prefixChars[0];
1286                        }
1287
1288                        str[len++] = el.uchars[0];
1289                        str[len]=0;
1290                        ucol_setText(ucaEl, str, len, status);
1291                        // Skip the keys for prefix character, then copy the rest to el.
1292                        while ((preKeyLen-->0) &&
1293                               (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1294                            continue;
1295                        }
1296
1297                    }
1298                    while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1299                        el.noOfCEs++;
1300                    }
1301                    uprv_uca_addAnElement(t, &el, status);
1302                }
1303
1304            } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1305                ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1306            }
1307            conts+=maxUCAContractionLength;
1308        }
1309        ucol_closeElements(ucaEl);
1310    }
1311
1312    // Add completely ignorable elements
1313    utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1314
1315    // add tailoring characters related canonical closures
1316    uprv_uca_canonicalClosure(t, src, NULL, status);
1317
1318    /* still need to produce compatibility closure */
1319
1320    UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1321
1322    uprv_uca_closeTempTable(t);
1323    uprv_free(image);
1324
1325    return myData;
1326}
1327
1328U_CDECL_BEGIN
1329static UBool U_CALLCONV
1330ucol_bld_cleanup(void)
1331{
1332    udata_close(invUCA_DATA_MEM);
1333    invUCA_DATA_MEM = NULL;
1334    _staticInvUCA = NULL;
1335    gStaticInvUCAInitOnce.reset();
1336    return TRUE;
1337}
1338U_CDECL_END
1339
1340static void U_CALLCONV initInverseUCA(UErrorCode &status) {
1341    U_ASSERT(invUCA_DATA_MEM == NULL);
1342    U_ASSERT(_staticInvUCA == NULL);
1343    ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1344    InverseUCATableHeader *newInvUCA = NULL;
1345    UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status);
1346
1347    if(U_FAILURE(status)) {
1348        if (result) {
1349            udata_close(result);
1350        }
1351        // This is not needed, as we are talking about
1352        // memory we got from UData
1353        //uprv_free(newInvUCA);
1354        return;
1355    }
1356
1357    if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1358        newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1359        UCollator *UCA = ucol_initUCA(&status);
1360        // UCA versions of UCA and inverse UCA should match
1361        if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1362            status = U_INVALID_FORMAT_ERROR;
1363            udata_close(result);
1364            return;
1365        }
1366
1367        invUCA_DATA_MEM = result;
1368        _staticInvUCA = newInvUCA;
1369    }
1370}
1371
1372
1373U_CAPI const InverseUCATableHeader * U_EXPORT2
1374ucol_initInverseUCA(UErrorCode *status)
1375{
1376    umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
1377    return _staticInvUCA;
1378}
1379
1380/* This is the data that is used for non-script reordering codes. These _must_ be kept
1381 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1382 */
1383static const char * const ReorderingTokenNames[] = {
1384    "SPACE",
1385    "PUNCT",
1386    "SYMBOL",
1387    "CURRENCY",
1388    "DIGIT"
1389};
1390
1391static void toUpper(const char* src, char* dst, uint32_t length) {
1392   for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1393       *dst = uprv_toupper(*src);
1394   }
1395   *dst = '\0';
1396}
1397
1398U_INTERNAL int32_t U_EXPORT2
1399ucol_findReorderingEntry(const char* name) {
1400    char buffer[32];
1401    toUpper(name, buffer, 32);
1402    for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1403        if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1404            return entry + UCOL_REORDER_CODE_FIRST;
1405        }
1406    }
1407    return USCRIPT_INVALID_CODE;
1408}
1409
1410#endif /* #if !UCONFIG_NO_COLLATION */
1411