1/*
2*******************************************************************************
3*
4*   Copyright (C) 2001-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucol_bld.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created 02/22/2001
14*   created by: Vladimir Weinstein
15*
16* This module builds a collator based on the rule set.
17*
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/ucoleitr.h"
25#include "unicode/udata.h"
26#include "unicode/uchar.h"
27#include "unicode/uniset.h"
28#include "unicode/uscript.h"
29#include "unicode/ustring.h"
30#include "normalizer2impl.h"
31#include "ucol_bld.h"
32#include "ucol_elm.h"
33#include "ucol_cnt.h"
34#include "ucln_in.h"
35#include "umutex.h"
36#include "cmemory.h"
37#include "cstring.h"
38
39static const InverseUCATableHeader* _staticInvUCA = NULL;
40static UDataMemory* invUCA_DATA_MEM = NULL;
41
42U_CDECL_BEGIN
43static UBool U_CALLCONV
44isAcceptableInvUCA(void * /*context*/,
45                   const char * /*type*/, const char * /*name*/,
46                   const UDataInfo *pInfo)
47{
48    /* context, type & name are intentionally not used */
49    if( pInfo->size>=20 &&
50        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
51        pInfo->charsetFamily==U_CHARSET_FAMILY &&
52        pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
53        pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
54        pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
55        pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
56        pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
57        pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
58        //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
59        //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
60        //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
61        )
62    {
63        UVersionInfo UCDVersion;
64        u_getUnicodeVersion(UCDVersion);
65        return (pInfo->dataVersion[0]==UCDVersion[0] &&
66            pInfo->dataVersion[1]==UCDVersion[1]);
67            //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
68            //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
69            //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
70    } else {
71        return FALSE;
72    }
73}
74U_CDECL_END
75
76/*
77* Takes two CEs (lead and continuation) and
78* compares them as CEs should be compared:
79* primary vs. primary, secondary vs. secondary
80* tertiary vs. tertiary
81*/
82static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
83    uint32_t s1 = source0, s2, t1 = target0, t2;
84    if(isContinuation(source1)) {
85        s2 = source1;
86    } else {
87        s2 = 0;
88    }
89    if(isContinuation(target1)) {
90        t2 = target1;
91    } else {
92        t2 = 0;
93    }
94
95    uint32_t s = 0, t = 0;
96    if(s1 == t1 && s2 == t2) {
97        return 0;
98    }
99    s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
100    t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
101    if(s < t) {
102        return -1;
103    } else if(s > t) {
104        return 1;
105    } else {
106        s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
107        t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
108        if(s < t) {
109            return -1;
110        } else if(s > t) {
111            return 1;
112        } else {
113            s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
114            t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
115            if(s < t) {
116                return -1;
117            } else {
118                return 1;
119            }
120        }
121    }
122}
123
124static
125int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
126    uint32_t bottom = 0, top = src->invUCA->tableSize;
127    uint32_t i = 0;
128    uint32_t first = 0, second = 0;
129    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
130    int32_t res = 0;
131
132    while(bottom < top-1) {
133        i = (top+bottom)/2;
134        first = *(CETable+3*i);
135        second = *(CETable+3*i+1);
136        res = compareCEs(first, second, CE, SecondCE);
137        if(res > 0) {
138            top = i;
139        } else if(res < 0) {
140            bottom = i;
141        } else {
142            break;
143        }
144    }
145
146    /* weiv:                                                  */
147    /* in searching for elements, I have removed the failure  */
148    /* The reason for this is that the builder does not rely  */
149    /* on search mechanism telling it that it didn't find an  */
150    /* element. However, indirect positioning relies on being */
151    /* able to find the elements around any CE, even if it is */
152    /* not defined in the UCA. */
153    return i;
154    /*
155    if((first == CE && second == SecondCE)) {
156    return i;
157    } else {
158    return -1;
159    }
160    */
161}
162
163static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
164    0xFFFF0000,
165    0xFFFFFF00,
166    0xFFFFFFFF
167};
168
169U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
170                                            uint32_t CE, uint32_t contCE,
171                                            uint32_t *nextCE, uint32_t *nextContCE,
172                                            uint32_t strength)
173{
174    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
175    int32_t iCE;
176
177    iCE = ucol_inv_findCE(src, CE, contCE);
178
179    if(iCE<0) {
180        *nextCE = UCOL_NOT_FOUND;
181        return -1;
182    }
183
184    CE &= strengthMask[strength];
185    contCE &= strengthMask[strength];
186
187    *nextCE = CE;
188    *nextContCE = contCE;
189
190    while((*nextCE  & strengthMask[strength]) == CE
191        && (*nextContCE  & strengthMask[strength]) == contCE)
192    {
193        *nextCE = (*(CETable+3*(++iCE)));
194        *nextContCE = (*(CETable+3*(iCE)+1));
195    }
196
197    return iCE;
198}
199
200U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
201                                            uint32_t CE, uint32_t contCE,
202                                            uint32_t *prevCE, uint32_t *prevContCE,
203                                            uint32_t strength)
204{
205    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
206    int32_t iCE;
207
208    iCE = ucol_inv_findCE(src, CE, contCE);
209
210    if(iCE<0) {
211        *prevCE = UCOL_NOT_FOUND;
212        return -1;
213    }
214
215    CE &= strengthMask[strength];
216    contCE &= strengthMask[strength];
217
218    *prevCE = CE;
219    *prevContCE = contCE;
220
221    while((*prevCE  & strengthMask[strength]) == CE
222        && (*prevContCE  & strengthMask[strength])== contCE
223        && iCE > 0) /* this condition should prevent falling off the edge of the world */
224    {
225        /* here, we end up in a singularity - zero */
226        *prevCE = (*(CETable+3*(--iCE)));
227        *prevContCE = (*(CETable+3*(iCE)+1));
228    }
229
230    return iCE;
231}
232
233U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
234                                                       uint32_t prevCE, uint32_t prevContCE)
235{
236    if(prevCE == CE && prevContCE == contCE) {
237        return UCOL_IDENTICAL;
238    }
239    if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
240        || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
241    {
242        return UCOL_PRIMARY;
243    }
244    if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
245        || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
246    {
247        return UCOL_SECONDARY;
248    }
249    return UCOL_TERTIARY;
250}
251
252
253/*static
254inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
255
256    uint32_t CE = lh->baseCE;
257    uint32_t SecondCE = lh->baseContCE;
258
259    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
260    uint32_t previousCE, previousContCE;
261    int32_t iCE;
262
263    iCE = ucol_inv_findCE(src, CE, SecondCE);
264
265    if(iCE<0) {
266        return -1;
267    }
268
269    CE &= strengthMask[strength];
270    SecondCE &= strengthMask[strength];
271
272    previousCE = CE;
273    previousContCE = SecondCE;
274
275    while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
276        previousCE = (*(CETable+3*(--iCE)));
277        previousContCE = (*(CETable+3*(iCE)+1));
278    }
279    lh->previousCE = previousCE;
280    lh->previousContCE = previousContCE;
281
282    return iCE;
283}*/
284
285static
286inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
287    uint32_t CE = lh->baseCE;
288    uint32_t SecondCE = lh->baseContCE;
289
290    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
291    uint32_t nextCE, nextContCE;
292    int32_t iCE;
293
294    iCE = ucol_inv_findCE(src, CE, SecondCE);
295
296    if(iCE<0) {
297        return -1;
298    }
299
300    CE &= strengthMask[strength];
301    SecondCE &= strengthMask[strength];
302
303    nextCE = CE;
304    nextContCE = SecondCE;
305
306    while((nextCE  & strengthMask[strength]) == CE
307        && (nextContCE  & strengthMask[strength]) == SecondCE)
308    {
309        nextCE = (*(CETable+3*(++iCE)));
310        nextContCE = (*(CETable+3*(iCE)+1));
311    }
312
313    lh->nextCE = nextCE;
314    lh->nextContCE = nextContCE;
315
316    return iCE;
317}
318
319static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
320    /* reset all the gaps */
321    int32_t i = 0;
322    uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
323    uint32_t st = 0;
324    uint32_t t1, t2;
325    int32_t pos;
326
327    UColToken *tok = lh->first;
328    uint32_t tokStrength = tok->strength;
329
330    for(i = 0; i<3; i++) {
331        lh->gapsHi[3*i] = 0;
332        lh->gapsHi[3*i+1] = 0;
333        lh->gapsHi[3*i+2] = 0;
334        lh->gapsLo[3*i] = 0;
335        lh->gapsLo[3*i+1] = 0;
336        lh->gapsLo[3*i+2] = 0;
337        lh->numStr[i] = 0;
338        lh->fStrToken[i] = NULL;
339        lh->lStrToken[i] = NULL;
340        lh->pos[i] = -1;
341    }
342
343    UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
344
345    if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
346        //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
347        lh->pos[0] = 0;
348        t1 = lh->baseCE;
349        t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
350        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
351        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
352        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
353        uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
354        primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
355
356        t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
357        t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
358
359        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
360        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
361        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
362    } else if(lh->indirect == TRUE && lh->nextCE != 0) {
363        //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
364        lh->pos[0] = 0;
365        t1 = lh->baseCE;
366        t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
367        lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
368        lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
369        lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
370        t1 = lh->nextCE;
371        t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
372        lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
373        lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
374        lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
375    } else {
376        for(;;) {
377            if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
378                if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
379                    lh->fStrToken[tokStrength] = tok;
380                } else { /* The CE must be implicit, since it's not in the table */
381                    /* Error */
382                    *status = U_INTERNAL_PROGRAM_ERROR;
383                }
384            }
385
386            while(tok != NULL && tok->strength >= tokStrength) {
387                if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
388                    lh->lStrToken[tokStrength] = tok;
389                }
390                tok = tok->next;
391            }
392            if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
393                /* check if previous interval is the same and merge the intervals if it is so */
394                if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
395                    lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
396                    lh->fStrToken[tokStrength+1] = NULL;
397                    lh->lStrToken[tokStrength+1] = NULL;
398                    lh->pos[tokStrength+1] = -1;
399                }
400            }
401            if(tok != NULL) {
402                tokStrength = tok->strength;
403            } else {
404                break;
405            }
406        }
407        for(st = 0; st < 3; st++) {
408            if((pos = lh->pos[st]) >= 0) {
409                t1 = *(CETable+3*(pos));
410                t2 = *(CETable+3*(pos)+1);
411                lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
412                lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
413                //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
414                lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
415                //pos--;
416                //t1 = *(CETable+3*(pos));
417                //t2 = *(CETable+3*(pos)+1);
418                t1 = lh->baseCE;
419                t2 = lh->baseContCE;
420                lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
421                lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
422                lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
423            }
424        }
425    }
426}
427
428
429#define ucol_countBytes(value, noOfBytes)   \
430{                               \
431    uint32_t mask = 0xFFFFFFFF;   \
432    (noOfBytes) = 0;              \
433    while(mask != 0) {            \
434    if(((value) & mask) != 0) { \
435    (noOfBytes)++;            \
436    }                           \
437    mask >>= 8;                 \
438    }                             \
439}
440
441static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
442    if(U_SUCCESS(*status)) {
443        g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
444    }
445    return g->current;
446}
447
448static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
449    /* TODO: rename to enum names */
450    uint32_t high, low, count=1;
451    uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
452
453    if(strength == UCOL_SECONDARY) {
454        low = UCOL_COMMON_TOP2<<24;
455        high = 0xFFFFFFFF;
456        count = 0xFF - UCOL_COMMON_TOP2;
457    } else {
458        low = UCOL_BYTE_COMMON << 24; //0x05000000;
459        high = 0x40000000;
460        count = 0x40 - UCOL_BYTE_COMMON;
461    }
462
463    if(tok->next != NULL && tok->next->strength == strength) {
464        count = tok->next->toInsert;
465    }
466
467    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
468    g->current = UCOL_BYTE_COMMON<<24;
469
470    if(g->noOfRanges == 0) {
471        *status = U_INTERNAL_PROGRAM_ERROR;
472    }
473    return g->current;
474}
475
476static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
477    uint32_t strength = tok->strength;
478    uint32_t low = lows[fStrength*3+strength];
479    uint32_t high = highs[fStrength*3+strength];
480    uint32_t maxByte = 0;
481    if(strength == UCOL_TERTIARY) {
482        maxByte = 0x3F;
483    } else if(strength == UCOL_PRIMARY) {
484        maxByte = 0xFE;
485    } else {
486        maxByte = 0xFF;
487    }
488
489    uint32_t count = tok->toInsert;
490
491    if(low >= high && strength > UCOL_PRIMARY) {
492        int32_t s = strength;
493        for(;;) {
494            s--;
495            if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
496                if(strength == UCOL_SECONDARY) {
497                    if (low < UCOL_COMMON_TOP2<<24 ) {
498                       // Override if low range is less than UCOL_COMMON_TOP2.
499		        low = UCOL_COMMON_TOP2<<24;
500                    }
501                    high = 0xFFFFFFFF;
502                } else {
503                    // Override if low range is less than UCOL_COMMON_BOT3.
504		    if ( low < UCOL_COMMON_BOT3<<24 ) {
505                        low = UCOL_COMMON_BOT3<<24;
506		    }
507                    high = 0x40000000;
508                }
509                break;
510            }
511            if(s<0) {
512                *status = U_INTERNAL_PROGRAM_ERROR;
513                return 0;
514            }
515        }
516    }
517
518    if(low < 0x02000000) {
519        // We must not use CE weight byte 02, so we set it as the minimum lower bound.
520        // See http://site.icu-project.org/design/collation/bytes
521        low = 0x02000000;
522    }
523
524    if(strength == UCOL_SECONDARY) { /* similar as simple */
525        if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
526            low = UCOL_COMMON_TOP2<<24;
527        }
528        if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
529            high = UCOL_COMMON_TOP2<<24;
530        }
531        if(low < (UCOL_COMMON_BOT2<<24)) {
532            g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
533            g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
534            //g->current = UCOL_COMMON_BOT2<<24;
535            return g->current;
536        }
537    }
538
539    g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
540    if(g->noOfRanges == 0) {
541        *status = U_INTERNAL_PROGRAM_ERROR;
542    }
543    g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
544    return g->current;
545}
546
547static
548uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
549    uint32_t i = 0;
550    UChar c;
551
552    if(U_FAILURE(*status)) {
553        return 0;
554    }
555
556    if(sourceLen > resLen) {
557        *status = U_MEMORY_ALLOCATION_ERROR;
558        return 0;
559    }
560
561    for(i = 0; i < sourceLen; i++) {
562        c = source[i];
563        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
564            switch(c - 0x3000) {
565            case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
566            case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
567                c++;
568                break;
569            case 0xF5:
570                c = 0x30AB;
571                break;
572            case 0xF6:
573                c = 0x30B1;
574                break;
575            }
576        }
577        resBuf[i] = c;
578    }
579    return sourceLen;
580}
581
582static
583uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
584    uint32_t i = 0;
585    UChar c;
586
587    if(U_FAILURE(*status)) {
588        return 0;
589    }
590
591    if(sourceLen > resLen) {
592        *status = U_MEMORY_ALLOCATION_ERROR;
593        return 0;
594    }
595
596    for(i = 0; i < sourceLen; i++) {
597        c = source[i];
598        if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
599            switch(c - 0x3000) {
600            case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
601            case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
602                c--;
603                break;
604            case 0xAB:
605                c = 0x30F5;
606                break;
607            case 0xB1:
608                c = 0x30F6;
609                break;
610            }
611        }
612        resBuf[i] = c;
613    }
614    return sourceLen;
615}
616
617U_NAMESPACE_BEGIN
618
619static
620uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
621    uint32_t i = 0;
622    UChar n[128];
623    uint32_t nLen = 0;
624    uint32_t uCount = 0, lCount = 0;
625
626    collIterate s;
627    uint32_t order = 0;
628
629    if(U_FAILURE(*status)) {
630        return UCOL_LOWER_CASE;
631    }
632
633    nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
634    if(U_SUCCESS(*status)) {
635        for(i = 0; i < nLen; i++) {
636            uprv_init_collIterate(UCA, &n[i], 1, &s, status);
637            order = ucol_getNextCE(UCA, &s, status);
638            if(isContinuation(order)) {
639                *status = U_INTERNAL_PROGRAM_ERROR;
640                return UCOL_LOWER_CASE;
641            }
642            if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
643                uCount++;
644            } else {
645                if(u_islower(n[i])) {
646                    lCount++;
647                } else if(U_SUCCESS(*status)) {
648                    UChar sk[1], lk[1];
649                    u_toSmallKana(&n[i], 1, sk, 1, status);
650                    u_toLargeKana(&n[i], 1, lk, 1, status);
651                    if(sk[0] == n[i] && lk[0] != n[i]) {
652                        lCount++;
653                    }
654                }
655            }
656        }
657    }
658
659    if(uCount != 0 && lCount != 0) {
660        return UCOL_MIXED_CASE;
661    } else if(uCount != 0) {
662        return UCOL_UPPER_CASE;
663    } else {
664        return UCOL_LOWER_CASE;
665    }
666}
667
668
669U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
670    /* this one makes the table and stuff */
671    uint32_t noOfBytes[3];
672    uint32_t i;
673
674    for(i = 0; i<3; i++) {
675        ucol_countBytes(CEparts[i], noOfBytes[i]);
676    }
677
678    /* Here we have to pack CEs from parts */
679
680    uint32_t CEi = 0;
681    uint32_t value = 0;
682
683    while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
684        if(CEi > 0) {
685            value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
686        } else {
687            value = 0;
688        }
689
690        if(2*CEi<noOfBytes[0]) {
691            value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
692        }
693        if(CEi<noOfBytes[1]) {
694            value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
695        }
696        if(CEi<noOfBytes[2]) {
697            value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
698        }
699        tok->CEs[CEi] = value;
700        CEi++;
701    }
702    if(CEi == 0) { /* totally ignorable */
703        tok->noOfCEs = 1;
704        tok->CEs[0] = 0;
705    } else { /* there is at least something */
706        tok->noOfCEs = CEi;
707    }
708
709
710    // we want to set case bits here and now, not later.
711    // Case bits handling
712    if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
713        tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
714        int32_t cSize = (tok->source & 0xFF000000) >> 24;
715        UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
716
717        if(cSize > 1) {
718            // Do it manually
719            tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
720        } else {
721            // Copy it from the UCA
722            uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
723            tok->CEs[0] |= (caseCE & 0xC0);
724        }
725    }
726
727#if UCOL_DEBUG==2
728    fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
729    for(i = 0; i<tok->noOfCEs; i++) {
730        fprintf(stderr, "%08X ", tok->CEs[i]);
731    }
732    fprintf(stderr, "\n");
733#endif
734}
735
736U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
737    ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
738    uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
739
740    UColToken *tok = lh->last;
741    uint32_t t[UCOL_STRENGTH_LIMIT];
742
743    uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
744
745    tok->toInsert = 1;
746    t[tok->strength] = 1;
747
748    while(tok->previous != NULL) {
749        if(tok->previous->strength < tok->strength) { /* going up */
750            t[tok->strength] = 0;
751            t[tok->previous->strength]++;
752        } else if(tok->previous->strength > tok->strength) { /* going down */
753            t[tok->previous->strength] = 1;
754        } else {
755            t[tok->strength]++;
756        }
757        tok=tok->previous;
758        tok->toInsert = t[tok->strength];
759    }
760
761    tok->toInsert = t[tok->strength];
762    ucol_inv_getGapPositions(src, lh, status);
763
764#if UCOL_DEBUG
765    fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
766    int32_t j = 2;
767    for(j = 2; j >= 0; j--) {
768        fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
769        fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
770    }
771    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
772
773    do {
774        fprintf(stderr,"%i", tok->strength);
775        tok = tok->next;
776    } while(tok != NULL);
777    fprintf(stderr, "\n");
778
779    tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
780
781    do {
782        fprintf(stderr,"%i", tok->toInsert);
783        tok = tok->next;
784    } while(tok != NULL);
785#endif
786
787    tok = lh->first;
788    uint32_t fStrength = UCOL_IDENTICAL;
789    uint32_t initStrength = UCOL_IDENTICAL;
790
791
792    CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
793    CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
794    CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
795
796    while (tok != NULL && U_SUCCESS(*status)) {
797        fStrength = tok->strength;
798        if(fStrength < initStrength) {
799            initStrength = fStrength;
800            if(lh->pos[fStrength] == -1) {
801                while(lh->pos[fStrength] == -1 && fStrength > 0) {
802                    fStrength--;
803                }
804                if(lh->pos[fStrength] == -1) {
805                    *status = U_INTERNAL_PROGRAM_ERROR;
806                    return;
807                }
808            }
809            if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
810                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
811                CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
812                /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
813                CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
814            } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
815                CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
816                /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
817                CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
818                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
819            } else { /* primaries */
820                /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
821                CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
822                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
823                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
824            }
825        } else {
826            if(tok->strength == UCOL_TERTIARY) {
827                CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
828            } else if(tok->strength == UCOL_SECONDARY) {
829                CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
830                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
831            } else if(tok->strength == UCOL_PRIMARY) {
832                CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
833                CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
834                CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
835            }
836        }
837        ucol_doCE(src, CEparts, tok, status);
838        tok = tok->next;
839    }
840}
841
842U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
843    UCAElements el;
844    UColToken *tok = lh->first;
845    UColToken *expt = NULL;
846    uint32_t i = 0, j = 0;
847    UChar32 fcdHighStart;
848    const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
849
850    while(tok != NULL && U_SUCCESS(*status)) {
851        /* first, check if there are any expansions */
852        /* if there are expansions, we need to do a little bit more processing */
853        /* since parts of expansion can be tailored, while others are not */
854        if(tok->expansion != 0) {
855            uint32_t len = tok->expansion >> 24;
856            uint32_t currentSequenceLen = len;
857            uint32_t expOffset = tok->expansion & 0x00FFFFFF;
858            //uint32_t exp = currentSequenceLen | expOffset;
859            UColToken exp;
860            exp.source = currentSequenceLen | expOffset;
861            exp.rulesToParseHdl = &(src->source);
862
863            while(len > 0) {
864                currentSequenceLen = len;
865                while(currentSequenceLen > 0) {
866                    exp.source = (currentSequenceLen << 24) | expOffset;
867                    if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
868                        uint32_t noOfCEsToCopy = expt->noOfCEs;
869                        for(j = 0; j<noOfCEsToCopy; j++) {
870                            tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
871                        }
872                        tok->noOfExpCEs += noOfCEsToCopy;
873                        // Smart people never try to add codepoints and CEs.
874                        // For some odd reason, it won't work.
875                        expOffset += currentSequenceLen; //noOfCEsToCopy;
876                        len -= currentSequenceLen; //noOfCEsToCopy;
877                        break;
878                    } else {
879                        currentSequenceLen--;
880                    }
881                }
882                if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
883                    /* will have to get one from UCA */
884                    /* first, get the UChars from the rules */
885                    /* then pick CEs out until there is no more and stuff them into expansion */
886                    collIterate s;
887                    uint32_t order = 0;
888                    uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
889
890                    for(;;) {
891                        order = ucol_getNextCE(src->UCA, &s, status);
892                        if(order == UCOL_NO_MORE_CES) {
893                            break;
894                        }
895                        tok->expCEs[tok->noOfExpCEs++] = order;
896                    }
897                    expOffset++;
898                    len--;
899                }
900            }
901        } else {
902            tok->noOfExpCEs = 0;
903        }
904
905        /* set the ucaelement with obtained values */
906        el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
907        /* copy CEs */
908        for(i = 0; i<tok->noOfCEs; i++) {
909            el.CEs[i] = tok->CEs[i];
910        }
911        for(i = 0; i<tok->noOfExpCEs; i++) {
912            el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
913        }
914
915        /* copy UChars */
916        // We kept prefix and source kind of together, as it is a kind of a contraction.
917        // However, now we have to slice the prefix off the main thing -
918        el.prefix = el.prefixChars;
919        el.cPoints = el.uchars;
920        if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
921            // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922            // decomposed elements to the unsaf table.
923            el.prefixSize = tok->prefix>>24;
924            uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
925
926            el.cSize = (tok->source >> 24)-(tok->prefix>>24);
927            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
928        } else {
929            el.prefixSize = 0;
930            *el.prefix = 0;
931
932            el.cSize = (tok->source >> 24);
933            uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
934        }
935        if(src->UCA != NULL) {
936            for(i = 0; i<el.cSize; i++) {
937                if(UCOL_ISJAMO(el.cPoints[i])) {
938                    t->image->jamoSpecial = TRUE;
939                }
940            }
941            if (!src->buildCCTabFlag && el.cSize > 0) {
942                // Check the trailing canonical combining class (tccc) of the last character.
943                const UChar *s = el.cPoints + el.cSize;
944                uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
945                if ((fcd & 0xff) != 0) {
946                    src->buildCCTabFlag = TRUE;
947                }
948            }
949        }
950
951        /* and then, add it */
952#if UCOL_DEBUG==2
953        fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
954#endif
955        uprv_uca_addAnElement(t, &el, status);
956
957#if UCOL_DEBUG_DUPLICATES
958        if(*status != U_ZERO_ERROR) {
959            fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
960            *status = U_ZERO_ERROR;
961        }
962#endif
963
964        tok = tok->next;
965    }
966}
967
968U_CDECL_BEGIN
969static UBool U_CALLCONV
970_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
971    UErrorCode status = U_ZERO_ERROR;
972    tempUCATable *t = (tempUCATable *)context;
973    if(value == 0) {
974        while(start < limit) {
975            uint32_t CE = utrie_get32(t->mapping, start, NULL);
976            if(CE == UCOL_NOT_FOUND) {
977                UCAElements el;
978                el.isThai = FALSE;
979                el.prefixSize = 0;
980                el.prefixChars[0] = 0;
981                el.prefix = el.prefixChars;
982                el.cPoints = el.uchars;
983
984                el.cSize = 0;
985                UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
986
987                el.noOfCEs = 1;
988                el.CEs[0] = 0;
989                uprv_uca_addAnElement(t, &el, &status);
990
991            }
992            start++;
993        }
994    }
995    if(U_FAILURE(status)) {
996        return FALSE;
997    } else {
998        return TRUE;
999    }
1000}
1001U_CDECL_END
1002
1003static void
1004ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1005                               UChar32 start, UChar32 end,
1006                               UErrorCode *status)
1007{
1008    //UChar decomp[256];
1009    uint32_t CE = UCOL_NOT_FOUND;
1010    UChar32 u = 0;
1011    UCAElements el;
1012    el.isThai = FALSE;
1013    el.prefixSize = 0;
1014    el.prefixChars[0] = 0;
1015    collIterate colIt;
1016
1017    if(U_SUCCESS(*status)) {
1018        for(u = start; u<=end; u++) {
1019            if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020                /* this test is for contractions that are missing the starting element. */
1021                || ((isCntTableElement(CE)) &&
1022                (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1023                )
1024            {
1025                el.cSize = 0;
1026                U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027                //decomp[0] = (UChar)u;
1028                //el.uchars[0] = (UChar)u;
1029                el.cPoints = el.uchars;
1030                //el.cSize = 1;
1031                el.noOfCEs = 0;
1032                el.prefix = el.prefixChars;
1033                el.prefixSize = 0;
1034                //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035                // We actually want to check whether this element is a special
1036                // If it is an implicit element (hangul, CJK - we want to copy the
1037                // special, not the resolved CEs) - for hangul, copying resolved
1038                // would just make things the same (there is an expansion and it
1039                // takes approximately the same amount of time to resolve as
1040                // falling back to the UCA).
1041                /*
1042                UTRIE_GET32(src->UCA->mapping, u, CE);
1043                tag = getCETag(CE);
1044                if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045                || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046                || tag == LEAD_SURROGATE_TAG) {
1047                el.CEs[el.noOfCEs++] = CE;
1048                } else {
1049                */
1050                // It turns out that it does not make sense to keep implicits
1051                // unresolved. The cost of resolving them is big enough so that
1052                // it doesn't make any difference whether we have to go to the UCA
1053                // or not.
1054                {
1055                    uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1056                    while(CE != UCOL_NO_MORE_CES) {
1057                        CE = ucol_getNextCE(src->UCA, &colIt, status);
1058                        if(CE != UCOL_NO_MORE_CES) {
1059                            el.CEs[el.noOfCEs++] = CE;
1060                        }
1061                    }
1062                }
1063                uprv_uca_addAnElement(t, &el, status);
1064            }
1065        }
1066    }
1067}
1068
1069U_NAMESPACE_END
1070
1071U_CFUNC UCATableHeader *
1072ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1073    U_NAMESPACE_USE
1074
1075    uint32_t i = 0;
1076    if(U_FAILURE(*status)) {
1077        return NULL;
1078    }
1079    /*
1080    2.  Eliminate the negative lists by doing the following for each non-null negative list:
1081    o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082    create new ListHeader X
1083    o   reverse the list, add to the end of X's positive list. Reset the strength of the
1084    first item you add, based on the stronger strength levels of the two lists.
1085    */
1086    /*
1087    3.  For each ListHeader with a non-null positive list:
1088    */
1089    /*
1090    o   Find all character strings with CEs between the baseCE and the
1091    next/previous CE, at the strength of the first token. Add these to the
1092    tailoring.
1093    ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1094    tailoring has & x < z...
1095    ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1096    */
1097    /* It is possible that this part should be done even while constructing list */
1098    /* The problem is that it is unknown what is going to be the strongest weight */
1099    /* So we might as well do it here */
1100
1101    /*
1102    o   Allocate CEs for each token in the list, based on the total number N of the
1103    largest level difference, and the gap G between baseCE and nextCE at that
1104    level. The relation * between the last item and nextCE is the same as the
1105    strongest strength.
1106    o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107    ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108    Then fit b and c into the secondary gap between a and d, then fit q
1109    into the tertiary gap between b and c.
1110
1111    o   Example: baseCE << b <<< q << c * nextCE(X,2)
1112    ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113    Then fit q into the tertiary gap between b and c.
1114    o   When incrementing primary values, we will not cross high byte
1115    boundaries except where there is only a single-byte primary. That is to
1116    ensure that the script reordering will continue to work.
1117    */
1118    UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1119    /* test for NULL */
1120    if (image == NULL) {
1121        *status = U_MEMORY_ALLOCATION_ERROR;
1122        return NULL;
1123    }
1124    uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1125
1126    for(i = 0; i<src->resultLen; i++) {
1127        /* now we need to generate the CEs */
1128        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129        /* According to strength                                                          */
1130        if(U_SUCCESS(*status)) {
1131            if(src->lh[i].first) { // if there are any elements
1132                // due to the way parser works, subsequent tailorings
1133                // may remove all the elements from a sequence, therefore
1134                // leaving an empty tailoring sequence.
1135                ucol_initBuffers(src, &src->lh[i], status);
1136            }
1137        }
1138        if(U_FAILURE(*status)) {
1139            uprv_free(image);
1140            return NULL;
1141        }
1142    }
1143
1144    if(src->varTop != NULL) { /* stuff the variable top value */
1145        src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1146        /* remove it from the list */
1147        if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1148            src->varTop->listHeader->first = src->varTop->next;
1149        }
1150        if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1151            src->varTop->listHeader->last = src->varTop->previous;
1152        }
1153        if(src->varTop->next != NULL) {
1154            src->varTop->next->previous = src->varTop->previous;
1155        }
1156        if(src->varTop->previous != NULL) {
1157            src->varTop->previous->next = src->varTop->next;
1158        }
1159    }
1160
1161
1162    tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1163    if(U_FAILURE(*status)) {
1164        uprv_free(image);
1165        return NULL;
1166    }
1167
1168
1169    /* After this, we have assigned CE values to all regular CEs      */
1170    /* now we will go through list once more and resolve expansions,  */
1171    /* make UCAElements structs and add them to table                 */
1172    for(i = 0; i<src->resultLen; i++) {
1173        /* now we need to generate the CEs */
1174        /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175        /* According to strength                                                          */
1176        if(U_SUCCESS(*status)) {
1177            ucol_createElements(src, t, &src->lh[i], status);
1178        }
1179    }
1180
1181    UCAElements el;
1182    el.isThai = FALSE;
1183    el.prefixSize = 0;
1184    el.prefixChars[0] = 0;
1185
1186    /* add latin-1 stuff */
1187    ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1188
1189    /* add stuff for copying */
1190    if(src->copySet != NULL) {
1191        int32_t i = 0;
1192        UnicodeSet *set = (UnicodeSet *)src->copySet;
1193        for(i = 0; i < set->getRangeCount(); i++) {
1194            ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1195        }
1196    }
1197
1198    if(U_SUCCESS(*status)) {
1199        /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1200
1201        uint32_t tailoredCE = UCOL_NOT_FOUND;
1202        //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1203        UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1204        UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1205        // Check for null pointer
1206        if (ucaEl == NULL) {
1207        	*status = U_MEMORY_ALLOCATION_ERROR;
1208        	return NULL;
1209        }
1210        while(*conts != 0) {
1211            /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1212            tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1213            if(tailoredCE != UCOL_NOT_FOUND) {
1214                UBool needToAdd = TRUE;
1215                if(isCntTableElement(tailoredCE)) {
1216                    if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1217                        needToAdd = FALSE;
1218                    }
1219                }
1220                if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1221                    UCAElements elm;
1222                    elm.cPoints = el.uchars;
1223                    elm.noOfCEs = 0;
1224                    elm.uchars[0] = *conts;
1225                    elm.uchars[1] = 0;
1226                    elm.cSize = 1;
1227                    elm.prefixChars[0] = *(conts+2);
1228                    elm.isThai = FALSE;
1229                    elm.prefix = elm.prefixChars;
1230                    elm.prefixSize = 1;
1231                    UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1232                    if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1233                        needToAdd = TRUE;
1234                    }
1235                }
1236                if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1237                    needToAdd = FALSE;
1238                }
1239
1240                if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1241                    if (*(conts+1) != 0) {  // contractions
1242                        el.prefix = el.prefixChars;
1243                        el.prefixSize = 0;
1244                        el.cPoints = el.uchars;
1245                        el.noOfCEs = 0;
1246                        el.uchars[0] = *conts;
1247                        el.uchars[1] = *(conts+1);
1248                        if(*(conts+2)!=0) {
1249                            el.uchars[2] = *(conts+2);
1250                            el.cSize = 3;
1251                        } else {
1252                            el.cSize = 2;
1253                        }
1254                        ucol_setText(ucaEl, el.uchars, el.cSize, status);
1255                    }
1256                    else { // pre-context character
1257                        UChar str[4] = { 0 };
1258                        int32_t len=0;
1259                        int32_t preKeyLen=0;
1260
1261                        el.cPoints = el.uchars;
1262                        el.noOfCEs = 0;
1263                        el.uchars[0] = *conts;
1264                        el.uchars[1] = 0;
1265                        el.cSize = 1;
1266                        el.prefixChars[0] = *(conts+2);
1267                        el.prefix = el.prefixChars;
1268                        el.prefixSize = 1;
1269                        if (el.prefixChars[0]!=0) {
1270                            // get CE of prefix character first
1271                            str[0]=el.prefixChars[0];
1272                            str[1]=0;
1273                            ucol_setText(ucaEl, str, 1, status);
1274                            while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1275                                    != UCOL_NULLORDER) {
1276                                preKeyLen++;  // count number of keys for prefix character
1277                            }
1278                            str[len++] = el.prefixChars[0];
1279                        }
1280
1281                        str[len++] = el.uchars[0];
1282                        str[len]=0;
1283                        ucol_setText(ucaEl, str, len, status);
1284                        // Skip the keys for prefix character, then copy the rest to el.
1285                        while ((preKeyLen-->0) &&
1286                               (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1287                            continue;
1288                        }
1289
1290                    }
1291                    while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1292                        el.noOfCEs++;
1293                    }
1294                    uprv_uca_addAnElement(t, &el, status);
1295                }
1296
1297            } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1298                ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1299            }
1300            conts+=3;
1301        }
1302        ucol_closeElements(ucaEl);
1303    }
1304
1305    // Add completely ignorable elements
1306    utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1307
1308    // add tailoring characters related canonical closures
1309    uprv_uca_canonicalClosure(t, src, NULL, status);
1310
1311    /* still need to produce compatibility closure */
1312
1313    UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1314
1315    uprv_uca_closeTempTable(t);
1316    uprv_free(image);
1317
1318    return myData;
1319}
1320
1321U_CDECL_BEGIN
1322static UBool U_CALLCONV
1323ucol_bld_cleanup(void)
1324{
1325    udata_close(invUCA_DATA_MEM);
1326    invUCA_DATA_MEM = NULL;
1327    _staticInvUCA = NULL;
1328    return TRUE;
1329}
1330U_CDECL_END
1331
1332U_CAPI const InverseUCATableHeader * U_EXPORT2
1333ucol_initInverseUCA(UErrorCode *status)
1334{
1335    if(U_FAILURE(*status)) return NULL;
1336
1337    UBool needsInit;
1338    UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1339
1340    if(needsInit) {
1341        InverseUCATableHeader *newInvUCA = NULL;
1342        UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1343
1344        if(U_FAILURE(*status)) {
1345            if (result) {
1346                udata_close(result);
1347            }
1348            // This is not needed, as we are talking about
1349            // memory we got from UData
1350            //uprv_free(newInvUCA);
1351        }
1352
1353        if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1354            newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1355            UCollator *UCA = ucol_initUCA(status);
1356            // UCA versions of UCA and inverse UCA should match
1357            if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1358                *status = U_INVALID_FORMAT_ERROR;
1359                udata_close(result);
1360                return NULL;
1361            }
1362
1363            umtx_lock(NULL);
1364            if(_staticInvUCA == NULL) {
1365                invUCA_DATA_MEM = result;
1366                _staticInvUCA = newInvUCA;
1367                result = NULL;
1368                newInvUCA = NULL;
1369            }
1370            umtx_unlock(NULL);
1371
1372            if(newInvUCA != NULL) {
1373                udata_close(result);
1374                // This is not needed, as we are talking about
1375                // memory we got from UData
1376                //uprv_free(newInvUCA);
1377            }
1378            else {
1379                ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1380            }
1381        }
1382    }
1383    return _staticInvUCA;
1384}
1385
1386/* This is the data that is used for non-script reordering codes. These _must_ be kept
1387 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1388 */
1389static const char* ReorderingTokenNames[] = {
1390    "SPACE",
1391    "PUNCT",
1392    "SYMBOL",
1393    "CURRENCY",
1394    "DIGIT",
1395    NULL
1396};
1397
1398static void toUpper(const char* src, char* dst, uint32_t length) {
1399   for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1400       *dst = toupper(*src);
1401   }
1402   *dst = '\0';
1403}
1404
1405U_INTERNAL int32_t U_EXPORT2
1406ucol_findReorderingEntry(const char* name) {
1407    char buffer[32];
1408    toUpper(name, buffer, 32);
1409    for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1410        if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1411            return entry + UCOL_REORDER_CODE_FIRST;
1412        }
1413    }
1414    return USCRIPT_INVALID_CODE;
1415}
1416
1417#endif /* #if !UCONFIG_NO_COLLATION */
1418