1/*
2*******************************************************************************
3*   Copyright (C) 1996-2009, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucol.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11* Modification history
12* Date        Name      Comments
13* 1996-1999   various members of ICU team maintained C API for collation framework
14* 02/16/2001  synwee    Added internal method getPrevSpecialCE
15* 03/01/2001  synwee    Added maxexpansion functionality.
16* 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_COLLATION
22
23#include "unicode/coleitr.h"
24#include "unicode/unorm.h"
25#include "unicode/udata.h"
26#include "unicode/ustring.h"
27
28#include "ucol_imp.h"
29#include "bocsu.h"
30
31#include "unormimp.h"
32#include "unorm_it.h"
33#include "umutex.h"
34#include "cmemory.h"
35#include "ucln_in.h"
36#include "cstring.h"
37#include "utracimp.h"
38#include "putilimp.h"
39#include "uassert.h"
40
41#ifdef UCOL_DEBUG
42#include <stdio.h>
43#endif
44
45U_NAMESPACE_USE
46
47#define LAST_BYTE_MASK_           0xFF
48#define SECOND_LAST_BYTE_SHIFT_   8
49
50#define ZERO_CC_LIMIT_            0xC0
51
52// this is static pointer to the normalizer fcdTrieIndex
53// it is always the same between calls to u_cleanup
54// and therefore writing to it is not synchronized.
55// It is cleaned in ucol_cleanup
56static const uint16_t *fcdTrieIndex=NULL;
57// Code points at fcdHighStart and above have a zero FCD value.
58static UChar32 fcdHighStart = 0;
59
60// These are values from UCA required for
61// implicit generation and supressing sort key compression
62// they should regularly be in the UCA, but if one
63// is running without UCA, it could be a problem
64static const int32_t maxRegularPrimary  = 0xA0;
65static const int32_t minImplicitPrimary = 0xE0;
66static const int32_t maxImplicitPrimary = 0xE4;
67
68U_CDECL_BEGIN
69static UBool U_CALLCONV
70ucol_cleanup(void)
71{
72    fcdTrieIndex = NULL;
73    return TRUE;
74}
75
76static int32_t U_CALLCONV
77_getFoldingOffset(uint32_t data) {
78    return (int32_t)(data&0xFFFFFF);
79}
80
81U_CDECL_END
82
83static
84inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
85                              int32_t sourceLen, collIterate *s)
86{
87    (s)->string = (s)->pos = (UChar *)(sourceString);
88    (s)->origFlags = 0;
89    (s)->flags = 0;
90    if (sourceLen >= 0) {
91        s->flags |= UCOL_ITER_HASLEN;
92        (s)->endp = (UChar *)sourceString+sourceLen;
93    }
94    else {
95        /* change to enable easier checking for end of string for fcdpositon */
96        (s)->endp = NULL;
97    }
98    (s)->extendCEs = NULL;
99    (s)->extendCEsSize = 0;
100    (s)->CEpos = (s)->toReturn = (s)->CEs;
101    (s)->offsetBuffer = NULL;
102    (s)->offsetBufferSize = 0;
103    (s)->offsetReturn = (s)->offsetStore = NULL;
104    (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
105    (s)->writableBuffer = (s)->stackWritableBuffer;
106    (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
107    (s)->coll = (collator);
108    (s)->fcdPosition = 0;
109    if(collator->normalizationMode == UCOL_ON) {
110        (s)->flags |= UCOL_ITER_NORM;
111    }
112    if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
113        (s)->flags |= UCOL_HIRAGANA_Q;
114    }
115    (s)->iterator = NULL;
116    //(s)->iteratorIndex = 0;
117}
118
119U_CAPI void  U_EXPORT2
120uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
121                             int32_t sourceLen, collIterate *s){
122    /* Out-of-line version for use from other files. */
123    IInit_collIterate(collator, sourceString, sourceLen, s);
124}
125
126/**
127* Backup the state of the collIterate struct data
128* @param data collIterate to backup
129* @param backup storage
130*/
131static
132inline void backupState(const collIterate *data, collIterateState *backup)
133{
134    backup->fcdPosition = data->fcdPosition;
135    backup->flags       = data->flags;
136    backup->origFlags   = data->origFlags;
137    backup->pos         = data->pos;
138    backup->bufferaddress = data->writableBuffer;
139    backup->buffersize    = data->writableBufSize;
140    backup->iteratorMove = 0;
141    backup->iteratorIndex = 0;
142    if(data->iterator != NULL) {
143        //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
144        backup->iteratorIndex = data->iterator->getState(data->iterator);
145        // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
146        if(backup->iteratorIndex == UITER_NO_STATE) {
147            while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
148                backup->iteratorMove++;
149                data->iterator->move(data->iterator, -1, UITER_CURRENT);
150            }
151            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
152        }
153    }
154}
155
156/**
157* Loads the state into the collIterate struct data
158* @param data collIterate to backup
159* @param backup storage
160* @param forwards boolean to indicate if forwards iteration is used,
161*        false indicates backwards iteration
162*/
163static
164inline void loadState(collIterate *data, const collIterateState *backup,
165                      UBool        forwards)
166{
167    UErrorCode status = U_ZERO_ERROR;
168    data->flags       = backup->flags;
169    data->origFlags   = backup->origFlags;
170    if(data->iterator != NULL) {
171        //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
172        data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
173        if(backup->iteratorMove != 0) {
174            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
175        }
176    }
177    data->pos         = backup->pos;
178
179    if ((data->flags & UCOL_ITER_INNORMBUF) &&
180        data->writableBuffer != backup->bufferaddress) {
181        /*
182        this is when a new buffer has been reallocated and we'll have to
183        calculate the new position.
184        note the new buffer has to contain the contents of the old buffer.
185        */
186        if (forwards) {
187            data->pos = data->writableBuffer +
188                                         (data->pos - backup->bufferaddress);
189        }
190        else {
191            /* backwards direction */
192            uint32_t temp = backup->buffersize -
193                                  (data->pos - backup->bufferaddress);
194            data->pos = data->writableBuffer + (data->writableBufSize - temp);
195        }
196    }
197    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
198        /*
199        this is alittle tricky.
200        if we are initially not in the normalization buffer, even if we
201        normalize in the later stage, the data in the buffer will be
202        ignored, since we skip back up to the data string.
203        however if we are already in the normalization buffer, any
204        further normalization will pull data into the normalization
205        buffer and modify the fcdPosition.
206        since we are keeping the data in the buffer for use, the
207        fcdPosition can not be reverted back.
208        arrgghh....
209        */
210        data->fcdPosition = backup->fcdPosition;
211    }
212}
213
214
215/*
216* collIter_eos()
217*     Checks for a collIterate being positioned at the end of
218*     its source string.
219*
220*/
221static
222inline UBool collIter_eos(collIterate *s) {
223    if(s->flags & UCOL_USE_ITERATOR) {
224      return !(s->iterator->hasNext(s->iterator));
225    }
226    if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
227        // Null terminated string, but not at null, so not at end.
228        //   Whether in main or normalization buffer doesn't matter.
229        return FALSE;
230    }
231
232    // String with length.  Can't be in normalization buffer, which is always
233    //  null termintated.
234    if (s->flags & UCOL_ITER_HASLEN) {
235        return (s->pos == s->endp);
236    }
237
238    // We are at a null termination, could be either normalization buffer or main string.
239    if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
240        // At null at end of main string.
241        return TRUE;
242    }
243
244    // At null at end of normalization buffer.  Need to check whether there there are
245    //   any characters left in the main buffer.
246    if(s->origFlags & UCOL_USE_ITERATOR) {
247      return !(s->iterator->hasNext(s->iterator));
248    } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
249        // Null terminated main string.  fcdPosition is the 'return' position into main buf.
250        return (*s->fcdPosition == 0);
251    }
252    else {
253        // Main string with an end pointer.
254        return s->fcdPosition == s->endp;
255    }
256}
257
258/*
259* collIter_bos()
260*     Checks for a collIterate being positioned at the start of
261*     its source string.
262*
263*/
264static
265inline UBool collIter_bos(collIterate *source) {
266  // if we're going backwards, we need to know whether there is more in the
267  // iterator, even if we are in the side buffer
268  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
269    return !source->iterator->hasPrevious(source->iterator);
270  }
271  if (source->pos <= source->string ||
272      ((source->flags & UCOL_ITER_INNORMBUF) &&
273      *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
274    return TRUE;
275  }
276  return FALSE;
277}
278
279/*static
280inline UBool collIter_SimpleBos(collIterate *source) {
281  // if we're going backwards, we need to know whether there is more in the
282  // iterator, even if we are in the side buffer
283  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
284    return !source->iterator->hasPrevious(source->iterator);
285  }
286  if (source->pos == source->string) {
287    return TRUE;
288  }
289  return FALSE;
290}*/
291    //return (data->pos == data->string) ||
292
293
294/**
295* Checks and free writable buffer if it is not the original stack buffer
296* in collIterate. This function does not reassign the writable buffer.
297* @param data collIterate struct to determine and free the writable buffer
298*/
299static
300inline void freeHeapWritableBuffer(collIterate *data)
301{
302    if (data->writableBuffer != data->stackWritableBuffer) {
303        uprv_free(data->writableBuffer);
304    }
305}
306
307
308/****************************************************************************/
309/* Following are the open/close functions                                   */
310/*                                                                          */
311/****************************************************************************/
312
313static UCollator*
314ucol_initFromBinary(const uint8_t *bin, int32_t length,
315                const UCollator *base,
316                UCollator *fillIn,
317                UErrorCode *status)
318{
319    UCollator *result = fillIn;
320    if(U_FAILURE(*status)) {
321        return NULL;
322    }
323    /*
324    if(base == NULL) {
325        // we don't support null base yet
326        *status = U_ILLEGAL_ARGUMENT_ERROR;
327        return NULL;
328    }
329    */
330    // We need these and we could be running without UCA
331    uprv_uca_initImplicitConstants(status);
332    UCATableHeader *colData = (UCATableHeader *)bin;
333    // do we want version check here? We're trying to figure out whether collators are compatible
334    if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
335        uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
336        colData->version[0] != UCOL_BUILDER_VERSION)
337    {
338        *status = U_COLLATOR_VERSION_MISMATCH;
339        return NULL;
340    }
341    else {
342        if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
343            result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
344            if(U_FAILURE(*status)){
345                return NULL;
346            }
347            result->hasRealData = TRUE;
348        }
349        else {
350            if(base) {
351                result = ucol_initCollator(base->image, result, base, status);
352                ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
353                if(U_FAILURE(*status)){
354                    return NULL;
355                }
356                result->hasRealData = FALSE;
357            }
358            else {
359                *status = U_USELESS_COLLATOR_ERROR;
360                return NULL;
361            }
362        }
363        result->freeImageOnClose = FALSE;
364    }
365    result->actualLocale = NULL;
366    result->validLocale = NULL;
367    result->requestedLocale = NULL;
368    result->rules = NULL;
369    result->rulesLength = 0;
370    result->freeRulesOnClose = FALSE;
371    result->ucaRules = NULL;
372    return result;
373}
374
375U_CAPI UCollator* U_EXPORT2
376ucol_openBinary(const uint8_t *bin, int32_t length,
377                const UCollator *base,
378                UErrorCode *status)
379{
380    return ucol_initFromBinary(bin, length, base, NULL, status);
381}
382
383U_CAPI int32_t U_EXPORT2
384ucol_cloneBinary(const UCollator *coll,
385                 uint8_t *buffer, int32_t capacity,
386                 UErrorCode *status)
387{
388    int32_t length = 0;
389    if(U_FAILURE(*status)) {
390        return length;
391    }
392    if(capacity < 0) {
393        *status = U_ILLEGAL_ARGUMENT_ERROR;
394        return length;
395    }
396    if(coll->hasRealData == TRUE) {
397        length = coll->image->size;
398        if(length <= capacity) {
399            uprv_memcpy(buffer, coll->image, length);
400        } else {
401            *status = U_BUFFER_OVERFLOW_ERROR;
402        }
403    } else {
404        length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
405        if(length <= capacity) {
406            /* build the UCATableHeader with minimal entries */
407            /* do not copy the header from the UCA file because its values are wrong! */
408            /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
409
410            /* reset everything */
411            uprv_memset(buffer, 0, length);
412
413            /* set the tailoring-specific values */
414            UCATableHeader *myData = (UCATableHeader *)buffer;
415            myData->size = length;
416
417            /* offset for the options, the only part of the data that is present after the header */
418            myData->options = sizeof(UCATableHeader);
419
420            /* need to always set the expansion value for an upper bound of the options */
421            myData->expansion = myData->options + sizeof(UColOptionSet);
422
423            myData->magic = UCOL_HEADER_MAGIC;
424            myData->isBigEndian = U_IS_BIG_ENDIAN;
425            myData->charSetFamily = U_CHARSET_FAMILY;
426
427            /* copy UCA's version; genrb will override all but the builder version with tailoring data */
428            uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
429
430            uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
431            uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
432            uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
433            myData->jamoSpecial = coll->image->jamoSpecial;
434
435            /* copy the collator options */
436            uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
437        } else {
438            *status = U_BUFFER_OVERFLOW_ERROR;
439        }
440    }
441    return length;
442}
443
444U_CAPI UCollator* U_EXPORT2
445ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
446{
447    UCollator * localCollator;
448    int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
449    char *stackBufferChars = (char *)stackBuffer;
450    int32_t imageSize = 0;
451    int32_t rulesSize = 0;
452    int32_t rulesPadding = 0;
453    uint8_t *image;
454    UChar *rules;
455    UBool colAllocated = FALSE;
456    UBool imageAllocated = FALSE;
457
458    if (status == NULL || U_FAILURE(*status)){
459        return 0;
460    }
461    if ((stackBuffer && !pBufferSize) || !coll){
462       *status = U_ILLEGAL_ARGUMENT_ERROR;
463        return 0;
464    }
465    if (coll->rules && coll->freeRulesOnClose) {
466        rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
467        rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
468        bufferSizeNeeded += rulesSize + rulesPadding;
469    }
470
471    if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
472        *pBufferSize =  bufferSizeNeeded;
473        return 0;
474    }
475
476    /* Pointers on 64-bit platforms need to be aligned
477     * on a 64-bit boundry in memory.
478     */
479    if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
480        int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
481        if (*pBufferSize > offsetUp) {
482            *pBufferSize -= offsetUp;
483            stackBufferChars += offsetUp;
484        }
485        else {
486            /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
487            *pBufferSize = 1;
488        }
489    }
490    stackBuffer = (void *)stackBufferChars;
491
492    if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
493        /* allocate one here...*/
494        stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
495        // Null pointer check.
496        if (stackBufferChars == NULL) {
497            *status = U_MEMORY_ALLOCATION_ERROR;
498            return NULL;
499        }
500        colAllocated = TRUE;
501        if (U_SUCCESS(*status)) {
502            *status = U_SAFECLONE_ALLOCATED_WARNING;
503        }
504    }
505    localCollator = (UCollator *)stackBufferChars;
506    rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
507    {
508        UErrorCode tempStatus = U_ZERO_ERROR;
509        imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
510    }
511    if (coll->freeImageOnClose) {
512        image = (uint8_t *)uprv_malloc(imageSize);
513        // Null pointer check
514        if (image == NULL) {
515            *status = U_MEMORY_ALLOCATION_ERROR;
516            return NULL;
517        }
518        ucol_cloneBinary(coll, image, imageSize, status);
519        imageAllocated = TRUE;
520    }
521    else {
522        image = (uint8_t *)coll->image;
523    }
524    localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
525    if (U_FAILURE(*status)) {
526        return NULL;
527    }
528
529    if (coll->rules) {
530        if (coll->freeRulesOnClose) {
531            localCollator->rules = u_strcpy(rules, coll->rules);
532            //bufferEnd += rulesSize;
533        }
534        else {
535            localCollator->rules = coll->rules;
536        }
537        localCollator->freeRulesOnClose = FALSE;
538        localCollator->rulesLength = coll->rulesLength;
539    }
540
541    int32_t i;
542    for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
543        ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
544    }
545    // zero copies of pointers
546    localCollator->actualLocale = NULL;
547    localCollator->validLocale = NULL;
548    localCollator->requestedLocale = NULL;
549    localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
550    localCollator->freeOnClose = colAllocated;
551    localCollator->freeImageOnClose = imageAllocated;
552    return localCollator;
553}
554
555U_CAPI void U_EXPORT2
556ucol_close(UCollator *coll)
557{
558    UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
559    UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
560    if(coll != NULL) {
561        // these are always owned by each UCollator struct,
562        // so we always free them
563        if(coll->validLocale != NULL) {
564            uprv_free(coll->validLocale);
565        }
566        if(coll->actualLocale != NULL) {
567            uprv_free(coll->actualLocale);
568        }
569        if(coll->requestedLocale != NULL) {
570            uprv_free(coll->requestedLocale);
571        }
572        if(coll->latinOneCEs != NULL) {
573            uprv_free(coll->latinOneCEs);
574        }
575        if(coll->options != NULL && coll->freeOptionsOnClose) {
576            uprv_free(coll->options);
577        }
578        if(coll->rules != NULL && coll->freeRulesOnClose) {
579            uprv_free((UChar *)coll->rules);
580        }
581        if(coll->image != NULL && coll->freeImageOnClose) {
582            uprv_free((UCATableHeader *)coll->image);
583        }
584
585        /* Here, it would be advisable to close: */
586        /* - UData for UCA (unless we stuff it in the root resb */
587        /* Again, do we need additional housekeeping... HMMM! */
588        UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
589        if(coll->freeOnClose){
590            /* for safeClone, if freeOnClose is FALSE,
591            don't free the other instance data */
592            uprv_free(coll);
593        }
594    }
595    UTRACE_EXIT();
596}
597
598/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
599/* you should be able to get the binary chunk to write out...  Doesn't look very full now */
600U_CFUNC uint8_t* U_EXPORT2
601ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
602{
603    uint8_t *result = NULL;
604    if(U_FAILURE(*status)) {
605        return NULL;
606    }
607    if(coll->hasRealData == TRUE) {
608        *length = coll->image->size;
609        result = (uint8_t *)uprv_malloc(*length);
610        /* test for NULL */
611        if (result == NULL) {
612            *status = U_MEMORY_ALLOCATION_ERROR;
613            return NULL;
614        }
615        uprv_memcpy(result, coll->image, *length);
616    } else {
617        *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
618        result = (uint8_t *)uprv_malloc(*length);
619        /* test for NULL */
620        if (result == NULL) {
621            *status = U_MEMORY_ALLOCATION_ERROR;
622            return NULL;
623        }
624
625        /* build the UCATableHeader with minimal entries */
626        /* do not copy the header from the UCA file because its values are wrong! */
627        /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
628
629        /* reset everything */
630        uprv_memset(result, 0, *length);
631
632        /* set the tailoring-specific values */
633        UCATableHeader *myData = (UCATableHeader *)result;
634        myData->size = *length;
635
636        /* offset for the options, the only part of the data that is present after the header */
637        myData->options = sizeof(UCATableHeader);
638
639        /* need to always set the expansion value for an upper bound of the options */
640        myData->expansion = myData->options + sizeof(UColOptionSet);
641
642        myData->magic = UCOL_HEADER_MAGIC;
643        myData->isBigEndian = U_IS_BIG_ENDIAN;
644        myData->charSetFamily = U_CHARSET_FAMILY;
645
646        /* copy UCA's version; genrb will override all but the builder version with tailoring data */
647        uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
648
649        uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
650        uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
651        uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
652        myData->jamoSpecial = coll->image->jamoSpecial;
653
654        /* copy the collator options */
655        uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
656    }
657    return result;
658}
659
660void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
661    if(U_FAILURE(*status)) {
662        return;
663    }
664    result->caseFirst = (UColAttributeValue)opts->caseFirst;
665    result->caseLevel = (UColAttributeValue)opts->caseLevel;
666    result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
667    result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
668    result->strength = (UColAttributeValue)opts->strength;
669    result->variableTopValue = opts->variableTopValue;
670    result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
671    result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
672    result->numericCollation = (UColAttributeValue)opts->numericCollation;
673
674    result->caseFirstisDefault = TRUE;
675    result->caseLevelisDefault = TRUE;
676    result->frenchCollationisDefault = TRUE;
677    result->normalizationModeisDefault = TRUE;
678    result->strengthisDefault = TRUE;
679    result->variableTopValueisDefault = TRUE;
680    result->hiraganaQisDefault = TRUE;
681    result->numericCollationisDefault = TRUE;
682
683    ucol_updateInternalState(result, status);
684
685    result->options = opts;
686}
687
688
689/**
690* Approximate determination if a character is at a contraction end.
691* Guaranteed to be TRUE if a character is at the end of a contraction,
692* otherwise it is not deterministic.
693* @param c character to be determined
694* @param coll collator
695*/
696static
697inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
698    if (c < coll->minContrEndCP) {
699        return FALSE;
700    }
701
702    int32_t  hash = c;
703    uint8_t  htbyte;
704    if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
705        if (U16_IS_TRAIL(c)) {
706            return TRUE;
707        }
708        hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
709    }
710    htbyte = coll->contrEndCP[hash>>3];
711    return (((htbyte >> (hash & 7)) & 1) == 1);
712}
713
714
715
716/*
717*   i_getCombiningClass()
718*        A fast, at least partly inline version of u_getCombiningClass()
719*        This is a candidate for further optimization.  Used heavily
720*        in contraction processing.
721*/
722static
723inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
724    uint8_t sCC = 0;
725    if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
726        sCC = u_getCombiningClass(c);
727    }
728    return sCC;
729}
730
731UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
732    UChar c;
733    UCollator *result = fillIn;
734    if(U_FAILURE(*status) || image == NULL) {
735        return NULL;
736    }
737
738    if(result == NULL) {
739        result = (UCollator *)uprv_malloc(sizeof(UCollator));
740        if(result == NULL) {
741            *status = U_MEMORY_ALLOCATION_ERROR;
742            return result;
743        }
744        result->freeOnClose = TRUE;
745    } else {
746        result->freeOnClose = FALSE;
747    }
748
749    // init FCD data
750    if (fcdTrieIndex == NULL) {
751        // The result is constant, until the library is reloaded.
752        fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
753        ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
754    }
755
756    result->image = image;
757    result->mapping.getFoldingOffset = _getFoldingOffset;
758    const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
759    utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
760    if(U_FAILURE(*status)) {
761        if(result->freeOnClose == TRUE) {
762            uprv_free(result);
763            result = NULL;
764        }
765        return result;
766    }
767
768    /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
769    result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
770    result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
771    result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
772    result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
773
774    result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
775    result->freeOptionsOnClose = FALSE;
776
777    /* set attributes */
778    result->caseFirst = (UColAttributeValue)result->options->caseFirst;
779    result->caseLevel = (UColAttributeValue)result->options->caseLevel;
780    result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
781    result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
782    result->strength = (UColAttributeValue)result->options->strength;
783    result->variableTopValue = result->options->variableTopValue;
784    result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
785    result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
786    result->numericCollation = (UColAttributeValue)result->options->numericCollation;
787
788    result->caseFirstisDefault = TRUE;
789    result->caseLevelisDefault = TRUE;
790    result->frenchCollationisDefault = TRUE;
791    result->normalizationModeisDefault = TRUE;
792    result->strengthisDefault = TRUE;
793    result->variableTopValueisDefault = TRUE;
794    result->alternateHandlingisDefault = TRUE;
795    result->hiraganaQisDefault = TRUE;
796    result->numericCollationisDefault = TRUE;
797
798    /*result->scriptOrder = NULL;*/
799
800    result->rules = NULL;
801    result->rulesLength = 0;
802    result->freeRulesOnClose = FALSE;
803
804    /* get the version info from UCATableHeader and populate the Collator struct*/
805    result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
806    result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
807    result->dataVersion[2] = 0;
808    result->dataVersion[3] = 0;
809
810    result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
811    result->minUnsafeCP = 0;
812    for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
813        if (ucol_unsafeCP(c, result)) break;
814    }
815    result->minUnsafeCP = c;
816
817    result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
818    result->minContrEndCP = 0;
819    for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
820        if (ucol_contractionEndCP(c, result)) break;
821    }
822    result->minContrEndCP = c;
823
824    /* max expansion tables */
825    result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
826                                         result->image->endExpansionCE);
827    result->lastEndExpansionCE = result->endExpansionCE +
828                                 result->image->endExpansionCECount - 1;
829    result->expansionCESize = (uint8_t*)result->image +
830                                               result->image->expansionCESize;
831
832
833    //result->errorCode = *status;
834
835    result->latinOneCEs = NULL;
836
837    result->latinOneRegenTable = FALSE;
838    result->latinOneFailed = FALSE;
839    result->UCA = UCA;
840
841    ucol_updateInternalState(result, status);
842
843    /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
844    result->ucaRules = NULL;
845    result->actualLocale = NULL;
846    result->validLocale = NULL;
847    result->requestedLocale = NULL;
848    result->hasRealData = FALSE; // real data lives in .dat file...
849    result->freeImageOnClose = FALSE;
850
851    return result;
852}
853
854/* new Mark's code */
855
856/**
857 * For generation of Implicit CEs
858 * @author Davis
859 *
860 * Cleaned up so that changes can be made more easily.
861 * Old values:
862# First Implicit: E26A792D
863# Last Implicit: E3DC70C0
864# First CJK: E0030300
865# Last CJK: E0A9DD00
866# First CJK_A: E0A9DF00
867# Last CJK_A: E0DE3100
868 */
869/* Following is a port of Mark's code for new treatment of implicits.
870 * It is positioned here, since ucol_initUCA need to initialize the
871 * variables below according to the data in the fractional UCA.
872 */
873
874/**
875 * Function used to:
876 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
877 * b) bump any non-CJK characters by 10FFFF.
878 * The relevant blocks are:
879 * A:    4E00..9FFF; CJK Unified Ideographs
880 *       F900..FAFF; CJK Compatibility Ideographs
881 * B:    3400..4DBF; CJK Unified Ideographs Extension A
882 *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
883 * As long as
884 *   no new B characters are allocated between 4E00 and FAFF, and
885 *   no new A characters are outside of this range,
886 * (very high probability) this simple code will work.
887 * The reordered blocks are:
888 * Block1 is CJK
889 * Block2 is CJK_COMPAT_USED
890 * Block3 is CJK_A
891 * (all contiguous)
892 * Any other CJK gets its normal code point
893 * Any non-CJK gets +10FFFF
894 * When we reorder Block1, we make sure that it is at the very start,
895 * so that it will use a 3-byte form.
896 * Warning: the we only pick up the compatibility characters that are
897 * NOT decomposed, so that block is smaller!
898 */
899
900// CONSTANTS
901static const UChar32
902    NON_CJK_OFFSET = 0x110000,
903    UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
904
905/**
906 * Precomputed by initImplicitConstants()
907 */
908static int32_t
909    final3Multiplier = 0,
910    final4Multiplier = 0,
911    final3Count = 0,
912    final4Count = 0,
913    medialCount = 0,
914    min3Primary = 0,
915    min4Primary = 0,
916    max4Primary = 0,
917    minTrail = 0,
918    maxTrail = 0,
919    max3Trail = 0,
920    max4Trail = 0,
921    min4Boundary = 0;
922
923static const UChar32
924    CJK_BASE = 0x4E00,
925    CJK_LIMIT = 0x9FFF+1,
926    CJK_COMPAT_USED_BASE = 0xFA0E,
927    CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
928    CJK_A_BASE = 0x3400,
929    CJK_A_LIMIT = 0x4DBF+1,
930    CJK_B_BASE = 0x20000,
931    CJK_B_LIMIT = 0x2A6DF+1;
932
933static UChar32 swapCJK(UChar32 i) {
934
935    if (i >= CJK_BASE) {
936        if (i < CJK_LIMIT)              return i - CJK_BASE;
937
938        if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
939
940        if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
941                                                + (CJK_LIMIT - CJK_BASE);
942        if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
943
944        if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
945
946        return i + NON_CJK_OFFSET;  // non-CJK
947    }
948    if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
949
950    if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
951                                                + (CJK_LIMIT - CJK_BASE)
952                                                + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
953    return i + NON_CJK_OFFSET; // non-CJK
954}
955
956U_CAPI UChar32 U_EXPORT2
957uprv_uca_getRawFromCodePoint(UChar32 i) {
958    return swapCJK(i)+1;
959}
960
961U_CAPI UChar32 U_EXPORT2
962uprv_uca_getCodePointFromRaw(UChar32 i) {
963    i--;
964    UChar32 result = 0;
965    if(i >= NON_CJK_OFFSET) {
966        result = i - NON_CJK_OFFSET;
967    } else if(i >= CJK_B_BASE) {
968        result = i;
969    } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
970        if(i < CJK_LIMIT - CJK_BASE) {
971            result = i + CJK_BASE;
972        } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
973            result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
974        } else {
975            result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
976        }
977    } else {
978        result = -1;
979    }
980    return result;
981}
982
983// GET IMPLICIT PRIMARY WEIGHTS
984// Return value is left justified primary key
985U_CAPI uint32_t U_EXPORT2
986uprv_uca_getImplicitFromRaw(UChar32 cp) {
987    /*
988    if (cp < 0 || cp > UCOL_MAX_INPUT) {
989        throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
990    }
991    */
992    int32_t last0 = cp - min4Boundary;
993    if (last0 < 0) {
994        int32_t last1 = cp / final3Count;
995        last0 = cp % final3Count;
996
997        int32_t last2 = last1 / medialCount;
998        last1 %= medialCount;
999
1000        last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1001        last1 = minTrail + last1; // offset
1002        last2 = min3Primary + last2; // offset
1003        /*
1004        if (last2 >= min4Primary) {
1005            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1006        }
1007        */
1008        return (last2 << 24) + (last1 << 16) + (last0 << 8);
1009    } else {
1010        int32_t last1 = last0 / final4Count;
1011        last0 %= final4Count;
1012
1013        int32_t last2 = last1 / medialCount;
1014        last1 %= medialCount;
1015
1016        int32_t last3 = last2 / medialCount;
1017        last2 %= medialCount;
1018
1019        last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1020        last1 = minTrail + last1; // offset
1021        last2 = minTrail + last2; // offset
1022        last3 = min4Primary + last3; // offset
1023        /*
1024        if (last3 > max4Primary) {
1025            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1026        }
1027        */
1028        return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1029    }
1030}
1031
1032static uint32_t U_EXPORT2
1033uprv_uca_getImplicitPrimary(UChar32 cp) {
1034    //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1035
1036    cp = swapCJK(cp);
1037    cp++;
1038    // we now have a range of numbers from 0 to 21FFFF.
1039
1040    //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1041
1042    return uprv_uca_getImplicitFromRaw(cp);
1043}
1044
1045/**
1046 * Converts implicit CE into raw integer ("code point")
1047 * @param implicit
1048 * @return -1 if illegal format
1049 */
1050U_CAPI UChar32 U_EXPORT2
1051uprv_uca_getRawFromImplicit(uint32_t implicit) {
1052    UChar32 result;
1053    UChar32 b3 = implicit & 0xFF;
1054    UChar32 b2 = (implicit >> 8) & 0xFF;
1055    UChar32 b1 = (implicit >> 16) & 0xFF;
1056    UChar32 b0 = (implicit >> 24) & 0xFF;
1057
1058    // simple parameter checks
1059    if (b0 < min3Primary || b0 > max4Primary
1060        || b1 < minTrail || b1 > maxTrail)
1061        return -1;
1062    // normal offsets
1063    b1 -= minTrail;
1064
1065    // take care of the final values, and compose
1066    if (b0 < min4Primary) {
1067        if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1068            return -1;
1069        b2 -= minTrail;
1070        UChar32 remainder = b2 % final3Multiplier;
1071        if (remainder != 0)
1072            return -1;
1073        b0 -= min3Primary;
1074        b2 /= final3Multiplier;
1075        result = ((b0 * medialCount) + b1) * final3Count + b2;
1076    } else {
1077        if (b2 < minTrail || b2 > maxTrail
1078            || b3 < minTrail || b3 > max4Trail)
1079            return -1;
1080        b2 -= minTrail;
1081        b3 -= minTrail;
1082        UChar32 remainder = b3 % final4Multiplier;
1083        if (remainder != 0)
1084            return -1;
1085        b3 /= final4Multiplier;
1086        b0 -= min4Primary;
1087        result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1088    }
1089    // final check
1090    if (result < 0 || result > UCOL_MAX_INPUT)
1091        return -1;
1092    return result;
1093}
1094
1095
1096static inline int32_t divideAndRoundUp(int a, int b) {
1097    return 1 + (a-1)/b;
1098}
1099
1100/* this function is either called from initUCA or from genUCA before
1101 * doing canonical closure for the UCA.
1102 */
1103
1104/**
1105 * Set up to generate implicits.
1106 * Maintenance Note:  this function may end up being called more than once, due
1107 *                    to threading races during initialization.  Make sure that
1108 *                    none of the Constants is ever transiently assigned an
1109 *                    incorrect value.
1110 * @param minPrimary
1111 * @param maxPrimary
1112 * @param minTrail final byte
1113 * @param maxTrail final byte
1114 * @param gap3 the gap we leave for tailoring for 3-byte forms
1115 * @param gap4 the gap we leave for tailoring for 4-byte forms
1116 */
1117static void initImplicitConstants(int minPrimary, int maxPrimary,
1118                                    int minTrailIn, int maxTrailIn,
1119                                    int gap3, int primaries3count,
1120                                    UErrorCode *status) {
1121    // some simple parameter checks
1122    if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1123        || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1124        || (primaries3count < 1))
1125    {
1126        *status = U_ILLEGAL_ARGUMENT_ERROR;
1127        return;
1128    };
1129
1130    minTrail = minTrailIn;
1131    maxTrail = maxTrailIn;
1132
1133    min3Primary = minPrimary;
1134    max4Primary = maxPrimary;
1135    // compute constants for use later.
1136    // number of values we can use in trailing bytes
1137    // leave room for empty values between AND above, e.g. if gap = 2
1138    // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1139    // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1140    // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1141    final3Multiplier = gap3 + 1;
1142    final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1143    max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1144
1145    // medials can use full range
1146    medialCount = (maxTrail - minTrail + 1);
1147    // find out how many values fit in each form
1148    int32_t threeByteCount = medialCount * final3Count;
1149    // now determine where the 3/4 boundary is.
1150    // we use 3 bytes below the boundary, and 4 above
1151    int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1152    int32_t primaries4count = primariesAvailable - primaries3count;
1153
1154
1155    int32_t min3ByteCoverage = primaries3count * threeByteCount;
1156    min4Primary = minPrimary + primaries3count;
1157    min4Boundary = min3ByteCoverage;
1158    // Now expand out the multiplier for the 4 bytes, and redo.
1159
1160    int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1161    int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1162    int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1163    int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1164    if (gap4 < 1) {
1165        *status = U_ILLEGAL_ARGUMENT_ERROR;
1166        return;
1167    }
1168    final4Multiplier = gap4 + 1;
1169    final4Count = neededPerFinalByte;
1170    max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1171}
1172
1173    /**
1174     * Supply parameters for generating implicit CEs
1175     */
1176U_CAPI void U_EXPORT2
1177uprv_uca_initImplicitConstants(UErrorCode *status) {
1178    // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1179    //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1180    initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1181}
1182
1183
1184/*    collIterNormalize     Incremental Normalization happens here.                       */
1185/*                          pick up the range of chars identifed by FCD,                  */
1186/*                          normalize it into the collIterate's writable buffer,          */
1187/*                          switch the collIterate's state to use the writable buffer.    */
1188/*                                                                                        */
1189static
1190void collIterNormalize(collIterate *collationSource)
1191{
1192    UErrorCode  status = U_ZERO_ERROR;
1193
1194    int32_t    normLen;
1195    UChar      *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1196    UChar      *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1197
1198    normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1199                              srcP, (int32_t)(endP - srcP),
1200                              FALSE, 0,
1201                              &status);
1202    if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1203        // reallocate and terminate
1204        if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1205                                   &collationSource->writableBuffer,
1206                                   (int32_t *)&collationSource->writableBufSize, normLen + 1,
1207                                   0)
1208        ) {
1209#ifdef UCOL_DEBUG
1210            fprintf(stderr, "collIterNormalize(), out of memory\n");
1211#endif
1212            return;
1213        }
1214        status = U_ZERO_ERROR;
1215        normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1216                                  srcP, (int32_t)(endP - srcP),
1217                                  FALSE, 0,
1218                                  &status);
1219    }
1220    if (U_FAILURE(status)) {
1221#ifdef UCOL_DEBUG
1222        fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1223#endif
1224        return;
1225    }
1226
1227    if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1228        collationSource->flags |= UCOL_ITER_ALLOCATED;
1229    }
1230    collationSource->pos        = collationSource->writableBuffer;
1231    collationSource->origFlags  = collationSource->flags;
1232    collationSource->flags     |= UCOL_ITER_INNORMBUF;
1233    collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1234}
1235
1236
1237// This function takes the iterator and extracts normalized stuff up to the next boundary
1238// It is similar in the end results to the collIterNormalize, but for the cases when we
1239// use an iterator
1240/*static
1241inline void normalizeIterator(collIterate *collationSource) {
1242  UErrorCode status = U_ZERO_ERROR;
1243  UBool wasNormalized = FALSE;
1244  //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1245  uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1246  int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1247    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1248  if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1249    // reallocate and terminate
1250    if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1251                               &collationSource->writableBuffer,
1252                               (int32_t *)&collationSource->writableBufSize, normLen + 1,
1253                               0)
1254    ) {
1255    #ifdef UCOL_DEBUG
1256        fprintf(stderr, "normalizeIterator(), out of memory\n");
1257    #endif
1258        return;
1259    }
1260    status = U_ZERO_ERROR;
1261    //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1262    collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1263    normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1264    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1265  }
1266  // Terminate the buffer - we already checked that it is big enough
1267  collationSource->writableBuffer[normLen] = 0;
1268  if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1269      collationSource->flags |= UCOL_ITER_ALLOCATED;
1270  }
1271  collationSource->pos        = collationSource->writableBuffer;
1272  collationSource->origFlags  = collationSource->flags;
1273  collationSource->flags     |= UCOL_ITER_INNORMBUF;
1274  collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1275}*/
1276
1277
1278/* Incremental FCD check and normalize                                                    */
1279/*   Called from getNextCE when normalization state is suspect.                           */
1280/*   When entering, the state is known to be this:                                        */
1281/*      o   We are working in the main buffer of the collIterate, not the side            */
1282/*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1283/*          so we won't get here.                                                         */
1284/*      o   The leading combining class from the current character is 0 or                */
1285/*          the trailing combining class of the previous char was zero.                   */
1286/*          True because the previous call to this function will have always exited       */
1287/*          that way, and we get called for every char where cc might be non-zero.        */
1288static
1289inline UBool collIterFCD(collIterate *collationSource) {
1290    const UChar *srcP, *endP;
1291    uint8_t     leadingCC;
1292    uint8_t     prevTrailingCC = 0;
1293    uint16_t    fcd;
1294    UBool       needNormalize = FALSE;
1295
1296    srcP = collationSource->pos-1;
1297
1298    if (collationSource->flags & UCOL_ITER_HASLEN) {
1299        endP = collationSource->endp;
1300    } else {
1301        endP = NULL;
1302    }
1303
1304    // Get the trailing combining class of the current character.  If it's zero,
1305    //   we are OK.
1306    /* trie access */
1307    fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1308    if (fcd != 0) {
1309        prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1310
1311        if (prevTrailingCC != 0) {
1312            // The current char has a non-zero trailing CC.  Scan forward until we find
1313            //   a char with a leading cc of zero.
1314            while (endP == NULL || srcP != endP)
1315            {
1316                const UChar *savedSrcP = srcP;
1317
1318                /* trie access */
1319                fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1320                leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1321                if (leadingCC == 0) {
1322                    srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1323                                           //   back up over it.  (Could be surrogate pair!)
1324                    break;
1325                }
1326
1327                if (leadingCC < prevTrailingCC) {
1328                    needNormalize = TRUE;
1329                }
1330
1331                prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1332            }
1333        }
1334    }
1335
1336    collationSource->fcdPosition = (UChar *)srcP;
1337
1338    return needNormalize;
1339}
1340
1341/****************************************************************************/
1342/* Following are the CE retrieval functions                                 */
1343/*                                                                          */
1344/****************************************************************************/
1345
1346static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1347static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1348
1349/* there should be a macro version of this function in the header file */
1350/* This is the first function that tries to fetch a collation element  */
1351/* If it's not succesfull or it encounters a more difficult situation  */
1352/* some more sofisticated and slower functions are invoked             */
1353static
1354inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1355    uint32_t order = 0;
1356    if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1357        order = *(collationSource->toReturn++);                         /* if so, return them */
1358        if(collationSource->CEpos == collationSource->toReturn) {
1359            collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1360        }
1361        return order;
1362    }
1363
1364    UChar ch = 0;
1365    collationSource->offsetReturn = NULL;
1366
1367    for (;;)                           /* Loop handles case when incremental normalize switches   */
1368    {                                  /*   to or from the side buffer / original string, and we  */
1369        /*   need to start again to get the next character.        */
1370
1371        if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1372        {
1373            // The source string is null terminated and we're not working from the side buffer,
1374            //   and we're not normalizing.  This is the fast path.
1375            //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1376            ch = *collationSource->pos++;
1377            if (ch != 0) {
1378                break;
1379            }
1380            else {
1381                return UCOL_NO_MORE_CES;
1382            }
1383        }
1384
1385        if (collationSource->flags & UCOL_ITER_HASLEN) {
1386            // Normal path for strings when length is specified.
1387            //   (We can't be in side buffer because it is always null terminated.)
1388            if (collationSource->pos >= collationSource->endp) {
1389                // Ran off of the end of the main source string.  We're done.
1390                return UCOL_NO_MORE_CES;
1391            }
1392            ch = *collationSource->pos++;
1393        }
1394        else if(collationSource->flags & UCOL_USE_ITERATOR) {
1395            UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1396            if(iterCh == U_SENTINEL) {
1397                return UCOL_NO_MORE_CES;
1398            }
1399            ch = (UChar)iterCh;
1400        }
1401        else
1402        {
1403            // Null terminated string.
1404            ch = *collationSource->pos++;
1405            if (ch == 0) {
1406                // Ran off end of buffer.
1407                if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1408                    // Ran off end of main string. backing up one character.
1409                    collationSource->pos--;
1410                    return UCOL_NO_MORE_CES;
1411                }
1412                else
1413                {
1414                    // Hit null in the normalize side buffer.
1415                    // Usually this means the end of the normalized data,
1416                    // except for one odd case: a null followed by combining chars,
1417                    //   which is the case if we are at the start of the buffer.
1418                    if (collationSource->pos == collationSource->writableBuffer+1) {
1419                        break;
1420                    }
1421
1422                    //  Null marked end of side buffer.
1423                    //   Revert to the main string and
1424                    //   loop back to top to try again to get a character.
1425                    collationSource->pos   = collationSource->fcdPosition;
1426                    collationSource->flags = collationSource->origFlags;
1427                    continue;
1428                }
1429            }
1430        }
1431
1432        if(collationSource->flags&UCOL_HIRAGANA_Q) {
1433            /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1434             * based on whether the previous codepoint was Hiragana or Katakana.
1435             */
1436            if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1437                    ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1438                collationSource->flags |= UCOL_WAS_HIRAGANA;
1439            } else {
1440                collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1441            }
1442        }
1443
1444        // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1445        //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1446        if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1447            break;
1448        }
1449
1450        if (collationSource->fcdPosition >= collationSource->pos) {
1451            // An earlier FCD check has already covered the current character.
1452            // We can go ahead and process this char.
1453            break;
1454        }
1455
1456        if (ch < ZERO_CC_LIMIT_ ) {
1457            // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1458            break;
1459        }
1460
1461        if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1462            // We need to peek at the next character in order to tell if we are FCD
1463            if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1464                // We are at the last char of source string.
1465                //  It is always OK for FCD check.
1466                break;
1467            }
1468
1469            // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1470            if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1471                break;
1472            }
1473        }
1474
1475
1476        // Need a more complete FCD check and possible normalization.
1477        if (collIterFCD(collationSource)) {
1478            collIterNormalize(collationSource);
1479        }
1480        if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1481            //  No normalization was needed.  Go ahead and process the char we already had.
1482            break;
1483        }
1484
1485        // Some normalization happened.  Next loop iteration will pick up a char
1486        //   from the normalization buffer.
1487
1488    }   // end for (;;)
1489
1490
1491    if (ch <= 0xFF) {
1492        /*  For latin-1 characters we never need to fall back to the UCA table        */
1493        /*    because all of the UCA data is replicated in the latinOneMapping array  */
1494        order = coll->latinOneMapping[ch];
1495        if (order > UCOL_NOT_FOUND) {
1496            order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1497        }
1498    }
1499    else
1500    {
1501        // Always use UCA for Han, Hangul
1502        // (Han extension A is before main Han block)
1503        // **** Han compatibility chars ?? ****
1504        if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1505            (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1506            if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1507                // between the two target ranges; do normal lookup
1508                // **** this range is YI, Modifier tone letters, ****
1509                // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
1510                // **** Latin-D might be tailored, so we need to ****
1511                // **** do the normal lookup for these guys.     ****
1512                order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1513            } else {
1514                // in one of the target ranges; use UCA
1515                order = UCOL_NOT_FOUND;
1516            }
1517        } else {
1518            order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1519        }
1520
1521        if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1522            order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1523        }
1524
1525        if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1526            /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1527            order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1528
1529            if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1530                order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1531            }
1532        }
1533    }
1534    if(order == UCOL_NOT_FOUND) {
1535        order = getImplicit(ch, collationSource);
1536    }
1537    return order; /* return the CE */
1538}
1539
1540/* ucol_getNextCE, out-of-line version for use from other files.   */
1541U_CAPI uint32_t  U_EXPORT2
1542ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1543    return ucol_IGetNextCE(coll, collationSource, status);
1544}
1545
1546
1547/**
1548* Incremental previous normalization happens here. Pick up the range of chars
1549* identifed by FCD, normalize it into the collIterate's writable buffer,
1550* switch the collIterate's state to use the writable buffer.
1551* @param data collation iterator data
1552*/
1553static
1554void collPrevIterNormalize(collIterate *data)
1555{
1556    UErrorCode status  = U_ZERO_ERROR;
1557    UChar      *pEnd   = data->pos;         /* End normalize + 1 */
1558    UChar      *pStart;
1559    uint32_t    normLen;
1560    UChar      *pStartNorm;
1561
1562    /* Start normalize */
1563    if (data->fcdPosition == NULL) {
1564        pStart = data->string;
1565    }
1566    else {
1567        pStart = data->fcdPosition + 1;
1568    }
1569
1570    normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1571                              data->writableBuffer, 0, &status);
1572
1573    if (data->writableBufSize <= normLen) {
1574            freeHeapWritableBuffer(data);
1575            data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1576                                                        sizeof(UChar));
1577            if(data->writableBuffer == NULL) { // something is wrong here, return
1578                data->writableBufSize = 0;     // Reset writableBufSize
1579                return;
1580            }
1581            data->flags |= UCOL_ITER_ALLOCATED;
1582            /* to handle the zero termination */
1583            data->writableBufSize = normLen + 1;
1584    }
1585            status = U_ZERO_ERROR;
1586    /*
1587    this puts the null termination infront of the normalized string instead
1588    of the end
1589    */
1590    pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1591    *(pStartNorm - 1) = 0;
1592    unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1593                    normLen, &status);
1594
1595    if (data->offsetBuffer == NULL) {
1596        int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
1597
1598        data->offsetBufferSize = len;
1599        data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
1600        data->offsetStore = data->offsetBuffer;
1601    } else if(data->offsetBufferSize < (int32_t) normLen) {
1602        int32_t storeIX = data->offsetStore - data->offsetBuffer;
1603        int32_t *tob    = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
1604
1605        if (tob != NULL) {
1606            data->offsetBuffer = tob;
1607            data->offsetStore = &data->offsetBuffer[storeIX];
1608            data->offsetBufferSize = normLen + 1;
1609        }
1610    }
1611
1612    /*
1613     * The usual case at this point is that we've got a base
1614     * character followed by marks that were normalized. If
1615     * fcdPosition is NULL, that means that we backed up to
1616     * the beginning of the string and there's no base character.
1617     *
1618     * Forward processing will usually normalize when it sees
1619     * the first mark, so that mark will get it's natural offset
1620     * and the rest will get the offset of the character following
1621     * the marks. The base character will also get its natural offset.
1622     *
1623     * We write the offset of the base character, if there is one,
1624     * followed by the offset of the first mark and then the offsets
1625     * of the rest of the marks.
1626     */
1627    int32_t firstMarkOffset = 0;
1628    int32_t trailOffset     = data->pos - data->string + 1;
1629    int32_t trailCount      = normLen - 1;
1630
1631    if (data->fcdPosition != NULL) {
1632        int32_t baseOffset = data->fcdPosition - data->string;
1633        UChar   baseChar   = *data->fcdPosition;
1634
1635        firstMarkOffset = baseOffset + 1;
1636
1637        /*
1638	     * If the base character is the start of a contraction, forward processing
1639	     * will normalize the marks while checking for the contraction, which means
1640	     * that the offset of the first mark will the same as the other marks.
1641	     *
1642	     * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1643	     */
1644	    if (baseChar >= 0x100) {
1645		    uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1646
1647		    if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1648			    baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1649		    }
1650
1651		    if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1652			    firstMarkOffset = trailOffset;
1653		    }
1654	    }
1655
1656        *(data->offsetStore++) = baseOffset;
1657    }
1658
1659    *(data->offsetStore++) = firstMarkOffset;
1660
1661    for (int32_t i = 0; i < trailCount; i += 1) {
1662        *(data->offsetStore++) = trailOffset;
1663    }
1664
1665    data->offsetRepeatValue = trailOffset;
1666
1667    data->offsetReturn = data->offsetStore - 1;
1668    if (data->offsetReturn == data->offsetBuffer) {
1669        data->offsetStore = data->offsetBuffer;
1670    }
1671
1672    data->pos        = data->writableBuffer + data->writableBufSize;
1673    data->origFlags  = data->flags;
1674    data->flags     |= UCOL_ITER_INNORMBUF;
1675    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1676}
1677
1678
1679/**
1680* Incremental FCD check for previous iteration and normalize. Called from
1681* getPrevCE when normalization state is suspect.
1682* When entering, the state is known to be this:
1683* o  We are working in the main buffer of the collIterate, not the side
1684*    writable buffer. When in the side buffer, normalization mode is always
1685*    off, so we won't get here.
1686* o  The leading combining class from the current character is 0 or the
1687*    trailing combining class of the previous char was zero.
1688*    True because the previous call to this function will have always exited
1689*    that way, and we get called for every char where cc might be non-zero.
1690* @param data collation iterate struct
1691* @return normalization status, TRUE for normalization to be done, FALSE
1692*         otherwise
1693*/
1694static
1695inline UBool collPrevIterFCD(collIterate *data)
1696{
1697    const UChar *src, *start;
1698    uint8_t     leadingCC;
1699    uint8_t     trailingCC = 0;
1700    uint16_t    fcd;
1701    UBool       result = FALSE;
1702
1703    start = data->string;
1704    src = data->pos + 1;
1705
1706    /* Get the trailing combining class of the current character. */
1707    fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1708
1709    leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1710
1711    if (leadingCC != 0) {
1712        /*
1713        The current char has a non-zero leading combining class.
1714        Scan backward until we find a char with a trailing cc of zero.
1715        */
1716        for (;;)
1717        {
1718            if (start == src) {
1719                data->fcdPosition = NULL;
1720                return result;
1721            }
1722
1723            fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1724
1725            trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1726
1727            if (trailingCC == 0) {
1728                break;
1729            }
1730
1731            if (leadingCC < trailingCC) {
1732                result = TRUE;
1733            }
1734
1735            leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1736        }
1737    }
1738
1739    data->fcdPosition = (UChar *)src;
1740
1741    return result;
1742}
1743
1744/** gets a character from the string at a given offset
1745 *  Handles both normal and iterative cases.
1746 *  No error checking - caller beware!
1747 */
1748inline static
1749UChar peekCharacter(collIterate *source, int32_t offset) {
1750    if(source->pos != NULL) {
1751        return *(source->pos + offset);
1752    } else if(source->iterator != NULL) {
1753        if(offset != 0) {
1754            source->iterator->move(source->iterator, offset, UITER_CURRENT);
1755            UChar toReturn = (UChar)source->iterator->next(source->iterator);
1756            source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1757            return toReturn;
1758        } else {
1759            return (UChar)source->iterator->current(source->iterator);
1760        }
1761    } else {
1762        return (UChar)U_SENTINEL;
1763    }
1764}
1765
1766/**
1767* Determines if we are at the start of the data string in the backwards
1768* collation iterator
1769* @param data collation iterator
1770* @return TRUE if we are at the start
1771*/
1772static
1773inline UBool isAtStartPrevIterate(collIterate *data) {
1774    if(data->pos == NULL && data->iterator != NULL) {
1775        return !data->iterator->hasPrevious(data->iterator);
1776    }
1777    //return (collIter_bos(data)) ||
1778    return (data->pos == data->string) ||
1779              ((data->flags & UCOL_ITER_INNORMBUF) &&
1780              *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1781}
1782
1783static
1784inline void goBackOne(collIterate *data) {
1785# if 0
1786    // somehow, it looks like we need to keep iterator synced up
1787    // at all times, as above.
1788    if(data->pos) {
1789        data->pos--;
1790    }
1791    if(data->iterator) {
1792        data->iterator->previous(data->iterator);
1793    }
1794#endif
1795    if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1796        data->iterator->previous(data->iterator);
1797    }
1798    if(data->pos) {
1799        data->pos --;
1800    }
1801}
1802
1803/**
1804* Inline function that gets a simple CE.
1805* So what it does is that it will first check the expansion buffer. If the
1806* expansion buffer is not empty, ie the end pointer to the expansion buffer
1807* is different from the string pointer, we return the collation element at the
1808* return pointer and decrement it.
1809* For more complicated CEs it resorts to getComplicatedCE.
1810* @param coll collator data
1811* @param data collation iterator struct
1812* @param status error status
1813*/
1814static
1815inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1816                               UErrorCode *status)
1817{
1818    uint32_t result = (uint32_t)UCOL_NULLORDER;
1819
1820    if (data->offsetReturn != NULL) {
1821        if (data->offsetRepeatCount > 0) {
1822                data->offsetRepeatCount -= 1;
1823        } else {
1824            if (data->offsetReturn == data->offsetBuffer) {
1825                data->offsetReturn = NULL;
1826				data->offsetStore  = data->offsetBuffer;
1827            } else {
1828                data->offsetReturn -= 1;
1829            }
1830        }
1831    }
1832
1833    if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1834            (!data->extendCEs && data->toReturn > data->CEs))
1835    {
1836        data->toReturn -= 1;
1837        result = *(data->toReturn);
1838        if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1839            data->CEpos = data->toReturn;
1840        }
1841    }
1842    else {
1843        UChar ch = 0;
1844
1845        /*
1846        Loop handles case when incremental normalize switches to or from the
1847        side buffer / original string, and we need to start again to get the
1848        next character.
1849        */
1850        for (;;) {
1851            if (data->flags & UCOL_ITER_HASLEN) {
1852                /*
1853                Normal path for strings when length is specified.
1854                Not in side buffer because it is always null terminated.
1855                */
1856                if (data->pos <= data->string) {
1857                    /* End of the main source string */
1858                    return UCOL_NO_MORE_CES;
1859                }
1860                data->pos --;
1861                ch = *data->pos;
1862            }
1863            // we are using an iterator to go back. Pray for us!
1864            else if (data->flags & UCOL_USE_ITERATOR) {
1865              UChar32 iterCh = data->iterator->previous(data->iterator);
1866              if(iterCh == U_SENTINEL) {
1867                return UCOL_NO_MORE_CES;
1868              } else {
1869                ch = (UChar)iterCh;
1870              }
1871            }
1872            else {
1873                data->pos --;
1874                ch = *data->pos;
1875                /* we are in the side buffer. */
1876                if (ch == 0) {
1877                    /*
1878                    At the start of the normalize side buffer.
1879                    Go back to string.
1880                    Because pointer points to the last accessed character,
1881                    hence we have to increment it by one here.
1882                    */
1883                    data->flags = data->origFlags;
1884                    data->offsetRepeatValue = 0;
1885
1886                     if (data->fcdPosition == NULL) {
1887                        data->pos = data->string;
1888                        return UCOL_NO_MORE_CES;
1889                    }
1890                    else {
1891                        data->pos   = data->fcdPosition + 1;
1892                    }
1893
1894                   continue;
1895                }
1896            }
1897
1898            if(data->flags&UCOL_HIRAGANA_Q) {
1899              if(ch>=0x3040 && ch<=0x309f) {
1900                data->flags |= UCOL_WAS_HIRAGANA;
1901              } else {
1902                data->flags &= ~UCOL_WAS_HIRAGANA;
1903              }
1904            }
1905
1906            /*
1907            * got a character to determine if there's fcd and/or normalization
1908            * stuff to do.
1909            * if the current character is not fcd.
1910            * if current character is at the start of the string
1911            * Trailing combining class == 0.
1912            * Note if pos is in the writablebuffer, norm is always 0
1913            */
1914            if (ch < ZERO_CC_LIMIT_ ||
1915              // this should propel us out of the loop in the iterator case
1916                (data->flags & UCOL_ITER_NORM) == 0 ||
1917                (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1918                || data->string == data->pos) {
1919                break;
1920            }
1921
1922            if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1923                /* if next character is FCD */
1924                if (data->pos == data->string) {
1925                    /* First char of string is always OK for FCD check */
1926                    break;
1927                }
1928
1929                /* Not first char of string, do the FCD fast test */
1930                if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1931                    break;
1932                }
1933            }
1934
1935            /* Need a more complete FCD check and possible normalization. */
1936            if (collPrevIterFCD(data)) {
1937                collPrevIterNormalize(data);
1938            }
1939
1940            if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1941                /*  No normalization. Go ahead and process the char. */
1942                break;
1943            }
1944
1945            /*
1946            Some normalization happened.
1947            Next loop picks up a char from the normalization buffer.
1948            */
1949        }
1950
1951        /* attempt to handle contractions, after removal of the backwards
1952        contraction
1953        */
1954        if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1955            result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1956        } else {
1957            if (ch <= 0xFF) {
1958                result = coll->latinOneMapping[ch];
1959            }
1960            else {
1961                // Always use UCA for [3400..9FFF], [AC00..D7AF]
1962                // **** [FA0E..FA2F] ?? ****
1963                if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1964                    (ch >= 0x3400 && ch <= 0xD7AF)) {
1965                    if (ch > 0x9FFF && ch < 0xAC00) {
1966                        // between the two target ranges; do normal lookup
1967                        // **** this range is YI, Modifier tone letters, ****
1968                        // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
1969                        // **** Latin-D might be tailored, so we need to ****
1970                        // **** do the normal lookup for these guys.     ****
1971                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1972                    } else {
1973                        result = UCOL_NOT_FOUND;
1974                    }
1975                } else {
1976                    result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1977                }
1978            }
1979            if (result > UCOL_NOT_FOUND) {
1980                result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1981            }
1982            if (result == UCOL_NOT_FOUND) { // Not found in master list
1983                if (!isAtStartPrevIterate(data) &&
1984                    ucol_contractionEndCP(ch, data->coll))
1985                {
1986                    result = UCOL_CONTRACTION;
1987                } else {
1988                    if(coll->UCA) {
1989                        result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1990                    }
1991                }
1992
1993                if (result > UCOL_NOT_FOUND) {
1994                    if(coll->UCA) {
1995                        result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
1996                    }
1997                }
1998            }
1999        }
2000
2001        if(result == UCOL_NOT_FOUND) {
2002            result = getPrevImplicit(ch, data);
2003        }
2004    }
2005
2006    return result;
2007}
2008
2009
2010/*   ucol_getPrevCE, out-of-line version for use from other files.  */
2011U_CFUNC uint32_t  U_EXPORT2
2012ucol_getPrevCE(const UCollator *coll, collIterate *data,
2013                        UErrorCode *status) {
2014    return ucol_IGetPrevCE(coll, data, status);
2015}
2016
2017
2018/* this should be connected to special Jamo handling */
2019U_CFUNC uint32_t  U_EXPORT2
2020ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2021    collIterate colIt;
2022    uint32_t order;
2023    IInit_collIterate(coll, &u, 1, &colIt);
2024    order = ucol_IGetNextCE(coll, &colIt, status);
2025    /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2026    return order;
2027}
2028
2029/**
2030* Inserts the argument character into the end of the buffer pushing back the
2031* null terminator.
2032* @param data collIterate struct data
2033* @param pNull pointer to the null termination
2034* @param ch character to be appended
2035* @return the position of the new addition
2036*/
2037static
2038inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
2039{
2040    uint32_t  size    = data->writableBufSize;
2041    UChar    *newbuffer;
2042    static const uint32_t  INCSIZE = 5;
2043
2044    if ((data->writableBuffer + size) > (pNull + 1)) {
2045        *pNull = ch;
2046        *(pNull + 1) = 0;
2047        return pNull;
2048    }
2049
2050    /*
2051    buffer will always be null terminated at the end.
2052    giving extra space since it is likely that more characters will be added.
2053    */
2054    size += INCSIZE;
2055    newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2056    if(newbuffer != NULL) { // something wrong, but no status
2057        uprv_memcpy(newbuffer, data->writableBuffer,
2058            data->writableBufSize * sizeof(UChar));
2059
2060        freeHeapWritableBuffer(data);
2061        data->writableBufSize = size;
2062        data->writableBuffer  = newbuffer;
2063
2064        newbuffer        = newbuffer + data->writableBufSize;
2065        *newbuffer       = ch;
2066        *(newbuffer + 1) = 0;
2067    }
2068    return newbuffer;
2069}
2070
2071/**
2072* Inserts the argument string into the end of the buffer pushing back the
2073* null terminator.
2074* @param data collIterate struct data
2075* @param pNull pointer to the null termination
2076* @param string to be appended
2077* @param length of the string to be appended
2078* @return the position of the new addition
2079*/
2080static
2081inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2082                               int32_t length)
2083{
2084    uint32_t  size = pNull - data->writableBuffer;
2085    UChar    *newbuffer;
2086
2087    if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2088        uprv_memcpy(pNull, str, length * sizeof(UChar));
2089        *(pNull + length) = 0;
2090        return pNull;
2091    }
2092
2093    /*
2094    buffer will always be null terminated at the end.
2095    giving extra space since it is likely that more characters will be added.
2096    */
2097    newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2098    if(newbuffer != NULL) {
2099      uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2100      uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2101
2102      freeHeapWritableBuffer(data);
2103      data->writableBufSize = size + length + 1;
2104      data->writableBuffer  = newbuffer;
2105    }
2106
2107    return newbuffer;
2108}
2109
2110/**
2111* Special normalization function for contraction in the forwards iterator.
2112* This normalization sequence will place the current character at source->pos
2113* and its following normalized sequence into the buffer.
2114* The fcd position, pos will be changed.
2115* pos will now point to positions in the buffer.
2116* Flags will be changed accordingly.
2117* @param data collation iterator data
2118*/
2119static
2120inline void normalizeNextContraction(collIterate *data)
2121{
2122    UChar      *buffer     = data->writableBuffer;
2123    uint32_t    buffersize = data->writableBufSize;
2124    uint32_t    strsize;
2125    UErrorCode  status     = U_ZERO_ERROR;
2126    /* because the pointer points to the next character */
2127    UChar      *pStart     = data->pos - 1;
2128    UChar      *pEnd;
2129    uint32_t    normLen;
2130    UChar      *pStartNorm;
2131
2132    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2133        *data->writableBuffer = *(pStart - 1);
2134        strsize               = 1;
2135    }
2136    else {
2137        strsize = u_strlen(data->writableBuffer);
2138    }
2139
2140    pEnd = data->fcdPosition;
2141
2142    normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2143                              &status);
2144
2145    if (buffersize <= normLen + strsize) {
2146        uint32_t  size = strsize + normLen + 1;
2147        UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2148        if(temp != NULL) {
2149          uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2150          freeHeapWritableBuffer(data);
2151          data->writableBuffer = temp;
2152          data->writableBufSize = size;
2153          data->flags |= UCOL_ITER_ALLOCATED;
2154        } else {
2155            return; // Avoid writing past bound of buffer->writableBuffer.
2156        }
2157    }
2158
2159    status            = U_ZERO_ERROR;
2160    pStartNorm        = buffer + strsize;
2161    /* null-termination will be added here */
2162    unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2163                    normLen + 1, &status);
2164
2165    data->pos        = data->writableBuffer + strsize;
2166    data->origFlags  = data->flags;
2167    data->flags     |= UCOL_ITER_INNORMBUF;
2168    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2169}
2170
2171/**
2172* Contraction character management function that returns the next character
2173* for the forwards iterator.
2174* Does nothing if the next character is in buffer and not the first character
2175* in it.
2176* Else it checks next character in data string to see if it is normalizable.
2177* If it is not, the character is simply copied into the buffer, else
2178* the whole normalized substring is copied into the buffer, including the
2179* current character.
2180* @param data collation element iterator data
2181* @return next character
2182*/
2183static
2184inline UChar getNextNormalizedChar(collIterate *data)
2185{
2186    UChar  nextch;
2187    UChar  ch;
2188    // Here we need to add the iterator code. One problem is the way
2189    // end of string is handled. If we just return next char, it could
2190    // be the sentinel. Most of the cases already check for this, but we
2191    // need to be sure.
2192    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2193         /* if no normalization and not in buffer. */
2194      if(data->flags & UCOL_USE_ITERATOR) {
2195         return (UChar)data->iterator->next(data->iterator);
2196      } else {
2197         return *(data->pos ++);
2198      }
2199    }
2200
2201    //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2202      //normalizeIterator(data);
2203    //}
2204
2205    UChar  *pEndWritableBuffer = NULL;
2206    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2207    if ((innormbuf && *data->pos != 0) ||
2208        (data->fcdPosition != NULL && !innormbuf &&
2209        data->pos < data->fcdPosition)) {
2210        /*
2211        if next character is in normalized buffer, no further normalization
2212        is required
2213        */
2214        return *(data->pos ++);
2215    }
2216
2217    if (data->flags & UCOL_ITER_HASLEN) {
2218        /* in data string */
2219        if (data->pos + 1 == data->endp) {
2220            return *(data->pos ++);
2221        }
2222    }
2223    else {
2224        if (innormbuf) {
2225          // inside the normalization buffer, but at the end
2226          // (since we encountered zero). This means, in the
2227          // case we're using char iterator, that we need to
2228          // do another round of normalization.
2229          //if(data->origFlags & UCOL_USE_ITERATOR) {
2230            // we need to restore original flags,
2231            // otherwise, we'll lose them
2232            //data->flags = data->origFlags;
2233            //normalizeIterator(data);
2234            //return *(data->pos++);
2235          //} else {
2236            /*
2237            in writable buffer, at this point fcdPosition can not be
2238            pointing to the end of the data string. see contracting tag.
2239            */
2240          if(data->fcdPosition) {
2241            if (*(data->fcdPosition + 1) == 0 ||
2242                data->fcdPosition + 1 == data->endp) {
2243                /* at the end of the string, dump it into the normalizer */
2244                data->pos = insertBufferEnd(data, data->pos,
2245                                            *(data->fcdPosition)) + 1;
2246                // Check if data->pos received a null pointer
2247                if (data->pos == NULL) {
2248                    return (UChar)-1; // Return to indicate error.
2249                }
2250                return *(data->fcdPosition ++);
2251            }
2252            pEndWritableBuffer = data->pos;
2253            data->pos = data->fcdPosition;
2254          } else if(data->origFlags & UCOL_USE_ITERATOR) {
2255            // if we are here, we're using a normalizing iterator.
2256            // we should just continue further.
2257            data->flags = data->origFlags;
2258            data->pos = NULL;
2259            return (UChar)data->iterator->next(data->iterator);
2260          }
2261          //}
2262        }
2263        else {
2264            if (*(data->pos + 1) == 0) {
2265                return *(data->pos ++);
2266            }
2267        }
2268    }
2269
2270    ch = *data->pos ++;
2271    nextch = *data->pos;
2272
2273    /*
2274    * if the current character is not fcd.
2275    * Trailing combining class == 0.
2276    */
2277    if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2278        (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2279         ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2280            /*
2281            Need a more complete FCD check and possible normalization.
2282            normalize substring will be appended to buffer
2283            */
2284        if (collIterFCD(data)) {
2285            normalizeNextContraction(data);
2286            return *(data->pos ++);
2287        }
2288        else if (innormbuf) {
2289            /* fcdposition shifted even when there's no normalization, if we
2290            don't input the rest into this, we'll get the wrong position when
2291            we reach the end of the writableBuffer */
2292            int32_t length = data->fcdPosition - data->pos + 1;
2293            data->pos = insertBufferEnd(data, pEndWritableBuffer,
2294                                        data->pos - 1, length);
2295            // Check if data->pos received a null pointer
2296            if (data->pos == NULL) {
2297                return (UChar)-1; // Return to indicate error.
2298            }
2299            return *(data->pos ++);
2300        }
2301    }
2302
2303    if (innormbuf) {
2304        /*
2305        no normalization is to be done hence only one character will be
2306        appended to the buffer.
2307        */
2308        data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2309        // Check if data->pos received a null pointer
2310        if (data->pos == NULL) {
2311            return (UChar)-1; // Return to indicate error.
2312        }
2313    }
2314
2315    /* points back to the pos in string */
2316    return ch;
2317}
2318
2319
2320
2321/**
2322* Function to copy the buffer into writableBuffer and sets the fcd position to
2323* the correct position
2324* @param source data string source
2325* @param buffer character buffer
2326* @param tempdb current position in buffer that has been used up
2327*/
2328static
2329inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2330                                     UChar *tempdb)
2331{
2332    /* okay confusing part here. to ensure that the skipped characters are
2333    considered later, we need to place it in the appropriate position in the
2334    normalization buffer and reassign the pos pointer. simple case if pos
2335    reside in string, simply copy to normalization buffer and
2336    fcdposition = pos, pos = start of normalization buffer. if pos in
2337    normalization buffer, we'll insert the copy infront of pos and point pos
2338    to the start of the normalization buffer. why am i doing these copies?
2339    well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2340    not require any changes, which be really painful. */
2341    uint32_t length = u_strlen(buffer);;
2342    if (source->flags & UCOL_ITER_INNORMBUF) {
2343        u_strcpy(tempdb, source->pos);
2344    }
2345    else {
2346        source->fcdPosition  = source->pos;
2347        source->origFlags    = source->flags;
2348        source->flags       |= UCOL_ITER_INNORMBUF;
2349        source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2350    }
2351
2352    if (length >= source->writableBufSize) {
2353        freeHeapWritableBuffer(source);
2354        source->writableBuffer =
2355                     (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2356        if(source->writableBuffer == NULL) {
2357            source->writableBufSize = 0; // Reset size
2358            return;
2359        }
2360        source->writableBufSize = length;
2361    }
2362
2363    u_strcpy(source->writableBuffer, buffer);
2364    source->pos = source->writableBuffer;
2365}
2366
2367/**
2368* Function to get the discontiguos collation element within the source.
2369* Note this function will set the position to the appropriate places.
2370* @param coll current collator used
2371* @param source data string source
2372* @param constart index to the start character in the contraction table
2373* @return discontiguos collation element offset
2374*/
2375static
2376uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2377                                const UChar *constart)
2378{
2379    /* source->pos currently points to the second combining character after
2380       the start character */
2381          UChar   *temppos      = source->pos;
2382          UChar    buffer[4*UCOL_MAX_BUFFER];
2383          UChar   *tempdb       = buffer;
2384    const UChar   *tempconstart = constart;
2385          uint8_t  tempflags    = source->flags;
2386          UBool    multicontraction = FALSE;
2387          UChar   *tempbufferpos = 0;
2388          collIterateState discState;
2389
2390          backupState(source, &discState);
2391
2392    //*tempdb = *(source->pos - 1);
2393    *tempdb = peekCharacter(source, -1);
2394    tempdb++;
2395    for (;;) {
2396        UChar    *UCharOffset;
2397        UChar     schar,
2398                  tchar;
2399        uint32_t  result;
2400
2401        if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2402            || (peekCharacter(source, 0) == 0  &&
2403            //|| (*source->pos == 0  &&
2404                ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2405                 source->fcdPosition == NULL ||
2406                 source->fcdPosition == source->endp ||
2407                 *(source->fcdPosition) == 0 ||
2408                 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2409                 /* end of string in null terminated string or stopped by a
2410                 null character, note fcd does not always point to a base
2411                 character after the discontiguos change */
2412                 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2413                 //u_getCombiningClass(*(source->pos)) == 0) {
2414            //constart = (UChar *)coll->image + getContractOffset(CE);
2415            if (multicontraction) {
2416                *tempbufferpos = 0;
2417                source->pos    = temppos - 1;
2418                setDiscontiguosAttribute(source, buffer, tempdb);
2419                return *(coll->contractionCEs +
2420                                    (tempconstart - coll->contractionIndex));
2421            }
2422            constart = tempconstart;
2423            break;
2424        }
2425
2426        UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2427        schar = getNextNormalizedChar(source);
2428
2429        while (schar > (tchar = *UCharOffset)) {
2430            UCharOffset++;
2431        }
2432
2433        if (schar != tchar) {
2434            /* not the correct codepoint. we stuff the current codepoint into
2435            the discontiguos buffer and try the next character */
2436            *tempdb = schar;
2437            tempdb ++;
2438            continue;
2439        }
2440        else {
2441            if (u_getCombiningClass(schar) ==
2442                u_getCombiningClass(peekCharacter(source, -2))) {
2443                //u_getCombiningClass(*(source->pos - 2))) {
2444                *tempdb = schar;
2445                tempdb ++;
2446                continue;
2447            }
2448            result = *(coll->contractionCEs +
2449                                      (UCharOffset - coll->contractionIndex));
2450        }
2451        *tempdb = 0;
2452
2453        if (result == UCOL_NOT_FOUND) {
2454          break;
2455        } else if (isContraction(result)) {
2456            /* this is a multi-contraction*/
2457            tempconstart = (UChar *)coll->image + getContractOffset(result);
2458            if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2459                != UCOL_NOT_FOUND) {
2460                multicontraction = TRUE;
2461                temppos       = source->pos + 1;
2462                tempbufferpos = buffer + u_strlen(buffer);
2463            }
2464        } else {
2465            setDiscontiguosAttribute(source, buffer, tempdb);
2466            return result;
2467        }
2468    }
2469
2470    /* no problems simply reverting just like that,
2471    if we are in string before getting into this function, points back to
2472    string hence no problem.
2473    if we are in normalization buffer before getting into this function,
2474    since we'll never use another normalization within this function, we
2475    know that fcdposition points to a base character. the normalization buffer
2476    never change, hence this revert works. */
2477    loadState(source, &discState, TRUE);
2478    goBackOne(source);
2479
2480    //source->pos   = temppos - 1;
2481    source->flags = tempflags;
2482    return *(coll->contractionCEs + (constart - coll->contractionIndex));
2483}
2484
2485static
2486inline UBool isNonChar(UChar32 cp) {
2487    return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
2488}
2489
2490/* now uses Mark's getImplicitPrimary code */
2491static
2492inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2493    if(isNonChar(cp)) {
2494        return 0;
2495    }
2496    uint32_t r = uprv_uca_getImplicitPrimary(cp);
2497    *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2498    collationSource->offsetRepeatCount += 1;
2499    return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2500}
2501
2502/**
2503* Inserts the argument character into the front of the buffer replacing the
2504* front null terminator.
2505* @param data collation element iterator data
2506* @param pNull pointer to the null terminator
2507* @param ch character to be appended
2508* @return positon of added character
2509*/
2510static
2511inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2512{
2513    uint32_t  size    = data->writableBufSize;
2514    UChar    *end;
2515    UChar    *newbuffer;
2516    static const uint32_t  INCSIZE = 5;
2517
2518    if (pNull > data->writableBuffer + 1) {
2519        *pNull       = ch;
2520        *(pNull - 1) = 0;
2521        return pNull;
2522    }
2523
2524    /*
2525    buffer will always be null terminated infront.
2526    giving extra space since it is likely that more characters will be added.
2527    */
2528    size += INCSIZE;
2529    newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2530    if(newbuffer == NULL) {
2531        return NULL;
2532    }
2533    end = newbuffer + INCSIZE;
2534    uprv_memcpy(end, data->writableBuffer,
2535                data->writableBufSize * sizeof(UChar));
2536    *end       = ch;
2537    *(end - 1) = 0;
2538
2539    freeHeapWritableBuffer(data);
2540
2541    data->writableBufSize = size;
2542    data->writableBuffer  = newbuffer;
2543    return end;
2544}
2545
2546/**
2547* Special normalization function for contraction in the previous iterator.
2548* This normalization sequence will place the current character at source->pos
2549* and its following normalized sequence into the buffer.
2550* The fcd position, pos will be changed.
2551* pos will now point to positions in the buffer.
2552* Flags will be changed accordingly.
2553* @param data collation iterator data
2554*/
2555static
2556inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2557{
2558    uint32_t    nulltermsize;
2559    UErrorCode  localstatus = U_ZERO_ERROR;
2560    UChar      *pEnd       = data->pos + 1;         /* End normalize + 1 */
2561    UChar      *pStart;
2562    uint32_t    normLen;
2563    UChar      *pStartNorm;
2564
2565    if (data->flags & UCOL_ITER_HASLEN) {
2566        /*
2567        normalization buffer not used yet, we'll pull down the next
2568        character into the end of the buffer
2569        */
2570        *(data->writableBuffer + (data->writableBufSize - 1)) = *(data->pos + 1);
2571        nulltermsize                  = data->writableBufSize - 1;
2572    }
2573    else {
2574        nulltermsize = data->writableBufSize;
2575        UChar *temp = data->writableBuffer + (nulltermsize - 1);
2576        while (*(temp --) != 0) {
2577            nulltermsize --;
2578        }
2579    }
2580
2581    /* Start normalize */
2582    if (data->fcdPosition == NULL) {
2583        pStart = data->string;
2584    }
2585    else {
2586        pStart = data->fcdPosition + 1;
2587    }
2588
2589    normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, data->writableBuffer, 0,
2590                              &localstatus);
2591
2592    if (nulltermsize <= normLen) {
2593        uint32_t  size = data->writableBufSize - nulltermsize + normLen + 1;
2594        UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2595        if (temp == NULL) {
2596            *status = U_MEMORY_ALLOCATION_ERROR;
2597            return;
2598        }
2599        nulltermsize   = normLen + 1;
2600        uprv_memcpy(temp + normLen, data->writableBuffer,
2601                    sizeof(UChar) * (data->writableBufSize - nulltermsize));
2602        freeHeapWritableBuffer(data);
2603        data->writableBuffer = temp;
2604        data->writableBufSize = size;
2605    }
2606
2607    /*
2608    this puts the null termination infront of the normalized string instead
2609    of the end
2610    */
2611    pStartNorm   = data->writableBuffer + (nulltermsize - normLen);
2612    *(pStartNorm - 1) = 0;
2613    unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2614                    status);
2615
2616    data->pos        = data->writableBuffer + nulltermsize;
2617    data->origFlags  = data->flags;
2618    data->flags     |= UCOL_ITER_INNORMBUF;
2619    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2620}
2621
2622/**
2623* Contraction character management function that returns the previous character
2624* for the backwards iterator.
2625* Does nothing if the previous character is in buffer and not the first
2626* character in it.
2627* Else it checks previous character in data string to see if it is
2628* normalizable.
2629* If it is not, the character is simply copied into the buffer, else
2630* the whole normalized substring is copied into the buffer, including the
2631* current character.
2632* @param data collation element iterator data
2633* @return previous character
2634*/
2635static
2636inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2637{
2638    UChar  prevch;
2639    UChar  ch;
2640    UChar *start;
2641    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2642    UChar *pNull = NULL;
2643    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2644        (innormbuf && *(data->pos - 1) != 0)) {
2645        /*
2646        if no normalization.
2647        if previous character is in normalized buffer, no further normalization
2648        is required
2649        */
2650      if(data->flags & UCOL_USE_ITERATOR) {
2651        data->iterator->move(data->iterator, -1, UITER_CURRENT);
2652        return (UChar)data->iterator->next(data->iterator);
2653      } else {
2654        return *(data->pos - 1);
2655      }
2656    }
2657
2658    start = data->pos;
2659    if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2660        /* in data string */
2661        if ((start - 1) == data->string) {
2662            return *(start - 1);
2663        }
2664        start --;
2665        ch     = *start;
2666        prevch = *(start - 1);
2667    }
2668    else {
2669        /*
2670        in writable buffer, at this point fcdPosition can not be NULL.
2671        see contracting tag.
2672        */
2673        if (data->fcdPosition == data->string) {
2674            /* at the start of the string, just dump it into the normalizer */
2675            insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2676            data->fcdPosition = NULL;
2677            return *(data->pos - 1);
2678        }
2679        pNull  = data->pos - 1;
2680        start  = data->fcdPosition;
2681        ch     = *start;
2682        prevch = *(start - 1);
2683    }
2684    /*
2685    * if the current character is not fcd.
2686    * Trailing combining class == 0.
2687    */
2688    if (data->fcdPosition > start &&
2689       (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2690    {
2691        /*
2692        Need a more complete FCD check and possible normalization.
2693        normalize substring will be appended to buffer
2694        */
2695        UChar *backuppos = data->pos;
2696        data->pos = start;
2697        if (collPrevIterFCD(data)) {
2698            normalizePrevContraction(data, status);
2699            return *(data->pos - 1);
2700        }
2701        data->pos = backuppos;
2702        data->fcdPosition ++;
2703    }
2704
2705    if (innormbuf) {
2706    /*
2707    no normalization is to be done hence only one character will be
2708    appended to the buffer.
2709    */
2710        insertBufferFront(data, pNull, ch);
2711        data->fcdPosition --;
2712    }
2713
2714    return ch;
2715}
2716
2717/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2718/* It is called by getNextCE */
2719
2720/* The following should be even */
2721#define UCOL_MAX_DIGITS_FOR_NUMBER 254
2722
2723uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2724    collIterateState entryState;
2725    backupState(source, &entryState);
2726    UChar32 cp = ch;
2727
2728    for (;;) {
2729        // This loop will repeat only in the case of contractions, and only when a contraction
2730        //   is found and the first CE resulting from that contraction is itself a special
2731        //   (an expansion, for example.)  All other special CE types are fully handled the
2732        //   first time through, and the loop exits.
2733
2734        const uint32_t *CEOffset = NULL;
2735        switch(getCETag(CE)) {
2736        case NOT_FOUND_TAG:
2737            /* This one is not found, and we'll let somebody else bother about it... no more games */
2738            return CE;
2739        case SPEC_PROC_TAG:
2740            {
2741                // Special processing is getting a CE that is preceded by a certain prefix
2742                // Currently this is only needed for optimizing Japanese length and iteration marks.
2743                // When we encouter a special processing tag, we go backwards and try to see if
2744                // we have a match.
2745                // Contraction tables are used - so the whole process is not unlike contraction.
2746                // prefix data is stored backwards in the table.
2747                const UChar *UCharOffset;
2748                UChar schar, tchar;
2749                collIterateState prefixState;
2750                backupState(source, &prefixState);
2751                loadState(source, &entryState, TRUE);
2752                goBackOne(source); // We want to look at the point where we entered - actually one
2753                // before that...
2754
2755                for(;;) {
2756                    // This loop will run once per source string character, for as long as we
2757                    //  are matching a potential contraction sequence
2758
2759                    // First we position ourselves at the begining of contraction sequence
2760                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2761                    if (collIter_bos(source)) {
2762                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2763                        break;
2764                    }
2765                    schar = getPrevNormalizedChar(source, status);
2766                    goBackOne(source);
2767
2768                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2769                        UCharOffset++;
2770                    }
2771
2772                    if (schar == tchar) {
2773                        // Found the source string char in the table.
2774                        //  Pick up the corresponding CE from the table.
2775                        CE = *(coll->contractionCEs +
2776                            (UCharOffset - coll->contractionIndex));
2777                    }
2778                    else
2779                    {
2780                        // Source string char was not in the table.
2781                        //   We have not found the prefix.
2782                        CE = *(coll->contractionCEs +
2783                            (ContractionStart - coll->contractionIndex));
2784                    }
2785
2786                    if(!isPrefix(CE)) {
2787                        // The source string char was in the contraction table, and the corresponding
2788                        //   CE is not a prefix CE.  We found the prefix, break
2789                        //   out of loop, this CE will end up being returned.  This is the normal
2790                        //   way out of prefix handling when the source actually contained
2791                        //   the prefix.
2792                        break;
2793                    }
2794                }
2795                if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2796                    loadState(source, &prefixState, TRUE);
2797                    if(source->origFlags & UCOL_USE_ITERATOR) {
2798                        source->flags = source->origFlags;
2799                    }
2800                } else { // prefix search was a failure, we have to backup all the way to the start
2801                    loadState(source, &entryState, TRUE);
2802                }
2803                break;
2804            }
2805        case CONTRACTION_TAG:
2806            {
2807                /* This should handle contractions */
2808                collIterateState state;
2809                backupState(source, &state);
2810                uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2811                const UChar *UCharOffset;
2812                UChar schar, tchar;
2813
2814                for (;;) {
2815                    /* This loop will run once per source string character, for as long as we     */
2816                    /*  are matching a potential contraction sequence                  */
2817
2818                    /* First we position ourselves at the begining of contraction sequence */
2819                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2820
2821                    if (collIter_eos(source)) {
2822                        // Ran off the end of the source string.
2823                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2824                        // So we'll pick whatever we have at the point...
2825                        if (CE == UCOL_NOT_FOUND) {
2826                            // back up the source over all the chars we scanned going into this contraction.
2827                            CE = firstCE;
2828                            loadState(source, &state, TRUE);
2829                            if(source->origFlags & UCOL_USE_ITERATOR) {
2830                                source->flags = source->origFlags;
2831                            }
2832                        }
2833                        break;
2834                    }
2835
2836                    uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2837                    uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2838
2839                    schar = getNextNormalizedChar(source);
2840                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2841                        UCharOffset++;
2842                    }
2843
2844                    if (schar == tchar) {
2845                        // Found the source string char in the contraction table.
2846                        //  Pick up the corresponding CE from the table.
2847                        CE = *(coll->contractionCEs +
2848                            (UCharOffset - coll->contractionIndex));
2849                    }
2850                    else
2851                    {
2852                        // Source string char was not in contraction table.
2853                        //   Unless we have a discontiguous contraction, we have finished
2854                        //   with this contraction.
2855                        // in order to do the proper detection, we
2856                        // need to see if we're dealing with a supplementary
2857                        /* We test whether the next two char are surrogate pairs.
2858                        * This test is done if the iterator is not NULL.
2859                        * If there is no surrogate pair, the iterator
2860                        * goes back one if needed. */
2861                        UChar32 miss = schar;
2862                        if (source->iterator) {
2863                            UChar32 surrNextChar; /* the next char in the iteration to test */
2864                            int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2865                            if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2866                                prevPos = source->iterator->index;
2867                                surrNextChar = getNextNormalizedChar(source);
2868                                if (U16_IS_TRAIL(surrNextChar)) {
2869                                    miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2870                                } else if (prevPos < source->iterator->index){
2871                                    goBackOne(source);
2872                                }
2873                            }
2874                        } else if (U16_IS_LEAD(schar)) {
2875                            miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2876                        }
2877
2878                        uint8_t sCC;
2879                        if (miss < 0x300 ||
2880                            maxCC == 0 ||
2881                            (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2882                            sCC>maxCC ||
2883                            (allSame != 0 && sCC == maxCC) ||
2884                            collIter_eos(source))
2885                        {
2886                            //  Contraction can not be discontiguous.
2887                            goBackOne(source);  // back up the source string by one,
2888                            //  because  the character we just looked at was
2889                            //  not part of the contraction.   */
2890                            if(U_IS_SUPPLEMENTARY(miss)) {
2891                                goBackOne(source);
2892                            }
2893                            CE = *(coll->contractionCEs +
2894                                (ContractionStart - coll->contractionIndex));
2895                        } else {
2896                            //
2897                            // Contraction is possibly discontiguous.
2898                            //   Scan more of source string looking for a match
2899                            //
2900                            UChar tempchar;
2901                            /* find the next character if schar is not a base character
2902                            and we are not yet at the end of the string */
2903                            tempchar = getNextNormalizedChar(source);
2904                            // probably need another supplementary thingie here
2905                            goBackOne(source);
2906                            if (i_getCombiningClass(tempchar, coll) == 0) {
2907                                goBackOne(source);
2908                                if(U_IS_SUPPLEMENTARY(miss)) {
2909                                    goBackOne(source);
2910                                }
2911                                /* Spit out the last char of the string, wasn't tasty enough */
2912                                CE = *(coll->contractionCEs +
2913                                    (ContractionStart - coll->contractionIndex));
2914                            } else {
2915                                CE = getDiscontiguous(coll, source, ContractionStart);
2916                            }
2917                        }
2918                    } // else after if(schar == tchar)
2919
2920                    if(CE == UCOL_NOT_FOUND) {
2921                        /* The Source string did not match the contraction that we were checking.  */
2922                        /*  Back up the source position to undo the effects of having partially    */
2923                        /*   scanned through what ultimately proved to not be a contraction.       */
2924                        loadState(source, &state, TRUE);
2925                        CE = firstCE;
2926                        break;
2927                    }
2928
2929                    if(!isContraction(CE)) {
2930                        // The source string char was in the contraction table, and the corresponding
2931                        //   CE is not a contraction CE.  We completed the contraction, break
2932                        //   out of loop, this CE will end up being returned.  This is the normal
2933                        //   way out of contraction handling when the source actually contained
2934                        //   the contraction.
2935                        break;
2936                    }
2937
2938
2939                    // The source string char was in the contraction table, and the corresponding
2940                    //   CE is IS  a contraction CE.  We will continue looping to check the source
2941                    //   string for the remaining chars in the contraction.
2942                    uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2943                    if(tempCE != UCOL_NOT_FOUND) {
2944                        // We have scanned a a section of source string for which there is a
2945                        //  CE from the contraction table.  Remember the CE and scan position, so
2946                        //  that we can return to this point if further scanning fails to
2947                        //  match a longer contraction sequence.
2948                        firstCE = tempCE;
2949
2950                        goBackOne(source);
2951                        backupState(source, &state);
2952                        getNextNormalizedChar(source);
2953
2954                        // Another way to do this is:
2955                        //collIterateState tempState;
2956                        //backupState(source, &tempState);
2957                        //goBackOne(source);
2958                        //backupState(source, &state);
2959                        //loadState(source, &tempState, TRUE);
2960
2961                        // The problem is that for incomplete contractions we have to remember the previous
2962                        // position. Before, the only thing I needed to do was state.pos--;
2963                        // After iterator introduction and especially after introduction of normalizing
2964                        // iterators, it became much more difficult to decrease the saved state.
2965                        // I'm not yet sure which of the two methods above is faster.
2966                    }
2967                } // for(;;)
2968                break;
2969            } // case CONTRACTION_TAG:
2970        case LONG_PRIMARY_TAG:
2971            {
2972                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2973                CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2974                source->offsetRepeatCount += 1;
2975                return CE;
2976            }
2977        case EXPANSION_TAG:
2978            {
2979                /* This should handle expansion. */
2980                /* NOTE: we can encounter both continuations and expansions in an expansion! */
2981                /* I have to decide where continuations are going to be dealt with */
2982                uint32_t size;
2983                uint32_t i;    /* general counter */
2984
2985                CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2986                size = getExpansionCount(CE);
2987                CE = *CEOffset++;
2988			  //source->offsetRepeatCount = -1;
2989
2990                if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2991                    for(i = 1; i<size; i++) {
2992                        *(source->CEpos++) = *CEOffset++;
2993						source->offsetRepeatCount += 1;
2994                    }
2995                } else { /* else, we do */
2996                    while(*CEOffset != 0) {
2997                        *(source->CEpos++) = *CEOffset++;
2998						source->offsetRepeatCount += 1;
2999                    }
3000                }
3001
3002                return CE;
3003            }
3004        case DIGIT_TAG:
3005            {
3006                /*
3007                We do a check to see if we want to collate digits as numbers; if so we generate
3008                a custom collation key. Otherwise we pull out the value stored in the expansion table.
3009                */
3010                //uint32_t size;
3011                uint32_t i;    /* general counter */
3012
3013                if (source->coll->numericCollation == UCOL_ON){
3014                    collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3015                    UChar32 char32 = 0;
3016                    int32_t digVal = 0;
3017
3018                    uint32_t digIndx = 0;
3019                    uint32_t endIndex = 0;
3020                    uint32_t trailingZeroIndex = 0;
3021
3022                    uint8_t collateVal = 0;
3023
3024                    UBool nonZeroValReached = FALSE;
3025
3026                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3027                    /*
3028                         We parse the source string until we hit a char that's NOT a digit.
3029                        Use this u_charDigitValue. This might be slow because we have to
3030                        handle surrogates...
3031                    */
3032            /*
3033                    if (U16_IS_LEAD(ch)){
3034                      if (!collIter_eos(source)) {
3035                        backupState(source, &digitState);
3036                        UChar trail = getNextNormalizedChar(source);
3037                        if(U16_IS_TRAIL(trail)) {
3038                          char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3039                        } else {
3040                          loadState(source, &digitState, TRUE);
3041                          char32 = ch;
3042                        }
3043                      } else {
3044                        char32 = ch;
3045                      }
3046                    } else {
3047                      char32 = ch;
3048                    }
3049                    digVal = u_charDigitValue(char32);
3050            */
3051                    digVal = u_charDigitValue(cp); // if we have arrived here, we have
3052                    // already processed possible supplementaries that trigered the digit tag -
3053                    // all supplementaries are marked in the UCA.
3054                    /*
3055                        We  pad a zero in front of the first element anyways. This takes
3056                        care of the (probably) most common case where people are sorting things followed
3057                        by a single digit
3058                    */
3059                    digIndx++;
3060                    for(;;){
3061                        // Make sure we have enough space. No longer needed;
3062                        // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3063                        // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3064                        // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3065
3066                        // Skipping over leading zeroes.
3067                        if (digVal != 0) {
3068                            nonZeroValReached = TRUE;
3069                        }
3070                        if (nonZeroValReached) {
3071                            /*
3072                            We parse the digit string into base 100 numbers (this fits into a byte).
3073                            We only add to the buffer in twos, thus if we are parsing an odd character,
3074                            that serves as the 'tens' digit while the if we are parsing an even one, that
3075                            is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3076                            a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3077                            overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3078                            than all the other bytes.
3079                            */
3080
3081                            if (digIndx % 2 == 1){
3082                                collateVal += (uint8_t)digVal;
3083
3084                                // We don't enter the low-order-digit case unless we've already seen
3085                                // the high order, or for the first digit, which is always non-zero.
3086                                if (collateVal != 0)
3087                                    trailingZeroIndex = 0;
3088
3089                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3090                                collateVal = 0;
3091                            }
3092                            else{
3093                                // We drop the collation value into the buffer so if we need to do
3094                                // a "front patch" we don't have to check to see if we're hitting the
3095                                // last element.
3096                                collateVal = (uint8_t)(digVal * 10);
3097
3098                                // Check for trailing zeroes.
3099                                if (collateVal == 0)
3100                                {
3101                                    if (!trailingZeroIndex)
3102                                        trailingZeroIndex = (digIndx/2) + 2;
3103                                }
3104                                else
3105                                    trailingZeroIndex = 0;
3106
3107                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3108                            }
3109                            digIndx++;
3110                        }
3111
3112                        // Get next character.
3113                        if (!collIter_eos(source)){
3114                            ch = getNextNormalizedChar(source);
3115                            if (U16_IS_LEAD(ch)){
3116                                if (!collIter_eos(source)) {
3117                                    backupState(source, &digitState);
3118                                    UChar trail = getNextNormalizedChar(source);
3119                                    if(U16_IS_TRAIL(trail)) {
3120                                        char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3121                                    } else {
3122                                        loadState(source, &digitState, TRUE);
3123                                        char32 = ch;
3124                                    }
3125                                }
3126                            } else {
3127                                char32 = ch;
3128                            }
3129
3130                            if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3131                                // Resetting position to point to the next unprocessed char. We
3132                                // overshot it when doing our test/set for numbers.
3133                                if (char32 > 0xFFFF) { // For surrogates.
3134                                    loadState(source, &digitState, TRUE);
3135                                    //goBackOne(source);
3136                                }
3137                                goBackOne(source);
3138                                break;
3139                            }
3140                        } else {
3141                            break;
3142                        }
3143                    }
3144
3145                    if (nonZeroValReached == FALSE){
3146                        digIndx = 2;
3147                        numTempBuf[2] = 6;
3148                    }
3149
3150                    endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3151                    if (digIndx % 2 != 0){
3152                        /*
3153                        We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3154                        we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3155                        Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3156                        single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3157                        */
3158
3159                        for(i = 2; i < endIndex; i++){
3160                            numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3161                                (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3162                        }
3163                        --digIndx;
3164                    }
3165
3166                    // Subtract one off of the last byte.
3167                    numTempBuf[endIndex-1] -= 1;
3168
3169                    /*
3170                    We want to skip over the first two slots in the buffer. The first slot
3171                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3172                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3173                    */
3174                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3175                    numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3176
3177                    // Now transfer the collation key to our collIterate struct.
3178                    // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3179                    //size = ((endIndex+1) & ~1)/2;
3180                    CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3181                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3182                        UCOL_BYTE_COMMON; // Tertiary weight.
3183                    i = 2; // Reset the index into the buffer.
3184                    while(i < endIndex)
3185                    {
3186                        uint32_t primWeight = numTempBuf[i++] << 8;
3187                        if ( i < endIndex)
3188                            primWeight |= numTempBuf[i++];
3189                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3190                    }
3191
3192                } else {
3193                    // no numeric mode, we'll just switch to whatever we stashed and continue
3194                    CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3195                    CE = *CEOffset++;
3196                    break;
3197                }
3198                return CE;
3199            }
3200            /* various implicits optimization */
3201        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3202            /* UCA is filled with these. Tailorings are NOT_FOUND */
3203            return getImplicit(cp, source);
3204        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3205            // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3206            return getImplicit(cp, source);
3207        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3208            {
3209                static const uint32_t
3210                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3211                //const uint32_t LCount = 19;
3212                static const uint32_t VCount = 21;
3213                static const uint32_t TCount = 28;
3214                //const uint32_t NCount = VCount * TCount;   // 588
3215                //const uint32_t SCount = LCount * NCount;   // 11172
3216                uint32_t L = ch - SBase;
3217
3218                // divide into pieces
3219
3220                uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3221                L /= TCount;
3222                uint32_t V = L % VCount;
3223                L /= VCount;
3224
3225                // offset them
3226
3227                L += LBase;
3228                V += VBase;
3229                T += TBase;
3230
3231                // return the first CE, but first put the rest into the expansion buffer
3232                if (!source->coll->image->jamoSpecial) { // FAST PATH
3233
3234                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3235                    if (T != TBase) {
3236                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3237                    }
3238
3239                    return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3240
3241                } else { // Jamo is Special
3242                    // Since Hanguls pass the FCD check, it is
3243                    // guaranteed that we won't be in
3244                    // the normalization buffer if something like this happens
3245                    // However, if we are using a uchar iterator and normalization
3246                    // is ON, the Hangul that lead us here is going to be in that
3247                    // normalization buffer. Here we want to restore the uchar
3248                    // iterator state and pull out of the normalization buffer
3249                    if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3250                        source->flags = source->origFlags; // restore the iterator
3251                        source->pos = NULL;
3252                    }
3253                    // Move Jamos into normalization buffer
3254                    source->writableBuffer[0] = (UChar)L;
3255                    source->writableBuffer[1] = (UChar)V;
3256                    if (T != TBase) {
3257                        source->writableBuffer[2] = (UChar)T;
3258                        source->writableBuffer[3] = 0;
3259                    } else {
3260                        source->writableBuffer[2] = 0;
3261                    }
3262
3263                    source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
3264                    //   after exhausting the writableBuffer
3265                    source->pos   = source->writableBuffer;
3266                    source->origFlags   = source->flags;
3267                    source->flags       |= UCOL_ITER_INNORMBUF;
3268                    source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3269
3270                    return(UCOL_IGNORABLE);
3271                }
3272            }
3273        case SURROGATE_TAG:
3274            /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3275            /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3276            /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3277            /* we return 0 (completely ignorable - per UCA specification */
3278            {
3279                UChar trail;
3280                collIterateState state;
3281                backupState(source, &state);
3282                if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3283                    // we chould have stepped one char forward and it might have turned that it
3284                    // was not a trail surrogate. In that case, we have to backup.
3285                    loadState(source, &state, TRUE);
3286                    return 0;
3287                } else {
3288                    /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3289                    CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3290                    if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3291                        // We need to backup
3292                        loadState(source, &state, TRUE);
3293                        return CE;
3294                    }
3295                    // calculate the supplementary code point value, if surrogate was not tailored
3296                    cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3297                }
3298            }
3299            break;
3300        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3301            UChar nextChar;
3302            if( source->flags & UCOL_USE_ITERATOR) {
3303                if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3304                    cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3305                    source->iterator->next(source->iterator);
3306                    return getImplicit(cp, source);
3307                } else {
3308                    return 0;
3309                }
3310            } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3311                U_IS_TRAIL((nextChar=*source->pos))) {
3312                    cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3313                    source->pos++;
3314                    return getImplicit(cp, source);
3315            } else {
3316                return 0; /* completely ignorable */
3317            }
3318        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3319            return 0; /* broken surrogate sequence */
3320        case CHARSET_TAG:
3321            /* not yet implemented */
3322            /* probably after 1.8 */
3323            return UCOL_NOT_FOUND;
3324        default:
3325            *status = U_INTERNAL_PROGRAM_ERROR;
3326            CE=0;
3327            break;
3328    }
3329    if (CE <= UCOL_NOT_FOUND) break;
3330  }
3331  return CE;
3332}
3333
3334
3335/* now uses Mark's getImplicitPrimary code */
3336static
3337inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3338    if(isNonChar(cp)) {
3339        return 0;
3340    }
3341
3342    uint32_t r = uprv_uca_getImplicitPrimary(cp);
3343
3344    *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3345    collationSource->toReturn = collationSource->CEpos;
3346
3347	if (collationSource->offsetBuffer == NULL) {
3348		collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3349		collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3350		collationSource->offsetStore = collationSource->offsetBuffer;
3351	}
3352
3353	// **** doesn't work if using iterator ****
3354	if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3355	  collationSource->offsetRepeatCount = 1;
3356	} else {
3357	  int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3358
3359	  *(collationSource->offsetStore++) = firstOffset;
3360	  *(collationSource->offsetStore++) = firstOffset + 1;
3361
3362		collationSource->offsetReturn = collationSource->offsetStore - 1;
3363		*(collationSource->offsetBuffer) = firstOffset;
3364		if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3365			collationSource->offsetStore = collationSource->offsetBuffer;
3366		}
3367	}
3368
3369    return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3370}
3371
3372/**
3373 * This function handles the special CEs like contractions, expansions,
3374 * surrogates, Thai.
3375 * It is called by both getPrevCE
3376 */
3377uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3378                          collIterate *source,
3379                          UErrorCode *status)
3380{
3381    const uint32_t *CEOffset    = NULL;
3382          UChar    *UCharOffset = NULL;
3383          UChar    schar;
3384    const UChar    *constart    = NULL;
3385          uint32_t size;
3386          UChar    buffer[UCOL_MAX_BUFFER];
3387          uint32_t *endCEBuffer;
3388          UChar   *strbuffer;
3389          int32_t noChars = 0;
3390          int32_t CECount = 0;
3391
3392    for(;;)
3393    {
3394        /* the only ces that loops are thai and contractions */
3395        switch (getCETag(CE))
3396        {
3397        case NOT_FOUND_TAG:  /* this tag always returns */
3398            return CE;
3399
3400        case SPEC_PROC_TAG:
3401            {
3402                // Special processing is getting a CE that is preceded by a certain prefix
3403                // Currently this is only needed for optimizing Japanese length and iteration marks.
3404                // When we encouter a special processing tag, we go backwards and try to see if
3405                // we have a match.
3406                // Contraction tables are used - so the whole process is not unlike contraction.
3407                // prefix data is stored backwards in the table.
3408                const UChar *UCharOffset;
3409                UChar schar, tchar;
3410                collIterateState prefixState;
3411                backupState(source, &prefixState);
3412                for(;;) {
3413                    // This loop will run once per source string character, for as long as we
3414                    //  are matching a potential contraction sequence
3415
3416                    // First we position ourselves at the begining of contraction sequence
3417                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3418
3419                    if (collIter_bos(source)) {
3420                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3421                        break;
3422                    }
3423                    schar = getPrevNormalizedChar(source, status);
3424                    goBackOne(source);
3425
3426                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3427                        UCharOffset++;
3428                    }
3429
3430                    if (schar == tchar) {
3431                        // Found the source string char in the table.
3432                        //  Pick up the corresponding CE from the table.
3433                        CE = *(coll->contractionCEs +
3434                            (UCharOffset - coll->contractionIndex));
3435                    }
3436                    else
3437                    {
3438                        // if there is a completely ignorable code point in the middle of
3439                        // a prefix, we need to act as if it's not there
3440                        // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3441                        // lone surrogates cannot be set to zero as it would break other processing
3442                        uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3443                        // it's easy for BMP code points
3444                        if(isZeroCE == 0) {
3445                            continue;
3446                        } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3447                            // for supplementary code points, we have to check the next one
3448                            // situations where we are going to ignore
3449                            // 1. beginning of the string: schar is a lone surrogate
3450                            // 2. schar is a lone surrogate
3451                            // 3. schar is a trail surrogate in a valid surrogate sequence
3452                            //    that is explicitly set to zero.
3453                            if (!collIter_bos(source)) {
3454                                UChar lead;
3455                                if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3456                                    isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3457                                    if(getCETag(isZeroCE) == SURROGATE_TAG) {
3458                                        uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3459                                        if(finalCE == 0) {
3460                                            // this is a real, assigned completely ignorable code point
3461                                            goBackOne(source);
3462                                            continue;
3463                                        }
3464                                    }
3465                                } else {
3466                                    // lone surrogate, completely ignorable
3467                                    continue;
3468                                }
3469                            } else {
3470                                // lone surrogate at the beggining, completely ignorable
3471                                continue;
3472                            }
3473                        }
3474                        // Source string char was not in the table.
3475                        //   We have not found the prefix.
3476                        CE = *(coll->contractionCEs +
3477                            (ContractionStart - coll->contractionIndex));
3478                    }
3479
3480                    if(!isPrefix(CE)) {
3481                        // The source string char was in the contraction table, and the corresponding
3482                        //   CE is not a prefix CE.  We found the prefix, break
3483                        //   out of loop, this CE will end up being returned.  This is the normal
3484                        //   way out of prefix handling when the source actually contained
3485                        //   the prefix.
3486                        break;
3487                    }
3488                }
3489                loadState(source, &prefixState, TRUE);
3490                break;
3491            }
3492
3493        case CONTRACTION_TAG:
3494            /* to ensure that the backwards and forwards iteration matches, we
3495            take the current region of most possible match and pass it through
3496            the forward iteration. this will ensure that the obstinate problem of
3497            overlapping contractions will not occur.
3498            */
3499            schar = peekCharacter(source, 0);
3500            constart = (UChar *)coll->image + getContractOffset(CE);
3501            if (isAtStartPrevIterate(source)
3502                /* commented away contraction end checks after adding the checks
3503                in getPrevCE  */) {
3504                    /* start of string or this is not the end of any contraction */
3505                    CE = *(coll->contractionCEs +
3506                        (constart - coll->contractionIndex));
3507                    break;
3508            }
3509            strbuffer = buffer;
3510            UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3511            *(UCharOffset --) = 0;
3512            noChars = 0;
3513            // have to swap thai characters
3514            while (ucol_unsafeCP(schar, coll)) {
3515                *(UCharOffset) = schar;
3516                noChars++;
3517                UCharOffset --;
3518                schar = getPrevNormalizedChar(source, status);
3519                goBackOne(source);
3520                // TODO: when we exhaust the contraction buffer,
3521                // it needs to get reallocated. The problem is
3522                // that the size depends on the string which is
3523                // not iterated over. However, since we're travelling
3524                // backwards, we already had to set the iterator at
3525                // the end - so we might as well know where we are?
3526                if (UCharOffset + 1 == buffer) {
3527                    /* we have exhausted the buffer */
3528                    int32_t newsize = 0;
3529                    if(source->pos) { // actually dealing with a position
3530                        newsize = source->pos - source->string + 1;
3531                    } else { // iterator
3532                        newsize = 4 * UCOL_MAX_BUFFER;
3533                    }
3534                    strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3535                        (newsize + UCOL_MAX_BUFFER));
3536                    /* test for NULL */
3537                    if (strbuffer == NULL) {
3538                        *status = U_MEMORY_ALLOCATION_ERROR;
3539                        return UCOL_NO_MORE_CES;
3540                    }
3541                    UCharOffset = strbuffer + newsize;
3542                    uprv_memcpy(UCharOffset, buffer,
3543                        UCOL_MAX_BUFFER * sizeof(UChar));
3544                    UCharOffset --;
3545                }
3546                if ((source->pos && (source->pos == source->string ||
3547                    ((source->flags & UCOL_ITER_INNORMBUF) &&
3548                    *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3549                    || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3550                        break;
3551                }
3552            }
3553            /* adds the initial base character to the string */
3554            *(UCharOffset) = schar;
3555            noChars++;
3556
3557            int32_t offsetBias;
3558
3559            // **** doesn't work if using iterator ****
3560            if (source->flags & UCOL_ITER_INNORMBUF) {
3561                offsetBias = -1;
3562            } else {
3563                offsetBias = (int32_t)(source->pos - source->string);
3564            }
3565
3566            /* a new collIterate is used to simplify things, since using the current
3567            collIterate will mean that the forward and backwards iteration will
3568            share and change the same buffers. we don't want to get into that. */
3569            collIterate temp;
3570            int32_t rawOffset;
3571
3572            IInit_collIterate(coll, UCharOffset, noChars, &temp);
3573            temp.flags &= ~UCOL_ITER_NORM;
3574            temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3575
3576            rawOffset = temp.pos - temp.string; // should always be zero?
3577            CE = ucol_IGetNextCE(coll, &temp, status);
3578
3579            if (source->extendCEs) {
3580                endCEBuffer = source->extendCEs + source->extendCEsSize;
3581                CECount = (source->CEpos - source->extendCEs)/sizeof(uint32_t);
3582            } else {
3583                endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3584                CECount = (source->CEpos - source->CEs)/sizeof(uint32_t);
3585            }
3586
3587            if (source->offsetBuffer == NULL) {
3588                source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3589                source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3590                source->offsetStore = source->offsetBuffer;
3591            }
3592
3593            while (CE != UCOL_NO_MORE_CES) {
3594                *(source->CEpos ++) = CE;
3595
3596                if (offsetBias >= 0) {
3597                    *(source->offsetStore ++) = rawOffset + offsetBias;
3598                }
3599
3600                CECount++;
3601                if (source->CEpos == endCEBuffer) {
3602                    /* ran out of CE space, reallocate to new buffer.
3603                    If reallocation fails, reset pointers and bail out,
3604                    there's no guarantee of the right character position after
3605                    this bail*/
3606                    if (source->extendCEs == NULL) {
3607                        source->extendCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t) *
3608                            (source->extendCEsSize =UCOL_EXPAND_CE_BUFFER_SIZE + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3609                        if (source->extendCEs == NULL) {
3610                            // Handle error later.
3611                            CECount = -1;
3612                        } else {
3613                            source->extendCEs = (uint32_t *)uprv_memcpy(source->extendCEs, source->CEs, UCOL_EXPAND_CE_BUFFER_SIZE * sizeof(uint32_t));
3614                        }
3615                    } else {
3616                        uint32_t *tempBufCE = (uint32_t *)uprv_realloc(source->extendCEs,
3617                            sizeof(uint32_t) * (source->extendCEsSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3618                        if (tempBufCE == NULL) {
3619                            // Handle error later.
3620                            CECount = -1;
3621                        }
3622                        else {
3623                            source->extendCEs = tempBufCE;
3624                        }
3625                    }
3626
3627                    if (CECount == -1) {
3628                        *status = U_MEMORY_ALLOCATION_ERROR;
3629                        source->extendCEsSize = 0;
3630                        source->CEpos = source->CEs;
3631                        freeHeapWritableBuffer(&temp);
3632
3633                        if (strbuffer != buffer) {
3634                            uprv_free(strbuffer);
3635                        }
3636
3637                        return (uint32_t)UCOL_NULLORDER;
3638                    }
3639
3640                    source->CEpos = source->extendCEs + CECount;
3641                    endCEBuffer = source->extendCEs + source->extendCEsSize;
3642                }
3643
3644                if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
3645                    int32_t  storeIX = source->offsetStore - source->offsetBuffer;
3646                    int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
3647                        sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3648
3649                    if (tob != NULL) {
3650                        source->offsetBuffer = tob;
3651                        source->offsetStore = &source->offsetBuffer[storeIX];
3652                        source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
3653                    } else {
3654                        // memory error...
3655                        *status = U_MEMORY_ALLOCATION_ERROR;
3656                        source->CEpos = source->CEs;
3657                        freeHeapWritableBuffer(&temp);
3658
3659                        if (strbuffer != buffer) {
3660                            uprv_free(strbuffer);
3661                        }
3662
3663                        return (uint32_t) UCOL_NULLORDER;
3664                    }
3665                }
3666
3667                if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3668                    rawOffset = temp.fcdPosition - temp.string;
3669                } else {
3670                    rawOffset = temp.pos - temp.string;
3671                }
3672
3673                CE = ucol_IGetNextCE(coll, &temp, status);
3674            }
3675
3676			if (source->offsetRepeatValue != 0) {
3677                if (CECount > noChars) {
3678				    source->offsetRepeatCount += temp.offsetRepeatCount;
3679                } else {
3680                    // **** does this really skip the right offsets? ****
3681                    source->offsetReturn -= (noChars - CECount);
3682                }
3683			}
3684
3685            freeHeapWritableBuffer(&temp);
3686
3687            if (strbuffer != buffer) {
3688                uprv_free(strbuffer);
3689            }
3690
3691            if (offsetBias >= 0) {
3692                source->offsetReturn = source->offsetStore - 1;
3693                if (source->offsetReturn == source->offsetBuffer) {
3694                    source->offsetStore = source->offsetBuffer;
3695                }
3696            }
3697
3698            source->toReturn = source->CEpos - 1;
3699            if (source->toReturn == source->CEs) {
3700                source->CEpos = source->CEs;
3701            }
3702
3703            return *(source->toReturn);
3704
3705        case LONG_PRIMARY_TAG:
3706            {
3707                *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3708                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3709                source->toReturn = source->CEpos - 1;
3710
3711				if (source->offsetBuffer == NULL) {
3712					source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3713					source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3714					source->offsetStore = source->offsetBuffer;
3715				}
3716
3717				if (source->flags & UCOL_ITER_INNORMBUF) {
3718                    source->offsetRepeatCount = 1;
3719				} else {
3720				  int32_t firstOffset = (int32_t)(source->pos - source->string);
3721
3722				  *(source->offsetStore++) = firstOffset;
3723				  *(source->offsetStore++) = firstOffset + 1;
3724
3725					source->offsetReturn = source->offsetStore - 1;
3726					*(source->offsetBuffer) = firstOffset;
3727					if (source->offsetReturn == source->offsetBuffer) {
3728						source->offsetStore = source->offsetBuffer;
3729					}
3730				}
3731
3732
3733                return *(source->toReturn);
3734            }
3735
3736        case EXPANSION_TAG: /* this tag always returns */
3737            {
3738            /*
3739            This should handle expansion.
3740            NOTE: we can encounter both continuations and expansions in an expansion!
3741            I have to decide where continuations are going to be dealt with
3742            */
3743            int32_t firstOffset = (int32_t)(source->pos - source->string);
3744
3745            // **** doesn't work if using iterator ****
3746            if (source->offsetReturn != NULL) {
3747                if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3748                    source->offsetStore = source->offsetBuffer;
3749                }else {
3750                  firstOffset = -1;
3751                }
3752            }
3753
3754            if (source->offsetBuffer == NULL) {
3755                source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3756                source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3757                source->offsetStore = source->offsetBuffer;
3758            }
3759
3760            /* find the offset to expansion table */
3761            CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3762            size     = getExpansionCount(CE);
3763            if (size != 0) {
3764                /*
3765                if there are less than 16 elements in expansion, we don't terminate
3766                */
3767                uint32_t count;
3768
3769                for (count = 0; count < size; count++) {
3770                    *(source->CEpos ++) = *CEOffset++;
3771
3772                    if (firstOffset >= 0) {
3773                        *(source->offsetStore ++) = firstOffset + 1;
3774                    }
3775                }
3776            } else {
3777                /* else, we do */
3778                while (*CEOffset != 0) {
3779                    *(source->CEpos ++) = *CEOffset ++;
3780
3781                    if (firstOffset >= 0) {
3782                        *(source->offsetStore ++) = firstOffset + 1;
3783                    }
3784                }
3785            }
3786
3787            if (firstOffset >= 0) {
3788                source->offsetReturn = source->offsetStore - 1;
3789                *(source->offsetBuffer) = firstOffset;
3790                if (source->offsetReturn == source->offsetBuffer) {
3791                    source->offsetStore = source->offsetBuffer;
3792                }
3793            } else {
3794                source->offsetRepeatCount += size - 1;
3795            }
3796
3797            source->toReturn = source->CEpos - 1;
3798            // in case of one element expansion, we
3799            // want to immediately return CEpos
3800            if(source->toReturn == source->CEs) {
3801                source->CEpos = source->CEs;
3802            }
3803
3804            return *(source->toReturn);
3805            }
3806
3807        case DIGIT_TAG:
3808            {
3809                /*
3810                We do a check to see if we want to collate digits as numbers; if so we generate
3811                a custom collation key. Otherwise we pull out the value stored in the expansion table.
3812                */
3813                //uint32_t size;
3814                uint32_t i;    /* general counter */
3815
3816                if (source->coll->numericCollation == UCOL_ON){
3817                    uint32_t digIndx = 0;
3818                    uint32_t endIndex = 0;
3819                    uint32_t leadingZeroIndex = 0;
3820                    uint32_t trailingZeroCount = 0;
3821
3822                    uint8_t collateVal = 0;
3823
3824                    UBool nonZeroValReached = FALSE;
3825
3826                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3827                    /*
3828                    We parse the source string until we hit a char that's NOT a digit.
3829                    Use this u_charDigitValue. This might be slow because we have to
3830                    handle surrogates...
3831                    */
3832                    /*
3833                    We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3834                    with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3835                    element we process when going backward. To determine how long that chunk might be, we may need to make
3836                    two passes through the loop that collects digits - one to see how long the string is (and how much is
3837                    leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3838                    more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3839                    element chunk after resetting the state to the initialState at the right side of the digit string.
3840                    */
3841                    uint32_t ceLimit = 0;
3842                    UChar initial_ch = ch;
3843                    collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3844                    backupState(source, &initialState);
3845
3846                    for(;;) {
3847                        collIterateState state = {0,0,0,0,0,0,0,0,0};
3848                        UChar32 char32 = 0;
3849                        int32_t digVal = 0;
3850
3851                        if (U16_IS_TRAIL (ch)) {
3852                            if (!collIter_bos(source)){
3853                                UChar lead = getPrevNormalizedChar(source, status);
3854                                if(U16_IS_LEAD(lead)) {
3855                                    char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3856                                    goBackOne(source);
3857                                } else {
3858                                    char32 = ch;
3859                                }
3860                            } else {
3861                                char32 = ch;
3862                            }
3863                        } else {
3864                            char32 = ch;
3865                        }
3866                        digVal = u_charDigitValue(char32);
3867
3868                        for(;;) {
3869                            // Make sure we have enough space. No longer needed;
3870                            // at this point the largest value of digIndx when we need to save data in numTempBuf
3871                            // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3872                            // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3873
3874                            // Skip over trailing zeroes, and keep a count of them.
3875                            if (digVal != 0)
3876                                nonZeroValReached = TRUE;
3877
3878                            if (nonZeroValReached) {
3879                                /*
3880                                We parse the digit string into base 100 numbers (this fits into a byte).
3881                                We only add to the buffer in twos, thus if we are parsing an odd character,
3882                                that serves as the 'tens' digit while the if we are parsing an even one, that
3883                                is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3884                                a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3885                                overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3886                                than all the other bytes.
3887
3888                                Since we're doing in this reverse we want to put the first digit encountered into the
3889                                ones place and the second digit encountered into the tens place.
3890                                */
3891
3892                                if ((digIndx + trailingZeroCount) % 2 == 1) {
3893                                    // High-order digit case (tens place)
3894                                    collateVal += (uint8_t)(digVal * 10);
3895
3896                                    // We cannot set leadingZeroIndex unless it has been set for the
3897                                    // low-order digit. Therefore, all we can do for the high-order
3898                                    // digit is turn it off, never on.
3899                                    // The only time we will have a high digit without a low is for
3900                                    // the very first non-zero digit, so no zero check is necessary.
3901                                    if (collateVal != 0)
3902                                        leadingZeroIndex = 0;
3903
3904                                    // The first pass through, digIndx may exceed the limit, but in that case
3905                                    // we no longer care about numTempBuf contents since they will be discarded
3906                                    if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3907                                        numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3908                                    }
3909                                    collateVal = 0;
3910                                } else {
3911                                    // Low-order digit case (ones place)
3912                                    collateVal = (uint8_t)digVal;
3913
3914                                    // Check for leading zeroes.
3915                                    if (collateVal == 0) {
3916                                        if (!leadingZeroIndex)
3917                                            leadingZeroIndex = (digIndx/2) + 2;
3918                                    } else
3919                                        leadingZeroIndex = 0;
3920
3921                                    // No need to write to buffer; the case of a last odd digit
3922                                    // is handled below.
3923                                }
3924                                ++digIndx;
3925                            } else
3926                                ++trailingZeroCount;
3927
3928                            if (!collIter_bos(source)) {
3929                                ch = getPrevNormalizedChar(source, status);
3930                                //goBackOne(source);
3931                                if (U16_IS_TRAIL(ch)) {
3932                                    backupState(source, &state);
3933                                    if (!collIter_bos(source)) {
3934                                        goBackOne(source);
3935                                        UChar lead = getPrevNormalizedChar(source, status);
3936
3937                                        if(U16_IS_LEAD(lead)) {
3938                                            char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3939                                        } else {
3940                                            loadState(source, &state, FALSE);
3941                                            char32 = ch;
3942                                        }
3943                                    }
3944                                } else
3945                                    char32 = ch;
3946
3947                                if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3948                                    if (char32 > 0xFFFF) {// For surrogates.
3949                                        loadState(source, &state, FALSE);
3950                                    }
3951                                    // Don't need to "reverse" the goBackOne call,
3952                                    // as this points to the next position to process..
3953                                    //if (char32 > 0xFFFF) // For surrogates.
3954                                    //getNextNormalizedChar(source);
3955                                    break;
3956                                }
3957
3958                                goBackOne(source);
3959                            }else
3960                                break;
3961                        }
3962
3963                        if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3964                            // our collation element is not too big, go ahead and finish with it
3965                            break;
3966                        }
3967                        // our digit string is too long for a collation element;
3968                        // set the limit for it, reset the state and begin again
3969                        ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3970                        if ( ceLimit == 0 ) {
3971                            ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3972                        }
3973                        ch = initial_ch;
3974                        loadState(source, &initialState, FALSE);
3975                        digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3976                        collateVal = 0;
3977                        nonZeroValReached = FALSE;
3978                    }
3979
3980                    if (! nonZeroValReached) {
3981                        digIndx = 2;
3982                        trailingZeroCount = 0;
3983                        numTempBuf[2] = 6;
3984                    }
3985
3986                    if ((digIndx + trailingZeroCount) % 2 != 0) {
3987                        numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3988                        digIndx += 1;       // The implicit leading zero
3989                    }
3990                    if (trailingZeroCount % 2 != 0) {
3991                        // We had to consume one trailing zero for the low digit
3992                        // of the least significant byte
3993                        digIndx += 1;       // The trailing zero not in the exponent
3994                        trailingZeroCount -= 1;
3995                    }
3996
3997                    endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3998
3999                    // Subtract one off of the last byte. Really the first byte here, but it's reversed...
4000                    numTempBuf[2] -= 1;
4001
4002                    /*
4003                    We want to skip over the first two slots in the buffer. The first slot
4004                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
4005                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
4006                    The exponent must be adjusted by the number of leading zeroes, and the number of
4007                    trailing zeroes.
4008                    */
4009                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
4010                    uint32_t exponent = (digIndx+trailingZeroCount)/2;
4011                    if (leadingZeroIndex)
4012                        exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
4013                    numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
4014
4015                    // Now transfer the collation key to our collIterate struct.
4016                    // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
4017                    //size = ((endIndex+1) & ~1)/2;
4018                    *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
4019                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
4020                        UCOL_BYTE_COMMON; // Tertiary weight.
4021                    i = endIndex - 1; // Reset the index into the buffer.
4022                    while(i >= 2) {
4023                        uint32_t primWeight = numTempBuf[i--] << 8;
4024                        if ( i >= 2)
4025                            primWeight |= numTempBuf[i--];
4026                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
4027                    }
4028
4029                    source->toReturn = source->CEpos -1;
4030                    return *(source->toReturn);
4031                } else {
4032                    CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
4033                    CE = *(CEOffset++);
4034                    break;
4035                }
4036            }
4037
4038        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
4039            {
4040                static const uint32_t
4041                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
4042                //const uint32_t LCount = 19;
4043                static const uint32_t VCount = 21;
4044                static const uint32_t TCount = 28;
4045                //const uint32_t NCount = VCount * TCount;   /* 588 */
4046                //const uint32_t SCount = LCount * NCount;   /* 11172 */
4047
4048                uint32_t L = ch - SBase;
4049                /*
4050                divide into pieces.
4051                we do it in this order since some compilers can do % and / in one
4052                operation
4053                */
4054                uint32_t T = L % TCount;
4055                L /= TCount;
4056                uint32_t V = L % VCount;
4057                L /= VCount;
4058
4059                /* offset them */
4060                L += LBase;
4061                V += VBase;
4062                T += TBase;
4063
4064				if (source->offsetBuffer == NULL) {
4065					source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
4066					source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
4067					source->offsetStore = source->offsetBuffer;
4068				}
4069
4070			  int32_t firstOffset = (int32_t)(source->pos - source->string);
4071
4072			  *(source->offsetStore++) = firstOffset;
4073
4074                /*
4075                 * return the first CE, but first put the rest into the expansion buffer
4076                 */
4077                if (!source->coll->image->jamoSpecial) {
4078                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4079                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4080					*(source->offsetStore++) = firstOffset + 1;
4081
4082					if (T != TBase) {
4083                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4084					    *(source->offsetStore++) = firstOffset + 1;
4085					}
4086
4087                    source->toReturn = source->CEpos - 1;
4088
4089					source->offsetReturn = source->offsetStore - 1;
4090					if (source->offsetReturn == source->offsetBuffer) {
4091						source->offsetStore = source->offsetBuffer;
4092					}
4093
4094					return *(source->toReturn);
4095                } else {
4096                    // Since Hanguls pass the FCD check, it is
4097                    // guaranteed that we won't be in
4098                    // the normalization buffer if something like this happens
4099                    // Move Jamos into normalization buffer
4100                    /*
4101                    Move the Jamos into the
4102                    normalization buffer
4103                    */
4104                    UChar *tempbuffer = source->writableBuffer +
4105                        (source->writableBufSize - 1);
4106                    *(tempbuffer) = 0;
4107                    if (T != TBase) {
4108                        *(tempbuffer - 1) = (UChar)T;
4109                        *(tempbuffer - 2) = (UChar)V;
4110                        *(tempbuffer - 3) = (UChar)L;
4111                        *(tempbuffer - 4) = 0;
4112                    } else {
4113                        *(tempbuffer - 1) = (UChar)V;
4114                        *(tempbuffer - 2) = (UChar)L;
4115                        *(tempbuffer - 3) = 0;
4116                    }
4117
4118                    /*
4119                    Indicate where to continue in main input string after exhausting
4120                    the writableBuffer
4121                    */
4122                    if (source->pos  == source->string) {
4123                        source->fcdPosition = NULL;
4124                    } else {
4125                        source->fcdPosition       = source->pos-1;
4126                    }
4127
4128                    source->pos               = tempbuffer;
4129                    source->origFlags         = source->flags;
4130                    source->flags            |= UCOL_ITER_INNORMBUF;
4131                    source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4132
4133                    return(UCOL_IGNORABLE);
4134                }
4135            }
4136
4137        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4138            return getPrevImplicit(ch, source);
4139
4140            // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4141        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4142            return getPrevImplicit(ch, source);
4143
4144        case SURROGATE_TAG:  /* This is a surrogate pair */
4145            /* essentialy an engaged lead surrogate. */
4146            /* if you have encountered it here, it means that a */
4147            /* broken sequence was encountered and this is an error */
4148            return 0;
4149
4150        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4151            return 0; /* broken surrogate sequence */
4152
4153        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4154            {
4155                UChar32 cp = 0;
4156                UChar  prevChar;
4157                UChar *prev;
4158                if (isAtStartPrevIterate(source)) {
4159                    /* we are at the start of the string, wrong place to be at */
4160                    return 0;
4161                }
4162                if (source->pos != source->writableBuffer) {
4163                    prev     = source->pos - 1;
4164                } else {
4165                    prev     = source->fcdPosition;
4166                }
4167                prevChar = *prev;
4168
4169                /* Handles Han and Supplementary characters here.*/
4170                if (U16_IS_LEAD(prevChar)) {
4171                    cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4172                    source->pos = prev;
4173                } else {
4174                    return 0; /* completely ignorable */
4175                }
4176
4177                return getPrevImplicit(cp, source);
4178            }
4179
4180            /* UCA is filled with these. Tailorings are NOT_FOUND */
4181            /* not yet implemented */
4182        case CHARSET_TAG:  /* this tag always returns */
4183            /* probably after 1.8 */
4184            return UCOL_NOT_FOUND;
4185
4186        default:           /* this tag always returns */
4187            *status = U_INTERNAL_PROGRAM_ERROR;
4188            CE=0;
4189            break;
4190        }
4191
4192        if (CE <= UCOL_NOT_FOUND) {
4193            break;
4194        }
4195    }
4196
4197    return CE;
4198}
4199
4200/* This should really be a macro        */
4201/* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4202/* anyway */
4203static
4204uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4205#ifdef UCOL_DEBUG
4206    fprintf(stderr, ".");
4207#endif
4208    uint8_t *newStart = NULL;
4209    uint32_t offset = *secondaries-secStart;
4210
4211    if(secStart==second) {
4212        newStart=(uint8_t*)uprv_malloc(newSize);
4213        if(newStart==NULL) {
4214            *status = U_MEMORY_ALLOCATION_ERROR;
4215            return NULL;
4216        }
4217        uprv_memcpy(newStart, secStart, *secondaries-secStart);
4218    } else {
4219        newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4220        if(newStart==NULL) {
4221            *status = U_MEMORY_ALLOCATION_ERROR;
4222            /* Since we're reallocating, return original reference so we don't loose it. */
4223            return secStart;
4224        }
4225    }
4226    *secondaries=newStart+offset;
4227    *secSize=newSize;
4228    return newStart;
4229}
4230
4231
4232/* This should really be a macro                                                                      */
4233/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4234/* secondaries in French                                                                              */
4235/*
4236void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4237  uint8_t temp;
4238  while(start<end) {
4239    temp = *start;
4240    *start++ = *end;
4241    *end-- = temp;
4242  }
4243}
4244*/
4245
4246#define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4247  TYPE tempA; \
4248while((start)<(end)) { \
4249    tempA = *(start); \
4250    *(start)++ = *(end); \
4251    *(end)-- = tempA; \
4252} \
4253}
4254
4255/****************************************************************************/
4256/* Following are the sortkey generation functions                           */
4257/*                                                                          */
4258/****************************************************************************/
4259
4260/**
4261 * Merge two sort keys.
4262 * This is useful, for example, to combine sort keys from first and last names
4263 * to sort such pairs.
4264 * Merged sort keys consider on each collation level the first part first entirely,
4265 * then the second one.
4266 * It is possible to merge multiple sort keys by consecutively merging
4267 * another one with the intermediate result.
4268 *
4269 * The length of the merge result is the sum of the lengths of the input sort keys
4270 * minus 1.
4271 *
4272 * @param src1 the first sort key
4273 * @param src1Length the length of the first sort key, including the zero byte at the end;
4274 *        can be -1 if the function is to find the length
4275 * @param src2 the second sort key
4276 * @param src2Length the length of the second sort key, including the zero byte at the end;
4277 *        can be -1 if the function is to find the length
4278 * @param dest the buffer where the merged sort key is written,
4279 *        can be NULL if destCapacity==0
4280 * @param destCapacity the number of bytes in the dest buffer
4281 * @return the length of the merged sort key, src1Length+src2Length-1;
4282 *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4283 *         in which cases the contents of dest is undefined
4284 *
4285 * @draft
4286 */
4287U_CAPI int32_t U_EXPORT2
4288ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4289                   const uint8_t *src2, int32_t src2Length,
4290                   uint8_t *dest, int32_t destCapacity) {
4291    int32_t destLength;
4292    uint8_t b;
4293
4294    /* check arguments */
4295    if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4296        src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4297        destCapacity<0 || (destCapacity>0 && dest==NULL)
4298    ) {
4299        /* error, attempt to write a zero byte and return 0 */
4300        if(dest!=NULL && destCapacity>0) {
4301            *dest=0;
4302        }
4303        return 0;
4304    }
4305
4306    /* check lengths and capacity */
4307    if(src1Length<0) {
4308        src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4309    }
4310    if(src2Length<0) {
4311        src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4312    }
4313
4314    destLength=src1Length+src2Length-1;
4315    if(destLength>destCapacity) {
4316        /* the merged sort key does not fit into the destination */
4317        return destLength;
4318    }
4319
4320    /* merge the sort keys with the same number of levels */
4321    while(*src1!=0 && *src2!=0) { /* while both have another level */
4322        /* copy level from src1 not including 00 or 01 */
4323        while((b=*src1)>=2) {
4324            ++src1;
4325            *dest++=b;
4326        }
4327
4328        /* add a 02 merge separator */
4329        *dest++=2;
4330
4331        /* copy level from src2 not including 00 or 01 */
4332        while((b=*src2)>=2) {
4333            ++src2;
4334            *dest++=b;
4335        }
4336
4337        /* if both sort keys have another level, then add a 01 level separator and continue */
4338        if(*src1==1 && *src2==1) {
4339            ++src1;
4340            ++src2;
4341            *dest++=1;
4342        }
4343    }
4344
4345    /*
4346     * here, at least one sort key is finished now, but the other one
4347     * might have some contents left from containing more levels;
4348     * that contents is just appended to the result
4349     */
4350    if(*src1!=0) {
4351        /* src1 is not finished, therefore *src2==0, and src1 is appended */
4352        src2=src1;
4353    }
4354    /* append src2, "the other, unfinished sort key" */
4355    uprv_strcpy((char *)dest, (const char *)src2);
4356
4357    /* trust that neither sort key contained illegally embedded zero bytes */
4358    return destLength;
4359}
4360
4361/* sortkey API */
4362U_CAPI int32_t U_EXPORT2
4363ucol_getSortKey(const    UCollator    *coll,
4364        const    UChar        *source,
4365        int32_t        sourceLength,
4366        uint8_t        *result,
4367        int32_t        resultLength)
4368{
4369    UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4370    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4371        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4372            ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4373    }
4374
4375    UErrorCode status = U_ZERO_ERROR;
4376    int32_t keySize   = 0;
4377
4378    if(source != NULL) {
4379        // source == NULL is actually an error situation, but we would need to
4380        // have an error code to return it. Until we introduce a new
4381        // API, it stays like this
4382
4383        /* this uses the function pointer that is set in updateinternalstate */
4384        /* currently, there are two funcs: */
4385        /*ucol_calcSortKey(...);*/
4386        /*ucol_calcSortKeySimpleTertiary(...);*/
4387
4388        keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4389        //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
4390            // That's not good. Something unusual happened.
4391            // We don't know how much we initialized before we failed.
4392            // NULL terminate for safety.
4393            // We have no way say that we have generated a partial sort key.
4394            //result[0] = 0;
4395            //keySize = 0;
4396        //}
4397    }
4398    UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4399    UTRACE_EXIT_STATUS(status);
4400    return keySize;
4401}
4402
4403/* this function is called by the C++ API for sortkey generation */
4404U_CFUNC int32_t
4405ucol_getSortKeyWithAllocation(const UCollator *coll,
4406                              const UChar *source, int32_t sourceLength,
4407                              uint8_t **pResult,
4408                              UErrorCode *pErrorCode) {
4409    *pResult = 0;
4410    return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4411}
4412
4413#define UCOL_FSEC_BUF_SIZE 256
4414
4415/* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
4416/* or if we run out of space while making a sortkey and want to return ASAP                                   */
4417int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4418    UErrorCode status = U_ZERO_ERROR;
4419    //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4420    uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4421    uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4422    uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4423    UBool  compareIdent = (strength == UCOL_IDENTICAL);
4424    UBool  doCase = (coll->caseLevel == UCOL_ON);
4425    UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4426    //UBool  qShifted = shifted  && (compareQuad == 0);
4427    UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4428    UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4429    uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4430    uint8_t *fSecs = fSecsBuff;
4431    uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4432    uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4433
4434    uint32_t variableTopValue = coll->variableTopValue;
4435    uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4436    if(doHiragana) {
4437        UCOL_COMMON_BOT4++;
4438        /* allocate one more space for hiragana */
4439    }
4440    uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4441
4442    uint32_t order = UCOL_NO_MORE_CES;
4443    uint8_t primary1 = 0;
4444    uint8_t primary2 = 0;
4445    uint8_t secondary = 0;
4446    uint8_t tertiary = 0;
4447    int32_t caseShift = 0;
4448    uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4449
4450    uint8_t caseSwitch = coll->caseSwitch;
4451    uint8_t tertiaryMask = coll->tertiaryMask;
4452    uint8_t tertiaryCommon = coll->tertiaryCommon;
4453
4454    UBool wasShifted = FALSE;
4455    UBool notIsContinuation = FALSE;
4456    uint8_t leadPrimary = 0;
4457
4458
4459    for(;;) {
4460        order = ucol_IGetNextCE(coll, s, &status);
4461        if(order == UCOL_NO_MORE_CES) {
4462            break;
4463        }
4464
4465        if(order == 0) {
4466            continue;
4467        }
4468
4469        notIsContinuation = !isContinuation(order);
4470
4471
4472        if(notIsContinuation) {
4473            tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4474        } else {
4475            tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4476        }
4477        secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4478        primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4479        primary1 = (uint8_t)(order >> 8);
4480
4481
4482        if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4483            || (!notIsContinuation && wasShifted))
4484            || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4485                /* and other ignorables should be removed if following a shifted code point */
4486                if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4487                    /* we should just completely ignore it */
4488                    continue;
4489                }
4490                if(compareQuad == 0) {
4491                    if(c4 > 0) {
4492                        currentSize += (c2/UCOL_BOT_COUNT4)+1;
4493                        c4 = 0;
4494                    }
4495                    currentSize++;
4496                    if(primary2 != 0) {
4497                        currentSize++;
4498                    }
4499                }
4500                wasShifted = TRUE;
4501        } else {
4502            wasShifted = FALSE;
4503            /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4504            /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4505            /* calculate sortkey size */
4506            if(primary1 != UCOL_IGNORABLE) {
4507                if(notIsContinuation) {
4508                    if(leadPrimary == primary1) {
4509                        currentSize++;
4510                    } else {
4511                        if(leadPrimary != 0) {
4512                            currentSize++;
4513                        }
4514                        if(primary2 == UCOL_IGNORABLE) {
4515                            /* one byter, not compressed */
4516                            currentSize++;
4517                            leadPrimary = 0;
4518                        }
4519                        else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4520                            //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4521                            //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4522                            (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary))
4523                        {
4524                            /* not compressible */
4525                            leadPrimary = 0;
4526                            currentSize+=2;
4527                        }
4528                        else { /* compress */
4529                            leadPrimary = primary1;
4530                            currentSize+=2;
4531                        }
4532                    }
4533                } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4534                    currentSize++;
4535                    if(primary2 != UCOL_IGNORABLE) {
4536                        currentSize++;
4537                    }
4538                }
4539            }
4540
4541            if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4542                if(!isFrenchSec){
4543                    if (secondary == UCOL_COMMON2 && notIsContinuation) {
4544                        c2++;
4545                    } else {
4546                        if(c2 > 0) {
4547                            if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4548                                currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4549                            } else {
4550                                currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4551                            }
4552                            c2 = 0;
4553                        }
4554                        currentSize++;
4555                    }
4556                } else {
4557                    fSecs[fSecsLen++] = secondary;
4558                    if(fSecsLen == fSecsMaxLen) {
4559                        uint8_t *fSecsTemp;
4560                        if(fSecs == fSecsBuff) {
4561                            fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
4562                        } else {
4563                            fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4564                        }
4565                        if(fSecsTemp == NULL) {
4566                            status = U_MEMORY_ALLOCATION_ERROR;
4567                            return 0;
4568                        }
4569                        fSecs = fSecsTemp;
4570                        fSecsMaxLen *= 2;
4571                    }
4572                    if(notIsContinuation) {
4573                        if (frenchStartPtr != NULL) {
4574                            /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4575                            uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4576                            frenchStartPtr = NULL;
4577                        }
4578                    } else {
4579                        if (frenchStartPtr == NULL) {
4580                            frenchStartPtr = fSecs+fSecsLen-2;
4581                        }
4582                        frenchEndPtr = fSecs+fSecsLen-1;
4583                    }
4584                }
4585            }
4586
4587            if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4588                // do the case level if we need to do it. We don't want to calculate
4589                // case level for primary ignorables if we have only primary strength and case level
4590                // otherwise we would break well formedness of CEs
4591                if (caseShift  == 0) {
4592                    currentSize++;
4593                    caseShift = UCOL_CASE_SHIFT_START;
4594                }
4595                if((tertiary&0x3F) > 0 && notIsContinuation) {
4596                    caseShift--;
4597                    if((tertiary &0xC0) != 0) {
4598                        if (caseShift  == 0) {
4599                            currentSize++;
4600                            caseShift = UCOL_CASE_SHIFT_START;
4601                        }
4602                        caseShift--;
4603                    }
4604                }
4605            } else {
4606                if(notIsContinuation) {
4607                    tertiary ^= caseSwitch;
4608                }
4609            }
4610
4611            tertiary &= tertiaryMask;
4612            if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4613                if (tertiary == tertiaryCommon && notIsContinuation) {
4614                    c3++;
4615                } else {
4616                    if(c3 > 0) {
4617                        if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4618                            || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4619                                currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4620                        } else {
4621                            currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4622                        }
4623                        c3 = 0;
4624                    }
4625                    currentSize++;
4626                }
4627            }
4628
4629            if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4630                if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4631                    if(c4>0) { // Close this part
4632                        currentSize += (c4/UCOL_BOT_COUNT4)+1;
4633                        c4 = 0;
4634                    }
4635                    currentSize++; // Add the Hiragana
4636                } else { // This wasn't Hiragana, so we can continue adding stuff
4637                    c4++;
4638                }
4639            }
4640        }
4641    }
4642
4643    if(!isFrenchSec){
4644        if(c2 > 0) {
4645            currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4646        }
4647    } else {
4648        uint32_t i = 0;
4649        if(frenchStartPtr != NULL) {
4650            uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4651        }
4652        for(i = 0; i<fSecsLen; i++) {
4653            secondary = *(fSecs+fSecsLen-i-1);
4654            /* This is compression code. */
4655            if (secondary == UCOL_COMMON2) {
4656                ++c2;
4657            } else {
4658                if(c2 > 0) {
4659                    if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4660                        currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4661                    } else {
4662                        currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4663                    }
4664                    c2 = 0;
4665                }
4666                currentSize++;
4667            }
4668        }
4669        if(c2 > 0) {
4670            currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4671        }
4672        if(fSecs != fSecsBuff) {
4673            uprv_free(fSecs);
4674        }
4675    }
4676
4677    if(c3 > 0) {
4678        currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4679    }
4680
4681    if(c4 > 0  && compareQuad == 0) {
4682        currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4683    }
4684
4685    if(compareIdent) {
4686        currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4687    }
4688    return currentSize;
4689}
4690
4691static
4692inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4693    if (caseShift  == 0) {
4694        *(*cases)++ = UCOL_CASE_BYTE_START;
4695        caseShift = UCOL_CASE_SHIFT_START;
4696    }
4697}
4698
4699// Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4700// know how many values we wanted to add, even if we didn't add them all
4701static
4702inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4703    size++;
4704    if(primaries < limit) {
4705        *(primaries)++ = value;
4706    }
4707}
4708
4709// Packs the secondary buffer when processing French locale. Adds the terminator.
4710static
4711inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4712    uint8_t secondary;
4713    int32_t count2 = 0;
4714    uint32_t i = 0, size = 0;
4715    // we use i here since the key size already accounts for terminators, so we'll discard the increment
4716    addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4717    /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4718    if(frenchStartPtr != NULL) {
4719        uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4720    }
4721    for(i = 0; i<*secsize; i++) {
4722        secondary = *(secondaries-i-1);
4723        /* This is compression code. */
4724        if (secondary == UCOL_COMMON2) {
4725            ++count2;
4726        } else {
4727            if (count2 > 0) {
4728                if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4729                    while (count2 > UCOL_TOP_COUNT2) {
4730                        addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4731                        count2 -= (uint32_t)UCOL_TOP_COUNT2;
4732                    }
4733                    addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4734                } else {
4735                    while (count2 > UCOL_BOT_COUNT2) {
4736                        addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4737                        count2 -= (uint32_t)UCOL_BOT_COUNT2;
4738                    }
4739                    addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4740                }
4741                count2 = 0;
4742            }
4743            addWithIncrement(primaries, primEnd, size, secondary);
4744        }
4745    }
4746    if (count2 > 0) {
4747        while (count2 > UCOL_BOT_COUNT2) {
4748            addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4749            count2 -= (uint32_t)UCOL_BOT_COUNT2;
4750        }
4751        addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4752    }
4753    *secsize = size;
4754    return primaries;
4755}
4756
4757#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4758
4759/* This is the sortkey work horse function */
4760U_CFUNC int32_t U_CALLCONV
4761ucol_calcSortKey(const    UCollator    *coll,
4762        const    UChar        *source,
4763        int32_t        sourceLength,
4764        uint8_t        **result,
4765        uint32_t        resultLength,
4766        UBool allocateSKBuffer,
4767        UErrorCode *status)
4768{
4769    //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4770
4771    uint32_t i = 0; /* general purpose counter */
4772
4773    /* Stack allocated buffers for buffers we use */
4774    uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4775
4776    uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4777
4778    if(U_FAILURE(*status)) {
4779        return 0;
4780    }
4781
4782    if(primaries == NULL && allocateSKBuffer == TRUE) {
4783        primaries = *result = prim;
4784        resultLength = UCOL_PRIMARY_MAX_BUFFER;
4785    }
4786
4787    uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4788      caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4789
4790    uint32_t sortKeySize = 1; /* it is always \0 terminated */
4791
4792    UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4793    UChar *normSource = normBuffer;
4794    int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4795
4796    int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4797
4798    UColAttributeValue strength = coll->strength;
4799
4800    uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4801    uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4802    uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4803    UBool  compareIdent = (strength == UCOL_IDENTICAL);
4804    UBool  doCase = (coll->caseLevel == UCOL_ON);
4805    UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4806    UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4807    //UBool  qShifted = shifted && (compareQuad == 0);
4808    UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4809    /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4810
4811    uint32_t variableTopValue = coll->variableTopValue;
4812    // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4813    // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4814    uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4815    uint8_t UCOL_HIRAGANA_QUAD = 0;
4816    if(doHiragana) {
4817        UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4818        /* allocate one more space for hiragana, value for hiragana */
4819    }
4820    uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4821
4822    /* support for special features like caselevel and funky secondaries */
4823    uint8_t *frenchStartPtr = NULL;
4824    uint8_t *frenchEndPtr = NULL;
4825    uint32_t caseShift = 0;
4826
4827    sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4828
4829    /* If we need to normalize, we'll do it all at once at the beginning! */
4830    UNormalizationMode normMode;
4831    if(compareIdent) {
4832        normMode = UNORM_NFD;
4833    } else if(coll->normalizationMode != UCOL_OFF) {
4834        normMode = UNORM_FCD;
4835    } else {
4836        normMode = UNORM_NONE;
4837    }
4838
4839    if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4840        len = unorm_internalNormalize(normSource, normSourceLen,
4841                                      source, len,
4842                                      normMode, FALSE,
4843                                      status);
4844        if(*status == U_BUFFER_OVERFLOW_ERROR) {
4845            normSourceLen = len;
4846            normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4847            if(normSource == NULL) {
4848                *status = U_MEMORY_ALLOCATION_ERROR;
4849                return 0;
4850            }
4851            *status = U_ZERO_ERROR;
4852            len = unorm_internalNormalize(normSource, normSourceLen,
4853                                          source, len,
4854                                          normMode, FALSE,
4855                                          status);
4856        }
4857
4858        if(U_FAILURE(*status)) {
4859            return 0;
4860        }
4861        source = normSource;
4862    }
4863
4864    collIterate s;
4865    IInit_collIterate(coll, (UChar *)source, len, &s);
4866    if(source == normSource) {
4867        s.flags &= ~UCOL_ITER_NORM;
4868    }
4869
4870    if(resultLength == 0 || primaries == NULL) {
4871        int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4872        if(normSource != normBuffer) {
4873            uprv_free(normSource);
4874        }
4875        return keyLen;
4876    }
4877    uint8_t *primarySafeEnd = primaries + resultLength - 1;
4878    if(strength > UCOL_PRIMARY) {
4879        primarySafeEnd--;
4880    }
4881
4882    uint32_t minBufferSize = UCOL_MAX_BUFFER;
4883
4884    uint8_t *primStart = primaries;
4885    uint8_t *secStart = secondaries;
4886    uint8_t *terStart = tertiaries;
4887    uint8_t *caseStart = cases;
4888    uint8_t *quadStart = quads;
4889
4890    uint32_t order = 0;
4891
4892    uint8_t primary1 = 0;
4893    uint8_t primary2 = 0;
4894    uint8_t secondary = 0;
4895    uint8_t tertiary = 0;
4896    uint8_t caseSwitch = coll->caseSwitch;
4897    uint8_t tertiaryMask = coll->tertiaryMask;
4898    int8_t tertiaryAddition = coll->tertiaryAddition;
4899    uint8_t tertiaryTop = coll->tertiaryTop;
4900    uint8_t tertiaryBottom = coll->tertiaryBottom;
4901    uint8_t tertiaryCommon = coll->tertiaryCommon;
4902    uint8_t caseBits = 0;
4903
4904    UBool finished = FALSE;
4905    UBool wasShifted = FALSE;
4906    UBool notIsContinuation = FALSE;
4907
4908    uint32_t prevBuffSize = 0;
4909
4910    uint32_t count2 = 0, count3 = 0, count4 = 0;
4911    uint8_t leadPrimary = 0;
4912
4913    for(;;) {
4914        for(i=prevBuffSize; i<minBufferSize; ++i) {
4915
4916            order = ucol_IGetNextCE(coll, &s, status);
4917            if(order == UCOL_NO_MORE_CES) {
4918                finished = TRUE;
4919                break;
4920            }
4921
4922            if(order == 0) {
4923                continue;
4924            }
4925
4926            notIsContinuation = !isContinuation(order);
4927
4928            if(notIsContinuation) {
4929                tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4930            } else {
4931                tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4932            }
4933
4934            secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4935            primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4936            primary1 = (uint8_t)(order >> 8);
4937
4938            /*if(notIsContinuation && scriptOrder != NULL) {
4939            primary1 = scriptOrder[primary1];
4940            }*/
4941
4942            if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4943                || (!notIsContinuation && wasShifted))
4944                || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4945            {
4946                /* and other ignorables should be removed if following a shifted code point */
4947                if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4948                    /* we should just completely ignore it */
4949                    continue;
4950                }
4951                if(compareQuad == 0) {
4952                    if(count4 > 0) {
4953                        while (count4 > UCOL_BOT_COUNT4) {
4954                            *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4955                            count4 -= UCOL_BOT_COUNT4;
4956                        }
4957                        *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4958                        count4 = 0;
4959                    }
4960                    /* We are dealing with a variable and we're treating them as shifted */
4961                    /* This is a shifted ignorable */
4962                    if(primary1 != 0) { /* we need to check this since we could be in continuation */
4963                        *quads++ = primary1;
4964                    }
4965                    if(primary2 != 0) {
4966                        *quads++ = primary2;
4967                    }
4968                }
4969                wasShifted = TRUE;
4970            } else {
4971                wasShifted = FALSE;
4972                /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4973                /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4974                /* regular and simple sortkey calc */
4975                if(primary1 != UCOL_IGNORABLE) {
4976                    if(notIsContinuation) {
4977                        if(leadPrimary == primary1) {
4978                            *primaries++ = primary2;
4979                        } else {
4980                            if(leadPrimary != 0) {
4981                                *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4982                            }
4983                            if(primary2 == UCOL_IGNORABLE) {
4984                                /* one byter, not compressed */
4985                                *primaries++ = primary1;
4986                                leadPrimary = 0;
4987                            } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4988                                //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4989                                (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4990                                    /* not compressible */
4991                                    leadPrimary = 0;
4992                                    *primaries++ = primary1;
4993                                    if(primaries <= primarySafeEnd) {
4994                                        *primaries++ = primary2;
4995                                    }
4996                            } else { /* compress */
4997                                *primaries++ = leadPrimary = primary1;
4998                                if(primaries <= primarySafeEnd) {
4999                                    *primaries++ = primary2;
5000                                }
5001                            }
5002                        }
5003                    } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5004                        *primaries++ = primary1;
5005                        if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
5006                                *primaries++ = primary2; /* second part */
5007                        }
5008                    }
5009                }
5010
5011                if(secondary > compareSec) {
5012                    if(!isFrenchSec) {
5013                        /* This is compression code. */
5014                        if (secondary == UCOL_COMMON2 && notIsContinuation) {
5015                            ++count2;
5016                        } else {
5017                            if (count2 > 0) {
5018                                if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5019                                    while (count2 > UCOL_TOP_COUNT2) {
5020                                        *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5021                                        count2 -= (uint32_t)UCOL_TOP_COUNT2;
5022                                    }
5023                                    *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5024                                } else {
5025                                    while (count2 > UCOL_BOT_COUNT2) {
5026                                        *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5027                                        count2 -= (uint32_t)UCOL_BOT_COUNT2;
5028                                    }
5029                                    *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5030                                }
5031                                count2 = 0;
5032                            }
5033                            *secondaries++ = secondary;
5034                        }
5035                    } else {
5036                        *secondaries++ = secondary;
5037                        /* Do the special handling for French secondaries */
5038                        /* We need to get continuation elements and do intermediate restore */
5039                        /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
5040                        if(notIsContinuation) {
5041                            if (frenchStartPtr != NULL) {
5042                                /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5043                                uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
5044                                frenchStartPtr = NULL;
5045                            }
5046                        } else {
5047                            if (frenchStartPtr == NULL) {
5048                                frenchStartPtr = secondaries - 2;
5049                            }
5050                            frenchEndPtr = secondaries-1;
5051                        }
5052                    }
5053                }
5054
5055                if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
5056                    // do the case level if we need to do it. We don't want to calculate
5057                    // case level for primary ignorables if we have only primary strength and case level
5058                    // otherwise we would break well formedness of CEs
5059                    doCaseShift(&cases, caseShift);
5060                    if(notIsContinuation) {
5061                        caseBits = (uint8_t)(tertiary & 0xC0);
5062
5063                        if(tertiary != 0) {
5064                            if(coll->caseFirst == UCOL_UPPER_FIRST) {
5065                                if((caseBits & 0xC0) == 0) {
5066                                    *(cases-1) |= 1 << (--caseShift);
5067                                } else {
5068                                    *(cases-1) |= 0 << (--caseShift);
5069                                    /* second bit */
5070                                    doCaseShift(&cases, caseShift);
5071                                    *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
5072                                }
5073                            } else {
5074                                if((caseBits & 0xC0) == 0) {
5075                                    *(cases-1) |= 0 << (--caseShift);
5076                                } else {
5077                                    *(cases-1) |= 1 << (--caseShift);
5078                                    /* second bit */
5079                                    doCaseShift(&cases, caseShift);
5080                                    *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
5081                                }
5082                            }
5083                        }
5084
5085                    }
5086                } else {
5087                    if(notIsContinuation) {
5088                        tertiary ^= caseSwitch;
5089                    }
5090                }
5091
5092                tertiary &= tertiaryMask;
5093                if(tertiary > compareTer) {
5094                    /* This is compression code. */
5095                    /* sequence size check is included in the if clause */
5096                    if (tertiary == tertiaryCommon && notIsContinuation) {
5097                        ++count3;
5098                    } else {
5099                        if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5100                            tertiary += tertiaryAddition;
5101                        } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5102                            tertiary -= tertiaryAddition;
5103                        }
5104                        if (count3 > 0) {
5105                            if ((tertiary > tertiaryCommon)) {
5106                                while (count3 > coll->tertiaryTopCount) {
5107                                    *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5108                                    count3 -= (uint32_t)coll->tertiaryTopCount;
5109                                }
5110                                *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5111                            } else {
5112                                while (count3 > coll->tertiaryBottomCount) {
5113                                    *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5114                                    count3 -= (uint32_t)coll->tertiaryBottomCount;
5115                                }
5116                                *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5117                            }
5118                            count3 = 0;
5119                        }
5120                        *tertiaries++ = tertiary;
5121                    }
5122                }
5123
5124                if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
5125                    if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5126                        if(count4>0) { // Close this part
5127                            while (count4 > UCOL_BOT_COUNT4) {
5128                                *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5129                                count4 -= UCOL_BOT_COUNT4;
5130                            }
5131                            *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5132                            count4 = 0;
5133                        }
5134                        *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
5135                    } else { // This wasn't Hiragana, so we can continue adding stuff
5136                        count4++;
5137                    }
5138                }
5139            }
5140
5141            if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5142                if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5143                    IInit_collIterate(coll, (UChar *)source, len, &s);
5144                    if(source == normSource) {
5145                        s.flags &= ~UCOL_ITER_NORM;
5146                    }
5147                    sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5148                    *status = U_BUFFER_OVERFLOW_ERROR;
5149                    finished = TRUE;
5150                    break;
5151                } else { /* It's much nicer if we can actually reallocate */
5152                    int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
5153                    primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5154                    if(U_SUCCESS(*status)) {
5155                        *result = primStart;
5156                        primarySafeEnd = primStart + resultLength - 1;
5157                        if(strength > UCOL_PRIMARY) {
5158                            primarySafeEnd--;
5159                        }
5160                    } else {
5161                        /* We ran out of memory!? We can't recover. */
5162                        sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5163                        finished = TRUE;
5164                        break;
5165                    }
5166                }
5167            }
5168        }
5169        if(finished) {
5170            break;
5171        } else {
5172            prevBuffSize = minBufferSize;
5173
5174            uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
5175            if (frenchStartPtr != NULL) {
5176                frenchStartOffset = frenchStartPtr - secStart;
5177                frenchEndOffset = frenchEndPtr - secStart;
5178            }
5179            secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5180            terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5181            caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
5182            quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
5183            if(U_FAILURE(*status)) {
5184                /* We ran out of memory!? We can't recover. */
5185                sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5186                break;
5187            }
5188            if (frenchStartPtr != NULL) {
5189                frenchStartPtr = secStart + frenchStartOffset;
5190                frenchEndPtr = secStart + frenchEndOffset;
5191            }
5192            minBufferSize *= 2;
5193        }
5194    }
5195
5196    /* Here, we are generally done with processing */
5197    /* bailing out would not be too productive */
5198
5199    if(U_SUCCESS(*status)) {
5200        sortKeySize += (primaries - primStart);
5201        /* we have done all the CE's, now let's put them together to form a key */
5202        if(compareSec == 0) {
5203            if (count2 > 0) {
5204                while (count2 > UCOL_BOT_COUNT2) {
5205                    *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5206                    count2 -= (uint32_t)UCOL_BOT_COUNT2;
5207                }
5208                *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5209            }
5210            uint32_t secsize = secondaries-secStart;
5211            if(!isFrenchSec) { // Regular situation, we know the length of secondaries
5212                sortKeySize += secsize;
5213                if(sortKeySize <= resultLength) {
5214                    *(primaries++) = UCOL_LEVELTERMINATOR;
5215                    uprv_memcpy(primaries, secStart, secsize);
5216                    primaries += secsize;
5217                } else {
5218                    if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5219                        primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5220                        if(U_SUCCESS(*status)) {
5221                            *result = primStart;
5222                            *(primaries++) = UCOL_LEVELTERMINATOR;
5223                            uprv_memcpy(primaries, secStart, secsize);
5224                            primaries += secsize;
5225                        }
5226                        else {
5227                            /* We ran out of memory!? We can't recover. */
5228                            sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5229                            goto cleanup;
5230                        }
5231                    } else {
5232                        *status = U_BUFFER_OVERFLOW_ERROR;
5233                    }
5234                }
5235            } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5236                uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5237                sortKeySize += secsize;
5238                if(sortKeySize <= resultLength) { // if we managed to pack fine
5239                    primaries = newPrim; // update the primary pointer
5240                } else { // overflow, need to reallocate and redo
5241                    if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5242                        primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5243                        if(U_SUCCESS(*status)) {
5244                            primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5245                        }
5246                        else {
5247                            /* We ran out of memory!? We can't recover. */
5248                            sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5249                            goto cleanup;
5250                        }
5251                    } else {
5252                        *status = U_BUFFER_OVERFLOW_ERROR;
5253                    }
5254                }
5255            }
5256        }
5257
5258        if(doCase) {
5259            uint32_t casesize = cases - caseStart;
5260            sortKeySize += casesize;
5261            if(sortKeySize <= resultLength) {
5262                *(primaries++) = UCOL_LEVELTERMINATOR;
5263                uprv_memcpy(primaries, caseStart, casesize);
5264                primaries += casesize;
5265            } else {
5266                if(allocateSKBuffer == TRUE) {
5267                    primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5268                    if(U_SUCCESS(*status)) {
5269                        *result = primStart;
5270                        *(primaries++) = UCOL_LEVELTERMINATOR;
5271                        uprv_memcpy(primaries, caseStart, casesize);
5272                    }
5273                    else {
5274                        /* We ran out of memory!? We can't recover. */
5275                        sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5276                        goto cleanup;
5277                    }
5278                } else {
5279                    *status = U_BUFFER_OVERFLOW_ERROR;
5280                }
5281            }
5282        }
5283
5284        if(compareTer == 0) {
5285            if (count3 > 0) {
5286                if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5287                    while (count3 >= coll->tertiaryTopCount) {
5288                        *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5289                        count3 -= (uint32_t)coll->tertiaryTopCount;
5290                    }
5291                    *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5292                } else {
5293                    while (count3 > coll->tertiaryBottomCount) {
5294                        *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5295                        count3 -= (uint32_t)coll->tertiaryBottomCount;
5296                    }
5297                    *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5298                }
5299            }
5300            uint32_t tersize = tertiaries - terStart;
5301            sortKeySize += tersize;
5302            if(sortKeySize <= resultLength) {
5303                *(primaries++) = UCOL_LEVELTERMINATOR;
5304                uprv_memcpy(primaries, terStart, tersize);
5305                primaries += tersize;
5306            } else {
5307                if(allocateSKBuffer == TRUE) {
5308                    primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5309                    if(U_SUCCESS(*status)) {
5310                        *result = primStart;
5311                        *(primaries++) = UCOL_LEVELTERMINATOR;
5312                        uprv_memcpy(primaries, terStart, tersize);
5313                    }
5314                    else {
5315                        /* We ran out of memory!? We can't recover. */
5316                        sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5317                        goto cleanup;
5318                    }
5319                } else {
5320                    *status = U_BUFFER_OVERFLOW_ERROR;
5321                }
5322            }
5323
5324            if(compareQuad == 0/*qShifted == TRUE*/) {
5325                if(count4 > 0) {
5326                    while (count4 > UCOL_BOT_COUNT4) {
5327                        *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5328                        count4 -= UCOL_BOT_COUNT4;
5329                    }
5330                    *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5331                }
5332                uint32_t quadsize = quads - quadStart;
5333                sortKeySize += quadsize;
5334                if(sortKeySize <= resultLength) {
5335                    *(primaries++) = UCOL_LEVELTERMINATOR;
5336                    uprv_memcpy(primaries, quadStart, quadsize);
5337                    primaries += quadsize;
5338                } else {
5339                    if(allocateSKBuffer == TRUE) {
5340                        primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5341                        if(U_SUCCESS(*status)) {
5342                            *result = primStart;
5343                            *(primaries++) = UCOL_LEVELTERMINATOR;
5344                            uprv_memcpy(primaries, quadStart, quadsize);
5345                        }
5346                        else {
5347                            /* We ran out of memory!? We can't recover. */
5348                            sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5349                            goto cleanup;
5350                        }
5351                    } else {
5352                        *status = U_BUFFER_OVERFLOW_ERROR;
5353                    }
5354                }
5355            }
5356
5357            if(compareIdent) {
5358                sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5359                if(sortKeySize <= resultLength) {
5360                    *(primaries++) = UCOL_LEVELTERMINATOR;
5361                    primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5362                } else {
5363                    if(allocateSKBuffer == TRUE) {
5364                        primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5365                        if(U_SUCCESS(*status)) {
5366                            *result = primStart;
5367                            *(primaries++) = UCOL_LEVELTERMINATOR;
5368                            u_writeIdenticalLevelRun(s.string, len, primaries);
5369                        }
5370                        else {
5371                            /* We ran out of memory!? We can't recover. */
5372                            sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5373                            goto cleanup;
5374                        }
5375                    } else {
5376                        *status = U_BUFFER_OVERFLOW_ERROR;
5377                    }
5378                }
5379            }
5380        }
5381        *(primaries++) = '\0';
5382    }
5383
5384    if(allocateSKBuffer == TRUE) {
5385        *result = (uint8_t*)uprv_malloc(sortKeySize);
5386        /* test for NULL */
5387        if (*result == NULL) {
5388            *status = U_MEMORY_ALLOCATION_ERROR;
5389            goto cleanup;
5390        }
5391        uprv_memcpy(*result, primStart, sortKeySize);
5392        if(primStart != prim) {
5393            uprv_free(primStart);
5394        }
5395    }
5396
5397cleanup:
5398    if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5399        /* NULL terminate for safety */
5400        **result = 0;
5401    }
5402    if(terStart != tert) {
5403        uprv_free(terStart);
5404        uprv_free(secStart);
5405        uprv_free(caseStart);
5406        uprv_free(quadStart);
5407    }
5408
5409    /* To avoid memory leak, free the offset buffer if necessary. */
5410    ucol_freeOffsetBuffer(&s);
5411
5412    if(normSource != normBuffer) {
5413        uprv_free(normSource);
5414    }
5415
5416    return sortKeySize;
5417}
5418
5419
5420U_CFUNC int32_t U_CALLCONV
5421ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5422        const    UChar        *source,
5423        int32_t        sourceLength,
5424        uint8_t        **result,
5425        uint32_t        resultLength,
5426        UBool allocateSKBuffer,
5427        UErrorCode *status)
5428{
5429    U_ALIGN_CODE(16);
5430
5431    //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5432    uint32_t i = 0; /* general purpose counter */
5433
5434    /* Stack allocated buffers for buffers we use */
5435    uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5436
5437    uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5438
5439    if(U_FAILURE(*status)) {
5440        return 0;
5441    }
5442
5443    if(primaries == NULL && allocateSKBuffer == TRUE) {
5444        primaries = *result = prim;
5445        resultLength = UCOL_PRIMARY_MAX_BUFFER;
5446    }
5447
5448    uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5449
5450    uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5451
5452    UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5453    UChar *normSource = normBuffer;
5454    int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5455
5456    int32_t len =  sourceLength;
5457
5458    /* If we need to normalize, we'll do it all at once at the beginning! */
5459    if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5460        len = unorm_internalNormalize(normSource, normSourceLen,
5461                                      source, len,
5462                                      UNORM_FCD, FALSE,
5463                                      status);
5464        if(*status == U_BUFFER_OVERFLOW_ERROR) {
5465            normSourceLen = len;
5466            normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5467            if(normSource == NULL) {
5468                *status = U_MEMORY_ALLOCATION_ERROR;
5469                return 0;
5470            }
5471            *status = U_ZERO_ERROR;
5472            len = unorm_internalNormalize(normSource, normSourceLen,
5473                                          source, len,
5474                                          UNORM_FCD, FALSE,
5475                                          status);
5476            if(U_FAILURE(*status)) {
5477                /* Should never happen. */
5478                uprv_free(normSource);
5479                normSource = normBuffer;
5480            }
5481        }
5482
5483        if(U_FAILURE(*status)) {
5484            return 0;
5485        }
5486        source = normSource;
5487    }
5488
5489    collIterate s;
5490    IInit_collIterate(coll, (UChar *)source, len, &s);
5491    if(source == normSource) {
5492        s.flags &= ~UCOL_ITER_NORM;
5493    }
5494
5495    if(resultLength == 0 || primaries == NULL) {
5496        int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5497        if(normSource != normBuffer) {
5498            uprv_free(normSource);
5499        }
5500        return t;
5501    }
5502
5503    uint8_t *primarySafeEnd = primaries + resultLength - 2;
5504
5505    uint32_t minBufferSize = UCOL_MAX_BUFFER;
5506
5507    uint8_t *primStart = primaries;
5508    uint8_t *secStart = secondaries;
5509    uint8_t *terStart = tertiaries;
5510
5511    uint32_t order = 0;
5512
5513    uint8_t primary1 = 0;
5514    uint8_t primary2 = 0;
5515    uint8_t secondary = 0;
5516    uint8_t tertiary = 0;
5517    uint8_t caseSwitch = coll->caseSwitch;
5518    uint8_t tertiaryMask = coll->tertiaryMask;
5519    int8_t tertiaryAddition = coll->tertiaryAddition;
5520    uint8_t tertiaryTop = coll->tertiaryTop;
5521    uint8_t tertiaryBottom = coll->tertiaryBottom;
5522    uint8_t tertiaryCommon = coll->tertiaryCommon;
5523
5524    uint32_t prevBuffSize = 0;
5525
5526    UBool finished = FALSE;
5527    UBool notIsContinuation = FALSE;
5528
5529    uint32_t count2 = 0, count3 = 0;
5530    uint8_t leadPrimary = 0;
5531
5532    for(;;) {
5533        for(i=prevBuffSize; i<minBufferSize; ++i) {
5534
5535            order = ucol_IGetNextCE(coll, &s, status);
5536
5537            if(order == 0) {
5538                continue;
5539            }
5540
5541            if(order == UCOL_NO_MORE_CES) {
5542                finished = TRUE;
5543                break;
5544            }
5545
5546            notIsContinuation = !isContinuation(order);
5547
5548            if(notIsContinuation) {
5549                tertiary = (uint8_t)((order & tertiaryMask));
5550            } else {
5551                tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5552            }
5553            secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5554            primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5555            primary1 = (uint8_t)(order >> 8);
5556
5557            /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5558            /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5559            /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5560            /* regular and simple sortkey calc */
5561            if(primary1 != UCOL_IGNORABLE) {
5562                if(notIsContinuation) {
5563                    if(leadPrimary == primary1) {
5564                        *primaries++ = primary2;
5565                    } else {
5566                        if(leadPrimary != 0) {
5567                            *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5568                        }
5569                        if(primary2 == UCOL_IGNORABLE) {
5570                            /* one byter, not compressed */
5571                            *primaries++ = primary1;
5572                            leadPrimary = 0;
5573                        } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5574                            //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5575                            //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5576                            (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5577                                /* not compressible */
5578                                leadPrimary = 0;
5579                                *primaries++ = primary1;
5580                                *primaries++ = primary2;
5581                        } else { /* compress */
5582                            *primaries++ = leadPrimary = primary1;
5583                            *primaries++ = primary2;
5584                        }
5585                    }
5586                } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5587                    *primaries++ = primary1;
5588                    if(primary2 != UCOL_IGNORABLE) {
5589                        *primaries++ = primary2; /* second part */
5590                    }
5591                }
5592            }
5593
5594            if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5595                /* This is compression code. */
5596                if (secondary == UCOL_COMMON2 && notIsContinuation) {
5597                    ++count2;
5598                } else {
5599                    if (count2 > 0) {
5600                        if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5601                            while (count2 > UCOL_TOP_COUNT2) {
5602                                *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5603                                count2 -= (uint32_t)UCOL_TOP_COUNT2;
5604                            }
5605                            *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5606                        } else {
5607                            while (count2 > UCOL_BOT_COUNT2) {
5608                                *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5609                                count2 -= (uint32_t)UCOL_BOT_COUNT2;
5610                            }
5611                            *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5612                        }
5613                        count2 = 0;
5614                    }
5615                    *secondaries++ = secondary;
5616                }
5617            }
5618
5619            if(notIsContinuation) {
5620                tertiary ^= caseSwitch;
5621            }
5622
5623            if(tertiary > 0) {
5624                /* This is compression code. */
5625                /* sequence size check is included in the if clause */
5626                if (tertiary == tertiaryCommon && notIsContinuation) {
5627                    ++count3;
5628                } else {
5629                    if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5630                        tertiary += tertiaryAddition;
5631                    } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5632                        tertiary -= tertiaryAddition;
5633                    }
5634                    if (count3 > 0) {
5635                        if ((tertiary > tertiaryCommon)) {
5636                            while (count3 > coll->tertiaryTopCount) {
5637                                *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5638                                count3 -= (uint32_t)coll->tertiaryTopCount;
5639                            }
5640                            *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5641                        } else {
5642                            while (count3 > coll->tertiaryBottomCount) {
5643                                *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5644                                count3 -= (uint32_t)coll->tertiaryBottomCount;
5645                            }
5646                            *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5647                        }
5648                        count3 = 0;
5649                    }
5650                    *tertiaries++ = tertiary;
5651                }
5652            }
5653
5654            if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5655                if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5656                    IInit_collIterate(coll, (UChar *)source, len, &s);
5657                    if(source == normSource) {
5658                        s.flags &= ~UCOL_ITER_NORM;
5659                    }
5660                    sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5661                    *status = U_BUFFER_OVERFLOW_ERROR;
5662                    finished = TRUE;
5663                    break;
5664                } else { /* It's much nicer if we can actually reallocate */
5665                    int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5666                    primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5667                    if(U_SUCCESS(*status)) {
5668                        *result = primStart;
5669                        primarySafeEnd = primStart + resultLength - 2;
5670                    } else {
5671                        /* We ran out of memory!? We can't recover. */
5672                        sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5673                        finished = TRUE;
5674                        break;
5675                    }
5676                }
5677            }
5678        }
5679        if(finished) {
5680            break;
5681        } else {
5682            prevBuffSize = minBufferSize;
5683            secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5684            terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5685            minBufferSize *= 2;
5686            if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5687                /* We ran out of memory!? We can't recover. */
5688                sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5689                break;
5690            }
5691        }
5692    }
5693
5694    if(U_SUCCESS(*status)) {
5695        sortKeySize += (primaries - primStart);
5696        /* we have done all the CE's, now let's put them together to form a key */
5697        if (count2 > 0) {
5698            while (count2 > UCOL_BOT_COUNT2) {
5699                *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5700                count2 -= (uint32_t)UCOL_BOT_COUNT2;
5701            }
5702            *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5703        }
5704        uint32_t secsize = secondaries-secStart;
5705        sortKeySize += secsize;
5706        if(sortKeySize <= resultLength) {
5707            *(primaries++) = UCOL_LEVELTERMINATOR;
5708            uprv_memcpy(primaries, secStart, secsize);
5709            primaries += secsize;
5710        } else {
5711            if(allocateSKBuffer == TRUE) {
5712                primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5713                if(U_SUCCESS(*status)) {
5714                    *(primaries++) = UCOL_LEVELTERMINATOR;
5715                    *result = primStart;
5716                    uprv_memcpy(primaries, secStart, secsize);
5717                }
5718                else {
5719                    /* We ran out of memory!? We can't recover. */
5720                    sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5721                    goto cleanup;
5722                }
5723            } else {
5724                *status = U_BUFFER_OVERFLOW_ERROR;
5725            }
5726        }
5727
5728        if (count3 > 0) {
5729            if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5730                while (count3 >= coll->tertiaryTopCount) {
5731                    *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5732                    count3 -= (uint32_t)coll->tertiaryTopCount;
5733                }
5734                *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5735            } else {
5736                while (count3 > coll->tertiaryBottomCount) {
5737                    *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5738                    count3 -= (uint32_t)coll->tertiaryBottomCount;
5739                }
5740                *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5741            }
5742        }
5743        uint32_t tersize = tertiaries - terStart;
5744        sortKeySize += tersize;
5745        if(sortKeySize <= resultLength) {
5746            *(primaries++) = UCOL_LEVELTERMINATOR;
5747            uprv_memcpy(primaries, terStart, tersize);
5748            primaries += tersize;
5749        } else {
5750            if(allocateSKBuffer == TRUE) {
5751                primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5752                if(U_SUCCESS(*status)) {
5753                    *result = primStart;
5754                    *(primaries++) = UCOL_LEVELTERMINATOR;
5755                    uprv_memcpy(primaries, terStart, tersize);
5756                }
5757                else {
5758                    /* We ran out of memory!? We can't recover. */
5759                    sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5760                    goto cleanup;
5761                }
5762            } else {
5763                *status = U_MEMORY_ALLOCATION_ERROR;
5764            }
5765        }
5766
5767        *(primaries++) = '\0';
5768    }
5769
5770    if(allocateSKBuffer == TRUE) {
5771        *result = (uint8_t*)uprv_malloc(sortKeySize);
5772        /* test for NULL */
5773        if (*result == NULL) {
5774            *status = U_MEMORY_ALLOCATION_ERROR;
5775            goto cleanup;
5776        }
5777        uprv_memcpy(*result, primStart, sortKeySize);
5778        if(primStart != prim) {
5779            uprv_free(primStart);
5780        }
5781    }
5782
5783cleanup:
5784    if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5785        /* NULL terminate for safety */
5786        **result = 0;
5787    }
5788    if(terStart != tert) {
5789        uprv_free(terStart);
5790        uprv_free(secStart);
5791    }
5792
5793    /* To avoid memory leak, free the offset buffer if necessary. */
5794    ucol_freeOffsetBuffer(&s);
5795
5796    if(normSource != normBuffer) {
5797        uprv_free(normSource);
5798    }
5799
5800    return sortKeySize;
5801}
5802
5803static inline
5804UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5805    UBool notIsContinuation = !isContinuation(CE);
5806    uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5807    if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5808        || (!notIsContinuation && *wasShifted))
5809        || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5810    {
5811        // The stuff below should probably be in the sortkey code... maybe not...
5812        if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5813            /* we should just completely ignore it */
5814            *wasShifted = TRUE;
5815            //continue;
5816        }
5817        //*wasShifted = TRUE;
5818        return TRUE;
5819    } else {
5820        *wasShifted = FALSE;
5821        return FALSE;
5822    }
5823}
5824static inline
5825void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5826    if(level < maxLevel) {
5827        dest[i++] = UCOL_LEVELTERMINATOR;
5828    } else {
5829        dest[i++] = 0;
5830    }
5831}
5832
5833/** enumeration of level identifiers for partial sort key generation */
5834enum {
5835  UCOL_PSK_PRIMARY = 0,
5836    UCOL_PSK_SECONDARY = 1,
5837    UCOL_PSK_CASE = 2,
5838    UCOL_PSK_TERTIARY = 3,
5839    UCOL_PSK_QUATERNARY = 4,
5840    UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5841    UCOL_PSK_IDENTICAL = 6,
5842    UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5843    UCOL_PSK_LIMIT
5844};
5845
5846/** collation state enum. *_SHIFT value is how much to shift right
5847 *  to get the state piece to the right. *_MASK value should be
5848 *  ANDed with the shifted state. This data is stored in state[1]
5849 *  field.
5850 */
5851enum {
5852    UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5853    UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5854    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5855    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5856    /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5857     *  This field is also used to denote that the French secondary level is finished
5858     */
5859    UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5860    UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5861    UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5862    UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5863    /** When we do French we need to reverse secondary values. However, continuations
5864     *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5865     */
5866    UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5867    UCOL_PSK_BOCSU_BYTES_MASK = 3,
5868    UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5869    UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5870};
5871
5872// macro calculating the number of expansion CEs available
5873#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5874
5875
5876/** main sortkey part procedure. On the first call,
5877 *  you should pass in a collator, an iterator, empty state
5878 *  state[0] == state[1] == 0, a buffer to hold results
5879 *  number of bytes you need and an error code pointer.
5880 *  Make sure your buffer is big enough to hold the wanted
5881 *  number of sortkey bytes. I don't check.
5882 *  The only meaningful status you can get back is
5883 *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5884 *  have been dealt a raw deal and that you probably won't
5885 *  be able to use partial sortkey generation for this
5886 *  particular combination of string and collator. This
5887 *  is highly unlikely, but you should still check the error code.
5888 *  Any other status means that you're not in a sane situation
5889 *  anymore. After the first call, preserve state values and
5890 *  use them on subsequent calls to obtain more bytes of a sortkey.
5891 *  Use until the number of bytes written is smaller than the requested
5892 *  number of bytes. Generated sortkey is not compatible with the
5893 *  one generated by ucol_getSortKey, as we don't do any compression.
5894 *  However, levels are still terminated by a 1 (one) and the sortkey
5895 *  is terminated by a 0 (zero). Identical level is the same as in the
5896 *  regular sortkey - internal bocu-1 implementation is used.
5897 *  For curious, although you cannot do much about this, here is
5898 *  the structure of state words.
5899 *  state[0] - iterator state. Depends on the iterator implementation,
5900 *             but allows the iterator to continue where it stopped in
5901 *             the last iteration.
5902 *  state[1] - collation processing state. Here is the distribution
5903 *             of the bits:
5904 *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5905 *             quaternary, quin (we don't use this one), identical and
5906 *             null (producing only zeroes - first one to terminate the
5907 *             sortkey and subsequent to fill the buffer).
5908 *   3       - byte count. Number of bytes written on the primary level.
5909 *   4       - was shifted. Whether the previous iteration finished in the
5910 *             shifted state.
5911 *   5, 6    - French continuation bytes written. See the comment in the enum
5912 *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5913 *             the identical level.
5914 *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5915 *             since thes last successful update of the iterator state.
5916 */
5917U_CAPI int32_t U_EXPORT2
5918ucol_nextSortKeyPart(const UCollator *coll,
5919                     UCharIterator *iter,
5920                     uint32_t state[2],
5921                     uint8_t *dest, int32_t count,
5922                     UErrorCode *status)
5923{
5924    /* error checking */
5925    if(status==NULL || U_FAILURE(*status)) {
5926        return 0;
5927    }
5928    UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5929    if( coll==NULL || iter==NULL ||
5930        state==NULL ||
5931        count<0 || (count>0 && dest==NULL)
5932    ) {
5933        *status=U_ILLEGAL_ARGUMENT_ERROR;
5934        UTRACE_EXIT_STATUS(status);
5935        return 0;
5936    }
5937
5938    UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5939                  coll, iter, state[0], state[1], dest, count);
5940
5941    if(count==0) {
5942        /* nothing to do */
5943        UTRACE_EXIT_VALUE(0);
5944        return 0;
5945    }
5946    /** Setting up situation according to the state we got from the previous iteration */
5947    // The state of the iterator from the previous invocation
5948    uint32_t iterState = state[0];
5949    // Has the last iteration ended in the shifted state
5950    UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5951    // What is the current level of the sortkey?
5952    int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5953    // Have we written only one byte from a two byte primary in the previous iteration?
5954    // Also on secondary level - have we finished with the French secondary?
5955    int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5956    // number of bytes in the continuation buffer for French
5957    int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5958    // Number of bytes already written from a bocsu sequence. Since
5959    // the longes bocsu sequence is 4 long, this can be up to 3.
5960    int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5961    // Number of elements that need to be consumed in this iteration because
5962    // the iterator returned UITER_NO_STATE at the end of the last iteration,
5963    // so we had to save the last valid state.
5964    int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5965
5966    /** values that depend on the collator attributes */
5967    // strength of the collator.
5968    int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5969    // maximal level of the partial sortkey. Need to take whether case level is done
5970    int32_t maxLevel = 0;
5971    if(strength < UCOL_TERTIARY) {
5972        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5973            maxLevel = UCOL_PSK_CASE;
5974        } else {
5975            maxLevel = strength;
5976        }
5977    } else {
5978        if(strength == UCOL_TERTIARY) {
5979            maxLevel = UCOL_PSK_TERTIARY;
5980        } else if(strength == UCOL_QUATERNARY) {
5981            maxLevel = UCOL_PSK_QUATERNARY;
5982        } else { // identical
5983            maxLevel = UCOL_IDENTICAL;
5984        }
5985    }
5986    // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5987    uint8_t UCOL_HIRAGANA_QUAD =
5988      (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5989    // Boundary value that decides whether a CE is shifted or not
5990    uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5991    // Are we doing French collation?
5992    UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5993
5994    /** initializing the collation state */
5995    UBool notIsContinuation = FALSE;
5996    uint32_t CE = UCOL_NO_MORE_CES;
5997
5998    collIterate s;
5999    IInit_collIterate(coll, NULL, -1, &s);
6000    s.iterator = iter;
6001    s.flags |= UCOL_USE_ITERATOR;
6002    // This variable tells us whether we have produced some other levels in this iteration
6003    // before we moved to the identical level. In that case, we need to switch the
6004    // type of the iterator.
6005    UBool doingIdenticalFromStart = FALSE;
6006    // Normalizing iterator
6007    // The division for the array length may truncate the array size to
6008    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6009    // for all platforms anyway.
6010    UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6011    UNormIterator *normIter = NULL;
6012    // If the normalization is turned on for the collator and we are below identical level
6013    // we will use a FCD normalizing iterator
6014    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
6015        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6016        s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
6017        s.flags &= ~UCOL_ITER_NORM;
6018        if(U_FAILURE(*status)) {
6019            UTRACE_EXIT_STATUS(*status);
6020            return 0;
6021        }
6022    } else if(level == UCOL_PSK_IDENTICAL) {
6023        // for identical level, we need a NFD iterator. We need to instantiate it here, since we
6024        // will be updating the state - and this cannot be done on an ordinary iterator.
6025        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6026        s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6027        s.flags &= ~UCOL_ITER_NORM;
6028        if(U_FAILURE(*status)) {
6029            UTRACE_EXIT_STATUS(*status);
6030            return 0;
6031        }
6032        doingIdenticalFromStart = TRUE;
6033    }
6034
6035    // This is the tentative new state of the iterator. The problem
6036    // is that the iterator might return an undefined state, in
6037    // which case we should save the last valid state and increase
6038    // the iterator skip value.
6039    uint32_t newState = 0;
6040
6041    // First, we set the iterator to the last valid position
6042    // from the last iteration. This was saved in state[0].
6043    if(iterState == 0) {
6044        /* initial state */
6045        if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
6046            s.iterator->move(s.iterator, 0, UITER_LIMIT);
6047        } else {
6048            s.iterator->move(s.iterator, 0, UITER_START);
6049        }
6050    } else {
6051        /* reset to previous state */
6052        s.iterator->setState(s.iterator, iterState, status);
6053        if(U_FAILURE(*status)) {
6054            UTRACE_EXIT_STATUS(*status);
6055            return 0;
6056        }
6057    }
6058
6059
6060
6061    // This variable tells us whether we can attempt to update the state
6062    // of iterator. Situations where we don't want to update iterator state
6063    // are the existence of expansion CEs that are not yet processed, and
6064    // finishing the case level without enough space in the buffer to insert
6065    // a level terminator.
6066    UBool canUpdateState = TRUE;
6067
6068    // Consume all the CEs that were consumed at the end of the previous
6069    // iteration without updating the iterator state. On identical level,
6070    // consume the code points.
6071    int32_t counter = cces;
6072    if(level < UCOL_PSK_IDENTICAL) {
6073        while(counter-->0) {
6074            // If we're doing French and we are on the secondary level,
6075            // we go backwards.
6076            if(level == UCOL_PSK_SECONDARY && doingFrench) {
6077                CE = ucol_IGetPrevCE(coll, &s, status);
6078            } else {
6079                CE = ucol_IGetNextCE(coll, &s, status);
6080            }
6081            if(CE==UCOL_NO_MORE_CES) {
6082                /* should not happen */
6083                *status=U_INTERNAL_PROGRAM_ERROR;
6084                UTRACE_EXIT_STATUS(*status);
6085                return 0;
6086            }
6087            if(uprv_numAvailableExpCEs(s)) {
6088                canUpdateState = FALSE;
6089            }
6090        }
6091    } else {
6092        while(counter-->0) {
6093            uiter_next32(s.iterator);
6094        }
6095    }
6096
6097    // French secondary needs to know whether the iterator state of zero came from previous level OR
6098    // from a new invocation...
6099    UBool wasDoingPrimary = FALSE;
6100    // destination buffer byte counter. When this guy
6101    // gets to count, we're done with the iteration
6102    int32_t i = 0;
6103    // used to count the zero bytes written after we
6104    // have finished with the sort key
6105    int32_t j = 0;
6106
6107
6108    // Hm.... I think we're ready to plunge in. Basic story is as following:
6109    // we have a fall through case based on level. This is used for initial
6110    // positioning on iteration start. Every level processor contains a
6111    // for(;;) which will be broken when we exhaust all the CEs. Other
6112    // way to exit is a goto saveState, which happens when we have filled
6113    // out our buffer.
6114    switch(level) {
6115    case UCOL_PSK_PRIMARY:
6116        wasDoingPrimary = TRUE;
6117        for(;;) {
6118            if(i==count) {
6119                goto saveState;
6120            }
6121            // We should save the state only if we
6122            // are sure that we are done with the
6123            // previous iterator state
6124            if(canUpdateState && byteCountOrFrenchDone == 0) {
6125                newState = s.iterator->getState(s.iterator);
6126                if(newState != UITER_NO_STATE) {
6127                    iterState = newState;
6128                    cces = 0;
6129                }
6130            }
6131            CE = ucol_IGetNextCE(coll, &s, status);
6132            cces++;
6133            if(CE==UCOL_NO_MORE_CES) {
6134                // Add the level separator
6135                terminatePSKLevel(level, maxLevel, i, dest);
6136                byteCountOrFrenchDone=0;
6137                // Restart the iteration an move to the
6138                // second level
6139                s.iterator->move(s.iterator, 0, UITER_START);
6140                cces = 0;
6141                level = UCOL_PSK_SECONDARY;
6142                break;
6143            }
6144            if(!isShiftedCE(CE, LVT, &wasShifted)) {
6145                CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
6146                if(CE != 0) {
6147                    if(byteCountOrFrenchDone == 0) {
6148                        // get the second byte of primary
6149                        dest[i++]=(uint8_t)(CE >> 8);
6150                    } else {
6151                        byteCountOrFrenchDone = 0;
6152                    }
6153                    if((CE &=0xff)!=0) {
6154                        if(i==count) {
6155                            /* overflow */
6156                            byteCountOrFrenchDone = 1;
6157                            cces--;
6158                            goto saveState;
6159                        }
6160                        dest[i++]=(uint8_t)CE;
6161                    }
6162                }
6163            }
6164            if(uprv_numAvailableExpCEs(s)) {
6165                canUpdateState = FALSE;
6166            } else {
6167                canUpdateState = TRUE;
6168            }
6169        }
6170        /* fall through to next level */
6171    case UCOL_PSK_SECONDARY:
6172        if(strength >= UCOL_SECONDARY) {
6173            if(!doingFrench) {
6174                for(;;) {
6175                    if(i == count) {
6176                        goto saveState;
6177                    }
6178                    // We should save the state only if we
6179                    // are sure that we are done with the
6180                    // previous iterator state
6181                    if(canUpdateState) {
6182                        newState = s.iterator->getState(s.iterator);
6183                        if(newState != UITER_NO_STATE) {
6184                            iterState = newState;
6185                            cces = 0;
6186                        }
6187                    }
6188                    CE = ucol_IGetNextCE(coll, &s, status);
6189                    cces++;
6190                    if(CE==UCOL_NO_MORE_CES) {
6191                        // Add the level separator
6192                        terminatePSKLevel(level, maxLevel, i, dest);
6193                        byteCountOrFrenchDone = 0;
6194                        // Restart the iteration an move to the
6195                        // second level
6196                        s.iterator->move(s.iterator, 0, UITER_START);
6197                        cces = 0;
6198                        level = UCOL_PSK_CASE;
6199                        break;
6200                    }
6201                    if(!isShiftedCE(CE, LVT, &wasShifted)) {
6202                        CE >>= 8; /* get secondary */
6203                        if(CE != 0) {
6204                            dest[i++]=(uint8_t)CE;
6205                        }
6206                    }
6207                    if(uprv_numAvailableExpCEs(s)) {
6208                        canUpdateState = FALSE;
6209                    } else {
6210                        canUpdateState = TRUE;
6211                    }
6212                }
6213            } else { // French secondary processing
6214                uint8_t frenchBuff[UCOL_MAX_BUFFER];
6215                int32_t frenchIndex = 0;
6216                // Here we are going backwards.
6217                // If the iterator is at the beggining, it should be
6218                // moved to end.
6219                if(wasDoingPrimary) {
6220                    s.iterator->move(s.iterator, 0, UITER_LIMIT);
6221                    cces = 0;
6222                }
6223                for(;;) {
6224                    if(i == count) {
6225                        goto saveState;
6226                    }
6227                    if(canUpdateState) {
6228                        newState = s.iterator->getState(s.iterator);
6229                        if(newState != UITER_NO_STATE) {
6230                            iterState = newState;
6231                            cces = 0;
6232                        }
6233                    }
6234                    CE = ucol_IGetPrevCE(coll, &s, status);
6235                    cces++;
6236                    if(CE==UCOL_NO_MORE_CES) {
6237                        // Add the level separator
6238                        terminatePSKLevel(level, maxLevel, i, dest);
6239                        byteCountOrFrenchDone = 0;
6240                        // Restart the iteration an move to the next level
6241                        s.iterator->move(s.iterator, 0, UITER_START);
6242                        level = UCOL_PSK_CASE;
6243                        break;
6244                    }
6245                    if(isContinuation(CE)) { // if it's a continuation, we want to save it and
6246                        // reverse when we get a first non-continuation CE.
6247                        CE >>= 8;
6248                        frenchBuff[frenchIndex++] = (uint8_t)CE;
6249                    } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
6250                        CE >>= 8; /* get secondary */
6251                        if(!frenchIndex) {
6252                            if(CE != 0) {
6253                                dest[i++]=(uint8_t)CE;
6254                            }
6255                        } else {
6256                            frenchBuff[frenchIndex++] = (uint8_t)CE;
6257                            frenchIndex -= usedFrench;
6258                            usedFrench = 0;
6259                            while(i < count && frenchIndex) {
6260                                dest[i++] = frenchBuff[--frenchIndex];
6261                                usedFrench++;
6262                            }
6263                        }
6264                    }
6265                    if(uprv_numAvailableExpCEs(s)) {
6266                        canUpdateState = FALSE;
6267                    } else {
6268                        canUpdateState = TRUE;
6269                    }
6270                }
6271            }
6272        } else {
6273            level = UCOL_PSK_CASE;
6274        }
6275        /* fall through to next level */
6276    case UCOL_PSK_CASE:
6277        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6278            uint32_t caseShift = UCOL_CASE_SHIFT_START;
6279            uint8_t caseByte = UCOL_CASE_BYTE_START;
6280            uint8_t caseBits = 0;
6281
6282            for(;;) {
6283                if(i == count) {
6284                    goto saveState;
6285                }
6286                // We should save the state only if we
6287                // are sure that we are done with the
6288                // previous iterator state
6289                if(canUpdateState) {
6290                    newState = s.iterator->getState(s.iterator);
6291                    if(newState != UITER_NO_STATE) {
6292                        iterState = newState;
6293                        cces = 0;
6294                    }
6295                }
6296                CE = ucol_IGetNextCE(coll, &s, status);
6297                cces++;
6298                if(CE==UCOL_NO_MORE_CES) {
6299                    // On the case level we might have an unfinished
6300                    // case byte. Add one if it's started.
6301                    if(caseShift != UCOL_CASE_SHIFT_START) {
6302                        dest[i++] = caseByte;
6303                    }
6304                    cces = 0;
6305                    // We have finished processing CEs on this level.
6306                    // However, we don't know if we have enough space
6307                    // to add a case level terminator.
6308                    if(i < count) {
6309                        // Add the level separator
6310                        terminatePSKLevel(level, maxLevel, i, dest);
6311                        // Restart the iteration and move to the
6312                        // next level
6313                        s.iterator->move(s.iterator, 0, UITER_START);
6314                        level = UCOL_PSK_TERTIARY;
6315                    } else {
6316                        canUpdateState = FALSE;
6317                    }
6318                    break;
6319                }
6320
6321                if(!isShiftedCE(CE, LVT, &wasShifted)) {
6322                    if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
6323                        // do the case level if we need to do it. We don't want to calculate
6324                        // case level for primary ignorables if we have only primary strength and case level
6325                        // otherwise we would break well formedness of CEs
6326                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6327                        caseBits = (uint8_t)(CE & 0xC0);
6328                        // this copies the case level logic from the
6329                        // sort key generation code
6330                        if(CE != 0) {
6331                            if(coll->caseFirst == UCOL_UPPER_FIRST) {
6332                                if((caseBits & 0xC0) == 0) {
6333                                    caseByte |= 1 << (--caseShift);
6334                                } else {
6335                                    caseByte |= 0 << (--caseShift);
6336                                    /* second bit */
6337                                    if(caseShift == 0) {
6338                                        dest[i++] = caseByte;
6339                                        caseShift = UCOL_CASE_SHIFT_START;
6340                                        caseByte = UCOL_CASE_BYTE_START;
6341                                    }
6342                                    caseByte |= ((caseBits>>6)&1) << (--caseShift);
6343                                }
6344                            } else {
6345                                if((caseBits & 0xC0) == 0) {
6346                                    caseByte |= 0 << (--caseShift);
6347                                } else {
6348                                    caseByte |= 1 << (--caseShift);
6349                                    /* second bit */
6350                                    if(caseShift == 0) {
6351                                        dest[i++] = caseByte;
6352                                        caseShift = UCOL_CASE_SHIFT_START;
6353                                        caseByte = UCOL_CASE_BYTE_START;
6354                                    }
6355                                    caseByte |= ((caseBits>>7)&1) << (--caseShift);
6356                                }
6357                            }
6358                        }
6359
6360                    }
6361                }
6362                // Not sure this is correct for the case level - revisit
6363                if(uprv_numAvailableExpCEs(s)) {
6364                    canUpdateState = FALSE;
6365                } else {
6366                    canUpdateState = TRUE;
6367                }
6368            }
6369        } else {
6370            level = UCOL_PSK_TERTIARY;
6371        }
6372        /* fall through to next level */
6373    case UCOL_PSK_TERTIARY:
6374        if(strength >= UCOL_TERTIARY) {
6375            for(;;) {
6376                if(i == count) {
6377                    goto saveState;
6378                }
6379                // We should save the state only if we
6380                // are sure that we are done with the
6381                // previous iterator state
6382                if(canUpdateState) {
6383                    newState = s.iterator->getState(s.iterator);
6384                    if(newState != UITER_NO_STATE) {
6385                        iterState = newState;
6386                        cces = 0;
6387                    }
6388                }
6389                CE = ucol_IGetNextCE(coll, &s, status);
6390                cces++;
6391                if(CE==UCOL_NO_MORE_CES) {
6392                    // Add the level separator
6393                    terminatePSKLevel(level, maxLevel, i, dest);
6394                    byteCountOrFrenchDone = 0;
6395                    // Restart the iteration an move to the
6396                    // second level
6397                    s.iterator->move(s.iterator, 0, UITER_START);
6398                    cces = 0;
6399                    level = UCOL_PSK_QUATERNARY;
6400                    break;
6401                }
6402                if(!isShiftedCE(CE, LVT, &wasShifted)) {
6403                    notIsContinuation = !isContinuation(CE);
6404
6405                    if(notIsContinuation) {
6406                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6407                        CE ^= coll->caseSwitch;
6408                        CE &= coll->tertiaryMask;
6409                    } else {
6410                        CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6411                    }
6412
6413                    if(CE != 0) {
6414                        dest[i++]=(uint8_t)CE;
6415                    }
6416                }
6417                if(uprv_numAvailableExpCEs(s)) {
6418                    canUpdateState = FALSE;
6419                } else {
6420                    canUpdateState = TRUE;
6421                }
6422            }
6423        } else {
6424            // if we're not doing tertiary
6425            // skip to the end
6426            level = UCOL_PSK_NULL;
6427        }
6428        /* fall through to next level */
6429    case UCOL_PSK_QUATERNARY:
6430        if(strength >= UCOL_QUATERNARY) {
6431            for(;;) {
6432                if(i == count) {
6433                    goto saveState;
6434                }
6435                // We should save the state only if we
6436                // are sure that we are done with the
6437                // previous iterator state
6438                if(canUpdateState) {
6439                    newState = s.iterator->getState(s.iterator);
6440                    if(newState != UITER_NO_STATE) {
6441                        iterState = newState;
6442                        cces = 0;
6443                    }
6444                }
6445                CE = ucol_IGetNextCE(coll, &s, status);
6446                cces++;
6447                if(CE==UCOL_NO_MORE_CES) {
6448                    // Add the level separator
6449                    terminatePSKLevel(level, maxLevel, i, dest);
6450                    //dest[i++] = UCOL_LEVELTERMINATOR;
6451                    byteCountOrFrenchDone = 0;
6452                    // Restart the iteration an move to the
6453                    // second level
6454                    s.iterator->move(s.iterator, 0, UITER_START);
6455                    cces = 0;
6456                    level = UCOL_PSK_QUIN;
6457                    break;
6458                }
6459                if(CE==0)
6460                    continue;
6461                if(isShiftedCE(CE, LVT, &wasShifted)) {
6462                    CE >>= 16; /* get primary */
6463                    if(CE != 0) {
6464                        if(byteCountOrFrenchDone == 0) {
6465                            dest[i++]=(uint8_t)(CE >> 8);
6466                        } else {
6467                            byteCountOrFrenchDone = 0;
6468                        }
6469                        if((CE &=0xff)!=0) {
6470                            if(i==count) {
6471                                /* overflow */
6472                                byteCountOrFrenchDone = 1;
6473                                goto saveState;
6474                            }
6475                            dest[i++]=(uint8_t)CE;
6476                        }
6477                    }
6478                } else {
6479                    notIsContinuation = !isContinuation(CE);
6480                    if(notIsContinuation) {
6481                        if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6482                            dest[i++] = UCOL_HIRAGANA_QUAD;
6483                        } else {
6484                            dest[i++] = 0xFF;
6485                        }
6486                    }
6487                }
6488                if(uprv_numAvailableExpCEs(s)) {
6489                    canUpdateState = FALSE;
6490                } else {
6491                    canUpdateState = TRUE;
6492                }
6493            }
6494        } else {
6495            // if we're not doing quaternary
6496            // skip to the end
6497            level = UCOL_PSK_NULL;
6498        }
6499        /* fall through to next level */
6500    case UCOL_PSK_QUIN:
6501        level = UCOL_PSK_IDENTICAL;
6502        /* fall through to next level */
6503    case UCOL_PSK_IDENTICAL:
6504        if(strength >= UCOL_IDENTICAL) {
6505            UChar32 first, second;
6506            int32_t bocsuBytesWritten = 0;
6507            // We always need to do identical on
6508            // the NFD form of the string.
6509            if(normIter == NULL) {
6510                // we arrived from the level below and
6511                // normalization was not turned on.
6512                // therefore, we need to make a fresh NFD iterator
6513                normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6514                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6515            } else if(!doingIdenticalFromStart) {
6516                // there is an iterator, but we did some other levels.
6517                // therefore, we have a FCD iterator - need to make
6518                // a NFD one.
6519                // normIter being at the beginning does not guarantee
6520                // that the underlying iterator is at the beginning
6521                iter->move(iter, 0, UITER_START);
6522                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6523            }
6524            // At this point we have a NFD iterator that is positioned
6525            // in the right place
6526            if(U_FAILURE(*status)) {
6527                UTRACE_EXIT_STATUS(*status);
6528                return 0;
6529            }
6530            first = uiter_previous32(s.iterator);
6531            // maybe we're at the start of the string
6532            if(first == U_SENTINEL) {
6533                first = 0;
6534            } else {
6535                uiter_next32(s.iterator);
6536            }
6537
6538            j = 0;
6539            for(;;) {
6540                if(i == count) {
6541                    if(j+1 < bocsuBytesWritten) {
6542                        bocsuBytesUsed = j+1;
6543                    }
6544                    goto saveState;
6545                }
6546
6547                // On identical level, we will always save
6548                // the state if we reach this point, since
6549                // we don't depend on getNextCE for content
6550                // all the content is in our buffer and we
6551                // already either stored the full buffer OR
6552                // otherwise we won't arrive here.
6553                newState = s.iterator->getState(s.iterator);
6554                if(newState != UITER_NO_STATE) {
6555                    iterState = newState;
6556                    cces = 0;
6557                }
6558
6559                uint8_t buff[4];
6560                second = uiter_next32(s.iterator);
6561                cces++;
6562
6563                // end condition for identical level
6564                if(second == U_SENTINEL) {
6565                    terminatePSKLevel(level, maxLevel, i, dest);
6566                    level = UCOL_PSK_NULL;
6567                    break;
6568                }
6569                bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6570                first = second;
6571
6572                j = 0;
6573                if(bocsuBytesUsed != 0) {
6574                    while(bocsuBytesUsed-->0) {
6575                        j++;
6576                    }
6577                }
6578
6579                while(i < count && j < bocsuBytesWritten) {
6580                    dest[i++] = buff[j++];
6581                }
6582            }
6583
6584        } else {
6585            level = UCOL_PSK_NULL;
6586        }
6587        /* fall through to next level */
6588    case UCOL_PSK_NULL:
6589        j = i;
6590        while(j<count) {
6591            dest[j++]=0;
6592        }
6593        break;
6594    default:
6595        *status = U_INTERNAL_PROGRAM_ERROR;
6596        UTRACE_EXIT_STATUS(*status);
6597        return 0;
6598    }
6599
6600saveState:
6601    // Now we need to return stuff. First we want to see whether we have
6602    // done everything for the current state of iterator.
6603    if(byteCountOrFrenchDone
6604        || canUpdateState == FALSE
6605        || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6606    {
6607        // Any of above mean that the previous transaction
6608        // wasn't finished and that we should store the
6609        // previous iterator state.
6610        state[0] = iterState;
6611    } else {
6612        // The transaction is complete. We will continue in the next iteration.
6613        state[0] = s.iterator->getState(s.iterator);
6614        cces = 0;
6615    }
6616    // Store the number of bocsu bytes written.
6617    if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6618        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6619    }
6620    state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6621
6622    // Next we put in the level of comparison
6623    state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6624
6625    // If we are doing French, we need to store whether we have just finished the French level
6626    if(level == UCOL_PSK_SECONDARY && doingFrench) {
6627        state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6628    } else {
6629        state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6630    }
6631
6632    // Was the latest CE shifted
6633    if(wasShifted) {
6634        state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6635    }
6636    // Check for cces overflow
6637    if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6638        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6639    }
6640    // Store cces
6641    state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6642
6643    // Check for French overflow
6644    if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6645        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6646    }
6647    // Store number of bytes written in the French secondary continuation sequence
6648    state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6649
6650
6651    // If we have used normalizing iterator, get rid of it
6652    if(normIter != NULL) {
6653        unorm_closeIter(normIter);
6654    }
6655
6656    /* To avoid memory leak, free the offset buffer if necessary. */
6657    ucol_freeOffsetBuffer(&s);
6658
6659    // Return number of meaningful sortkey bytes.
6660    UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6661                  dest,i, state[0], state[1]);
6662    UTRACE_EXIT_VALUE(i);
6663    return i;
6664}
6665
6666/**
6667 * Produce a bound for a given sortkey and a number of levels.
6668 */
6669U_CAPI int32_t U_EXPORT2
6670ucol_getBound(const uint8_t       *source,
6671        int32_t             sourceLength,
6672        UColBoundMode       boundType,
6673        uint32_t            noOfLevels,
6674        uint8_t             *result,
6675        int32_t             resultLength,
6676        UErrorCode          *status)
6677{
6678    // consistency checks
6679    if(status == NULL || U_FAILURE(*status)) {
6680        return 0;
6681    }
6682    if(source == NULL) {
6683        *status = U_ILLEGAL_ARGUMENT_ERROR;
6684        return 0;
6685    }
6686
6687    int32_t sourceIndex = 0;
6688    // Scan the string until we skip enough of the key OR reach the end of the key
6689    do {
6690        sourceIndex++;
6691        if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6692            noOfLevels--;
6693        }
6694    } while (noOfLevels > 0
6695        && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6696
6697    if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6698        && noOfLevels > 0) {
6699            *status = U_SORT_KEY_TOO_SHORT_WARNING;
6700    }
6701
6702
6703    // READ ME: this code assumes that the values for boundType
6704    // enum will not changes. They are set so that the enum value
6705    // corresponds to the number of extra bytes each bound type
6706    // needs.
6707    if(result != NULL && resultLength >= sourceIndex+boundType) {
6708        uprv_memcpy(result, source, sourceIndex);
6709        switch(boundType) {
6710            // Lower bound just gets terminated. No extra bytes
6711        case UCOL_BOUND_LOWER: // = 0
6712            break;
6713            // Upper bound needs one extra byte
6714        case UCOL_BOUND_UPPER: // = 1
6715            result[sourceIndex++] = 2;
6716            break;
6717            // Upper long bound needs two extra bytes
6718        case UCOL_BOUND_UPPER_LONG: // = 2
6719            result[sourceIndex++] = 0xFF;
6720            result[sourceIndex++] = 0xFF;
6721            break;
6722        default:
6723            *status = U_ILLEGAL_ARGUMENT_ERROR;
6724            return 0;
6725        }
6726        result[sourceIndex++] = 0;
6727
6728        return sourceIndex;
6729    } else {
6730        return sourceIndex+boundType+1;
6731    }
6732}
6733
6734/****************************************************************************/
6735/* Following are the functions that deal with the properties of a collator  */
6736/* there are new APIs and some compatibility APIs                           */
6737/****************************************************************************/
6738
6739static inline void
6740ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6741                    int32_t *primShift, int32_t *secShift, int32_t *terShift)
6742{
6743    uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6744    UBool reverseSecondary = FALSE;
6745    if(!isContinuation(CE)) {
6746        tertiary = (uint8_t)((CE & coll->tertiaryMask));
6747        tertiary ^= coll->caseSwitch;
6748        reverseSecondary = TRUE;
6749    } else {
6750        tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6751        tertiary &= UCOL_REMOVE_CASE;
6752        reverseSecondary = FALSE;
6753    }
6754
6755    secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6756    primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6757    primary1 = (uint8_t)(CE >> 8);
6758
6759    if(primary1 != 0) {
6760        coll->latinOneCEs[ch] |= (primary1 << *primShift);
6761        *primShift -= 8;
6762    }
6763    if(primary2 != 0) {
6764        if(*primShift < 0) {
6765            coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6766            coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6767            coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6768            return;
6769        }
6770        coll->latinOneCEs[ch] |= (primary2 << *primShift);
6771        *primShift -= 8;
6772    }
6773    if(secondary != 0) {
6774        if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6775            coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6776            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6777        } else { // normal case
6778            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6779        }
6780        *secShift -= 8;
6781    }
6782    if(tertiary != 0) {
6783        coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6784        *terShift -= 8;
6785    }
6786}
6787
6788static inline UBool
6789ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6790    uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6791    if(newTable == NULL) {
6792      *status = U_MEMORY_ALLOCATION_ERROR;
6793      coll->latinOneFailed = TRUE;
6794      return FALSE;
6795    }
6796    int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6797    uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6798    uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6799    uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6800    uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6801    coll->latinOneTableLen = size;
6802    uprv_free(coll->latinOneCEs);
6803    coll->latinOneCEs = newTable;
6804    return TRUE;
6805}
6806
6807static UBool
6808ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6809    UBool result = TRUE;
6810    if(coll->latinOneCEs == NULL) {
6811        coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6812        if(coll->latinOneCEs == NULL) {
6813            *status = U_MEMORY_ALLOCATION_ERROR;
6814            return FALSE;
6815        }
6816        coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6817    }
6818    UChar ch = 0;
6819    UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6820    // Check for null pointer
6821    if (U_FAILURE(*status)) {
6822        return FALSE;
6823    }
6824    uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6825
6826    int32_t primShift = 24, secShift = 24, terShift = 24;
6827    uint32_t CE = 0;
6828    int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6829
6830    // TODO: make safe if you get more than you wanted...
6831    for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6832        primShift = 24; secShift = 24; terShift = 24;
6833        if(ch < 0x100) {
6834            CE = coll->latinOneMapping[ch];
6835        } else {
6836            CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6837            if(CE == UCOL_NOT_FOUND && coll->UCA) {
6838                CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6839            }
6840        }
6841        if(CE < UCOL_NOT_FOUND) {
6842            ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6843        } else {
6844            switch (getCETag(CE)) {
6845            case EXPANSION_TAG:
6846            case DIGIT_TAG:
6847                ucol_setText(it, &ch, 1, status);
6848                while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6849                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6850                        coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6851                        coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6852                        coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6853                        break;
6854                    }
6855                    ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6856                }
6857                break;
6858            case CONTRACTION_TAG:
6859                // here is the trick
6860                // F2 is contraction. We do something very similar to contractions
6861                // but have two indices, one in the real contraction table and the
6862                // other to where we stuffed things. This hopes that we don't have
6863                // many contractions (this should work for latin-1 tables).
6864                {
6865                    if((CE & 0x00FFF000) != 0) {
6866                        *status = U_UNSUPPORTED_ERROR;
6867                        goto cleanup_after_failure;
6868                    }
6869
6870                    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6871
6872                    CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6873
6874                    coll->latinOneCEs[ch] = CE;
6875                    coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6876                    coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6877
6878                    // We're going to jump into contraction table, pick the elements
6879                    // and use them
6880                    do {
6881                        CE = *(coll->contractionCEs +
6882                            (UCharOffset - coll->contractionIndex));
6883                        if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6884                            uint32_t size;
6885                            uint32_t i;    /* general counter */
6886                            uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6887                            size = getExpansionCount(CE);
6888                            //CE = *CEOffset++;
6889                            if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6890                                for(i = 0; i<size; i++) {
6891                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6892                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6893                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6894                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6895                                        break;
6896                                    }
6897                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6898                                }
6899                            } else { /* else, we do */
6900                                while(*CEOffset != 0) {
6901                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6902                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6903                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6904                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6905                                        break;
6906                                    }
6907                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6908                                }
6909                            }
6910                            contractionOffset++;
6911                        } else if(CE < UCOL_NOT_FOUND) {
6912                            ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6913                        } else {
6914                            coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6915                            coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6916                            coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6917                            contractionOffset++;
6918                        }
6919                        UCharOffset++;
6920                        primShift = 24; secShift = 24; terShift = 24;
6921                        if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6922                            if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6923                                goto cleanup_after_failure;
6924                            }
6925                        }
6926                    } while(*UCharOffset != 0xFFFF);
6927                }
6928                break;;
6929            case SPEC_PROC_TAG:
6930                {
6931                    // 0xB7 is a precontext character defined in UCA5.1, a special
6932                    // handle is implemeted in order to save LatinOne table for
6933                    // most locales.
6934                    if (ch==0xb7) {
6935                        ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6936                    }
6937                    else {
6938                        goto cleanup_after_failure;
6939                    }
6940                }
6941                break;
6942            default:
6943                goto cleanup_after_failure;
6944            }
6945        }
6946    }
6947    // compact table
6948    if(contractionOffset < coll->latinOneTableLen) {
6949        if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6950            goto cleanup_after_failure;
6951        }
6952    }
6953    ucol_closeElements(it);
6954    return result;
6955
6956cleanup_after_failure:
6957    // status should already be set before arriving here.
6958    coll->latinOneFailed = TRUE;
6959    ucol_closeElements(it);
6960    return FALSE;
6961}
6962
6963void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6964    if(U_SUCCESS(*status)) {
6965        if(coll->caseFirst == UCOL_UPPER_FIRST) {
6966            coll->caseSwitch = UCOL_CASE_SWITCH;
6967        } else {
6968            coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6969        }
6970
6971        if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6972            coll->tertiaryMask = UCOL_REMOVE_CASE;
6973            coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6974            coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6975            coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6976            coll->tertiaryBottom = UCOL_COMMON_BOT3;
6977        } else {
6978            coll->tertiaryMask = UCOL_KEEP_CASE;
6979            coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6980            if(coll->caseFirst == UCOL_UPPER_FIRST) {
6981                coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6982                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6983                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6984            } else {
6985                coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6986                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6987                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6988            }
6989        }
6990
6991        /* Set the compression values */
6992        uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6993        coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6994        coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6995
6996        if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6997            && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6998        {
6999            coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
7000        } else {
7001            coll->sortKeyGen = ucol_calcSortKey;
7002        }
7003        if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
7004            && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
7005        {
7006            if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
7007                if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
7008                    //fprintf(stderr, "F");
7009                    coll->latinOneUse = TRUE;
7010                } else {
7011                    coll->latinOneUse = FALSE;
7012                }
7013                if(*status == U_UNSUPPORTED_ERROR) {
7014                    *status = U_ZERO_ERROR;
7015                }
7016            } else { // latin1Table exists and it doesn't need to be regenerated, just use it
7017                coll->latinOneUse = TRUE;
7018            }
7019        } else {
7020            coll->latinOneUse = FALSE;
7021        }
7022    }
7023}
7024
7025U_CAPI uint32_t  U_EXPORT2
7026ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
7027    if(U_FAILURE(*status) || coll == NULL) {
7028        return 0;
7029    }
7030    if(len == -1) {
7031        len = u_strlen(varTop);
7032    }
7033    if(len == 0) {
7034        *status = U_ILLEGAL_ARGUMENT_ERROR;
7035        return 0;
7036    }
7037
7038    collIterate s;
7039    IInit_collIterate(coll, varTop, len, &s);
7040
7041    uint32_t CE = ucol_IGetNextCE(coll, &s, status);
7042
7043    /* here we check if we have consumed all characters */
7044    /* you can put in either one character or a contraction */
7045    /* you shouldn't put more... */
7046    if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
7047        *status = U_CE_NOT_FOUND_ERROR;
7048        return 0;
7049    }
7050
7051    uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
7052
7053    if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
7054        *status = U_PRIMARY_TOO_LONG_ERROR;
7055        return 0;
7056    }
7057    if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
7058        coll->variableTopValueisDefault = FALSE;
7059        coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
7060    }
7061
7062    /* To avoid memory leak, free the offset buffer if necessary. */
7063    ucol_freeOffsetBuffer(&s);
7064
7065    return CE & UCOL_PRIMARYMASK;
7066}
7067
7068U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
7069    if(U_FAILURE(*status) || coll == NULL) {
7070        return 0;
7071    }
7072    return coll->variableTopValue<<16;
7073}
7074
7075U_CAPI void  U_EXPORT2
7076ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
7077    if(U_FAILURE(*status) || coll == NULL) {
7078        return;
7079    }
7080
7081    if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
7082        coll->variableTopValueisDefault = FALSE;
7083        coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
7084    }
7085}
7086/* Attribute setter API */
7087U_CAPI void  U_EXPORT2
7088ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
7089    if(U_FAILURE(*status) || coll == NULL) {
7090      return;
7091    }
7092    UColAttributeValue oldFrench = coll->frenchCollation;
7093    UColAttributeValue oldCaseFirst = coll->caseFirst;
7094    switch(attr) {
7095    case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
7096        if(value == UCOL_ON) {
7097            coll->numericCollation = UCOL_ON;
7098            coll->numericCollationisDefault = FALSE;
7099        } else if (value == UCOL_OFF) {
7100            coll->numericCollation = UCOL_OFF;
7101            coll->numericCollationisDefault = FALSE;
7102        } else if (value == UCOL_DEFAULT) {
7103            coll->numericCollationisDefault = TRUE;
7104            coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
7105        } else {
7106            *status = U_ILLEGAL_ARGUMENT_ERROR;
7107        }
7108        break;
7109    case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
7110        if(value == UCOL_ON) {
7111            coll->hiraganaQ = UCOL_ON;
7112            coll->hiraganaQisDefault = FALSE;
7113        } else if (value == UCOL_OFF) {
7114            coll->hiraganaQ = UCOL_OFF;
7115            coll->hiraganaQisDefault = FALSE;
7116        } else if (value == UCOL_DEFAULT) {
7117            coll->hiraganaQisDefault = TRUE;
7118            coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
7119        } else {
7120            *status = U_ILLEGAL_ARGUMENT_ERROR;
7121        }
7122        break;
7123    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7124        if(value == UCOL_ON) {
7125            coll->frenchCollation = UCOL_ON;
7126            coll->frenchCollationisDefault = FALSE;
7127        } else if (value == UCOL_OFF) {
7128            coll->frenchCollation = UCOL_OFF;
7129            coll->frenchCollationisDefault = FALSE;
7130        } else if (value == UCOL_DEFAULT) {
7131            coll->frenchCollationisDefault = TRUE;
7132            coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
7133        } else {
7134            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7135        }
7136        break;
7137    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7138        if(value == UCOL_SHIFTED) {
7139            coll->alternateHandling = UCOL_SHIFTED;
7140            coll->alternateHandlingisDefault = FALSE;
7141        } else if (value == UCOL_NON_IGNORABLE) {
7142            coll->alternateHandling = UCOL_NON_IGNORABLE;
7143            coll->alternateHandlingisDefault = FALSE;
7144        } else if (value == UCOL_DEFAULT) {
7145            coll->alternateHandlingisDefault = TRUE;
7146            coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
7147        } else {
7148            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7149        }
7150        break;
7151    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7152        if(value == UCOL_LOWER_FIRST) {
7153            coll->caseFirst = UCOL_LOWER_FIRST;
7154            coll->caseFirstisDefault = FALSE;
7155        } else if (value == UCOL_UPPER_FIRST) {
7156            coll->caseFirst = UCOL_UPPER_FIRST;
7157            coll->caseFirstisDefault = FALSE;
7158        } else if (value == UCOL_OFF) {
7159            coll->caseFirst = UCOL_OFF;
7160            coll->caseFirstisDefault = FALSE;
7161        } else if (value == UCOL_DEFAULT) {
7162            coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
7163            coll->caseFirstisDefault = TRUE;
7164        } else {
7165            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7166        }
7167        break;
7168    case UCOL_CASE_LEVEL: /* do we have an extra case level */
7169        if(value == UCOL_ON) {
7170            coll->caseLevel = UCOL_ON;
7171            coll->caseLevelisDefault = FALSE;
7172        } else if (value == UCOL_OFF) {
7173            coll->caseLevel = UCOL_OFF;
7174            coll->caseLevelisDefault = FALSE;
7175        } else if (value == UCOL_DEFAULT) {
7176            coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
7177            coll->caseLevelisDefault = TRUE;
7178        } else {
7179            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7180        }
7181        break;
7182    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7183        if(value == UCOL_ON) {
7184            coll->normalizationMode = UCOL_ON;
7185            coll->normalizationModeisDefault = FALSE;
7186        } else if (value == UCOL_OFF) {
7187            coll->normalizationMode = UCOL_OFF;
7188            coll->normalizationModeisDefault = FALSE;
7189        } else if (value == UCOL_DEFAULT) {
7190            coll->normalizationModeisDefault = TRUE;
7191            coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
7192        } else {
7193            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7194        }
7195        break;
7196    case UCOL_STRENGTH:         /* attribute for strength */
7197        if (value == UCOL_DEFAULT) {
7198            coll->strengthisDefault = TRUE;
7199            coll->strength = (UColAttributeValue)coll->options->strength;
7200        } else if (value <= UCOL_IDENTICAL) {
7201            coll->strengthisDefault = FALSE;
7202            coll->strength = value;
7203        } else {
7204            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7205        }
7206        break;
7207    case UCOL_ATTRIBUTE_COUNT:
7208    default:
7209        *status = U_ILLEGAL_ARGUMENT_ERROR;
7210        break;
7211    }
7212    if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7213        coll->latinOneRegenTable = TRUE;
7214    } else {
7215        coll->latinOneRegenTable = FALSE;
7216    }
7217    ucol_updateInternalState(coll, status);
7218}
7219
7220U_CAPI UColAttributeValue  U_EXPORT2
7221ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7222    if(U_FAILURE(*status) || coll == NULL) {
7223      return UCOL_DEFAULT;
7224    }
7225    switch(attr) {
7226    case UCOL_NUMERIC_COLLATION:
7227      return coll->numericCollation;
7228    case UCOL_HIRAGANA_QUATERNARY_MODE:
7229      return coll->hiraganaQ;
7230    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7231        return coll->frenchCollation;
7232    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7233        return coll->alternateHandling;
7234    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7235        return coll->caseFirst;
7236    case UCOL_CASE_LEVEL: /* do we have an extra case level */
7237        return coll->caseLevel;
7238    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7239        return coll->normalizationMode;
7240    case UCOL_STRENGTH:         /* attribute for strength */
7241        return coll->strength;
7242    case UCOL_ATTRIBUTE_COUNT:
7243    default:
7244        *status = U_ILLEGAL_ARGUMENT_ERROR;
7245        break;
7246    }
7247    return UCOL_DEFAULT;
7248}
7249
7250U_CAPI void U_EXPORT2
7251ucol_setStrength(    UCollator                *coll,
7252            UCollationStrength        strength)
7253{
7254    UErrorCode status = U_ZERO_ERROR;
7255    ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7256}
7257
7258U_CAPI UCollationStrength U_EXPORT2
7259ucol_getStrength(const UCollator *coll)
7260{
7261    UErrorCode status = U_ZERO_ERROR;
7262    return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7263}
7264
7265/****************************************************************************/
7266/* Following are misc functions                                             */
7267/* there are new APIs and some compatibility APIs                           */
7268/****************************************************************************/
7269
7270U_CAPI void U_EXPORT2
7271ucol_getVersion(const UCollator* coll,
7272                UVersionInfo versionInfo)
7273{
7274    /* RunTime version  */
7275    uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7276    /* Builder version*/
7277    uint8_t bdVersion = coll->image->version[0];
7278
7279    /* Charset Version. Need to get the version from cnv files
7280     * makeconv should populate cnv files with version and
7281     * an api has to be provided in ucnv.h to obtain this version
7282     */
7283    uint8_t csVersion = 0;
7284
7285    /* combine the version info */
7286    uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7287
7288    /* Tailoring rules */
7289    versionInfo[0] = (uint8_t)(cmbVersion>>8);
7290    versionInfo[1] = (uint8_t)cmbVersion;
7291    versionInfo[2] = coll->image->version[1];
7292    if(coll->UCA) {
7293        /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
7294        versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
7295    } else {
7296        versionInfo[3] = 0;
7297    }
7298}
7299
7300
7301/* This internal API checks whether a character is tailored or not */
7302U_CAPI UBool  U_EXPORT2
7303ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7304    if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
7305        return FALSE;
7306    }
7307
7308    uint32_t CE = UCOL_NOT_FOUND;
7309    const UChar *ContractionStart = NULL;
7310    if(u < 0x100) { /* latin-1 */
7311        CE = coll->latinOneMapping[u];
7312        if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
7313            return FALSE;
7314        }
7315    } else { /* regular */
7316        CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
7317    }
7318
7319    if(isContraction(CE)) {
7320        ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7321        CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7322    }
7323
7324    return (UBool)(CE != UCOL_NOT_FOUND);
7325}
7326
7327
7328/****************************************************************************/
7329/* Following are the string compare functions                               */
7330/*                                                                          */
7331/****************************************************************************/
7332
7333
7334/*  ucol_checkIdent    internal function.  Does byte level string compare.   */
7335/*                     Used by strcoll if strength == identical and strings  */
7336/*                     are otherwise equal.  Moved out-of-line because this  */
7337/*                     is a rare case.                                       */
7338/*                                                                           */
7339/*                     Comparison must be done on NFD normalized strings.    */
7340/*                     FCD is not good enough.                               */
7341/*                                                                           */
7342/*      TODO:  make an incremental NFD Comparison function, which could      */
7343/*             be of general use                                             */
7344
7345static
7346UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7347{
7348
7349  // TODO: When we have an UChar iterator, we need to access the whole string. One
7350  // useful modification would be a UChar iterator extract API, since reset next next...
7351  // is not optimal.
7352  // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7353
7354  // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7355  // of same type, but that doesn't really mean that it will stay that way.
7356
7357    // The division for the array length may truncate the array size to
7358    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7359    // for all platforms anyway.
7360    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7361    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7362    //UChar sStackBuf[256], tStackBuf[256];
7363    //int32_t sBufSize = 256, tBufSize = 256;
7364    int32_t            comparison;
7365    int32_t          sLen        = 0;
7366    UChar            *sBuf       = NULL;
7367    int32_t          tLen        = 0;
7368    UChar            *tBuf       = NULL;
7369    UBool freeSBuf = FALSE, freeTBuf = FALSE;
7370
7371    if (sColl->flags & UCOL_USE_ITERATOR) {
7372        UNormIterator *sNIt = NULL, *tNIt = NULL;
7373        sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7374        tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7375        sColl->iterator->move(sColl->iterator, 0, UITER_START);
7376        tColl->iterator->move(tColl->iterator, 0, UITER_START);
7377        UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7378        UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7379        comparison = u_strCompareIter(sIt, tIt, TRUE);
7380        unorm_closeIter(sNIt);
7381        unorm_closeIter(tNIt);
7382    } else {
7383        sLen        = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
7384        sBuf = sColl->string;
7385        tLen        = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
7386        tBuf = tColl->string;
7387
7388        if (normalize) {
7389            *status = U_ZERO_ERROR;
7390            if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
7391                sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7392                                    sBuf, sLen,
7393                                    FALSE, 0,
7394                                    status);
7395                if(*status == U_BUFFER_OVERFLOW_ERROR) {
7396                    if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
7397                        &sColl->writableBuffer,
7398                        (int32_t *)&sColl->writableBufSize, sLen,
7399                        0)
7400                        )
7401                    {
7402                        *status = U_MEMORY_ALLOCATION_ERROR;
7403                        return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7404                    }
7405                    *status = U_ZERO_ERROR;
7406                    sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7407                                        sBuf, sLen,
7408                                        FALSE, 0,
7409                                        status);
7410                }
7411                if(freeSBuf) {
7412                    uprv_free(sBuf);
7413                    freeSBuf = FALSE;
7414                }
7415                sBuf = sColl->writableBuffer;
7416                if (sBuf != sColl->stackWritableBuffer) {
7417                    sColl->flags |= UCOL_ITER_ALLOCATED;
7418                }
7419            }
7420
7421            *status = U_ZERO_ERROR;
7422            if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
7423                tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7424                                    tBuf, tLen,
7425                                    FALSE, 0,
7426                                    status);
7427                if(*status == U_BUFFER_OVERFLOW_ERROR) {
7428                    if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
7429                        &tColl->writableBuffer,
7430                        (int32_t *)&tColl->writableBufSize, tLen,
7431                        0)
7432                        )
7433                    {
7434                        *status = U_MEMORY_ALLOCATION_ERROR;
7435                        return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7436                    }
7437                    *status = U_ZERO_ERROR;
7438                    tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7439                                        tBuf, tLen,
7440                                        FALSE, 0,
7441                                        status);
7442                }
7443                if(freeTBuf) {
7444                    uprv_free(tBuf);
7445                    freeTBuf = FALSE;
7446                }
7447                tBuf = tColl->writableBuffer;
7448                if (tBuf != tColl->stackWritableBuffer) {
7449                    tColl->flags |= UCOL_ITER_ALLOCATED;
7450                }
7451            }
7452        }
7453
7454        if (sLen == -1 && tLen == -1) {
7455            comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7456        } else {
7457            if (sLen == -1) {
7458                sLen = u_strlen(sBuf);
7459            }
7460            if (tLen == -1) {
7461                tLen = u_strlen(tBuf);
7462            }
7463            comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7464            if (comparison == 0) {
7465                comparison = sLen - tLen;
7466            }
7467        }
7468    }
7469
7470    if (comparison < 0) {
7471        return UCOL_LESS;
7472    } else if (comparison == 0) {
7473        return UCOL_EQUAL;
7474    } else /* comparison > 0 */ {
7475        return UCOL_GREATER;
7476    }
7477}
7478
7479/*  CEBuf - A struct and some inline functions to handle the saving    */
7480/*          of CEs in a buffer within ucol_strcoll                     */
7481
7482#define UCOL_CEBUF_SIZE 512
7483typedef struct ucol_CEBuf {
7484    uint32_t    *buf;
7485    uint32_t    *endp;
7486    uint32_t    *pos;
7487    uint32_t     localArray[UCOL_CEBUF_SIZE];
7488} ucol_CEBuf;
7489
7490
7491static
7492inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7493    (b)->buf = (b)->pos = (b)->localArray;
7494    (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7495}
7496
7497static
7498void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7499    uint32_t  oldSize;
7500    uint32_t  newSize;
7501    uint32_t  *newBuf;
7502
7503    ci->flags |= UCOL_ITER_ALLOCATED;
7504    oldSize = b->pos - b->buf;
7505    newSize = oldSize * 2;
7506    newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7507    if(newBuf == NULL) {
7508        *status = U_MEMORY_ALLOCATION_ERROR;
7509    }
7510    else {
7511        uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7512        if (b->buf != b->localArray) {
7513            uprv_free(b->buf);
7514        }
7515        b->buf = newBuf;
7516        b->endp = b->buf + newSize;
7517        b->pos  = b->buf + oldSize;
7518    }
7519}
7520
7521static
7522inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7523    if (b->pos == b->endp) {
7524        ucol_CEBuf_Expand(b, ci, status);
7525    }
7526    if (U_SUCCESS(*status)) {
7527        *(b)->pos++ = ce;
7528    }
7529}
7530
7531/* This is a trick string compare function that goes in and uses sortkeys to compare */
7532/* It is used when compare gets in trouble and needs to bail out                     */
7533static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7534                                                  collIterate *tColl,
7535                                                  UErrorCode *status)
7536{
7537    uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7538    uint8_t *sourceKeyP = sourceKey;
7539    uint8_t *targetKeyP = targetKey;
7540    int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7541    const UCollator *coll = sColl->coll;
7542    UChar *source = NULL;
7543    UChar *target = NULL;
7544    int32_t result = UCOL_EQUAL;
7545    UChar sStackBuf[256], tStackBuf[256];
7546    int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7547    int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7548
7549    // TODO: Handle long strings. Do the same in ucol_checkIdent.
7550    if(sColl->flags & UCOL_USE_ITERATOR) {
7551        sColl->iterator->move(sColl->iterator, 0, UITER_START);
7552        tColl->iterator->move(tColl->iterator, 0, UITER_START);
7553        source = sStackBuf;
7554        UChar *sBufp = source;
7555        target = tStackBuf;
7556        UChar *tBufp = target;
7557        while(sColl->iterator->hasNext(sColl->iterator)) {
7558            *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7559        }
7560        while(tColl->iterator->hasNext(tColl->iterator)) {
7561            *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7562        }
7563        sourceLength = sBufp - source;
7564        targetLength = tBufp - target;
7565    } else { // no iterators
7566        sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7567        targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7568        source = sColl->string;
7569        target = tColl->string;
7570    }
7571
7572
7573
7574    sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7575    if(sourceKeyLen > UCOL_MAX_BUFFER) {
7576        sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7577        if(sourceKeyP == NULL) {
7578            *status = U_MEMORY_ALLOCATION_ERROR;
7579            goto cleanup_and_do_compare;
7580        }
7581        sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7582    }
7583
7584    targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7585    if(targetKeyLen > UCOL_MAX_BUFFER) {
7586        targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7587        if(targetKeyP == NULL) {
7588            *status = U_MEMORY_ALLOCATION_ERROR;
7589            goto cleanup_and_do_compare;
7590        }
7591        targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7592    }
7593
7594    result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7595
7596cleanup_and_do_compare:
7597    if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7598        uprv_free(sourceKeyP);
7599    }
7600
7601    if(targetKeyP != NULL && targetKeyP != targetKey) {
7602        uprv_free(targetKeyP);
7603    }
7604
7605    if(result<0) {
7606        return UCOL_LESS;
7607    } else if(result>0) {
7608        return UCOL_GREATER;
7609    } else {
7610        return UCOL_EQUAL;
7611    }
7612}
7613
7614
7615static inline UCollationResult
7616ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7617//              const UCollator    *coll,
7618//              const UChar        *source,
7619//              int32_t            sourceLength,
7620//              const UChar        *target,
7621//              int32_t            targetLength,
7622              UErrorCode *status)
7623{
7624    U_ALIGN_CODE(16);
7625
7626    const UCollator *coll = sColl->coll;
7627
7628
7629    // setting up the collator parameters
7630    UColAttributeValue strength = coll->strength;
7631    UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7632
7633    UBool checkSecTer = initialCheckSecTer;
7634    UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7635    UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7636    UBool checkIdent = (strength == UCOL_IDENTICAL);
7637    UBool checkCase = (coll->caseLevel == UCOL_ON);
7638    UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7639    UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7640    UBool qShifted = shifted && checkQuad;
7641    UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7642
7643    if(doHiragana && shifted) {
7644        return (ucol_compareUsingSortKeys(sColl, tColl, status));
7645    }
7646    uint8_t caseSwitch = coll->caseSwitch;
7647    uint8_t tertiaryMask = coll->tertiaryMask;
7648
7649    // This is the lowest primary value that will not be ignored if shifted
7650    uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7651
7652    UCollationResult result = UCOL_EQUAL;
7653    UCollationResult hirResult = UCOL_EQUAL;
7654
7655    // Preparing the CE buffers. They will be filled during the primary phase
7656    ucol_CEBuf   sCEs;
7657    ucol_CEBuf   tCEs;
7658    UCOL_INIT_CEBUF(&sCEs);
7659    UCOL_INIT_CEBUF(&tCEs);
7660
7661    uint32_t secS = 0, secT = 0;
7662    uint32_t sOrder=0, tOrder=0;
7663
7664    // Non shifted primary processing is quite simple
7665    if(!shifted) {
7666        for(;;) {
7667
7668            // We fetch CEs until we hit a non ignorable primary or end.
7669            do {
7670                // We get the next CE
7671                sOrder = ucol_IGetNextCE(coll, sColl, status);
7672                // Stuff it in the buffer
7673                UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7674                // And keep just the primary part.
7675                sOrder &= UCOL_PRIMARYMASK;
7676            } while(sOrder == 0);
7677
7678            // see the comments on the above block
7679            do {
7680                tOrder = ucol_IGetNextCE(coll, tColl, status);
7681                UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7682                tOrder &= UCOL_PRIMARYMASK;
7683            } while(tOrder == 0);
7684
7685            // if both primaries are the same
7686            if(sOrder == tOrder) {
7687                // and there are no more CEs, we advance to the next level
7688                if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7689                    break;
7690                }
7691                if(doHiragana && hirResult == UCOL_EQUAL) {
7692                    if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7693                        hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7694                            ? UCOL_LESS:UCOL_GREATER;
7695                    }
7696                }
7697            } else {
7698                // if two primaries are different, we are done
7699                result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7700                goto commonReturn;
7701            }
7702        } // no primary difference... do the rest from the buffers
7703    } else { // shifted - do a slightly more complicated processing :)
7704        for(;;) {
7705            UBool sInShifted = FALSE;
7706            UBool tInShifted = FALSE;
7707            // This version of code can be refactored. However, it seems easier to understand this way.
7708            // Source loop. Sam as the target loop.
7709            for(;;) {
7710                sOrder = ucol_IGetNextCE(coll, sColl, status);
7711                if(sOrder == UCOL_NO_MORE_CES) {
7712                    UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7713                    break;
7714                } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7715                    /* UCA amendment - ignore ignorables that follow shifted code points */
7716                    continue;
7717                } else if(isContinuation(sOrder)) {
7718                    if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7719                        if(sInShifted) {
7720                            sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7721                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7722                            continue;
7723                        } else {
7724                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7725                            break;
7726                        }
7727                    } else { /* Just lower level values */
7728                        if(sInShifted) {
7729                            continue;
7730                        } else {
7731                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7732                            continue;
7733                        }
7734                    }
7735                } else { /* regular */
7736                    if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7737                        UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7738                        break;
7739                    } else {
7740                        if((sOrder & UCOL_PRIMARYMASK) > 0) {
7741                            sInShifted = TRUE;
7742                            sOrder &= UCOL_PRIMARYMASK;
7743                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7744                            continue;
7745                        } else {
7746                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7747                            sInShifted = FALSE;
7748                            continue;
7749                        }
7750                    }
7751                }
7752            }
7753            sOrder &= UCOL_PRIMARYMASK;
7754            sInShifted = FALSE;
7755
7756            for(;;) {
7757                tOrder = ucol_IGetNextCE(coll, tColl, status);
7758                if(tOrder == UCOL_NO_MORE_CES) {
7759                    UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7760                    break;
7761                } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7762                    /* UCA amendment - ignore ignorables that follow shifted code points */
7763                    continue;
7764                } else if(isContinuation(tOrder)) {
7765                    if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7766                        if(tInShifted) {
7767                            tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7768                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7769                            continue;
7770                        } else {
7771                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7772                            break;
7773                        }
7774                    } else { /* Just lower level values */
7775                        if(tInShifted) {
7776                            continue;
7777                        } else {
7778                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7779                            continue;
7780                        }
7781                    }
7782                } else { /* regular */
7783                    if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7784                        UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7785                        break;
7786                    } else {
7787                        if((tOrder & UCOL_PRIMARYMASK) > 0) {
7788                            tInShifted = TRUE;
7789                            tOrder &= UCOL_PRIMARYMASK;
7790                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7791                            continue;
7792                        } else {
7793                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7794                            tInShifted = FALSE;
7795                            continue;
7796                        }
7797                    }
7798                }
7799            }
7800            tOrder &= UCOL_PRIMARYMASK;
7801            tInShifted = FALSE;
7802
7803            if(sOrder == tOrder) {
7804                /*
7805                if(doHiragana && hirResult == UCOL_EQUAL) {
7806                if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7807                hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7808                ? UCOL_LESS:UCOL_GREATER;
7809                }
7810                }
7811                */
7812                if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7813                    break;
7814                } else {
7815                    sOrder = 0;
7816                    tOrder = 0;
7817                    continue;
7818                }
7819            } else {
7820                result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7821                goto commonReturn;
7822            }
7823        } /* no primary difference... do the rest from the buffers */
7824    }
7825
7826    /* now, we're gonna reexamine collected CEs */
7827    uint32_t    *sCE;
7828    uint32_t    *tCE;
7829
7830    /* This is the secondary level of comparison */
7831    if(checkSecTer) {
7832        if(!isFrenchSec) { /* normal */
7833            sCE = sCEs.buf;
7834            tCE = tCEs.buf;
7835            for(;;) {
7836                while (secS == 0) {
7837                    secS = *(sCE++) & UCOL_SECONDARYMASK;
7838                }
7839
7840                while(secT == 0) {
7841                    secT = *(tCE++) & UCOL_SECONDARYMASK;
7842                }
7843
7844                if(secS == secT) {
7845                    if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7846                        break;
7847                    } else {
7848                        secS = 0; secT = 0;
7849                        continue;
7850                    }
7851                } else {
7852                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7853                    goto commonReturn;
7854                }
7855            }
7856        } else { /* do the French */
7857            uint32_t *sCESave = NULL;
7858            uint32_t *tCESave = NULL;
7859            sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7860            tCE = tCEs.pos-2;
7861            for(;;) {
7862                while (secS == 0 && sCE >= sCEs.buf) {
7863                    if(sCESave == 0) {
7864                        secS = *(sCE--);
7865                        if(isContinuation(secS)) {
7866                            while(isContinuation(secS = *(sCE--)))
7867                                ;
7868                            /* after this, secS has the start of continuation, and sCEs points before that */
7869                            sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7870                            sCE+=2;  /* need to point to the first continuation CP */
7871                            /* However, now you can just continue doing stuff */
7872                        }
7873                    } else {
7874                        secS = *(sCE++);
7875                        if(!isContinuation(secS)) { /* This means we have finished with this cont */
7876                            sCE = sCESave;            /* reset the pointer to before continuation */
7877                            sCESave = 0;
7878                            continue;
7879                        }
7880                    }
7881                    secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7882                }
7883
7884                while(secT == 0 && tCE >= tCEs.buf) {
7885                    if(tCESave == 0) {
7886                        secT = *(tCE--);
7887                        if(isContinuation(secT)) {
7888                            while(isContinuation(secT = *(tCE--)))
7889                                ;
7890                            /* after this, secS has the start of continuation, and sCEs points before that */
7891                            tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7892                            tCE+=2;  /* need to point to the first continuation CP */
7893                            /* However, now you can just continue doing stuff */
7894                        }
7895                    } else {
7896                        secT = *(tCE++);
7897                        if(!isContinuation(secT)) { /* This means we have finished with this cont */
7898                            tCE = tCESave;          /* reset the pointer to before continuation */
7899                            tCESave = 0;
7900                            continue;
7901                        }
7902                    }
7903                    secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7904                }
7905
7906                if(secS == secT) {
7907                    if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7908                        break;
7909                    } else {
7910                        secS = 0; secT = 0;
7911                        continue;
7912                    }
7913                } else {
7914                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7915                    goto commonReturn;
7916                }
7917            }
7918        }
7919    }
7920
7921    /* doing the case bit */
7922    if(checkCase) {
7923        sCE = sCEs.buf;
7924        tCE = tCEs.buf;
7925        for(;;) {
7926            while((secS & UCOL_REMOVE_CASE) == 0) {
7927                if(!isContinuation(*sCE++)) {
7928                    secS =*(sCE-1);
7929                    if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7930                        // primary ignorables should not be considered on the case level when the strength is primary
7931                        // otherwise, the CEs stop being well-formed
7932                        secS &= UCOL_TERT_CASE_MASK;
7933                        secS ^= caseSwitch;
7934                    } else {
7935                        secS = 0;
7936                    }
7937                } else {
7938                    secS = 0;
7939                }
7940            }
7941
7942            while((secT & UCOL_REMOVE_CASE) == 0) {
7943                if(!isContinuation(*tCE++)) {
7944                    secT = *(tCE-1);
7945                    if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7946                        // primary ignorables should not be considered on the case level when the strength is primary
7947                        // otherwise, the CEs stop being well-formed
7948                        secT &= UCOL_TERT_CASE_MASK;
7949                        secT ^= caseSwitch;
7950                    } else {
7951                        secT = 0;
7952                    }
7953                } else {
7954                    secT = 0;
7955                }
7956            }
7957
7958            if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7959                result = UCOL_LESS;
7960                goto commonReturn;
7961            } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7962                result = UCOL_GREATER;
7963                goto commonReturn;
7964            }
7965
7966            if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7967                break;
7968            } else {
7969                secS = 0;
7970                secT = 0;
7971            }
7972        }
7973    }
7974
7975    /* Tertiary level */
7976    if(checkTertiary) {
7977        secS = 0;
7978        secT = 0;
7979        sCE = sCEs.buf;
7980        tCE = tCEs.buf;
7981        for(;;) {
7982            while((secS & UCOL_REMOVE_CASE) == 0) {
7983                secS = *(sCE++) & tertiaryMask;
7984                if(!isContinuation(secS)) {
7985                    secS ^= caseSwitch;
7986                } else {
7987                    secS &= UCOL_REMOVE_CASE;
7988                }
7989            }
7990
7991            while((secT & UCOL_REMOVE_CASE)  == 0) {
7992                secT = *(tCE++) & tertiaryMask;
7993                if(!isContinuation(secT)) {
7994                    secT ^= caseSwitch;
7995                } else {
7996                    secT &= UCOL_REMOVE_CASE;
7997                }
7998            }
7999
8000            if(secS == secT) {
8001                if((secS & UCOL_REMOVE_CASE) == 1) {
8002                    break;
8003                } else {
8004                    secS = 0; secT = 0;
8005                    continue;
8006                }
8007            } else {
8008                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8009                goto commonReturn;
8010            }
8011        }
8012    }
8013
8014
8015    if(qShifted /*checkQuad*/) {
8016        UBool sInShifted = TRUE;
8017        UBool tInShifted = TRUE;
8018        secS = 0;
8019        secT = 0;
8020        sCE = sCEs.buf;
8021        tCE = tCEs.buf;
8022        for(;;) {
8023            while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
8024                secS = *(sCE++);
8025                if(isContinuation(secS)) {
8026                    if(!sInShifted) {
8027                        continue;
8028                    }
8029                } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
8030                    secS = UCOL_PRIMARYMASK;
8031                    sInShifted = FALSE;
8032                } else {
8033                    sInShifted = TRUE;
8034                }
8035            }
8036            secS &= UCOL_PRIMARYMASK;
8037
8038
8039            while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
8040                secT = *(tCE++);
8041                if(isContinuation(secT)) {
8042                    if(!tInShifted) {
8043                        continue;
8044                    }
8045                } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
8046                    secT = UCOL_PRIMARYMASK;
8047                    tInShifted = FALSE;
8048                } else {
8049                    tInShifted = TRUE;
8050                }
8051            }
8052            secT &= UCOL_PRIMARYMASK;
8053
8054            if(secS == secT) {
8055                if(secS == UCOL_NO_MORE_CES_PRIMARY) {
8056                    break;
8057                } else {
8058                    secS = 0; secT = 0;
8059                    continue;
8060                }
8061            } else {
8062                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8063                goto commonReturn;
8064            }
8065        }
8066    } else if(doHiragana && hirResult != UCOL_EQUAL) {
8067        // If we're fine on quaternaries, we might be different
8068        // on Hiragana. This, however, might fail us in shifted.
8069        result = hirResult;
8070        goto commonReturn;
8071    }
8072
8073    /*  For IDENTICAL comparisons, we use a bitwise character comparison */
8074    /*  as a tiebreaker if all else is equal.                                */
8075    /*  Getting here  should be quite rare - strings are not identical -     */
8076    /*     that is checked first, but compared == through all other checks.  */
8077    if(checkIdent)
8078    {
8079        //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
8080        result = ucol_checkIdent(sColl, tColl, TRUE, status);
8081    }
8082
8083commonReturn:
8084    if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
8085        freeHeapWritableBuffer(sColl);
8086        freeHeapWritableBuffer(tColl);
8087
8088        if (sCEs.buf != sCEs.localArray ) {
8089            uprv_free(sCEs.buf);
8090        }
8091        if (tCEs.buf != tCEs.localArray ) {
8092            uprv_free(tCEs.buf);
8093        }
8094    }
8095
8096    return result;
8097}
8098
8099
8100static inline uint32_t
8101ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
8102                          uint32_t CE, const UChar *s, int32_t *index, int32_t len)
8103{
8104    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8105    int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8106    int32_t offset = 1;
8107    UChar schar = 0, tchar = 0;
8108
8109    for(;;) {
8110        if(len == -1) {
8111            if(s[*index] == 0) { // end of string
8112                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8113            } else {
8114                schar = s[*index];
8115            }
8116        } else {
8117            if(*index == len) {
8118                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8119            } else {
8120                schar = s[*index];
8121            }
8122        }
8123
8124        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8125            offset++;
8126        }
8127
8128        if (schar == tchar) {
8129            (*index)++;
8130            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8131        }
8132        else
8133        {
8134            if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8135                return UCOL_BAIL_OUT_CE;
8136            }
8137            // skip completely ignorables
8138            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8139            if(isZeroCE == 0) { // we have to ignore completely ignorables
8140                (*index)++;
8141                continue;
8142            }
8143
8144            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8145        }
8146    }
8147}
8148
8149
8150/**
8151 * This is a fast strcoll, geared towards text in Latin-1.
8152 * It supports contractions of size two, French secondaries
8153 * and case switching. You can use it with strengths primary
8154 * to tertiary. It does not support shifted and case level.
8155 * It relies on the table build by setupLatin1Table. If it
8156 * doesn't understand something, it will go to the regular
8157 * strcoll.
8158 */
8159static inline UCollationResult
8160ucol_strcollUseLatin1( const UCollator    *coll,
8161              const UChar        *source,
8162              int32_t            sLen,
8163              const UChar        *target,
8164              int32_t            tLen,
8165              UErrorCode *status)
8166{
8167    U_ALIGN_CODE(16);
8168    int32_t strength = coll->strength;
8169
8170    int32_t sIndex = 0, tIndex = 0;
8171    UChar sChar = 0, tChar = 0;
8172    uint32_t sOrder=0, tOrder=0;
8173
8174    UBool endOfSource = FALSE;
8175
8176    uint32_t *elements = coll->latinOneCEs;
8177
8178    UBool haveContractions = FALSE; // if we have contractions in our string
8179                                    // we cannot do French secondary
8180
8181    // Do the primary level
8182    for(;;) {
8183        while(sOrder==0) { // this loop skips primary ignorables
8184            // sOrder=getNextlatinOneCE(source);
8185            if(sLen==-1) {   // handling zero terminated strings
8186                sChar=source[sIndex++];
8187                if(sChar==0) {
8188                    endOfSource = TRUE;
8189                    break;
8190                }
8191            } else {        // handling strings with known length
8192                if(sIndex==sLen) {
8193                    endOfSource = TRUE;
8194                    break;
8195                }
8196                sChar=source[sIndex++];
8197            }
8198            if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8199                //fprintf(stderr, "R");
8200                goto returnRegular;
8201                //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8202            }
8203            sOrder = elements[sChar];
8204            if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8205                // specials can basically be either contractions or bail-out signs. If we get anything
8206                // else, we'll bail out anywasy
8207                if(getCETag(sOrder) == CONTRACTION_TAG) {
8208                    sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8209                    haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8210                    // However, if there are contractions in the table, but we always use just one char,
8211                    // we might be able to do French. This should be checked out.
8212                }
8213                if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8214                    //fprintf(stderr, "S");
8215                    goto returnRegular;
8216                    //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8217                }
8218            }
8219        }
8220
8221        while(tOrder==0) {  // this loop skips primary ignorables
8222            // tOrder=getNextlatinOneCE(target);
8223            if(tLen==-1) {    // handling zero terminated strings
8224                tChar=target[tIndex++];
8225                if(tChar==0) {
8226                    if(endOfSource) { // this is different than source loop,
8227                        // as we already know that source loop is done here,
8228                        // so we can either finish the primary loop if both
8229                        // strings are done or anounce the result if only
8230                        // target is done. Same below.
8231                        goto endOfPrimLoop;
8232                    } else {
8233                        return UCOL_GREATER;
8234                    }
8235                }
8236            } else {          // handling strings with known length
8237                if(tIndex==tLen) {
8238                    if(endOfSource) {
8239                        goto endOfPrimLoop;
8240                    } else {
8241                        return UCOL_GREATER;
8242                    }
8243                }
8244                tChar=target[tIndex++];
8245            }
8246            if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8247                //fprintf(stderr, "R");
8248                goto returnRegular;
8249                //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8250            }
8251            tOrder = elements[tChar];
8252            if(tOrder >= UCOL_NOT_FOUND) {
8253                // Handling specials, see the comments for source
8254                if(getCETag(tOrder) == CONTRACTION_TAG) {
8255                    tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8256                    haveContractions = TRUE;
8257                }
8258                if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8259                    //fprintf(stderr, "S");
8260                    goto returnRegular;
8261                    //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8262                }
8263            }
8264        }
8265        if(endOfSource) { // source is finished, but target is not, say the result.
8266            return UCOL_LESS;
8267        }
8268
8269        if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8270            sOrder = 0; tOrder = 0;
8271            continue;
8272        } else {
8273            // compare current top bytes
8274            if(((sOrder^tOrder)&0xFF000000)!=0) {
8275                // top bytes differ, return difference
8276                if(sOrder < tOrder) {
8277                    return UCOL_LESS;
8278                } else if(sOrder > tOrder) {
8279                    return UCOL_GREATER;
8280                }
8281                // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8282                // since we must return enum value
8283            }
8284
8285            // top bytes match, continue with following bytes
8286            sOrder<<=8;
8287            tOrder<<=8;
8288        }
8289    }
8290
8291endOfPrimLoop:
8292    // after primary loop, we definitely know the sizes of strings,
8293    // so we set it and use simpler loop for secondaries and tertiaries
8294    sLen = sIndex; tLen = tIndex;
8295    if(strength >= UCOL_SECONDARY) {
8296        // adjust the table beggining
8297        elements += coll->latinOneTableLen;
8298        endOfSource = FALSE;
8299
8300        if(coll->frenchCollation == UCOL_OFF) { // non French
8301            // This loop is a simplified copy of primary loop
8302            // at this point we know that whole strings are latin-1, so we don't
8303            // check for that. We also know that we only have contractions as
8304            // specials.
8305            sIndex = 0; tIndex = 0;
8306            for(;;) {
8307                while(sOrder==0) {
8308                    if(sIndex==sLen) {
8309                        endOfSource = TRUE;
8310                        break;
8311                    }
8312                    sChar=source[sIndex++];
8313                    sOrder = elements[sChar];
8314                    if(sOrder > UCOL_NOT_FOUND) {
8315                        sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8316                    }
8317                }
8318
8319                while(tOrder==0) {
8320                    if(tIndex==tLen) {
8321                        if(endOfSource) {
8322                            goto endOfSecLoop;
8323                        } else {
8324                            return UCOL_GREATER;
8325                        }
8326                    }
8327                    tChar=target[tIndex++];
8328                    tOrder = elements[tChar];
8329                    if(tOrder > UCOL_NOT_FOUND) {
8330                        tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8331                    }
8332                }
8333                if(endOfSource) {
8334                    return UCOL_LESS;
8335                }
8336
8337                if(sOrder == tOrder) {
8338                    sOrder = 0; tOrder = 0;
8339                    continue;
8340                } else {
8341                    // see primary loop for comments on this
8342                    if(((sOrder^tOrder)&0xFF000000)!=0) {
8343                        if(sOrder < tOrder) {
8344                            return UCOL_LESS;
8345                        } else if(sOrder > tOrder) {
8346                            return UCOL_GREATER;
8347                        }
8348                    }
8349                    sOrder<<=8;
8350                    tOrder<<=8;
8351                }
8352            }
8353        } else { // French
8354            if(haveContractions) { // if we have contractions, we have to bail out
8355                // since we don't really know how to handle them here
8356                goto returnRegular;
8357                //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8358            }
8359            // For French, we go backwards
8360            sIndex = sLen; tIndex = tLen;
8361            for(;;) {
8362                while(sOrder==0) {
8363                    if(sIndex==0) {
8364                        endOfSource = TRUE;
8365                        break;
8366                    }
8367                    sChar=source[--sIndex];
8368                    sOrder = elements[sChar];
8369                    // don't even look for contractions
8370                }
8371
8372                while(tOrder==0) {
8373                    if(tIndex==0) {
8374                        if(endOfSource) {
8375                            goto endOfSecLoop;
8376                        } else {
8377                            return UCOL_GREATER;
8378                        }
8379                    }
8380                    tChar=target[--tIndex];
8381                    tOrder = elements[tChar];
8382                    // don't even look for contractions
8383                }
8384                if(endOfSource) {
8385                    return UCOL_LESS;
8386                }
8387
8388                if(sOrder == tOrder) {
8389                    sOrder = 0; tOrder = 0;
8390                    continue;
8391                } else {
8392                    // see the primary loop for comments
8393                    if(((sOrder^tOrder)&0xFF000000)!=0) {
8394                        if(sOrder < tOrder) {
8395                            return UCOL_LESS;
8396                        } else if(sOrder > tOrder) {
8397                            return UCOL_GREATER;
8398                        }
8399                    }
8400                    sOrder<<=8;
8401                    tOrder<<=8;
8402                }
8403            }
8404        }
8405    }
8406
8407endOfSecLoop:
8408    if(strength >= UCOL_TERTIARY) {
8409        // tertiary loop is the same as secondary (except no French)
8410        elements += coll->latinOneTableLen;
8411        sIndex = 0; tIndex = 0;
8412        endOfSource = FALSE;
8413        for(;;) {
8414            while(sOrder==0) {
8415                if(sIndex==sLen) {
8416                    endOfSource = TRUE;
8417                    break;
8418                }
8419                sChar=source[sIndex++];
8420                sOrder = elements[sChar];
8421                if(sOrder > UCOL_NOT_FOUND) {
8422                    sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8423                }
8424            }
8425            while(tOrder==0) {
8426                if(tIndex==tLen) {
8427                    if(endOfSource) {
8428                        return UCOL_EQUAL; // if both strings are at the end, they are equal
8429                    } else {
8430                        return UCOL_GREATER;
8431                    }
8432                }
8433                tChar=target[tIndex++];
8434                tOrder = elements[tChar];
8435                if(tOrder > UCOL_NOT_FOUND) {
8436                    tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8437                }
8438            }
8439            if(endOfSource) {
8440                return UCOL_LESS;
8441            }
8442            if(sOrder == tOrder) {
8443                sOrder = 0; tOrder = 0;
8444                continue;
8445            } else {
8446                if(((sOrder^tOrder)&0xff000000)!=0) {
8447                    if(sOrder < tOrder) {
8448                        return UCOL_LESS;
8449                    } else if(sOrder > tOrder) {
8450                        return UCOL_GREATER;
8451                    }
8452                }
8453                sOrder<<=8;
8454                tOrder<<=8;
8455            }
8456        }
8457    }
8458    return UCOL_EQUAL;
8459
8460returnRegular:
8461    // Preparing the context objects for iterating over strings
8462    collIterate sColl, tColl;
8463
8464    IInit_collIterate(coll, source, sLen, &sColl);
8465    IInit_collIterate(coll, target, tLen, &tColl);
8466    return ucol_strcollRegular(&sColl, &tColl, status);
8467}
8468
8469
8470U_CAPI UCollationResult U_EXPORT2
8471ucol_strcollIter( const UCollator    *coll,
8472                 UCharIterator *sIter,
8473                 UCharIterator *tIter,
8474                 UErrorCode         *status)
8475{
8476    if(!status || U_FAILURE(*status)) {
8477        return UCOL_EQUAL;
8478    }
8479
8480    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8481    UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8482
8483    if (sIter == tIter) {
8484        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8485        return UCOL_EQUAL;
8486    }
8487    if(sIter == NULL || tIter == NULL || coll == NULL) {
8488        *status = U_ILLEGAL_ARGUMENT_ERROR;
8489        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8490        return UCOL_EQUAL;
8491    }
8492
8493    UCollationResult result = UCOL_EQUAL;
8494
8495    // Preparing the context objects for iterating over strings
8496    collIterate sColl, tColl;
8497    // The division for the array length may truncate the array size to
8498    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8499    // for all platforms anyway.
8500    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8501    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8502    UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8503
8504    IInit_collIterate(coll, NULL, -1, &sColl);
8505    sColl.iterator = sIter;
8506    sColl.flags |= UCOL_USE_ITERATOR;
8507    IInit_collIterate(coll, NULL, -1, &tColl);
8508    tColl.flags |= UCOL_USE_ITERATOR;
8509    tColl.iterator = tIter;
8510
8511    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8512        sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8513        sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8514        sColl.flags &= ~UCOL_ITER_NORM;
8515
8516        tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8517        tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8518        tColl.flags &= ~UCOL_ITER_NORM;
8519    }
8520
8521    UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8522
8523    while((sChar = sColl.iterator->next(sColl.iterator)) ==
8524        (tChar = tColl.iterator->next(tColl.iterator))) {
8525            if(sChar == U_SENTINEL) {
8526                result = UCOL_EQUAL;
8527                goto end_compare;
8528            }
8529    }
8530
8531    if(sChar == U_SENTINEL) {
8532        tChar = tColl.iterator->previous(tColl.iterator);
8533    }
8534
8535    if(tChar == U_SENTINEL) {
8536        sChar = sColl.iterator->previous(sColl.iterator);
8537    }
8538
8539    sChar = sColl.iterator->previous(sColl.iterator);
8540    tChar = tColl.iterator->previous(tColl.iterator);
8541
8542    if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8543    {
8544        // We are stopped in the middle of a contraction.
8545        // Scan backwards through the == part of the string looking for the start of the contraction.
8546        //   It doesn't matter which string we scan, since they are the same in this region.
8547        do
8548        {
8549            sChar = sColl.iterator->previous(sColl.iterator);
8550            tChar = tColl.iterator->previous(tColl.iterator);
8551        }
8552        while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8553    }
8554
8555
8556    if(U_SUCCESS(*status)) {
8557        result = ucol_strcollRegular(&sColl, &tColl, status);
8558    }
8559
8560end_compare:
8561    if(sNormIter || tNormIter) {
8562        unorm_closeIter(sNormIter);
8563        unorm_closeIter(tNormIter);
8564    }
8565
8566    UTRACE_EXIT_VALUE_STATUS(result, *status)
8567    return result;
8568}
8569
8570
8571/*                                                                      */
8572/* ucol_strcoll     Main public API string comparison function          */
8573/*                                                                      */
8574U_CAPI UCollationResult U_EXPORT2
8575ucol_strcoll( const UCollator    *coll,
8576              const UChar        *source,
8577              int32_t            sourceLength,
8578              const UChar        *target,
8579              int32_t            targetLength)
8580{
8581    U_ALIGN_CODE(16);
8582
8583    UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8584    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8585        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8586        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8587        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8588    }
8589
8590    if(source == NULL || target == NULL) {
8591        // do not crash, but return. Should have
8592        // status argument to return error.
8593        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8594        return UCOL_EQUAL;
8595    }
8596
8597    /* Quick check if source and target are same strings. */
8598    /* They should either both be NULL terminated or the explicit length should be set on both. */
8599    if (source==target && sourceLength==targetLength) {
8600        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8601        return UCOL_EQUAL;
8602    }
8603
8604    /* Scan the strings.  Find:                                                             */
8605    /*    The length of any leading portion that is equal                                   */
8606    /*    Whether they are exactly equal.  (in which case we just return)                   */
8607    const UChar    *pSrc    = source;
8608    const UChar    *pTarg   = target;
8609    int32_t        equalLength;
8610
8611    if (sourceLength == -1 && targetLength == -1) {
8612        // Both strings are null terminated.
8613        //    Scan through any leading equal portion.
8614        while (*pSrc == *pTarg && *pSrc != 0) {
8615            pSrc++;
8616            pTarg++;
8617        }
8618        if (*pSrc == 0 && *pTarg == 0) {
8619            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8620            return UCOL_EQUAL;
8621        }
8622        equalLength = pSrc - source;
8623    }
8624    else
8625    {
8626        // One or both strings has an explicit length.
8627        const UChar    *pSrcEnd = source + sourceLength;
8628        const UChar    *pTargEnd = target + targetLength;
8629
8630        // Scan while the strings are bitwise ==, or until one is exhausted.
8631        for (;;) {
8632            if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8633                break;
8634            }
8635            if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8636                break;
8637            }
8638            if (*pSrc != *pTarg) {
8639                break;
8640            }
8641            pSrc++;
8642            pTarg++;
8643        }
8644        equalLength = pSrc - source;
8645
8646        // If we made it all the way through both strings, we are done.  They are ==
8647        if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8648            (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
8649        {
8650            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8651            return UCOL_EQUAL;
8652        }
8653    }
8654    if (equalLength > 0) {
8655        /* There is an identical portion at the beginning of the two strings.        */
8656        /*   If the identical portion ends within a contraction or a comibining      */
8657        /*   character sequence, back up to the start of that sequence.              */
8658
8659        // These values should already be set by the code above.
8660        //pSrc  = source + equalLength;        /* point to the first differing chars   */
8661        //pTarg = target + equalLength;
8662        if (pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8663            pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8664        {
8665            // We are stopped in the middle of a contraction.
8666            // Scan backwards through the == part of the string looking for the start of the contraction.
8667            //   It doesn't matter which string we scan, since they are the same in this region.
8668            do
8669            {
8670                equalLength--;
8671                pSrc--;
8672            }
8673            while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8674        }
8675
8676        source += equalLength;
8677        target += equalLength;
8678        if (sourceLength > 0) {
8679            sourceLength -= equalLength;
8680        }
8681        if (targetLength > 0) {
8682            targetLength -= equalLength;
8683        }
8684    }
8685
8686    UErrorCode status = U_ZERO_ERROR;
8687    UCollationResult returnVal;
8688    if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8689        collIterate sColl, tColl;
8690        // Preparing the context objects for iterating over strings
8691        IInit_collIterate(coll, source, sourceLength, &sColl);
8692        IInit_collIterate(coll, target, targetLength, &tColl);
8693        returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
8694    } else {
8695        returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8696    }
8697    UTRACE_EXIT_VALUE(returnVal);
8698    return returnVal;
8699}
8700
8701/* convenience function for comparing strings */
8702U_CAPI UBool U_EXPORT2
8703ucol_greater(    const    UCollator        *coll,
8704        const    UChar            *source,
8705        int32_t            sourceLength,
8706        const    UChar            *target,
8707        int32_t            targetLength)
8708{
8709    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8710        == UCOL_GREATER);
8711}
8712
8713/* convenience function for comparing strings */
8714U_CAPI UBool U_EXPORT2
8715ucol_greaterOrEqual(    const    UCollator    *coll,
8716            const    UChar        *source,
8717            int32_t        sourceLength,
8718            const    UChar        *target,
8719            int32_t        targetLength)
8720{
8721    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8722        != UCOL_LESS);
8723}
8724
8725/* convenience function for comparing strings */
8726U_CAPI UBool U_EXPORT2
8727ucol_equal(        const    UCollator        *coll,
8728            const    UChar            *source,
8729            int32_t            sourceLength,
8730            const    UChar            *target,
8731            int32_t            targetLength)
8732{
8733    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8734        == UCOL_EQUAL);
8735}
8736
8737U_CAPI void U_EXPORT2
8738ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8739    if(coll && coll->UCA) {
8740        uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8741    }
8742}
8743
8744#endif /* #if !UCONFIG_NO_COLLATION */
8745