1/*
2*******************************************************************************
3*   Copyright (C) 1996-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucol.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11* Modification history
12* Date        Name      Comments
13* 1996-1999   various members of ICU team maintained C API for collation framework
14* 02/16/2001  synwee    Added internal method getPrevSpecialCE
15* 03/01/2001  synwee    Added maxexpansion functionality.
16* 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_COLLATION
22
23#include "unicode/bytestream.h"
24#include "unicode/coleitr.h"
25#include "unicode/unorm.h"
26#include "unicode/udata.h"
27#include "unicode/ustring.h"
28
29#include "ucol_imp.h"
30#include "bocsu.h"
31
32#include "normalizer2impl.h"
33#include "unorm_it.h"
34#include "umutex.h"
35#include "cmemory.h"
36#include "ucln_in.h"
37#include "cstring.h"
38#include "utracimp.h"
39#include "putilimp.h"
40#include "uassert.h"
41
42#ifdef UCOL_DEBUG
43#include <stdio.h>
44#endif
45
46U_NAMESPACE_USE
47
48#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
49
50#define LAST_BYTE_MASK_           0xFF
51#define SECOND_LAST_BYTE_SHIFT_   8
52
53#define ZERO_CC_LIMIT_            0xC0
54
55// this is static pointer to the normalizer fcdTrieIndex
56// it is always the same between calls to u_cleanup
57// and therefore writing to it is not synchronized.
58// It is cleaned in ucol_cleanup
59static const uint16_t *fcdTrieIndex=NULL;
60// Code points at fcdHighStart and above have a zero FCD value.
61static UChar32 fcdHighStart = 0;
62
63// These are values from UCA required for
64// implicit generation and supressing sort key compression
65// they should regularly be in the UCA, but if one
66// is running without UCA, it could be a problem
67static const int32_t maxRegularPrimary  = 0x7A;
68static const int32_t minImplicitPrimary = 0xE0;
69static const int32_t maxImplicitPrimary = 0xE4;
70
71U_CDECL_BEGIN
72static UBool U_CALLCONV
73ucol_cleanup(void)
74{
75    fcdTrieIndex = NULL;
76    return TRUE;
77}
78
79static int32_t U_CALLCONV
80_getFoldingOffset(uint32_t data) {
81    return (int32_t)(data&0xFFFFFF);
82}
83
84U_CDECL_END
85
86// init FCD data
87static inline
88UBool initializeFCD(UErrorCode *status) {
89    if (fcdTrieIndex != NULL) {
90        return TRUE;
91    } else {
92        // The result is constant, until the library is reloaded.
93        fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
94        ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
95        return U_SUCCESS(*status);
96    }
97}
98
99static
100inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
101                              int32_t sourceLen, collIterate *s,
102                              UErrorCode *status)
103{
104    (s)->string = (s)->pos = sourceString;
105    (s)->origFlags = 0;
106    (s)->flags = 0;
107    if (sourceLen >= 0) {
108        s->flags |= UCOL_ITER_HASLEN;
109        (s)->endp = (UChar *)sourceString+sourceLen;
110    }
111    else {
112        /* change to enable easier checking for end of string for fcdpositon */
113        (s)->endp = NULL;
114    }
115    (s)->extendCEs = NULL;
116    (s)->extendCEsSize = 0;
117    (s)->CEpos = (s)->toReturn = (s)->CEs;
118    (s)->offsetBuffer = NULL;
119    (s)->offsetBufferSize = 0;
120    (s)->offsetReturn = (s)->offsetStore = NULL;
121    (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
122    (s)->coll = (collator);
123    (s)->nfd = Normalizer2Factory::getNFDInstance(*status);
124    (s)->fcdPosition = 0;
125    if(collator->normalizationMode == UCOL_ON) {
126        (s)->flags |= UCOL_ITER_NORM;
127    }
128    if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
129        (s)->flags |= UCOL_HIRAGANA_Q;
130    }
131    (s)->iterator = NULL;
132    //(s)->iteratorIndex = 0;
133}
134
135U_CAPI void  U_EXPORT2
136uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
137                             int32_t sourceLen, collIterate *s,
138                             UErrorCode *status) {
139    /* Out-of-line version for use from other files. */
140    IInit_collIterate(collator, sourceString, sourceLen, s, status);
141}
142
143U_CAPI collIterate * U_EXPORT2
144uprv_new_collIterate(UErrorCode *status) {
145    if(U_FAILURE(*status)) {
146        return NULL;
147    }
148    collIterate *s = new collIterate;
149    if(s == NULL) {
150        *status = U_MEMORY_ALLOCATION_ERROR;
151        return NULL;
152    }
153    return s;
154}
155
156U_CAPI void U_EXPORT2
157uprv_delete_collIterate(collIterate *s) {
158    delete s;
159}
160
161U_CAPI UBool U_EXPORT2
162uprv_collIterateAtEnd(collIterate *s) {
163    return s == NULL || s->pos == s->endp;
164}
165
166/**
167* Backup the state of the collIterate struct data
168* @param data collIterate to backup
169* @param backup storage
170*/
171static
172inline void backupState(const collIterate *data, collIterateState *backup)
173{
174    backup->fcdPosition = data->fcdPosition;
175    backup->flags       = data->flags;
176    backup->origFlags   = data->origFlags;
177    backup->pos         = data->pos;
178    backup->bufferaddress = data->writableBuffer.getBuffer();
179    backup->buffersize    = data->writableBuffer.length();
180    backup->iteratorMove = 0;
181    backup->iteratorIndex = 0;
182    if(data->iterator != NULL) {
183        //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
184        backup->iteratorIndex = data->iterator->getState(data->iterator);
185        // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
186        if(backup->iteratorIndex == UITER_NO_STATE) {
187            while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
188                backup->iteratorMove++;
189                data->iterator->move(data->iterator, -1, UITER_CURRENT);
190            }
191            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
192        }
193    }
194}
195
196/**
197* Loads the state into the collIterate struct data
198* @param data collIterate to backup
199* @param backup storage
200* @param forwards boolean to indicate if forwards iteration is used,
201*        false indicates backwards iteration
202*/
203static
204inline void loadState(collIterate *data, const collIterateState *backup,
205                      UBool        forwards)
206{
207    UErrorCode status = U_ZERO_ERROR;
208    data->flags       = backup->flags;
209    data->origFlags   = backup->origFlags;
210    if(data->iterator != NULL) {
211        //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
212        data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
213        if(backup->iteratorMove != 0) {
214            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
215        }
216    }
217    data->pos         = backup->pos;
218
219    if ((data->flags & UCOL_ITER_INNORMBUF) &&
220        data->writableBuffer.getBuffer() != backup->bufferaddress) {
221        /*
222        this is when a new buffer has been reallocated and we'll have to
223        calculate the new position.
224        note the new buffer has to contain the contents of the old buffer.
225        */
226        if (forwards) {
227            data->pos = data->writableBuffer.getTerminatedBuffer() +
228                                         (data->pos - backup->bufferaddress);
229        }
230        else {
231            /* backwards direction */
232            int32_t temp = backup->buffersize -
233                                  (int32_t)(data->pos - backup->bufferaddress);
234            data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
235        }
236    }
237    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
238        /*
239        this is alittle tricky.
240        if we are initially not in the normalization buffer, even if we
241        normalize in the later stage, the data in the buffer will be
242        ignored, since we skip back up to the data string.
243        however if we are already in the normalization buffer, any
244        further normalization will pull data into the normalization
245        buffer and modify the fcdPosition.
246        since we are keeping the data in the buffer for use, the
247        fcdPosition can not be reverted back.
248        arrgghh....
249        */
250        data->fcdPosition = backup->fcdPosition;
251    }
252}
253
254static UBool
255reallocCEs(collIterate *data, int32_t newCapacity) {
256    uint32_t *oldCEs = data->extendCEs;
257    if(oldCEs == NULL) {
258        oldCEs = data->CEs;
259    }
260    int32_t length = data->CEpos - oldCEs;
261    uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
262    if(newCEs == NULL) {
263        return FALSE;
264    }
265    uprv_memcpy(newCEs, oldCEs, length * 4);
266    uprv_free(data->extendCEs);
267    data->extendCEs = newCEs;
268    data->extendCEsSize = newCapacity;
269    data->CEpos = newCEs + length;
270    return TRUE;
271}
272
273static UBool
274increaseCEsCapacity(collIterate *data) {
275    int32_t oldCapacity;
276    if(data->extendCEs != NULL) {
277        oldCapacity = data->extendCEsSize;
278    } else {
279        oldCapacity = LENGTHOF(data->CEs);
280    }
281    return reallocCEs(data, 2 * oldCapacity);
282}
283
284static UBool
285ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
286    int32_t oldCapacity;
287    if(data->extendCEs != NULL) {
288        oldCapacity = data->extendCEsSize;
289    } else {
290        oldCapacity = LENGTHOF(data->CEs);
291    }
292    if(minCapacity <= oldCapacity) {
293        return TRUE;
294    }
295    oldCapacity *= 2;
296    return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
297}
298
299void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
300    if(U_FAILURE(errorCode)) {
301        return;
302    }
303    int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
304    if(length >= offsetBufferSize) {
305        int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
306        int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
307        if(newBuffer == NULL) {
308            errorCode = U_MEMORY_ALLOCATION_ERROR;
309            return;
310        }
311        if(length > 0) {
312            uprv_memcpy(newBuffer, offsetBuffer, length * 4);
313        }
314        uprv_free(offsetBuffer);
315        offsetBuffer = newBuffer;
316        offsetStore = offsetBuffer + length;
317        offsetBufferSize = newCapacity;
318    }
319    *offsetStore++ = offset;
320}
321
322/*
323* collIter_eos()
324*     Checks for a collIterate being positioned at the end of
325*     its source string.
326*
327*/
328static
329inline UBool collIter_eos(collIterate *s) {
330    if(s->flags & UCOL_USE_ITERATOR) {
331      return !(s->iterator->hasNext(s->iterator));
332    }
333    if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
334        // Null terminated string, but not at null, so not at end.
335        //   Whether in main or normalization buffer doesn't matter.
336        return FALSE;
337    }
338
339    // String with length.  Can't be in normalization buffer, which is always
340    //  null termintated.
341    if (s->flags & UCOL_ITER_HASLEN) {
342        return (s->pos == s->endp);
343    }
344
345    // We are at a null termination, could be either normalization buffer or main string.
346    if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
347        // At null at end of main string.
348        return TRUE;
349    }
350
351    // At null at end of normalization buffer.  Need to check whether there there are
352    //   any characters left in the main buffer.
353    if(s->origFlags & UCOL_USE_ITERATOR) {
354      return !(s->iterator->hasNext(s->iterator));
355    } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
356        // Null terminated main string.  fcdPosition is the 'return' position into main buf.
357        return (*s->fcdPosition == 0);
358    }
359    else {
360        // Main string with an end pointer.
361        return s->fcdPosition == s->endp;
362    }
363}
364
365/*
366* collIter_bos()
367*     Checks for a collIterate being positioned at the start of
368*     its source string.
369*
370*/
371static
372inline UBool collIter_bos(collIterate *source) {
373  // if we're going backwards, we need to know whether there is more in the
374  // iterator, even if we are in the side buffer
375  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
376    return !source->iterator->hasPrevious(source->iterator);
377  }
378  if (source->pos <= source->string ||
379      ((source->flags & UCOL_ITER_INNORMBUF) &&
380      *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
381    return TRUE;
382  }
383  return FALSE;
384}
385
386/*static
387inline UBool collIter_SimpleBos(collIterate *source) {
388  // if we're going backwards, we need to know whether there is more in the
389  // iterator, even if we are in the side buffer
390  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
391    return !source->iterator->hasPrevious(source->iterator);
392  }
393  if (source->pos == source->string) {
394    return TRUE;
395  }
396  return FALSE;
397}*/
398    //return (data->pos == data->string) ||
399
400
401/****************************************************************************/
402/* Following are the open/close functions                                   */
403/*                                                                          */
404/****************************************************************************/
405
406static UCollator*
407ucol_initFromBinary(const uint8_t *bin, int32_t length,
408                const UCollator *base,
409                UCollator *fillIn,
410                UErrorCode *status)
411{
412    UCollator *result = fillIn;
413    if(U_FAILURE(*status)) {
414        return NULL;
415    }
416    /*
417    if(base == NULL) {
418        // we don't support null base yet
419        *status = U_ILLEGAL_ARGUMENT_ERROR;
420        return NULL;
421    }
422    */
423    // We need these and we could be running without UCA
424    uprv_uca_initImplicitConstants(status);
425    UCATableHeader *colData = (UCATableHeader *)bin;
426    // do we want version check here? We're trying to figure out whether collators are compatible
427    if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
428        uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
429        colData->version[0] != UCOL_BUILDER_VERSION)
430    {
431        *status = U_COLLATOR_VERSION_MISMATCH;
432        return NULL;
433    }
434    else {
435        if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
436            result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
437            if(U_FAILURE(*status)){
438                return NULL;
439            }
440            result->hasRealData = TRUE;
441        }
442        else {
443            if(base) {
444                result = ucol_initCollator(base->image, result, base, status);
445                ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
446                if(U_FAILURE(*status)){
447                    return NULL;
448                }
449                result->hasRealData = FALSE;
450            }
451            else {
452                *status = U_USELESS_COLLATOR_ERROR;
453                return NULL;
454            }
455        }
456        result->freeImageOnClose = FALSE;
457    }
458    result->actualLocale = NULL;
459    result->validLocale = NULL;
460    result->requestedLocale = NULL;
461    result->rules = NULL;
462    result->rulesLength = 0;
463    result->freeRulesOnClose = FALSE;
464    result->ucaRules = NULL;
465    return result;
466}
467
468U_CAPI UCollator* U_EXPORT2
469ucol_openBinary(const uint8_t *bin, int32_t length,
470                const UCollator *base,
471                UErrorCode *status)
472{
473    return ucol_initFromBinary(bin, length, base, NULL, status);
474}
475
476U_CAPI int32_t U_EXPORT2
477ucol_cloneBinary(const UCollator *coll,
478                 uint8_t *buffer, int32_t capacity,
479                 UErrorCode *status)
480{
481    int32_t length = 0;
482    if(U_FAILURE(*status)) {
483        return length;
484    }
485    if(capacity < 0) {
486        *status = U_ILLEGAL_ARGUMENT_ERROR;
487        return length;
488    }
489    if(coll->hasRealData == TRUE) {
490        length = coll->image->size;
491        if(length <= capacity) {
492            uprv_memcpy(buffer, coll->image, length);
493        } else {
494            *status = U_BUFFER_OVERFLOW_ERROR;
495        }
496    } else {
497        length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
498        if(length <= capacity) {
499            /* build the UCATableHeader with minimal entries */
500            /* do not copy the header from the UCA file because its values are wrong! */
501            /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
502
503            /* reset everything */
504            uprv_memset(buffer, 0, length);
505
506            /* set the tailoring-specific values */
507            UCATableHeader *myData = (UCATableHeader *)buffer;
508            myData->size = length;
509
510            /* offset for the options, the only part of the data that is present after the header */
511            myData->options = sizeof(UCATableHeader);
512
513            /* need to always set the expansion value for an upper bound of the options */
514            myData->expansion = myData->options + sizeof(UColOptionSet);
515
516            myData->magic = UCOL_HEADER_MAGIC;
517            myData->isBigEndian = U_IS_BIG_ENDIAN;
518            myData->charSetFamily = U_CHARSET_FAMILY;
519
520            /* copy UCA's version; genrb will override all but the builder version with tailoring data */
521            uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
522
523            uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
524            uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
525            uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
526            myData->jamoSpecial = coll->image->jamoSpecial;
527
528            /* copy the collator options */
529            uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
530        } else {
531            *status = U_BUFFER_OVERFLOW_ERROR;
532        }
533    }
534    return length;
535}
536
537U_CAPI UCollator* U_EXPORT2
538ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
539{
540    UCollator * localCollator;
541    int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
542    char *stackBufferChars = (char *)stackBuffer;
543    int32_t imageSize = 0;
544    int32_t rulesSize = 0;
545    int32_t rulesPadding = 0;
546    int32_t defaultReorderCodesSize = 0;
547    int32_t reorderCodesSize = 0;
548    uint8_t *image;
549    UChar *rules;
550    int32_t* defaultReorderCodes;
551    int32_t* reorderCodes;
552    uint8_t* leadBytePermutationTable;
553    UBool colAllocated = FALSE;
554    UBool imageAllocated = FALSE;
555
556    if (status == NULL || U_FAILURE(*status)){
557        return 0;
558    }
559    if ((stackBuffer && !pBufferSize) || !coll){
560       *status = U_ILLEGAL_ARGUMENT_ERROR;
561        return 0;
562    }
563
564    if (coll->rules && coll->freeRulesOnClose) {
565        rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
566        rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
567        bufferSizeNeeded += rulesSize + rulesPadding;
568    }
569    // no padding for alignment needed from here since the next two are 4 byte quantities
570    if (coll->defaultReorderCodes) {
571        defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
572        bufferSizeNeeded += defaultReorderCodesSize;
573    }
574    if (coll->reorderCodes) {
575        reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
576        bufferSizeNeeded += reorderCodesSize;
577    }
578    if (coll->leadBytePermutationTable) {
579        bufferSizeNeeded += 256 * sizeof(uint8_t);
580    }
581
582    if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */
583        *pBufferSize =  bufferSizeNeeded;
584        return 0;
585    }
586
587    /* Pointers on 64-bit platforms need to be aligned
588     * on a 64-bit boundry in memory.
589     */
590    if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
591        int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
592        if (*pBufferSize > offsetUp) {
593            *pBufferSize -= offsetUp;
594            stackBufferChars += offsetUp;
595        }
596        else {
597            /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
598            *pBufferSize = 1;
599        }
600    }
601    stackBuffer = (void *)stackBufferChars;
602
603    if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
604        /* allocate one here...*/
605        stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
606        // Null pointer check.
607        if (stackBufferChars == NULL) {
608            *status = U_MEMORY_ALLOCATION_ERROR;
609            return NULL;
610        }
611        colAllocated = TRUE;
612        if (U_SUCCESS(*status)) {
613            *status = U_SAFECLONE_ALLOCATED_WARNING;
614        }
615    }
616    localCollator = (UCollator *)stackBufferChars;
617    rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
618    defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
619    reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
620    leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
621
622    {
623        UErrorCode tempStatus = U_ZERO_ERROR;
624        imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
625    }
626    if (coll->freeImageOnClose) {
627        image = (uint8_t *)uprv_malloc(imageSize);
628        // Null pointer check
629        if (image == NULL) {
630            *status = U_MEMORY_ALLOCATION_ERROR;
631            return NULL;
632        }
633        ucol_cloneBinary(coll, image, imageSize, status);
634        imageAllocated = TRUE;
635    }
636    else {
637        image = (uint8_t *)coll->image;
638    }
639    localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
640    if (U_FAILURE(*status)) {
641        return NULL;
642    }
643
644    if (coll->rules) {
645        if (coll->freeRulesOnClose) {
646            localCollator->rules = u_strcpy(rules, coll->rules);
647            //bufferEnd += rulesSize;
648        }
649        else {
650            localCollator->rules = coll->rules;
651        }
652        localCollator->freeRulesOnClose = FALSE;
653        localCollator->rulesLength = coll->rulesLength;
654    }
655
656    // collator reordering
657    if (coll->defaultReorderCodes) {
658        localCollator->defaultReorderCodes =
659            (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
660        localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
661        localCollator->freeDefaultReorderCodesOnClose = FALSE;
662    }
663    if (coll->reorderCodes) {
664        localCollator->reorderCodes =
665            (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
666        localCollator->reorderCodesLength = coll->reorderCodesLength;
667        localCollator->freeReorderCodesOnClose = FALSE;
668    }
669    if (coll->leadBytePermutationTable) {
670        localCollator->leadBytePermutationTable =
671            (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
672        localCollator->freeLeadBytePermutationTableOnClose = FALSE;
673    }
674
675    int32_t i;
676    for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
677        ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
678    }
679    // zero copies of pointers
680    localCollator->actualLocale = NULL;
681    localCollator->validLocale = NULL;
682    localCollator->requestedLocale = NULL;
683    localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
684    localCollator->freeOnClose = colAllocated;
685    localCollator->freeImageOnClose = imageAllocated;
686    return localCollator;
687}
688
689U_CAPI void U_EXPORT2
690ucol_close(UCollator *coll)
691{
692    UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
693    UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
694    if(coll != NULL) {
695        // these are always owned by each UCollator struct,
696        // so we always free them
697        if(coll->validLocale != NULL) {
698            uprv_free(coll->validLocale);
699        }
700        if(coll->actualLocale != NULL) {
701            uprv_free(coll->actualLocale);
702        }
703        if(coll->requestedLocale != NULL) {
704            uprv_free(coll->requestedLocale);
705        }
706        if(coll->latinOneCEs != NULL) {
707            uprv_free(coll->latinOneCEs);
708        }
709        if(coll->options != NULL && coll->freeOptionsOnClose) {
710            uprv_free(coll->options);
711        }
712        if(coll->rules != NULL && coll->freeRulesOnClose) {
713            uprv_free((UChar *)coll->rules);
714        }
715        if(coll->image != NULL && coll->freeImageOnClose) {
716            uprv_free((UCATableHeader *)coll->image);
717        }
718
719        if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
720            uprv_free(coll->leadBytePermutationTable);
721        }
722        if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
723            uprv_free(coll->defaultReorderCodes);
724        }
725        if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
726            uprv_free(coll->reorderCodes);
727        }
728
729        /* Here, it would be advisable to close: */
730        /* - UData for UCA (unless we stuff it in the root resb */
731        /* Again, do we need additional housekeeping... HMMM! */
732        UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
733        if(coll->freeOnClose){
734            /* for safeClone, if freeOnClose is FALSE,
735            don't free the other instance data */
736            uprv_free(coll);
737        }
738    }
739    UTRACE_EXIT();
740}
741
742/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
743/* you should be able to get the binary chunk to write out...  Doesn't look very full now */
744U_CFUNC uint8_t* U_EXPORT2
745ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
746{
747    uint8_t *result = NULL;
748    if(U_FAILURE(*status)) {
749        return NULL;
750    }
751    if(coll->hasRealData == TRUE) {
752        *length = coll->image->size;
753        result = (uint8_t *)uprv_malloc(*length);
754        /* test for NULL */
755        if (result == NULL) {
756            *status = U_MEMORY_ALLOCATION_ERROR;
757            return NULL;
758        }
759        uprv_memcpy(result, coll->image, *length);
760    } else {
761        *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
762        result = (uint8_t *)uprv_malloc(*length);
763        /* test for NULL */
764        if (result == NULL) {
765            *status = U_MEMORY_ALLOCATION_ERROR;
766            return NULL;
767        }
768
769        /* build the UCATableHeader with minimal entries */
770        /* do not copy the header from the UCA file because its values are wrong! */
771        /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
772
773        /* reset everything */
774        uprv_memset(result, 0, *length);
775
776        /* set the tailoring-specific values */
777        UCATableHeader *myData = (UCATableHeader *)result;
778        myData->size = *length;
779
780        /* offset for the options, the only part of the data that is present after the header */
781        myData->options = sizeof(UCATableHeader);
782
783        /* need to always set the expansion value for an upper bound of the options */
784        myData->expansion = myData->options + sizeof(UColOptionSet);
785
786        myData->magic = UCOL_HEADER_MAGIC;
787        myData->isBigEndian = U_IS_BIG_ENDIAN;
788        myData->charSetFamily = U_CHARSET_FAMILY;
789
790        /* copy UCA's version; genrb will override all but the builder version with tailoring data */
791        uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
792
793        uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
794        uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
795        uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
796        myData->jamoSpecial = coll->image->jamoSpecial;
797
798        /* copy the collator options */
799        uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
800    }
801    return result;
802}
803
804void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
805    if(U_FAILURE(*status)) {
806        return;
807    }
808    result->caseFirst = (UColAttributeValue)opts->caseFirst;
809    result->caseLevel = (UColAttributeValue)opts->caseLevel;
810    result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
811    result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
812    if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
813        return;
814    }
815    result->strength = (UColAttributeValue)opts->strength;
816    result->variableTopValue = opts->variableTopValue;
817    result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
818    result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
819    result->numericCollation = (UColAttributeValue)opts->numericCollation;
820    result->caseFirstisDefault = TRUE;
821    result->caseLevelisDefault = TRUE;
822    result->frenchCollationisDefault = TRUE;
823    result->normalizationModeisDefault = TRUE;
824    result->strengthisDefault = TRUE;
825    result->variableTopValueisDefault = TRUE;
826    result->alternateHandlingisDefault = TRUE;
827    result->hiraganaQisDefault = TRUE;
828    result->numericCollationisDefault = TRUE;
829
830    ucol_updateInternalState(result, status);
831
832    result->options = opts;
833}
834
835
836/**
837* Approximate determination if a character is at a contraction end.
838* Guaranteed to be TRUE if a character is at the end of a contraction,
839* otherwise it is not deterministic.
840* @param c character to be determined
841* @param coll collator
842*/
843static
844inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
845    if (c < coll->minContrEndCP) {
846        return FALSE;
847    }
848
849    int32_t  hash = c;
850    uint8_t  htbyte;
851    if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
852        if (U16_IS_TRAIL(c)) {
853            return TRUE;
854        }
855        hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
856    }
857    htbyte = coll->contrEndCP[hash>>3];
858    return (((htbyte >> (hash & 7)) & 1) == 1);
859}
860
861
862
863/*
864*   i_getCombiningClass()
865*        A fast, at least partly inline version of u_getCombiningClass()
866*        This is a candidate for further optimization.  Used heavily
867*        in contraction processing.
868*/
869static
870inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
871    uint8_t sCC = 0;
872    if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
873        sCC = u_getCombiningClass(c);
874    }
875    return sCC;
876}
877
878UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
879    UChar c;
880    UCollator *result = fillIn;
881    if(U_FAILURE(*status) || image == NULL) {
882        return NULL;
883    }
884
885    if(result == NULL) {
886        result = (UCollator *)uprv_malloc(sizeof(UCollator));
887        if(result == NULL) {
888            *status = U_MEMORY_ALLOCATION_ERROR;
889            return result;
890        }
891        result->freeOnClose = TRUE;
892    } else {
893        result->freeOnClose = FALSE;
894    }
895
896    result->image = image;
897    result->mapping.getFoldingOffset = _getFoldingOffset;
898    const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
899    utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
900    if(U_FAILURE(*status)) {
901        if(result->freeOnClose == TRUE) {
902            uprv_free(result);
903            result = NULL;
904        }
905        return result;
906    }
907
908    result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
909    result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
910    result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
911    result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
912    result->rules = NULL;
913    result->rulesLength = 0;
914    result->freeRulesOnClose = FALSE;
915    result->defaultReorderCodes = NULL;
916    result->defaultReorderCodesLength = 0;
917    result->freeDefaultReorderCodesOnClose = FALSE;
918    result->reorderCodes = NULL;
919    result->reorderCodesLength = 0;
920    result->freeReorderCodesOnClose = FALSE;
921    result->leadBytePermutationTable = NULL;
922    result->freeLeadBytePermutationTableOnClose = FALSE;
923
924    /* get the version info from UCATableHeader and populate the Collator struct*/
925    result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
926    result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
927    result->dataVersion[2] = 0;
928    result->dataVersion[3] = 0;
929
930    result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
931    result->minUnsafeCP = 0;
932    for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
933        if (ucol_unsafeCP(c, result)) break;
934    }
935    result->minUnsafeCP = c;
936
937    result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
938    result->minContrEndCP = 0;
939    for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
940        if (ucol_contractionEndCP(c, result)) break;
941    }
942    result->minContrEndCP = c;
943
944    /* max expansion tables */
945    result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
946                                         result->image->endExpansionCE);
947    result->lastEndExpansionCE = result->endExpansionCE +
948                                 result->image->endExpansionCECount - 1;
949    result->expansionCESize = (uint8_t*)result->image +
950                                               result->image->expansionCESize;
951
952
953    //result->errorCode = *status;
954
955    result->latinOneCEs = NULL;
956
957    result->latinOneRegenTable = FALSE;
958    result->latinOneFailed = FALSE;
959    result->UCA = UCA;
960
961    /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
962    result->ucaRules = NULL;
963    result->actualLocale = NULL;
964    result->validLocale = NULL;
965    result->requestedLocale = NULL;
966    result->hasRealData = FALSE; // real data lives in .dat file...
967    result->freeImageOnClose = FALSE;
968
969    /* set attributes */
970    ucol_setOptionsFromHeader(
971        result,
972        (UColOptionSet*)((uint8_t*)result->image+result->image->options),
973        status);
974    result->freeOptionsOnClose = FALSE;
975
976    return result;
977}
978
979/* new Mark's code */
980
981/**
982 * For generation of Implicit CEs
983 * @author Davis
984 *
985 * Cleaned up so that changes can be made more easily.
986 * Old values:
987# First Implicit: E26A792D
988# Last Implicit: E3DC70C0
989# First CJK: E0030300
990# Last CJK: E0A9DD00
991# First CJK_A: E0A9DF00
992# Last CJK_A: E0DE3100
993 */
994/* Following is a port of Mark's code for new treatment of implicits.
995 * It is positioned here, since ucol_initUCA need to initialize the
996 * variables below according to the data in the fractional UCA.
997 */
998
999/**
1000 * Function used to:
1001 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1002 * b) bump any non-CJK characters by 10FFFF.
1003 * The relevant blocks are:
1004 * A:    4E00..9FFF; CJK Unified Ideographs
1005 *       F900..FAFF; CJK Compatibility Ideographs
1006 * B:    3400..4DBF; CJK Unified Ideographs Extension A
1007 *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
1008 * As long as
1009 *   no new B characters are allocated between 4E00 and FAFF, and
1010 *   no new A characters are outside of this range,
1011 * (very high probability) this simple code will work.
1012 * The reordered blocks are:
1013 * Block1 is CJK
1014 * Block2 is CJK_COMPAT_USED
1015 * Block3 is CJK_A
1016 * (all contiguous)
1017 * Any other CJK gets its normal code point
1018 * Any non-CJK gets +10FFFF
1019 * When we reorder Block1, we make sure that it is at the very start,
1020 * so that it will use a 3-byte form.
1021 * Warning: the we only pick up the compatibility characters that are
1022 * NOT decomposed, so that block is smaller!
1023 */
1024
1025// CONSTANTS
1026static const UChar32
1027    NON_CJK_OFFSET = 0x110000,
1028    UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
1029
1030/**
1031 * Precomputed by initImplicitConstants()
1032 */
1033static int32_t
1034    final3Multiplier = 0,
1035    final4Multiplier = 0,
1036    final3Count = 0,
1037    final4Count = 0,
1038    medialCount = 0,
1039    min3Primary = 0,
1040    min4Primary = 0,
1041    max4Primary = 0,
1042    minTrail = 0,
1043    maxTrail = 0,
1044    max3Trail = 0,
1045    max4Trail = 0,
1046    min4Boundary = 0;
1047
1048static const UChar32
1049    // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
1050    // 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
1051    CJK_BASE = 0x4E00,
1052    CJK_LIMIT = 0x9FCB+1,
1053    // Unified CJK ideographs in the compatibility ideographs block.
1054    CJK_COMPAT_USED_BASE = 0xFA0E,
1055    CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1056    // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1057    // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1058    CJK_A_BASE = 0x3400,
1059    CJK_A_LIMIT = 0x4DB5+1,
1060    // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1061    // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1062    CJK_B_BASE = 0x20000,
1063    CJK_B_LIMIT = 0x2A6D6+1,
1064    // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1065    // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1066    CJK_C_BASE = 0x2A700,
1067    CJK_C_LIMIT = 0x2B734+1,
1068    // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1069    // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1070    CJK_D_BASE = 0x2B740,
1071    CJK_D_LIMIT = 0x2B81D+1;
1072    // when adding to this list, look for all occurrences (in project)
1073    // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1074
1075static UChar32 swapCJK(UChar32 i) {
1076    if (i < CJK_A_BASE) {
1077        // non-CJK
1078    } else if (i < CJK_A_LIMIT) {
1079        // Extension A has lower code points than the original Unihan+compat
1080        // but sorts higher.
1081        return i - CJK_A_BASE
1082                + (CJK_LIMIT - CJK_BASE)
1083                + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1084    } else if (i < CJK_BASE) {
1085        // non-CJK
1086    } else if (i < CJK_LIMIT) {
1087        return i - CJK_BASE;
1088    } else if (i < CJK_COMPAT_USED_BASE) {
1089        // non-CJK
1090    } else if (i < CJK_COMPAT_USED_LIMIT) {
1091        return i - CJK_COMPAT_USED_BASE
1092                + (CJK_LIMIT - CJK_BASE);
1093    } else if (i < CJK_B_BASE) {
1094        // non-CJK
1095    } else if (i < CJK_B_LIMIT) {
1096        return i; // non-BMP-CJK
1097    } else if (i < CJK_C_BASE) {
1098        // non-CJK
1099    } else if (i < CJK_C_LIMIT) {
1100        return i; // non-BMP-CJK
1101    } else if (i < CJK_D_BASE) {
1102        // non-CJK
1103    } else if (i < CJK_D_LIMIT) {
1104        return i; // non-BMP-CJK
1105    }
1106    return i + NON_CJK_OFFSET; // non-CJK
1107}
1108
1109U_CAPI UChar32 U_EXPORT2
1110uprv_uca_getRawFromCodePoint(UChar32 i) {
1111    return swapCJK(i)+1;
1112}
1113
1114U_CAPI UChar32 U_EXPORT2
1115uprv_uca_getCodePointFromRaw(UChar32 i) {
1116    i--;
1117    UChar32 result = 0;
1118    if(i >= NON_CJK_OFFSET) {
1119        result = i - NON_CJK_OFFSET;
1120    } else if(i >= CJK_B_BASE) {
1121        result = i;
1122    } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1123        if(i < CJK_LIMIT - CJK_BASE) {
1124            result = i + CJK_BASE;
1125        } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1126            result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1127        } else {
1128            result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1129        }
1130    } else {
1131        result = -1;
1132    }
1133    return result;
1134}
1135
1136// GET IMPLICIT PRIMARY WEIGHTS
1137// Return value is left justified primary key
1138U_CAPI uint32_t U_EXPORT2
1139uprv_uca_getImplicitFromRaw(UChar32 cp) {
1140    /*
1141    if (cp < 0 || cp > UCOL_MAX_INPUT) {
1142        throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1143    }
1144    */
1145    int32_t last0 = cp - min4Boundary;
1146    if (last0 < 0) {
1147        int32_t last1 = cp / final3Count;
1148        last0 = cp % final3Count;
1149
1150        int32_t last2 = last1 / medialCount;
1151        last1 %= medialCount;
1152
1153        last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1154        last1 = minTrail + last1; // offset
1155        last2 = min3Primary + last2; // offset
1156        /*
1157        if (last2 >= min4Primary) {
1158            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1159        }
1160        */
1161        return (last2 << 24) + (last1 << 16) + (last0 << 8);
1162    } else {
1163        int32_t last1 = last0 / final4Count;
1164        last0 %= final4Count;
1165
1166        int32_t last2 = last1 / medialCount;
1167        last1 %= medialCount;
1168
1169        int32_t last3 = last2 / medialCount;
1170        last2 %= medialCount;
1171
1172        last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1173        last1 = minTrail + last1; // offset
1174        last2 = minTrail + last2; // offset
1175        last3 = min4Primary + last3; // offset
1176        /*
1177        if (last3 > max4Primary) {
1178            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1179        }
1180        */
1181        return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1182    }
1183}
1184
1185static uint32_t U_EXPORT2
1186uprv_uca_getImplicitPrimary(UChar32 cp) {
1187   //fprintf(stdout, "Incoming: %04x\n", cp);
1188    //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1189
1190    cp = swapCJK(cp);
1191    cp++;
1192    // we now have a range of numbers from 0 to 21FFFF.
1193
1194    //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1195    //fprintf(stdout, "CJK swapped: %04x\n", cp);
1196
1197    return uprv_uca_getImplicitFromRaw(cp);
1198}
1199
1200/**
1201 * Converts implicit CE into raw integer ("code point")
1202 * @param implicit
1203 * @return -1 if illegal format
1204 */
1205U_CAPI UChar32 U_EXPORT2
1206uprv_uca_getRawFromImplicit(uint32_t implicit) {
1207    UChar32 result;
1208    UChar32 b3 = implicit & 0xFF;
1209    UChar32 b2 = (implicit >> 8) & 0xFF;
1210    UChar32 b1 = (implicit >> 16) & 0xFF;
1211    UChar32 b0 = (implicit >> 24) & 0xFF;
1212
1213    // simple parameter checks
1214    if (b0 < min3Primary || b0 > max4Primary
1215        || b1 < minTrail || b1 > maxTrail)
1216        return -1;
1217    // normal offsets
1218    b1 -= minTrail;
1219
1220    // take care of the final values, and compose
1221    if (b0 < min4Primary) {
1222        if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1223            return -1;
1224        b2 -= minTrail;
1225        UChar32 remainder = b2 % final3Multiplier;
1226        if (remainder != 0)
1227            return -1;
1228        b0 -= min3Primary;
1229        b2 /= final3Multiplier;
1230        result = ((b0 * medialCount) + b1) * final3Count + b2;
1231    } else {
1232        if (b2 < minTrail || b2 > maxTrail
1233            || b3 < minTrail || b3 > max4Trail)
1234            return -1;
1235        b2 -= minTrail;
1236        b3 -= minTrail;
1237        UChar32 remainder = b3 % final4Multiplier;
1238        if (remainder != 0)
1239            return -1;
1240        b3 /= final4Multiplier;
1241        b0 -= min4Primary;
1242        result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1243    }
1244    // final check
1245    if (result < 0 || result > UCOL_MAX_INPUT)
1246        return -1;
1247    return result;
1248}
1249
1250
1251static inline int32_t divideAndRoundUp(int a, int b) {
1252    return 1 + (a-1)/b;
1253}
1254
1255/* this function is either called from initUCA or from genUCA before
1256 * doing canonical closure for the UCA.
1257 */
1258
1259/**
1260 * Set up to generate implicits.
1261 * Maintenance Note:  this function may end up being called more than once, due
1262 *                    to threading races during initialization.  Make sure that
1263 *                    none of the Constants is ever transiently assigned an
1264 *                    incorrect value.
1265 * @param minPrimary
1266 * @param maxPrimary
1267 * @param minTrail final byte
1268 * @param maxTrail final byte
1269 * @param gap3 the gap we leave for tailoring for 3-byte forms
1270 * @param gap4 the gap we leave for tailoring for 4-byte forms
1271 */
1272static void initImplicitConstants(int minPrimary, int maxPrimary,
1273                                    int minTrailIn, int maxTrailIn,
1274                                    int gap3, int primaries3count,
1275                                    UErrorCode *status) {
1276    // some simple parameter checks
1277    if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1278        || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1279        || (primaries3count < 1))
1280    {
1281        *status = U_ILLEGAL_ARGUMENT_ERROR;
1282        return;
1283    };
1284
1285    minTrail = minTrailIn;
1286    maxTrail = maxTrailIn;
1287
1288    min3Primary = minPrimary;
1289    max4Primary = maxPrimary;
1290    // compute constants for use later.
1291    // number of values we can use in trailing bytes
1292    // leave room for empty values between AND above, e.g. if gap = 2
1293    // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1294    // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1295    // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1296    final3Multiplier = gap3 + 1;
1297    final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1298    max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1299
1300    // medials can use full range
1301    medialCount = (maxTrail - minTrail + 1);
1302    // find out how many values fit in each form
1303    int32_t threeByteCount = medialCount * final3Count;
1304    // now determine where the 3/4 boundary is.
1305    // we use 3 bytes below the boundary, and 4 above
1306    int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1307    int32_t primaries4count = primariesAvailable - primaries3count;
1308
1309
1310    int32_t min3ByteCoverage = primaries3count * threeByteCount;
1311    min4Primary = minPrimary + primaries3count;
1312    min4Boundary = min3ByteCoverage;
1313    // Now expand out the multiplier for the 4 bytes, and redo.
1314
1315    int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1316    int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1317    int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1318    int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1319    if (gap4 < 1) {
1320        *status = U_ILLEGAL_ARGUMENT_ERROR;
1321        return;
1322    }
1323    final4Multiplier = gap4 + 1;
1324    final4Count = neededPerFinalByte;
1325    max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1326}
1327
1328    /**
1329     * Supply parameters for generating implicit CEs
1330     */
1331U_CAPI void U_EXPORT2
1332uprv_uca_initImplicitConstants(UErrorCode *status) {
1333    // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1334    //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1335    initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1336}
1337
1338
1339/*    collIterNormalize     Incremental Normalization happens here.                       */
1340/*                          pick up the range of chars identifed by FCD,                  */
1341/*                          normalize it into the collIterate's writable buffer,          */
1342/*                          switch the collIterate's state to use the writable buffer.    */
1343/*                                                                                        */
1344static
1345void collIterNormalize(collIterate *collationSource)
1346{
1347    UErrorCode  status = U_ZERO_ERROR;
1348    const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1349    const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1350
1351    collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1352                                    collationSource->writableBuffer,
1353                                    status);
1354    if (U_FAILURE(status)) {
1355#ifdef UCOL_DEBUG
1356        fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1357#endif
1358        return;
1359    }
1360
1361    collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
1362    collationSource->origFlags  = collationSource->flags;
1363    collationSource->flags     |= UCOL_ITER_INNORMBUF;
1364    collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1365}
1366
1367
1368// This function takes the iterator and extracts normalized stuff up to the next boundary
1369// It is similar in the end results to the collIterNormalize, but for the cases when we
1370// use an iterator
1371/*static
1372inline void normalizeIterator(collIterate *collationSource) {
1373  UErrorCode status = U_ZERO_ERROR;
1374  UBool wasNormalized = FALSE;
1375  //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1376  uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1377  int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1378    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1379  if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1380    // reallocate and terminate
1381    if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1382                               &collationSource->writableBuffer,
1383                               (int32_t *)&collationSource->writableBufSize, normLen + 1,
1384                               0)
1385    ) {
1386    #ifdef UCOL_DEBUG
1387        fprintf(stderr, "normalizeIterator(), out of memory\n");
1388    #endif
1389        return;
1390    }
1391    status = U_ZERO_ERROR;
1392    //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1393    collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1394    normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1395    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1396  }
1397  // Terminate the buffer - we already checked that it is big enough
1398  collationSource->writableBuffer[normLen] = 0;
1399  if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1400      collationSource->flags |= UCOL_ITER_ALLOCATED;
1401  }
1402  collationSource->pos        = collationSource->writableBuffer;
1403  collationSource->origFlags  = collationSource->flags;
1404  collationSource->flags     |= UCOL_ITER_INNORMBUF;
1405  collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1406}*/
1407
1408
1409/* Incremental FCD check and normalize                                                    */
1410/*   Called from getNextCE when normalization state is suspect.                           */
1411/*   When entering, the state is known to be this:                                        */
1412/*      o   We are working in the main buffer of the collIterate, not the side            */
1413/*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1414/*          so we won't get here.                                                         */
1415/*      o   The leading combining class from the current character is 0 or                */
1416/*          the trailing combining class of the previous char was zero.                   */
1417/*          True because the previous call to this function will have always exited       */
1418/*          that way, and we get called for every char where cc might be non-zero.        */
1419static
1420inline UBool collIterFCD(collIterate *collationSource) {
1421    const UChar *srcP, *endP;
1422    uint8_t     leadingCC;
1423    uint8_t     prevTrailingCC = 0;
1424    uint16_t    fcd;
1425    UBool       needNormalize = FALSE;
1426
1427    srcP = collationSource->pos-1;
1428
1429    if (collationSource->flags & UCOL_ITER_HASLEN) {
1430        endP = collationSource->endp;
1431    } else {
1432        endP = NULL;
1433    }
1434
1435    // Get the trailing combining class of the current character.  If it's zero,
1436    //   we are OK.
1437    /* trie access */
1438    fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1439    if (fcd != 0) {
1440        prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1441
1442        if (prevTrailingCC != 0) {
1443            // The current char has a non-zero trailing CC.  Scan forward until we find
1444            //   a char with a leading cc of zero.
1445            while (endP == NULL || srcP != endP)
1446            {
1447                const UChar *savedSrcP = srcP;
1448
1449                /* trie access */
1450                fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP);
1451                leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1452                if (leadingCC == 0) {
1453                    srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1454                                           //   back up over it.  (Could be surrogate pair!)
1455                    break;
1456                }
1457
1458                if (leadingCC < prevTrailingCC) {
1459                    needNormalize = TRUE;
1460                }
1461
1462                prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1463            }
1464        }
1465    }
1466
1467    collationSource->fcdPosition = (UChar *)srcP;
1468
1469    return needNormalize;
1470}
1471
1472/****************************************************************************/
1473/* Following are the CE retrieval functions                                 */
1474/*                                                                          */
1475/****************************************************************************/
1476
1477static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1478static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1479
1480/* there should be a macro version of this function in the header file */
1481/* This is the first function that tries to fetch a collation element  */
1482/* If it's not succesfull or it encounters a more difficult situation  */
1483/* some more sofisticated and slower functions are invoked             */
1484static
1485inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1486    uint32_t order = 0;
1487    if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1488        order = *(collationSource->toReturn++);                         /* if so, return them */
1489        if(collationSource->CEpos == collationSource->toReturn) {
1490            collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1491        }
1492        return order;
1493    }
1494
1495    UChar ch = 0;
1496    collationSource->offsetReturn = NULL;
1497
1498    do {
1499        for (;;)                           /* Loop handles case when incremental normalize switches   */
1500        {                                  /*   to or from the side buffer / original string, and we  */
1501            /*   need to start again to get the next character.        */
1502
1503            if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1504            {
1505                // The source string is null terminated and we're not working from the side buffer,
1506                //   and we're not normalizing.  This is the fast path.
1507                //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1508                ch = *collationSource->pos++;
1509                if (ch != 0) {
1510                    break;
1511                }
1512                else {
1513                    return UCOL_NO_MORE_CES;
1514                }
1515            }
1516
1517            if (collationSource->flags & UCOL_ITER_HASLEN) {
1518                // Normal path for strings when length is specified.
1519                //   (We can't be in side buffer because it is always null terminated.)
1520                if (collationSource->pos >= collationSource->endp) {
1521                    // Ran off of the end of the main source string.  We're done.
1522                    return UCOL_NO_MORE_CES;
1523                }
1524                ch = *collationSource->pos++;
1525            }
1526            else if(collationSource->flags & UCOL_USE_ITERATOR) {
1527                UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1528                if(iterCh == U_SENTINEL) {
1529                    return UCOL_NO_MORE_CES;
1530                }
1531                ch = (UChar)iterCh;
1532            }
1533            else
1534            {
1535                // Null terminated string.
1536                ch = *collationSource->pos++;
1537                if (ch == 0) {
1538                    // Ran off end of buffer.
1539                    if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1540                        // Ran off end of main string. backing up one character.
1541                        collationSource->pos--;
1542                        return UCOL_NO_MORE_CES;
1543                    }
1544                    else
1545                    {
1546                        // Hit null in the normalize side buffer.
1547                        // Usually this means the end of the normalized data,
1548                        // except for one odd case: a null followed by combining chars,
1549                        //   which is the case if we are at the start of the buffer.
1550                        if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1551                            break;
1552                        }
1553
1554                        //  Null marked end of side buffer.
1555                        //   Revert to the main string and
1556                        //   loop back to top to try again to get a character.
1557                        collationSource->pos   = collationSource->fcdPosition;
1558                        collationSource->flags = collationSource->origFlags;
1559                        continue;
1560                    }
1561                }
1562            }
1563
1564            if(collationSource->flags&UCOL_HIRAGANA_Q) {
1565                /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1566                 * based on whether the previous codepoint was Hiragana or Katakana.
1567                 */
1568                if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1569                        ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1570                    collationSource->flags |= UCOL_WAS_HIRAGANA;
1571                } else {
1572                    collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1573                }
1574            }
1575
1576            // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1577            //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1578            if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1579                break;
1580            }
1581
1582            if (collationSource->fcdPosition >= collationSource->pos) {
1583                // An earlier FCD check has already covered the current character.
1584                // We can go ahead and process this char.
1585                break;
1586            }
1587
1588            if (ch < ZERO_CC_LIMIT_ ) {
1589                // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1590                break;
1591            }
1592
1593            if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1594                // We need to peek at the next character in order to tell if we are FCD
1595                if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1596                    // We are at the last char of source string.
1597                    //  It is always OK for FCD check.
1598                    break;
1599                }
1600
1601                // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1602                if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1603                    break;
1604                }
1605            }
1606
1607
1608            // Need a more complete FCD check and possible normalization.
1609            if (collIterFCD(collationSource)) {
1610                collIterNormalize(collationSource);
1611            }
1612            if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1613                //  No normalization was needed.  Go ahead and process the char we already had.
1614                break;
1615            }
1616
1617            // Some normalization happened.  Next loop iteration will pick up a char
1618            //   from the normalization buffer.
1619
1620        }   // end for (;;)
1621
1622
1623        if (ch <= 0xFF) {
1624            /*  For latin-1 characters we never need to fall back to the UCA table        */
1625            /*    because all of the UCA data is replicated in the latinOneMapping array  */
1626            order = coll->latinOneMapping[ch];
1627            if (order > UCOL_NOT_FOUND) {
1628                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1629            }
1630        }
1631        else
1632        {
1633            // Always use UCA for Han, Hangul
1634            // (Han extension A is before main Han block)
1635            // **** Han compatibility chars ?? ****
1636            if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1637                (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1638                if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1639                    // between the two target ranges; do normal lookup
1640                    // **** this range is YI, Modifier tone letters, ****
1641                    // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
1642                    // **** Latin-D might be tailored, so we need to ****
1643                    // **** do the normal lookup for these guys.     ****
1644                    order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1645                } else {
1646                    // in one of the target ranges; use UCA
1647                    order = UCOL_NOT_FOUND;
1648                }
1649            } else {
1650                order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1651            }
1652
1653            if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1654                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1655            }
1656
1657            if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1658                /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1659                order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1660
1661                if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1662                    order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1663                }
1664            }
1665        }
1666    } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1667
1668    if(order == UCOL_NOT_FOUND) {
1669        order = getImplicit(ch, collationSource);
1670    }
1671    return order; /* return the CE */
1672}
1673
1674/* ucol_getNextCE, out-of-line version for use from other files.   */
1675U_CAPI uint32_t  U_EXPORT2
1676ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1677    return ucol_IGetNextCE(coll, collationSource, status);
1678}
1679
1680
1681/**
1682* Incremental previous normalization happens here. Pick up the range of chars
1683* identifed by FCD, normalize it into the collIterate's writable buffer,
1684* switch the collIterate's state to use the writable buffer.
1685* @param data collation iterator data
1686*/
1687static
1688void collPrevIterNormalize(collIterate *data)
1689{
1690    UErrorCode status  = U_ZERO_ERROR;
1691    const UChar *pEnd   = data->pos;  /* End normalize + 1 */
1692    const UChar *pStart;
1693
1694    /* Start normalize */
1695    if (data->fcdPosition == NULL) {
1696        pStart = data->string;
1697    }
1698    else {
1699        pStart = data->fcdPosition + 1;
1700    }
1701
1702    int32_t normLen =
1703        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1704                             data->writableBuffer,
1705                             status).
1706        length();
1707    if(U_FAILURE(status)) {
1708        return;
1709    }
1710    /*
1711    this puts the null termination infront of the normalized string instead
1712    of the end
1713    */
1714    data->writableBuffer.insert(0, (UChar)0);
1715
1716    /*
1717     * The usual case at this point is that we've got a base
1718     * character followed by marks that were normalized. If
1719     * fcdPosition is NULL, that means that we backed up to
1720     * the beginning of the string and there's no base character.
1721     *
1722     * Forward processing will usually normalize when it sees
1723     * the first mark, so that mark will get it's natural offset
1724     * and the rest will get the offset of the character following
1725     * the marks. The base character will also get its natural offset.
1726     *
1727     * We write the offset of the base character, if there is one,
1728     * followed by the offset of the first mark and then the offsets
1729     * of the rest of the marks.
1730     */
1731    int32_t firstMarkOffset = 0;
1732    int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
1733    int32_t trailCount      = normLen - 1;
1734
1735    if (data->fcdPosition != NULL) {
1736        int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1737        UChar   baseChar   = *data->fcdPosition;
1738
1739        firstMarkOffset = baseOffset + 1;
1740
1741        /*
1742         * If the base character is the start of a contraction, forward processing
1743         * will normalize the marks while checking for the contraction, which means
1744         * that the offset of the first mark will the same as the other marks.
1745         *
1746         * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1747         */
1748        if (baseChar >= 0x100) {
1749            uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1750
1751            if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1752                baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1753            }
1754
1755            if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1756                firstMarkOffset = trailOffset;
1757            }
1758        }
1759
1760        data->appendOffset(baseOffset, status);
1761    }
1762
1763    data->appendOffset(firstMarkOffset, status);
1764
1765    for (int32_t i = 0; i < trailCount; i += 1) {
1766        data->appendOffset(trailOffset, status);
1767    }
1768
1769    data->offsetRepeatValue = trailOffset;
1770
1771    data->offsetReturn = data->offsetStore - 1;
1772    if (data->offsetReturn == data->offsetBuffer) {
1773        data->offsetStore = data->offsetBuffer;
1774    }
1775
1776    data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1777    data->origFlags  = data->flags;
1778    data->flags     |= UCOL_ITER_INNORMBUF;
1779    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1780}
1781
1782
1783/**
1784* Incremental FCD check for previous iteration and normalize. Called from
1785* getPrevCE when normalization state is suspect.
1786* When entering, the state is known to be this:
1787* o  We are working in the main buffer of the collIterate, not the side
1788*    writable buffer. When in the side buffer, normalization mode is always
1789*    off, so we won't get here.
1790* o  The leading combining class from the current character is 0 or the
1791*    trailing combining class of the previous char was zero.
1792*    True because the previous call to this function will have always exited
1793*    that way, and we get called for every char where cc might be non-zero.
1794* @param data collation iterate struct
1795* @return normalization status, TRUE for normalization to be done, FALSE
1796*         otherwise
1797*/
1798static
1799inline UBool collPrevIterFCD(collIterate *data)
1800{
1801    const UChar *src, *start;
1802    uint8_t     leadingCC;
1803    uint8_t     trailingCC = 0;
1804    uint16_t    fcd;
1805    UBool       result = FALSE;
1806
1807    start = data->string;
1808    src = data->pos + 1;
1809
1810    /* Get the trailing combining class of the current character. */
1811    fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1812
1813    leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1814
1815    if (leadingCC != 0) {
1816        /*
1817        The current char has a non-zero leading combining class.
1818        Scan backward until we find a char with a trailing cc of zero.
1819        */
1820        for (;;)
1821        {
1822            if (start == src) {
1823                data->fcdPosition = NULL;
1824                return result;
1825            }
1826
1827            fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src);
1828
1829            trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1830
1831            if (trailingCC == 0) {
1832                break;
1833            }
1834
1835            if (leadingCC < trailingCC) {
1836                result = TRUE;
1837            }
1838
1839            leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1840        }
1841    }
1842
1843    data->fcdPosition = (UChar *)src;
1844
1845    return result;
1846}
1847
1848/** gets a code unit from the string at a given offset
1849 *  Handles both normal and iterative cases.
1850 *  No error checking - caller beware!
1851 */
1852static inline
1853UChar peekCodeUnit(collIterate *source, int32_t offset) {
1854    if(source->pos != NULL) {
1855        return *(source->pos + offset);
1856    } else if(source->iterator != NULL) {
1857        UChar32 c;
1858        if(offset != 0) {
1859            source->iterator->move(source->iterator, offset, UITER_CURRENT);
1860            c = source->iterator->next(source->iterator);
1861            source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1862        } else {
1863            c = source->iterator->current(source->iterator);
1864        }
1865        return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
1866    } else {
1867        return 0xfffd;
1868    }
1869}
1870
1871// Code point version. Treats the offset as a _code point_ delta.
1872// We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1873// We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1874static inline
1875UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1876    UChar32 c;
1877    if(source->pos != NULL) {
1878        const UChar *p = source->pos;
1879        if(offset >= 0) {
1880            // Skip forward over (offset-1) code points.
1881            while(--offset >= 0) {
1882                if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1883                    ++p;
1884                }
1885            }
1886            // Read the code point there.
1887            c = *p++;
1888            UChar trail;
1889            if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1890                c = U16_GET_SUPPLEMENTARY(c, trail);
1891            }
1892        } else /* offset<0 */ {
1893            // Skip backward over (offset-1) code points.
1894            while(++offset < 0) {
1895                if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1896                    --p;
1897                }
1898            }
1899            // Read the code point before that.
1900            c = *--p;
1901            UChar lead;
1902            if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1903                c = U16_GET_SUPPLEMENTARY(lead, c);
1904            }
1905        }
1906    } else if(source->iterator != NULL) {
1907        if(offset >= 0) {
1908            // Skip forward over (offset-1) code points.
1909            int32_t fwd = offset;
1910            while(fwd-- > 0) {
1911                uiter_next32(source->iterator);
1912            }
1913            // Read the code point there.
1914            c = uiter_current32(source->iterator);
1915            // Return to the starting point, skipping backward over (offset-1) code points.
1916            while(offset-- > 0) {
1917                uiter_previous32(source->iterator);
1918            }
1919        } else /* offset<0 */ {
1920            // Read backward, reading offset code points, remember only the last-read one.
1921            int32_t back = offset;
1922            do {
1923                c = uiter_previous32(source->iterator);
1924            } while(++back < 0);
1925            // Return to the starting position, skipping forward over offset code points.
1926            do {
1927                uiter_next32(source->iterator);
1928            } while(++offset < 0);
1929        }
1930    } else {
1931        c = U_SENTINEL;
1932    }
1933    return c;
1934}
1935
1936/**
1937* Determines if we are at the start of the data string in the backwards
1938* collation iterator
1939* @param data collation iterator
1940* @return TRUE if we are at the start
1941*/
1942static
1943inline UBool isAtStartPrevIterate(collIterate *data) {
1944    if(data->pos == NULL && data->iterator != NULL) {
1945        return !data->iterator->hasPrevious(data->iterator);
1946    }
1947    //return (collIter_bos(data)) ||
1948    return (data->pos == data->string) ||
1949              ((data->flags & UCOL_ITER_INNORMBUF) &&
1950              *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1951}
1952
1953static
1954inline void goBackOne(collIterate *data) {
1955# if 0
1956    // somehow, it looks like we need to keep iterator synced up
1957    // at all times, as above.
1958    if(data->pos) {
1959        data->pos--;
1960    }
1961    if(data->iterator) {
1962        data->iterator->previous(data->iterator);
1963    }
1964#endif
1965    if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1966        data->iterator->previous(data->iterator);
1967    }
1968    if(data->pos) {
1969        data->pos --;
1970    }
1971}
1972
1973/**
1974* Inline function that gets a simple CE.
1975* So what it does is that it will first check the expansion buffer. If the
1976* expansion buffer is not empty, ie the end pointer to the expansion buffer
1977* is different from the string pointer, we return the collation element at the
1978* return pointer and decrement it.
1979* For more complicated CEs it resorts to getComplicatedCE.
1980* @param coll collator data
1981* @param data collation iterator struct
1982* @param status error status
1983*/
1984static
1985inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1986                               UErrorCode *status)
1987{
1988    uint32_t result = (uint32_t)UCOL_NULLORDER;
1989
1990    if (data->offsetReturn != NULL) {
1991        if (data->offsetRepeatCount > 0) {
1992                data->offsetRepeatCount -= 1;
1993        } else {
1994            if (data->offsetReturn == data->offsetBuffer) {
1995                data->offsetReturn = NULL;
1996                data->offsetStore  = data->offsetBuffer;
1997            } else {
1998                data->offsetReturn -= 1;
1999            }
2000        }
2001    }
2002
2003    if ((data->extendCEs && data->toReturn > data->extendCEs) ||
2004            (!data->extendCEs && data->toReturn > data->CEs))
2005    {
2006        data->toReturn -= 1;
2007        result = *(data->toReturn);
2008        if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
2009            data->CEpos = data->toReturn;
2010        }
2011    }
2012    else {
2013        UChar ch = 0;
2014
2015        do {
2016            /*
2017            Loop handles case when incremental normalize switches to or from the
2018            side buffer / original string, and we need to start again to get the
2019            next character.
2020            */
2021            for (;;) {
2022                if (data->flags & UCOL_ITER_HASLEN) {
2023                    /*
2024                    Normal path for strings when length is specified.
2025                    Not in side buffer because it is always null terminated.
2026                    */
2027                    if (data->pos <= data->string) {
2028                        /* End of the main source string */
2029                        return UCOL_NO_MORE_CES;
2030                    }
2031                    data->pos --;
2032                    ch = *data->pos;
2033                }
2034                // we are using an iterator to go back. Pray for us!
2035                else if (data->flags & UCOL_USE_ITERATOR) {
2036                  UChar32 iterCh = data->iterator->previous(data->iterator);
2037                  if(iterCh == U_SENTINEL) {
2038                    return UCOL_NO_MORE_CES;
2039                  } else {
2040                    ch = (UChar)iterCh;
2041                  }
2042                }
2043                else {
2044                    data->pos --;
2045                    ch = *data->pos;
2046                    /* we are in the side buffer. */
2047                    if (ch == 0) {
2048                        /*
2049                        At the start of the normalize side buffer.
2050                        Go back to string.
2051                        Because pointer points to the last accessed character,
2052                        hence we have to increment it by one here.
2053                        */
2054                        data->flags = data->origFlags;
2055                        data->offsetRepeatValue = 0;
2056
2057                         if (data->fcdPosition == NULL) {
2058                            data->pos = data->string;
2059                            return UCOL_NO_MORE_CES;
2060                        }
2061                        else {
2062                            data->pos   = data->fcdPosition + 1;
2063                        }
2064
2065                       continue;
2066                    }
2067                }
2068
2069                if(data->flags&UCOL_HIRAGANA_Q) {
2070                  if(ch>=0x3040 && ch<=0x309f) {
2071                    data->flags |= UCOL_WAS_HIRAGANA;
2072                  } else {
2073                    data->flags &= ~UCOL_WAS_HIRAGANA;
2074                  }
2075                }
2076
2077                /*
2078                * got a character to determine if there's fcd and/or normalization
2079                * stuff to do.
2080                * if the current character is not fcd.
2081                * if current character is at the start of the string
2082                * Trailing combining class == 0.
2083                * Note if pos is in the writablebuffer, norm is always 0
2084                */
2085                if (ch < ZERO_CC_LIMIT_ ||
2086                  // this should propel us out of the loop in the iterator case
2087                    (data->flags & UCOL_ITER_NORM) == 0 ||
2088                    (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2089                    || data->string == data->pos) {
2090                    break;
2091                }
2092
2093                if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2094                    /* if next character is FCD */
2095                    if (data->pos == data->string) {
2096                        /* First char of string is always OK for FCD check */
2097                        break;
2098                    }
2099
2100                    /* Not first char of string, do the FCD fast test */
2101                    if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2102                        break;
2103                    }
2104                }
2105
2106                /* Need a more complete FCD check and possible normalization. */
2107                if (collPrevIterFCD(data)) {
2108                    collPrevIterNormalize(data);
2109                }
2110
2111                if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2112                    /*  No normalization. Go ahead and process the char. */
2113                    break;
2114                }
2115
2116                /*
2117                Some normalization happened.
2118                Next loop picks up a char from the normalization buffer.
2119                */
2120            }
2121
2122            /* attempt to handle contractions, after removal of the backwards
2123            contraction
2124            */
2125            if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2126                result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2127            } else {
2128                if (ch <= 0xFF) {
2129                    result = coll->latinOneMapping[ch];
2130                }
2131                else {
2132                    // Always use UCA for [3400..9FFF], [AC00..D7AF]
2133                    // **** [FA0E..FA2F] ?? ****
2134                    if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2135                        (ch >= 0x3400 && ch <= 0xD7AF)) {
2136                        if (ch > 0x9FFF && ch < 0xAC00) {
2137                            // between the two target ranges; do normal lookup
2138                            // **** this range is YI, Modifier tone letters, ****
2139                            // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
2140                            // **** Latin-D might be tailored, so we need to ****
2141                            // **** do the normal lookup for these guys.     ****
2142                             result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2143                        } else {
2144                            result = UCOL_NOT_FOUND;
2145                        }
2146                    } else {
2147                        result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2148                    }
2149                }
2150                if (result > UCOL_NOT_FOUND) {
2151                    result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2152                }
2153                if (result == UCOL_NOT_FOUND) { // Not found in master list
2154                    if (!isAtStartPrevIterate(data) &&
2155                        ucol_contractionEndCP(ch, data->coll))
2156                    {
2157                        result = UCOL_CONTRACTION;
2158                    } else {
2159                        if(coll->UCA) {
2160                            result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2161                        }
2162                    }
2163
2164                    if (result > UCOL_NOT_FOUND) {
2165                        if(coll->UCA) {
2166                            result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2167                        }
2168                    }
2169                }
2170            }
2171        } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2172
2173        if(result == UCOL_NOT_FOUND) {
2174            result = getPrevImplicit(ch, data);
2175        }
2176    }
2177
2178    return result;
2179}
2180
2181
2182/*   ucol_getPrevCE, out-of-line version for use from other files.  */
2183U_CFUNC uint32_t  U_EXPORT2
2184ucol_getPrevCE(const UCollator *coll, collIterate *data,
2185                        UErrorCode *status) {
2186    return ucol_IGetPrevCE(coll, data, status);
2187}
2188
2189
2190/* this should be connected to special Jamo handling */
2191U_CFUNC uint32_t  U_EXPORT2
2192ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2193    collIterate colIt;
2194    IInit_collIterate(coll, &u, 1, &colIt, status);
2195    if(U_FAILURE(*status)) {
2196        return 0;
2197    }
2198    return ucol_IGetNextCE(coll, &colIt, status);
2199}
2200
2201/**
2202* Inserts the argument character into the end of the buffer pushing back the
2203* null terminator.
2204* @param data collIterate struct data
2205* @param ch character to be appended
2206* @return the position of the new addition
2207*/
2208static
2209inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2210{
2211    int32_t oldLength = data->writableBuffer.length();
2212    return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2213}
2214
2215/**
2216* Inserts the argument string into the end of the buffer pushing back the
2217* null terminator.
2218* @param data collIterate struct data
2219* @param string to be appended
2220* @param length of the string to be appended
2221* @return the position of the new addition
2222*/
2223static
2224inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2225{
2226    int32_t oldLength = data->writableBuffer.length();
2227    return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2228}
2229
2230/**
2231* Special normalization function for contraction in the forwards iterator.
2232* This normalization sequence will place the current character at source->pos
2233* and its following normalized sequence into the buffer.
2234* The fcd position, pos will be changed.
2235* pos will now point to positions in the buffer.
2236* Flags will be changed accordingly.
2237* @param data collation iterator data
2238*/
2239static
2240inline void normalizeNextContraction(collIterate *data)
2241{
2242    int32_t     strsize;
2243    UErrorCode  status     = U_ZERO_ERROR;
2244    /* because the pointer points to the next character */
2245    const UChar *pStart    = data->pos - 1;
2246    const UChar *pEnd;
2247
2248    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2249        data->writableBuffer.setTo(*(pStart - 1));
2250        strsize               = 1;
2251    }
2252    else {
2253        strsize = data->writableBuffer.length();
2254    }
2255
2256    pEnd = data->fcdPosition;
2257
2258    data->writableBuffer.append(
2259        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2260    if(U_FAILURE(status)) {
2261        return;
2262    }
2263
2264    data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
2265    data->origFlags  = data->flags;
2266    data->flags     |= UCOL_ITER_INNORMBUF;
2267    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2268}
2269
2270/**
2271* Contraction character management function that returns the next character
2272* for the forwards iterator.
2273* Does nothing if the next character is in buffer and not the first character
2274* in it.
2275* Else it checks next character in data string to see if it is normalizable.
2276* If it is not, the character is simply copied into the buffer, else
2277* the whole normalized substring is copied into the buffer, including the
2278* current character.
2279* @param data collation element iterator data
2280* @return next character
2281*/
2282static
2283inline UChar getNextNormalizedChar(collIterate *data)
2284{
2285    UChar  nextch;
2286    UChar  ch;
2287    // Here we need to add the iterator code. One problem is the way
2288    // end of string is handled. If we just return next char, it could
2289    // be the sentinel. Most of the cases already check for this, but we
2290    // need to be sure.
2291    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2292         /* if no normalization and not in buffer. */
2293      if(data->flags & UCOL_USE_ITERATOR) {
2294         return (UChar)data->iterator->next(data->iterator);
2295      } else {
2296         return *(data->pos ++);
2297      }
2298    }
2299
2300    //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2301      //normalizeIterator(data);
2302    //}
2303
2304    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2305    if ((innormbuf && *data->pos != 0) ||
2306        (data->fcdPosition != NULL && !innormbuf &&
2307        data->pos < data->fcdPosition)) {
2308        /*
2309        if next character is in normalized buffer, no further normalization
2310        is required
2311        */
2312        return *(data->pos ++);
2313    }
2314
2315    if (data->flags & UCOL_ITER_HASLEN) {
2316        /* in data string */
2317        if (data->pos + 1 == data->endp) {
2318            return *(data->pos ++);
2319        }
2320    }
2321    else {
2322        if (innormbuf) {
2323          // inside the normalization buffer, but at the end
2324          // (since we encountered zero). This means, in the
2325          // case we're using char iterator, that we need to
2326          // do another round of normalization.
2327          //if(data->origFlags & UCOL_USE_ITERATOR) {
2328            // we need to restore original flags,
2329            // otherwise, we'll lose them
2330            //data->flags = data->origFlags;
2331            //normalizeIterator(data);
2332            //return *(data->pos++);
2333          //} else {
2334            /*
2335            in writable buffer, at this point fcdPosition can not be
2336            pointing to the end of the data string. see contracting tag.
2337            */
2338          if(data->fcdPosition) {
2339            if (*(data->fcdPosition + 1) == 0 ||
2340                data->fcdPosition + 1 == data->endp) {
2341                /* at the end of the string, dump it into the normalizer */
2342                data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2343                // Check if data->pos received a null pointer
2344                if (data->pos == NULL) {
2345                    return (UChar)-1; // Return to indicate error.
2346                }
2347                return *(data->fcdPosition ++);
2348            }
2349            data->pos = data->fcdPosition;
2350          } else if(data->origFlags & UCOL_USE_ITERATOR) {
2351            // if we are here, we're using a normalizing iterator.
2352            // we should just continue further.
2353            data->flags = data->origFlags;
2354            data->pos = NULL;
2355            return (UChar)data->iterator->next(data->iterator);
2356          }
2357          //}
2358        }
2359        else {
2360            if (*(data->pos + 1) == 0) {
2361                return *(data->pos ++);
2362            }
2363        }
2364    }
2365
2366    ch = *data->pos ++;
2367    nextch = *data->pos;
2368
2369    /*
2370    * if the current character is not fcd.
2371    * Trailing combining class == 0.
2372    */
2373    if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2374        (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2375         ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2376            /*
2377            Need a more complete FCD check and possible normalization.
2378            normalize substring will be appended to buffer
2379            */
2380        if (collIterFCD(data)) {
2381            normalizeNextContraction(data);
2382            return *(data->pos ++);
2383        }
2384        else if (innormbuf) {
2385            /* fcdposition shifted even when there's no normalization, if we
2386            don't input the rest into this, we'll get the wrong position when
2387            we reach the end of the writableBuffer */
2388            int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2389            data->pos = insertBufferEnd(data, data->pos - 1, length);
2390            // Check if data->pos received a null pointer
2391            if (data->pos == NULL) {
2392                return (UChar)-1; // Return to indicate error.
2393            }
2394            return *(data->pos ++);
2395        }
2396    }
2397
2398    if (innormbuf) {
2399        /*
2400        no normalization is to be done hence only one character will be
2401        appended to the buffer.
2402        */
2403        data->pos = insertBufferEnd(data, ch) + 1;
2404        // Check if data->pos received a null pointer
2405        if (data->pos == NULL) {
2406            return (UChar)-1; // Return to indicate error.
2407        }
2408    }
2409
2410    /* points back to the pos in string */
2411    return ch;
2412}
2413
2414
2415
2416/**
2417* Function to copy the buffer into writableBuffer and sets the fcd position to
2418* the correct position
2419* @param source data string source
2420* @param buffer character buffer
2421*/
2422static
2423inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2424{
2425    /* okay confusing part here. to ensure that the skipped characters are
2426    considered later, we need to place it in the appropriate position in the
2427    normalization buffer and reassign the pos pointer. simple case if pos
2428    reside in string, simply copy to normalization buffer and
2429    fcdposition = pos, pos = start of normalization buffer. if pos in
2430    normalization buffer, we'll insert the copy infront of pos and point pos
2431    to the start of the normalization buffer. why am i doing these copies?
2432    well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2433    not require any changes, which be really painful. */
2434    if (source->flags & UCOL_ITER_INNORMBUF) {
2435        int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2436        source->writableBuffer.replace(0, replaceLength, buffer);
2437    }
2438    else {
2439        source->fcdPosition  = source->pos;
2440        source->origFlags    = source->flags;
2441        source->flags       |= UCOL_ITER_INNORMBUF;
2442        source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2443        source->writableBuffer = buffer;
2444    }
2445
2446    source->pos = source->writableBuffer.getTerminatedBuffer();
2447}
2448
2449/**
2450* Function to get the discontiguos collation element within the source.
2451* Note this function will set the position to the appropriate places.
2452* @param coll current collator used
2453* @param source data string source
2454* @param constart index to the start character in the contraction table
2455* @return discontiguos collation element offset
2456*/
2457static
2458uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2459                                const UChar *constart)
2460{
2461    /* source->pos currently points to the second combining character after
2462       the start character */
2463          const UChar *temppos      = source->pos;
2464          UnicodeString buffer;
2465    const UChar   *tempconstart = constart;
2466          uint8_t  tempflags    = source->flags;
2467          UBool    multicontraction = FALSE;
2468          collIterateState discState;
2469
2470          backupState(source, &discState);
2471
2472    buffer.setTo(peekCodePoint(source, -1));
2473    for (;;) {
2474        UChar    *UCharOffset;
2475        UChar     schar,
2476                  tchar;
2477        uint32_t  result;
2478
2479        if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2480            || (peekCodeUnit(source, 0) == 0  &&
2481            //|| (*source->pos == 0  &&
2482                ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2483                 source->fcdPosition == NULL ||
2484                 source->fcdPosition == source->endp ||
2485                 *(source->fcdPosition) == 0 ||
2486                 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2487                 /* end of string in null terminated string or stopped by a
2488                 null character, note fcd does not always point to a base
2489                 character after the discontiguos change */
2490                 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2491                 //u_getCombiningClass(*(source->pos)) == 0) {
2492            //constart = (UChar *)coll->image + getContractOffset(CE);
2493            if (multicontraction) {
2494                source->pos    = temppos - 1;
2495                setDiscontiguosAttribute(source, buffer);
2496                return *(coll->contractionCEs +
2497                                    (tempconstart - coll->contractionIndex));
2498            }
2499            constart = tempconstart;
2500            break;
2501        }
2502
2503        UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2504        schar = getNextNormalizedChar(source);
2505
2506        while (schar > (tchar = *UCharOffset)) {
2507            UCharOffset++;
2508        }
2509
2510        if (schar != tchar) {
2511            /* not the correct codepoint. we stuff the current codepoint into
2512            the discontiguos buffer and try the next character */
2513            buffer.append(schar);
2514            continue;
2515        }
2516        else {
2517            if (u_getCombiningClass(schar) ==
2518                u_getCombiningClass(peekCodePoint(source, -2))) {
2519                buffer.append(schar);
2520                continue;
2521            }
2522            result = *(coll->contractionCEs +
2523                                      (UCharOffset - coll->contractionIndex));
2524        }
2525
2526        if (result == UCOL_NOT_FOUND) {
2527          break;
2528        } else if (isContraction(result)) {
2529            /* this is a multi-contraction*/
2530            tempconstart = (UChar *)coll->image + getContractOffset(result);
2531            if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2532                != UCOL_NOT_FOUND) {
2533                multicontraction = TRUE;
2534                temppos       = source->pos + 1;
2535            }
2536        } else {
2537            setDiscontiguosAttribute(source, buffer);
2538            return result;
2539        }
2540    }
2541
2542    /* no problems simply reverting just like that,
2543    if we are in string before getting into this function, points back to
2544    string hence no problem.
2545    if we are in normalization buffer before getting into this function,
2546    since we'll never use another normalization within this function, we
2547    know that fcdposition points to a base character. the normalization buffer
2548    never change, hence this revert works. */
2549    loadState(source, &discState, TRUE);
2550    goBackOne(source);
2551
2552    //source->pos   = temppos - 1;
2553    source->flags = tempflags;
2554    return *(coll->contractionCEs + (constart - coll->contractionIndex));
2555}
2556
2557/* now uses Mark's getImplicitPrimary code */
2558static
2559inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2560    uint32_t r = uprv_uca_getImplicitPrimary(cp);
2561    *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2562    collationSource->offsetRepeatCount += 1;
2563    return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2564}
2565
2566/**
2567* Inserts the argument character into the front of the buffer replacing the
2568* front null terminator.
2569* @param data collation element iterator data
2570* @param ch character to be appended
2571*/
2572static
2573inline void insertBufferFront(collIterate *data, UChar ch)
2574{
2575    data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2576}
2577
2578/**
2579* Special normalization function for contraction in the previous iterator.
2580* This normalization sequence will place the current character at source->pos
2581* and its following normalized sequence into the buffer.
2582* The fcd position, pos will be changed.
2583* pos will now point to positions in the buffer.
2584* Flags will be changed accordingly.
2585* @param data collation iterator data
2586*/
2587static
2588inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2589{
2590    const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
2591    const UChar *pStart;
2592
2593    UnicodeString endOfBuffer;
2594    if (data->flags & UCOL_ITER_HASLEN) {
2595        /*
2596        normalization buffer not used yet, we'll pull down the next
2597        character into the end of the buffer
2598        */
2599        endOfBuffer.setTo(*pEnd);
2600    }
2601    else {
2602        endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
2603    }
2604
2605    if (data->fcdPosition == NULL) {
2606        pStart = data->string;
2607    }
2608    else {
2609        pStart = data->fcdPosition + 1;
2610    }
2611    int32_t normLen =
2612        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2613                             data->writableBuffer,
2614                             *status).
2615        length();
2616    if(U_FAILURE(*status)) {
2617        return;
2618    }
2619    /*
2620    this puts the null termination infront of the normalized string instead
2621    of the end
2622    */
2623    data->pos =
2624        data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2625        1 + normLen;
2626    data->origFlags  = data->flags;
2627    data->flags     |= UCOL_ITER_INNORMBUF;
2628    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2629}
2630
2631/**
2632* Contraction character management function that returns the previous character
2633* for the backwards iterator.
2634* Does nothing if the previous character is in buffer and not the first
2635* character in it.
2636* Else it checks previous character in data string to see if it is
2637* normalizable.
2638* If it is not, the character is simply copied into the buffer, else
2639* the whole normalized substring is copied into the buffer, including the
2640* current character.
2641* @param data collation element iterator data
2642* @return previous character
2643*/
2644static
2645inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2646{
2647    UChar  prevch;
2648    UChar  ch;
2649    const UChar *start;
2650    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2651    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2652        (innormbuf && *(data->pos - 1) != 0)) {
2653        /*
2654        if no normalization.
2655        if previous character is in normalized buffer, no further normalization
2656        is required
2657        */
2658      if(data->flags & UCOL_USE_ITERATOR) {
2659        data->iterator->move(data->iterator, -1, UITER_CURRENT);
2660        return (UChar)data->iterator->next(data->iterator);
2661      } else {
2662        return *(data->pos - 1);
2663      }
2664    }
2665
2666    start = data->pos;
2667    if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2668        /* in data string */
2669        if ((start - 1) == data->string) {
2670            return *(start - 1);
2671        }
2672        start --;
2673        ch     = *start;
2674        prevch = *(start - 1);
2675    }
2676    else {
2677        /*
2678        in writable buffer, at this point fcdPosition can not be NULL.
2679        see contracting tag.
2680        */
2681        if (data->fcdPosition == data->string) {
2682            /* at the start of the string, just dump it into the normalizer */
2683            insertBufferFront(data, *(data->fcdPosition));
2684            data->fcdPosition = NULL;
2685            return *(data->pos - 1);
2686        }
2687        start  = data->fcdPosition;
2688        ch     = *start;
2689        prevch = *(start - 1);
2690    }
2691    /*
2692    * if the current character is not fcd.
2693    * Trailing combining class == 0.
2694    */
2695    if (data->fcdPosition > start &&
2696       (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2697    {
2698        /*
2699        Need a more complete FCD check and possible normalization.
2700        normalize substring will be appended to buffer
2701        */
2702        const UChar *backuppos = data->pos;
2703        data->pos = start;
2704        if (collPrevIterFCD(data)) {
2705            normalizePrevContraction(data, status);
2706            return *(data->pos - 1);
2707        }
2708        data->pos = backuppos;
2709        data->fcdPosition ++;
2710    }
2711
2712    if (innormbuf) {
2713    /*
2714    no normalization is to be done hence only one character will be
2715    appended to the buffer.
2716    */
2717        insertBufferFront(data, ch);
2718        data->fcdPosition --;
2719    }
2720
2721    return ch;
2722}
2723
2724/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2725/* It is called by getNextCE */
2726
2727/* The following should be even */
2728#define UCOL_MAX_DIGITS_FOR_NUMBER 254
2729
2730uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2731    collIterateState entryState;
2732    backupState(source, &entryState);
2733    UChar32 cp = ch;
2734
2735    for (;;) {
2736        // This loop will repeat only in the case of contractions, and only when a contraction
2737        //   is found and the first CE resulting from that contraction is itself a special
2738        //   (an expansion, for example.)  All other special CE types are fully handled the
2739        //   first time through, and the loop exits.
2740
2741        const uint32_t *CEOffset = NULL;
2742        switch(getCETag(CE)) {
2743        case NOT_FOUND_TAG:
2744            /* This one is not found, and we'll let somebody else bother about it... no more games */
2745            return CE;
2746        case SPEC_PROC_TAG:
2747            {
2748                // Special processing is getting a CE that is preceded by a certain prefix
2749                // Currently this is only needed for optimizing Japanese length and iteration marks.
2750                // When we encouter a special processing tag, we go backwards and try to see if
2751                // we have a match.
2752                // Contraction tables are used - so the whole process is not unlike contraction.
2753                // prefix data is stored backwards in the table.
2754                const UChar *UCharOffset;
2755                UChar schar, tchar;
2756                collIterateState prefixState;
2757                backupState(source, &prefixState);
2758                loadState(source, &entryState, TRUE);
2759                goBackOne(source); // We want to look at the point where we entered - actually one
2760                // before that...
2761
2762                for(;;) {
2763                    // This loop will run once per source string character, for as long as we
2764                    //  are matching a potential contraction sequence
2765
2766                    // First we position ourselves at the begining of contraction sequence
2767                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2768                    if (collIter_bos(source)) {
2769                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2770                        break;
2771                    }
2772                    schar = getPrevNormalizedChar(source, status);
2773                    goBackOne(source);
2774
2775                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2776                        UCharOffset++;
2777                    }
2778
2779                    if (schar == tchar) {
2780                        // Found the source string char in the table.
2781                        //  Pick up the corresponding CE from the table.
2782                        CE = *(coll->contractionCEs +
2783                            (UCharOffset - coll->contractionIndex));
2784                    }
2785                    else
2786                    {
2787                        // Source string char was not in the table.
2788                        //   We have not found the prefix.
2789                        CE = *(coll->contractionCEs +
2790                            (ContractionStart - coll->contractionIndex));
2791                    }
2792
2793                    if(!isPrefix(CE)) {
2794                        // The source string char was in the contraction table, and the corresponding
2795                        //   CE is not a prefix CE.  We found the prefix, break
2796                        //   out of loop, this CE will end up being returned.  This is the normal
2797                        //   way out of prefix handling when the source actually contained
2798                        //   the prefix.
2799                        break;
2800                    }
2801                }
2802                if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2803                    loadState(source, &prefixState, TRUE);
2804                    if(source->origFlags & UCOL_USE_ITERATOR) {
2805                        source->flags = source->origFlags;
2806                    }
2807                } else { // prefix search was a failure, we have to backup all the way to the start
2808                    loadState(source, &entryState, TRUE);
2809                }
2810                break;
2811            }
2812        case CONTRACTION_TAG:
2813            {
2814                /* This should handle contractions */
2815                collIterateState state;
2816                backupState(source, &state);
2817                uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2818                const UChar *UCharOffset;
2819                UChar schar, tchar;
2820
2821                for (;;) {
2822                    /* This loop will run once per source string character, for as long as we     */
2823                    /*  are matching a potential contraction sequence                  */
2824
2825                    /* First we position ourselves at the begining of contraction sequence */
2826                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2827
2828                    if (collIter_eos(source)) {
2829                        // Ran off the end of the source string.
2830                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2831                        // So we'll pick whatever we have at the point...
2832                        if (CE == UCOL_NOT_FOUND) {
2833                            // back up the source over all the chars we scanned going into this contraction.
2834                            CE = firstCE;
2835                            loadState(source, &state, TRUE);
2836                            if(source->origFlags & UCOL_USE_ITERATOR) {
2837                                source->flags = source->origFlags;
2838                            }
2839                        }
2840                        break;
2841                    }
2842
2843                    uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2844                    uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2845
2846                    schar = getNextNormalizedChar(source);
2847                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2848                        UCharOffset++;
2849                    }
2850
2851                    if (schar == tchar) {
2852                        // Found the source string char in the contraction table.
2853                        //  Pick up the corresponding CE from the table.
2854                        CE = *(coll->contractionCEs +
2855                            (UCharOffset - coll->contractionIndex));
2856                    }
2857                    else
2858                    {
2859                        // Source string char was not in contraction table.
2860                        //   Unless we have a discontiguous contraction, we have finished
2861                        //   with this contraction.
2862                        // in order to do the proper detection, we
2863                        // need to see if we're dealing with a supplementary
2864                        /* We test whether the next two char are surrogate pairs.
2865                        * This test is done if the iterator is not NULL.
2866                        * If there is no surrogate pair, the iterator
2867                        * goes back one if needed. */
2868                        UChar32 miss = schar;
2869                        if (source->iterator) {
2870                            UChar32 surrNextChar; /* the next char in the iteration to test */
2871                            int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2872                            if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2873                                prevPos = source->iterator->index;
2874                                surrNextChar = getNextNormalizedChar(source);
2875                                if (U16_IS_TRAIL(surrNextChar)) {
2876                                    miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2877                                } else if (prevPos < source->iterator->index){
2878                                    goBackOne(source);
2879                                }
2880                            }
2881                        } else if (U16_IS_LEAD(schar)) {
2882                            miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2883                        }
2884
2885                        uint8_t sCC;
2886                        if (miss < 0x300 ||
2887                            maxCC == 0 ||
2888                            (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2889                            sCC>maxCC ||
2890                            (allSame != 0 && sCC == maxCC) ||
2891                            collIter_eos(source))
2892                        {
2893                            //  Contraction can not be discontiguous.
2894                            goBackOne(source);  // back up the source string by one,
2895                            //  because  the character we just looked at was
2896                            //  not part of the contraction.   */
2897                            if(U_IS_SUPPLEMENTARY(miss)) {
2898                                goBackOne(source);
2899                            }
2900                            CE = *(coll->contractionCEs +
2901                                (ContractionStart - coll->contractionIndex));
2902                        } else {
2903                            //
2904                            // Contraction is possibly discontiguous.
2905                            //   Scan more of source string looking for a match
2906                            //
2907                            UChar tempchar;
2908                            /* find the next character if schar is not a base character
2909                            and we are not yet at the end of the string */
2910                            tempchar = getNextNormalizedChar(source);
2911                            // probably need another supplementary thingie here
2912                            goBackOne(source);
2913                            if (i_getCombiningClass(tempchar, coll) == 0) {
2914                                goBackOne(source);
2915                                if(U_IS_SUPPLEMENTARY(miss)) {
2916                                    goBackOne(source);
2917                                }
2918                                /* Spit out the last char of the string, wasn't tasty enough */
2919                                CE = *(coll->contractionCEs +
2920                                    (ContractionStart - coll->contractionIndex));
2921                            } else {
2922                                CE = getDiscontiguous(coll, source, ContractionStart);
2923                            }
2924                        }
2925                    } // else after if(schar == tchar)
2926
2927                    if(CE == UCOL_NOT_FOUND) {
2928                        /* The Source string did not match the contraction that we were checking.  */
2929                        /*  Back up the source position to undo the effects of having partially    */
2930                        /*   scanned through what ultimately proved to not be a contraction.       */
2931                        loadState(source, &state, TRUE);
2932                        CE = firstCE;
2933                        break;
2934                    }
2935
2936                    if(!isContraction(CE)) {
2937                        // The source string char was in the contraction table, and the corresponding
2938                        //   CE is not a contraction CE.  We completed the contraction, break
2939                        //   out of loop, this CE will end up being returned.  This is the normal
2940                        //   way out of contraction handling when the source actually contained
2941                        //   the contraction.
2942                        break;
2943                    }
2944
2945
2946                    // The source string char was in the contraction table, and the corresponding
2947                    //   CE is IS  a contraction CE.  We will continue looping to check the source
2948                    //   string for the remaining chars in the contraction.
2949                    uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2950                    if(tempCE != UCOL_NOT_FOUND) {
2951                        // We have scanned a a section of source string for which there is a
2952                        //  CE from the contraction table.  Remember the CE and scan position, so
2953                        //  that we can return to this point if further scanning fails to
2954                        //  match a longer contraction sequence.
2955                        firstCE = tempCE;
2956
2957                        goBackOne(source);
2958                        backupState(source, &state);
2959                        getNextNormalizedChar(source);
2960
2961                        // Another way to do this is:
2962                        //collIterateState tempState;
2963                        //backupState(source, &tempState);
2964                        //goBackOne(source);
2965                        //backupState(source, &state);
2966                        //loadState(source, &tempState, TRUE);
2967
2968                        // The problem is that for incomplete contractions we have to remember the previous
2969                        // position. Before, the only thing I needed to do was state.pos--;
2970                        // After iterator introduction and especially after introduction of normalizing
2971                        // iterators, it became much more difficult to decrease the saved state.
2972                        // I'm not yet sure which of the two methods above is faster.
2973                    }
2974                } // for(;;)
2975                break;
2976            } // case CONTRACTION_TAG:
2977        case LONG_PRIMARY_TAG:
2978            {
2979                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2980                CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2981                source->offsetRepeatCount += 1;
2982                return CE;
2983            }
2984        case EXPANSION_TAG:
2985            {
2986                /* This should handle expansion. */
2987                /* NOTE: we can encounter both continuations and expansions in an expansion! */
2988                /* I have to decide where continuations are going to be dealt with */
2989                uint32_t size;
2990                uint32_t i;    /* general counter */
2991
2992                CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2993                size = getExpansionCount(CE);
2994                CE = *CEOffset++;
2995              //source->offsetRepeatCount = -1;
2996
2997                if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2998                    for(i = 1; i<size; i++) {
2999                        *(source->CEpos++) = *CEOffset++;
3000                        source->offsetRepeatCount += 1;
3001                    }
3002                } else { /* else, we do */
3003                    while(*CEOffset != 0) {
3004                        *(source->CEpos++) = *CEOffset++;
3005                        source->offsetRepeatCount += 1;
3006                    }
3007                }
3008
3009                return CE;
3010            }
3011        case DIGIT_TAG:
3012            {
3013                /*
3014                We do a check to see if we want to collate digits as numbers; if so we generate
3015                a custom collation key. Otherwise we pull out the value stored in the expansion table.
3016                */
3017                //uint32_t size;
3018                uint32_t i;    /* general counter */
3019
3020                if (source->coll->numericCollation == UCOL_ON){
3021                    collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3022                    UChar32 char32 = 0;
3023                    int32_t digVal = 0;
3024
3025                    uint32_t digIndx = 0;
3026                    uint32_t endIndex = 0;
3027                    uint32_t trailingZeroIndex = 0;
3028
3029                    uint8_t collateVal = 0;
3030
3031                    UBool nonZeroValReached = FALSE;
3032
3033                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3034                    /*
3035                         We parse the source string until we hit a char that's NOT a digit.
3036                        Use this u_charDigitValue. This might be slow because we have to
3037                        handle surrogates...
3038                    */
3039            /*
3040                    if (U16_IS_LEAD(ch)){
3041                      if (!collIter_eos(source)) {
3042                        backupState(source, &digitState);
3043                        UChar trail = getNextNormalizedChar(source);
3044                        if(U16_IS_TRAIL(trail)) {
3045                          char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3046                        } else {
3047                          loadState(source, &digitState, TRUE);
3048                          char32 = ch;
3049                        }
3050                      } else {
3051                        char32 = ch;
3052                      }
3053                    } else {
3054                      char32 = ch;
3055                    }
3056                    digVal = u_charDigitValue(char32);
3057            */
3058                    digVal = u_charDigitValue(cp); // if we have arrived here, we have
3059                    // already processed possible supplementaries that trigered the digit tag -
3060                    // all supplementaries are marked in the UCA.
3061                    /*
3062                        We  pad a zero in front of the first element anyways. This takes
3063                        care of the (probably) most common case where people are sorting things followed
3064                        by a single digit
3065                    */
3066                    digIndx++;
3067                    for(;;){
3068                        // Make sure we have enough space. No longer needed;
3069                        // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3070                        // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3071                        // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3072
3073                        // Skipping over leading zeroes.
3074                        if (digVal != 0) {
3075                            nonZeroValReached = TRUE;
3076                        }
3077                        if (nonZeroValReached) {
3078                            /*
3079                            We parse the digit string into base 100 numbers (this fits into a byte).
3080                            We only add to the buffer in twos, thus if we are parsing an odd character,
3081                            that serves as the 'tens' digit while the if we are parsing an even one, that
3082                            is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3083                            a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3084                            overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3085                            than all the other bytes.
3086                            */
3087
3088                            if (digIndx % 2 == 1){
3089                                collateVal += (uint8_t)digVal;
3090
3091                                // We don't enter the low-order-digit case unless we've already seen
3092                                // the high order, or for the first digit, which is always non-zero.
3093                                if (collateVal != 0)
3094                                    trailingZeroIndex = 0;
3095
3096                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3097                                collateVal = 0;
3098                            }
3099                            else{
3100                                // We drop the collation value into the buffer so if we need to do
3101                                // a "front patch" we don't have to check to see if we're hitting the
3102                                // last element.
3103                                collateVal = (uint8_t)(digVal * 10);
3104
3105                                // Check for trailing zeroes.
3106                                if (collateVal == 0)
3107                                {
3108                                    if (!trailingZeroIndex)
3109                                        trailingZeroIndex = (digIndx/2) + 2;
3110                                }
3111                                else
3112                                    trailingZeroIndex = 0;
3113
3114                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3115                            }
3116                            digIndx++;
3117                        }
3118
3119                        // Get next character.
3120                        if (!collIter_eos(source)){
3121                            ch = getNextNormalizedChar(source);
3122                            if (U16_IS_LEAD(ch)){
3123                                if (!collIter_eos(source)) {
3124                                    backupState(source, &digitState);
3125                                    UChar trail = getNextNormalizedChar(source);
3126                                    if(U16_IS_TRAIL(trail)) {
3127                                        char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3128                                    } else {
3129                                        loadState(source, &digitState, TRUE);
3130                                        char32 = ch;
3131                                    }
3132                                }
3133                            } else {
3134                                char32 = ch;
3135                            }
3136
3137                            if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3138                                // Resetting position to point to the next unprocessed char. We
3139                                // overshot it when doing our test/set for numbers.
3140                                if (char32 > 0xFFFF) { // For surrogates.
3141                                    loadState(source, &digitState, TRUE);
3142                                    //goBackOne(source);
3143                                }
3144                                goBackOne(source);
3145                                break;
3146                            }
3147                        } else {
3148                            break;
3149                        }
3150                    }
3151
3152                    if (nonZeroValReached == FALSE){
3153                        digIndx = 2;
3154                        numTempBuf[2] = 6;
3155                    }
3156
3157                    endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3158                    if (digIndx % 2 != 0){
3159                        /*
3160                        We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3161                        we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3162                        Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3163                        single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3164                        */
3165
3166                        for(i = 2; i < endIndex; i++){
3167                            numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3168                                (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3169                        }
3170                        --digIndx;
3171                    }
3172
3173                    // Subtract one off of the last byte.
3174                    numTempBuf[endIndex-1] -= 1;
3175
3176                    /*
3177                    We want to skip over the first two slots in the buffer. The first slot
3178                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3179                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3180                    */
3181                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3182                    numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3183
3184                    // Now transfer the collation key to our collIterate struct.
3185                    // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3186                    //size = ((endIndex+1) & ~1)/2;
3187                    CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3188                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3189                        UCOL_BYTE_COMMON; // Tertiary weight.
3190                    i = 2; // Reset the index into the buffer.
3191                    while(i < endIndex)
3192                    {
3193                        uint32_t primWeight = numTempBuf[i++] << 8;
3194                        if ( i < endIndex)
3195                            primWeight |= numTempBuf[i++];
3196                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3197                    }
3198
3199                } else {
3200                    // no numeric mode, we'll just switch to whatever we stashed and continue
3201                    CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3202                    CE = *CEOffset++;
3203                    break;
3204                }
3205                return CE;
3206            }
3207            /* various implicits optimization */
3208        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3209            /* UCA is filled with these. Tailorings are NOT_FOUND */
3210            return getImplicit(cp, source);
3211        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3212            // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3213            return getImplicit(cp, source);
3214        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3215            {
3216                static const uint32_t
3217                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3218                //const uint32_t LCount = 19;
3219                static const uint32_t VCount = 21;
3220                static const uint32_t TCount = 28;
3221                //const uint32_t NCount = VCount * TCount;   // 588
3222                //const uint32_t SCount = LCount * NCount;   // 11172
3223                uint32_t L = ch - SBase;
3224
3225                // divide into pieces
3226
3227                uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3228                L /= TCount;
3229                uint32_t V = L % VCount;
3230                L /= VCount;
3231
3232                // offset them
3233
3234                L += LBase;
3235                V += VBase;
3236                T += TBase;
3237
3238                // return the first CE, but first put the rest into the expansion buffer
3239                if (!source->coll->image->jamoSpecial) { // FAST PATH
3240
3241                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3242                    if (T != TBase) {
3243                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3244                    }
3245
3246                    return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3247
3248                } else { // Jamo is Special
3249                    // Since Hanguls pass the FCD check, it is
3250                    // guaranteed that we won't be in
3251                    // the normalization buffer if something like this happens
3252
3253                    // However, if we are using a uchar iterator and normalization
3254                    // is ON, the Hangul that lead us here is going to be in that
3255                    // normalization buffer. Here we want to restore the uchar
3256                    // iterator state and pull out of the normalization buffer
3257                    if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3258                        source->flags = source->origFlags; // restore the iterator
3259                        source->pos = NULL;
3260                    }
3261
3262                    // Move Jamos into normalization buffer
3263                    UChar *buffer = source->writableBuffer.getBuffer(4);
3264                    int32_t bufferLength;
3265                    buffer[0] = (UChar)L;
3266                    buffer[1] = (UChar)V;
3267                    if (T != TBase) {
3268                        buffer[2] = (UChar)T;
3269                        bufferLength = 3;
3270                    } else {
3271                        bufferLength = 2;
3272                    }
3273                    source->writableBuffer.releaseBuffer(bufferLength);
3274
3275                    // Indicate where to continue in main input string after exhausting the writableBuffer
3276                    source->fcdPosition       = source->pos;
3277
3278                    source->pos   = source->writableBuffer.getTerminatedBuffer();
3279                    source->origFlags   = source->flags;
3280                    source->flags       |= UCOL_ITER_INNORMBUF;
3281                    source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3282
3283                    return(UCOL_IGNORABLE);
3284                }
3285            }
3286        case SURROGATE_TAG:
3287            /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3288            /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3289            /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3290            /* we treat it like an unassigned code point. */
3291            {
3292                UChar trail;
3293                collIterateState state;
3294                backupState(source, &state);
3295                if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3296                    // we chould have stepped one char forward and it might have turned that it
3297                    // was not a trail surrogate. In that case, we have to backup.
3298                    loadState(source, &state, TRUE);
3299                    return UCOL_NOT_FOUND;
3300                } else {
3301                    /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3302                    CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3303                    if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3304                        // We need to backup
3305                        loadState(source, &state, TRUE);
3306                        return CE;
3307                    }
3308                    // calculate the supplementary code point value, if surrogate was not tailored
3309                    cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3310                }
3311            }
3312            break;
3313        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3314            UChar nextChar;
3315            if( source->flags & UCOL_USE_ITERATOR) {
3316                if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3317                    cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3318                    source->iterator->next(source->iterator);
3319                    return getImplicit(cp, source);
3320                }
3321            } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3322                      U_IS_TRAIL((nextChar=*source->pos))) {
3323                cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3324                source->pos++;
3325                return getImplicit(cp, source);
3326            }
3327            return UCOL_NOT_FOUND;
3328        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3329            return UCOL_NOT_FOUND; /* broken surrogate sequence */
3330        case CHARSET_TAG:
3331            /* not yet implemented */
3332            /* probably after 1.8 */
3333            return UCOL_NOT_FOUND;
3334        default:
3335            *status = U_INTERNAL_PROGRAM_ERROR;
3336            CE=0;
3337            break;
3338    }
3339    if (CE <= UCOL_NOT_FOUND) break;
3340  }
3341  return CE;
3342}
3343
3344
3345/* now uses Mark's getImplicitPrimary code */
3346static
3347inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3348    uint32_t r = uprv_uca_getImplicitPrimary(cp);
3349
3350    *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3351    collationSource->toReturn = collationSource->CEpos;
3352
3353    // **** doesn't work if using iterator ****
3354    if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3355        collationSource->offsetRepeatCount = 1;
3356    } else {
3357        int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3358
3359        UErrorCode errorCode = U_ZERO_ERROR;
3360        collationSource->appendOffset(firstOffset, errorCode);
3361        collationSource->appendOffset(firstOffset + 1, errorCode);
3362
3363        collationSource->offsetReturn = collationSource->offsetStore - 1;
3364        *(collationSource->offsetBuffer) = firstOffset;
3365        if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3366            collationSource->offsetStore = collationSource->offsetBuffer;
3367        }
3368    }
3369
3370    return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3371}
3372
3373/**
3374 * This function handles the special CEs like contractions, expansions,
3375 * surrogates, Thai.
3376 * It is called by both getPrevCE
3377 */
3378uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3379                          collIterate *source,
3380                          UErrorCode *status)
3381{
3382    const uint32_t *CEOffset    = NULL;
3383          UChar    *UCharOffset = NULL;
3384          UChar    schar;
3385    const UChar    *constart    = NULL;
3386          uint32_t size;
3387          UChar    buffer[UCOL_MAX_BUFFER];
3388          uint32_t *endCEBuffer;
3389          UChar   *strbuffer;
3390          int32_t noChars = 0;
3391          int32_t CECount = 0;
3392
3393    for(;;)
3394    {
3395        /* the only ces that loops are thai and contractions */
3396        switch (getCETag(CE))
3397        {
3398        case NOT_FOUND_TAG:  /* this tag always returns */
3399            return CE;
3400
3401        case SPEC_PROC_TAG:
3402            {
3403                // Special processing is getting a CE that is preceded by a certain prefix
3404                // Currently this is only needed for optimizing Japanese length and iteration marks.
3405                // When we encouter a special processing tag, we go backwards and try to see if
3406                // we have a match.
3407                // Contraction tables are used - so the whole process is not unlike contraction.
3408                // prefix data is stored backwards in the table.
3409                const UChar *UCharOffset;
3410                UChar schar, tchar;
3411                collIterateState prefixState;
3412                backupState(source, &prefixState);
3413                for(;;) {
3414                    // This loop will run once per source string character, for as long as we
3415                    //  are matching a potential contraction sequence
3416
3417                    // First we position ourselves at the begining of contraction sequence
3418                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3419
3420                    if (collIter_bos(source)) {
3421                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3422                        break;
3423                    }
3424                    schar = getPrevNormalizedChar(source, status);
3425                    goBackOne(source);
3426
3427                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3428                        UCharOffset++;
3429                    }
3430
3431                    if (schar == tchar) {
3432                        // Found the source string char in the table.
3433                        //  Pick up the corresponding CE from the table.
3434                        CE = *(coll->contractionCEs +
3435                            (UCharOffset - coll->contractionIndex));
3436                    }
3437                    else
3438                    {
3439                        // if there is a completely ignorable code point in the middle of
3440                        // a prefix, we need to act as if it's not there
3441                        // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3442                        // lone surrogates cannot be set to zero as it would break other processing
3443                        uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3444                        // it's easy for BMP code points
3445                        if(isZeroCE == 0) {
3446                            continue;
3447                        } else if(U16_IS_SURROGATE(schar)) {
3448                            // for supplementary code points, we have to check the next one
3449                            // situations where we are going to ignore
3450                            // 1. beginning of the string: schar is a lone surrogate
3451                            // 2. schar is a lone surrogate
3452                            // 3. schar is a trail surrogate in a valid surrogate sequence
3453                            //    that is explicitly set to zero.
3454                            if (!collIter_bos(source)) {
3455                                UChar lead;
3456                                if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3457                                    isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3458                                    if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3459                                        uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3460                                        if(finalCE == 0) {
3461                                            // this is a real, assigned completely ignorable code point
3462                                            goBackOne(source);
3463                                            continue;
3464                                        }
3465                                    }
3466                                } else {
3467                                    // lone surrogate, treat like unassigned
3468                                    return UCOL_NOT_FOUND;
3469                                }
3470                            } else {
3471                                // lone surrogate at the beggining, treat like unassigned
3472                                return UCOL_NOT_FOUND;
3473                            }
3474                        }
3475                        // Source string char was not in the table.
3476                        //   We have not found the prefix.
3477                        CE = *(coll->contractionCEs +
3478                            (ContractionStart - coll->contractionIndex));
3479                    }
3480
3481                    if(!isPrefix(CE)) {
3482                        // The source string char was in the contraction table, and the corresponding
3483                        //   CE is not a prefix CE.  We found the prefix, break
3484                        //   out of loop, this CE will end up being returned.  This is the normal
3485                        //   way out of prefix handling when the source actually contained
3486                        //   the prefix.
3487                        break;
3488                    }
3489                }
3490                loadState(source, &prefixState, TRUE);
3491                break;
3492            }
3493
3494        case CONTRACTION_TAG: {
3495            /* to ensure that the backwards and forwards iteration matches, we
3496            take the current region of most possible match and pass it through
3497            the forward iteration. this will ensure that the obstinate problem of
3498            overlapping contractions will not occur.
3499            */
3500            schar = peekCodeUnit(source, 0);
3501            constart = (UChar *)coll->image + getContractOffset(CE);
3502            if (isAtStartPrevIterate(source)
3503                /* commented away contraction end checks after adding the checks
3504                in getPrevCE  */) {
3505                    /* start of string or this is not the end of any contraction */
3506                    CE = *(coll->contractionCEs +
3507                        (constart - coll->contractionIndex));
3508                    break;
3509            }
3510            strbuffer = buffer;
3511            UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3512            *(UCharOffset --) = 0;
3513            noChars = 0;
3514            // have to swap thai characters
3515            while (ucol_unsafeCP(schar, coll)) {
3516                *(UCharOffset) = schar;
3517                noChars++;
3518                UCharOffset --;
3519                schar = getPrevNormalizedChar(source, status);
3520                goBackOne(source);
3521                // TODO: when we exhaust the contraction buffer,
3522                // it needs to get reallocated. The problem is
3523                // that the size depends on the string which is
3524                // not iterated over. However, since we're travelling
3525                // backwards, we already had to set the iterator at
3526                // the end - so we might as well know where we are?
3527                if (UCharOffset + 1 == buffer) {
3528                    /* we have exhausted the buffer */
3529                    int32_t newsize = 0;
3530                    if(source->pos) { // actually dealing with a position
3531                        newsize = (int32_t)(source->pos - source->string + 1);
3532                    } else { // iterator
3533                        newsize = 4 * UCOL_MAX_BUFFER;
3534                    }
3535                    strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3536                        (newsize + UCOL_MAX_BUFFER));
3537                    /* test for NULL */
3538                    if (strbuffer == NULL) {
3539                        *status = U_MEMORY_ALLOCATION_ERROR;
3540                        return UCOL_NO_MORE_CES;
3541                    }
3542                    UCharOffset = strbuffer + newsize;
3543                    uprv_memcpy(UCharOffset, buffer,
3544                        UCOL_MAX_BUFFER * sizeof(UChar));
3545                    UCharOffset --;
3546                }
3547                if ((source->pos && (source->pos == source->string ||
3548                    ((source->flags & UCOL_ITER_INNORMBUF) &&
3549                    *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3550                    || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3551                        break;
3552                }
3553            }
3554            /* adds the initial base character to the string */
3555            *(UCharOffset) = schar;
3556            noChars++;
3557
3558            int32_t offsetBias;
3559
3560            // **** doesn't work if using iterator ****
3561            if (source->flags & UCOL_ITER_INNORMBUF) {
3562                offsetBias = -1;
3563            } else {
3564                offsetBias = (int32_t)(source->pos - source->string);
3565            }
3566
3567            /* a new collIterate is used to simplify things, since using the current
3568            collIterate will mean that the forward and backwards iteration will
3569            share and change the same buffers. we don't want to get into that. */
3570            collIterate temp;
3571            int32_t rawOffset;
3572
3573            IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3574            if(U_FAILURE(*status)) {
3575                return UCOL_NULLORDER;
3576            }
3577            temp.flags &= ~UCOL_ITER_NORM;
3578            temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3579
3580            rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3581            CE = ucol_IGetNextCE(coll, &temp, status);
3582
3583            if (source->extendCEs) {
3584                endCEBuffer = source->extendCEs + source->extendCEsSize;
3585                CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3586            } else {
3587                endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3588                CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3589            }
3590
3591            while (CE != UCOL_NO_MORE_CES) {
3592                *(source->CEpos ++) = CE;
3593
3594                if (offsetBias >= 0) {
3595                    source->appendOffset(rawOffset + offsetBias, *status);
3596                }
3597
3598                CECount++;
3599                if (source->CEpos == endCEBuffer) {
3600                    /* ran out of CE space, reallocate to new buffer.
3601                    If reallocation fails, reset pointers and bail out,
3602                    there's no guarantee of the right character position after
3603                    this bail*/
3604                    if (!increaseCEsCapacity(source)) {
3605                        *status = U_MEMORY_ALLOCATION_ERROR;
3606                        break;
3607                    }
3608
3609                    endCEBuffer = source->extendCEs + source->extendCEsSize;
3610                }
3611
3612                if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3613                    rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3614                } else {
3615                    rawOffset = (int32_t)(temp.pos - temp.string);
3616                }
3617
3618                CE = ucol_IGetNextCE(coll, &temp, status);
3619            }
3620
3621            if (strbuffer != buffer) {
3622                uprv_free(strbuffer);
3623            }
3624            if (U_FAILURE(*status)) {
3625                return (uint32_t)UCOL_NULLORDER;
3626            }
3627
3628            if (source->offsetRepeatValue != 0) {
3629                if (CECount > noChars) {
3630                    source->offsetRepeatCount += temp.offsetRepeatCount;
3631                } else {
3632                    // **** does this really skip the right offsets? ****
3633                    source->offsetReturn -= (noChars - CECount);
3634                }
3635            }
3636
3637            if (offsetBias >= 0) {
3638                source->offsetReturn = source->offsetStore - 1;
3639                if (source->offsetReturn == source->offsetBuffer) {
3640                    source->offsetStore = source->offsetBuffer;
3641                }
3642            }
3643
3644            source->toReturn = source->CEpos - 1;
3645            if (source->toReturn == source->CEs) {
3646                source->CEpos = source->CEs;
3647            }
3648
3649            return *(source->toReturn);
3650        }
3651        case LONG_PRIMARY_TAG:
3652            {
3653                *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3654                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3655                source->toReturn = source->CEpos - 1;
3656
3657                if (source->flags & UCOL_ITER_INNORMBUF) {
3658                    source->offsetRepeatCount = 1;
3659                } else {
3660                    int32_t firstOffset = (int32_t)(source->pos - source->string);
3661
3662                    source->appendOffset(firstOffset, *status);
3663                    source->appendOffset(firstOffset + 1, *status);
3664
3665                    source->offsetReturn = source->offsetStore - 1;
3666                    *(source->offsetBuffer) = firstOffset;
3667                    if (source->offsetReturn == source->offsetBuffer) {
3668                        source->offsetStore = source->offsetBuffer;
3669                    }
3670                }
3671
3672
3673                return *(source->toReturn);
3674            }
3675
3676        case EXPANSION_TAG: /* this tag always returns */
3677            {
3678            /*
3679            This should handle expansion.
3680            NOTE: we can encounter both continuations and expansions in an expansion!
3681            I have to decide where continuations are going to be dealt with
3682            */
3683            int32_t firstOffset = (int32_t)(source->pos - source->string);
3684
3685            // **** doesn't work if using iterator ****
3686            if (source->offsetReturn != NULL) {
3687                if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3688                    source->offsetStore = source->offsetBuffer;
3689                }else {
3690                  firstOffset = -1;
3691                }
3692            }
3693
3694            /* find the offset to expansion table */
3695            CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3696            size     = getExpansionCount(CE);
3697            if (size != 0) {
3698                /*
3699                if there are less than 16 elements in expansion, we don't terminate
3700                */
3701                uint32_t count;
3702
3703                for (count = 0; count < size; count++) {
3704                    *(source->CEpos ++) = *CEOffset++;
3705
3706                    if (firstOffset >= 0) {
3707                        source->appendOffset(firstOffset + 1, *status);
3708                    }
3709                }
3710            } else {
3711                /* else, we do */
3712                while (*CEOffset != 0) {
3713                    *(source->CEpos ++) = *CEOffset ++;
3714
3715                    if (firstOffset >= 0) {
3716                        source->appendOffset(firstOffset + 1, *status);
3717                    }
3718                }
3719            }
3720
3721            if (firstOffset >= 0) {
3722                source->offsetReturn = source->offsetStore - 1;
3723                *(source->offsetBuffer) = firstOffset;
3724                if (source->offsetReturn == source->offsetBuffer) {
3725                    source->offsetStore = source->offsetBuffer;
3726                }
3727            } else {
3728                source->offsetRepeatCount += size - 1;
3729            }
3730
3731            source->toReturn = source->CEpos - 1;
3732            // in case of one element expansion, we
3733            // want to immediately return CEpos
3734            if(source->toReturn == source->CEs) {
3735                source->CEpos = source->CEs;
3736            }
3737
3738            return *(source->toReturn);
3739            }
3740
3741        case DIGIT_TAG:
3742            {
3743                /*
3744                We do a check to see if we want to collate digits as numbers; if so we generate
3745                a custom collation key. Otherwise we pull out the value stored in the expansion table.
3746                */
3747                uint32_t i;    /* general counter */
3748
3749                if (source->coll->numericCollation == UCOL_ON){
3750                    uint32_t digIndx = 0;
3751                    uint32_t endIndex = 0;
3752                    uint32_t leadingZeroIndex = 0;
3753                    uint32_t trailingZeroCount = 0;
3754
3755                    uint8_t collateVal = 0;
3756
3757                    UBool nonZeroValReached = FALSE;
3758
3759                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3760                    /*
3761                    We parse the source string until we hit a char that's NOT a digit.
3762                    Use this u_charDigitValue. This might be slow because we have to
3763                    handle surrogates...
3764                    */
3765                    /*
3766                    We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3767                    with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3768                    element we process when going backward. To determine how long that chunk might be, we may need to make
3769                    two passes through the loop that collects digits - one to see how long the string is (and how much is
3770                    leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3771                    more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3772                    element chunk after resetting the state to the initialState at the right side of the digit string.
3773                    */
3774                    uint32_t ceLimit = 0;
3775                    UChar initial_ch = ch;
3776                    collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3777                    backupState(source, &initialState);
3778
3779                    for(;;) {
3780                        collIterateState state = {0,0,0,0,0,0,0,0,0};
3781                        UChar32 char32 = 0;
3782                        int32_t digVal = 0;
3783
3784                        if (U16_IS_TRAIL (ch)) {
3785                            if (!collIter_bos(source)){
3786                                UChar lead = getPrevNormalizedChar(source, status);
3787                                if(U16_IS_LEAD(lead)) {
3788                                    char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3789                                    goBackOne(source);
3790                                } else {
3791                                    char32 = ch;
3792                                }
3793                            } else {
3794                                char32 = ch;
3795                            }
3796                        } else {
3797                            char32 = ch;
3798                        }
3799                        digVal = u_charDigitValue(char32);
3800
3801                        for(;;) {
3802                            // Make sure we have enough space. No longer needed;
3803                            // at this point the largest value of digIndx when we need to save data in numTempBuf
3804                            // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3805                            // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3806
3807                            // Skip over trailing zeroes, and keep a count of them.
3808                            if (digVal != 0)
3809                                nonZeroValReached = TRUE;
3810
3811                            if (nonZeroValReached) {
3812                                /*
3813                                We parse the digit string into base 100 numbers (this fits into a byte).
3814                                We only add to the buffer in twos, thus if we are parsing an odd character,
3815                                that serves as the 'tens' digit while the if we are parsing an even one, that
3816                                is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3817                                a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3818                                overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3819                                than all the other bytes.
3820
3821                                Since we're doing in this reverse we want to put the first digit encountered into the
3822                                ones place and the second digit encountered into the tens place.
3823                                */
3824
3825                                if ((digIndx + trailingZeroCount) % 2 == 1) {
3826                                    // High-order digit case (tens place)
3827                                    collateVal += (uint8_t)(digVal * 10);
3828
3829                                    // We cannot set leadingZeroIndex unless it has been set for the
3830                                    // low-order digit. Therefore, all we can do for the high-order
3831                                    // digit is turn it off, never on.
3832                                    // The only time we will have a high digit without a low is for
3833                                    // the very first non-zero digit, so no zero check is necessary.
3834                                    if (collateVal != 0)
3835                                        leadingZeroIndex = 0;
3836
3837                                    // The first pass through, digIndx may exceed the limit, but in that case
3838                                    // we no longer care about numTempBuf contents since they will be discarded
3839                                    if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3840                                        numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3841                                    }
3842                                    collateVal = 0;
3843                                } else {
3844                                    // Low-order digit case (ones place)
3845                                    collateVal = (uint8_t)digVal;
3846
3847                                    // Check for leading zeroes.
3848                                    if (collateVal == 0) {
3849                                        if (!leadingZeroIndex)
3850                                            leadingZeroIndex = (digIndx/2) + 2;
3851                                    } else
3852                                        leadingZeroIndex = 0;
3853
3854                                    // No need to write to buffer; the case of a last odd digit
3855                                    // is handled below.
3856                                }
3857                                ++digIndx;
3858                            } else
3859                                ++trailingZeroCount;
3860
3861                            if (!collIter_bos(source)) {
3862                                ch = getPrevNormalizedChar(source, status);
3863                                //goBackOne(source);
3864                                if (U16_IS_TRAIL(ch)) {
3865                                    backupState(source, &state);
3866                                    if (!collIter_bos(source)) {
3867                                        goBackOne(source);
3868                                        UChar lead = getPrevNormalizedChar(source, status);
3869
3870                                        if(U16_IS_LEAD(lead)) {
3871                                            char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3872                                        } else {
3873                                            loadState(source, &state, FALSE);
3874                                            char32 = ch;
3875                                        }
3876                                    }
3877                                } else
3878                                    char32 = ch;
3879
3880                                if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3881                                    if (char32 > 0xFFFF) {// For surrogates.
3882                                        loadState(source, &state, FALSE);
3883                                    }
3884                                    // Don't need to "reverse" the goBackOne call,
3885                                    // as this points to the next position to process..
3886                                    //if (char32 > 0xFFFF) // For surrogates.
3887                                    //getNextNormalizedChar(source);
3888                                    break;
3889                                }
3890
3891                                goBackOne(source);
3892                            }else
3893                                break;
3894                        }
3895
3896                        if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3897                            // our collation element is not too big, go ahead and finish with it
3898                            break;
3899                        }
3900                        // our digit string is too long for a collation element;
3901                        // set the limit for it, reset the state and begin again
3902                        ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3903                        if ( ceLimit == 0 ) {
3904                            ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3905                        }
3906                        ch = initial_ch;
3907                        loadState(source, &initialState, FALSE);
3908                        digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3909                        collateVal = 0;
3910                        nonZeroValReached = FALSE;
3911                    }
3912
3913                    if (! nonZeroValReached) {
3914                        digIndx = 2;
3915                        trailingZeroCount = 0;
3916                        numTempBuf[2] = 6;
3917                    }
3918
3919                    if ((digIndx + trailingZeroCount) % 2 != 0) {
3920                        numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3921                        digIndx += 1;       // The implicit leading zero
3922                    }
3923                    if (trailingZeroCount % 2 != 0) {
3924                        // We had to consume one trailing zero for the low digit
3925                        // of the least significant byte
3926                        digIndx += 1;       // The trailing zero not in the exponent
3927                        trailingZeroCount -= 1;
3928                    }
3929
3930                    endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3931
3932                    // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3933                    numTempBuf[2] -= 1;
3934
3935                    /*
3936                    We want to skip over the first two slots in the buffer. The first slot
3937                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3938                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3939                    The exponent must be adjusted by the number of leading zeroes, and the number of
3940                    trailing zeroes.
3941                    */
3942                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3943                    uint32_t exponent = (digIndx+trailingZeroCount)/2;
3944                    if (leadingZeroIndex)
3945                        exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3946                    numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3947
3948                    // Now transfer the collation key to our collIterate struct.
3949                    // The total size for our collation key is half of endIndex, rounded up.
3950                    int32_t size = (endIndex+1)/2;
3951                    if(!ensureCEsCapacity(source, size)) {
3952                        return UCOL_NULLORDER;
3953                    }
3954                    *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3955                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3956                        UCOL_BYTE_COMMON; // Tertiary weight.
3957                    i = endIndex - 1; // Reset the index into the buffer.
3958                    while(i >= 2) {
3959                        uint32_t primWeight = numTempBuf[i--] << 8;
3960                        if ( i >= 2)
3961                            primWeight |= numTempBuf[i--];
3962                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3963                    }
3964
3965                    source->toReturn = source->CEpos -1;
3966                    return *(source->toReturn);
3967                } else {
3968                    CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3969                    CE = *(CEOffset++);
3970                    break;
3971                }
3972            }
3973
3974        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3975            {
3976                static const uint32_t
3977                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3978                //const uint32_t LCount = 19;
3979                static const uint32_t VCount = 21;
3980                static const uint32_t TCount = 28;
3981                //const uint32_t NCount = VCount * TCount;   /* 588 */
3982                //const uint32_t SCount = LCount * NCount;   /* 11172 */
3983
3984                uint32_t L = ch - SBase;
3985                /*
3986                divide into pieces.
3987                we do it in this order since some compilers can do % and / in one
3988                operation
3989                */
3990                uint32_t T = L % TCount;
3991                L /= TCount;
3992                uint32_t V = L % VCount;
3993                L /= VCount;
3994
3995                /* offset them */
3996                L += LBase;
3997                V += VBase;
3998                T += TBase;
3999
4000                int32_t firstOffset = (int32_t)(source->pos - source->string);
4001                source->appendOffset(firstOffset, *status);
4002
4003                /*
4004                 * return the first CE, but first put the rest into the expansion buffer
4005                 */
4006                if (!source->coll->image->jamoSpecial) {
4007                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4008                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4009                    source->appendOffset(firstOffset + 1, *status);
4010
4011                    if (T != TBase) {
4012                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4013                        source->appendOffset(firstOffset + 1, *status);
4014                    }
4015
4016                    source->toReturn = source->CEpos - 1;
4017
4018                    source->offsetReturn = source->offsetStore - 1;
4019                    if (source->offsetReturn == source->offsetBuffer) {
4020                        source->offsetStore = source->offsetBuffer;
4021                    }
4022
4023                    return *(source->toReturn);
4024                } else {
4025                    // Since Hanguls pass the FCD check, it is
4026                    // guaranteed that we won't be in
4027                    // the normalization buffer if something like this happens
4028
4029                    // Move Jamos into normalization buffer
4030                    UChar *tempbuffer = source->writableBuffer.getBuffer(5);
4031                    int32_t tempbufferLength, jamoOffset;
4032                    tempbuffer[0] = 0;
4033                    tempbuffer[1] = (UChar)L;
4034                    tempbuffer[2] = (UChar)V;
4035                    if (T != TBase) {
4036                        tempbuffer[3] = (UChar)T;
4037                        tempbufferLength = 4;
4038                    } else {
4039                        tempbufferLength = 3;
4040                    }
4041                    source->writableBuffer.releaseBuffer(tempbufferLength);
4042
4043                    // Indicate where to continue in main input string after exhausting the writableBuffer
4044                    if (source->pos  == source->string) {
4045                        jamoOffset = 0;
4046                        source->fcdPosition = NULL;
4047                    } else {
4048                        jamoOffset = source->pos - source->string;
4049                        source->fcdPosition       = source->pos-1;
4050                    }
4051
4052                    // Append offsets for the additional chars
4053                    // (not the 0, and not the L whose offsets match the original Hangul)
4054                    int32_t jamoRemaining = tempbufferLength - 2;
4055                    jamoOffset++; // appended offsets should match end of original Hangul
4056                    while (jamoRemaining-- > 0) {
4057                        source->appendOffset(jamoOffset, *status);
4058                    }
4059
4060                    source->offsetRepeatValue = jamoOffset;
4061
4062                    source->offsetReturn = source->offsetStore - 1;
4063                    if (source->offsetReturn == source->offsetBuffer) {
4064                        source->offsetStore = source->offsetBuffer;
4065                    }
4066
4067                    source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4068                    source->origFlags         = source->flags;
4069                    source->flags            |= UCOL_ITER_INNORMBUF;
4070                    source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4071
4072                    return(UCOL_IGNORABLE);
4073                }
4074            }
4075
4076        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4077            return getPrevImplicit(ch, source);
4078
4079            // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4080        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4081            return getPrevImplicit(ch, source);
4082
4083        case SURROGATE_TAG:  /* This is a surrogate pair */
4084            /* essentially an engaged lead surrogate. */
4085            /* if you have encountered it here, it means that a */
4086            /* broken sequence was encountered and this is an error */
4087            return UCOL_NOT_FOUND;
4088
4089        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4090            return UCOL_NOT_FOUND; /* broken surrogate sequence */
4091
4092        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4093            {
4094                UChar32 cp = 0;
4095                UChar  prevChar;
4096                const UChar *prev;
4097                if (isAtStartPrevIterate(source)) {
4098                    /* we are at the start of the string, wrong place to be at */
4099                    return UCOL_NOT_FOUND;
4100                }
4101                if (source->pos != source->writableBuffer.getBuffer()) {
4102                    prev     = source->pos - 1;
4103                } else {
4104                    prev     = source->fcdPosition;
4105                }
4106                prevChar = *prev;
4107
4108                /* Handles Han and Supplementary characters here.*/
4109                if (U16_IS_LEAD(prevChar)) {
4110                    cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4111                    source->pos = prev;
4112                } else {
4113                    return UCOL_NOT_FOUND; /* like unassigned */
4114                }
4115
4116                return getPrevImplicit(cp, source);
4117            }
4118
4119            /* UCA is filled with these. Tailorings are NOT_FOUND */
4120            /* not yet implemented */
4121        case CHARSET_TAG:  /* this tag always returns */
4122            /* probably after 1.8 */
4123            return UCOL_NOT_FOUND;
4124
4125        default:           /* this tag always returns */
4126            *status = U_INTERNAL_PROGRAM_ERROR;
4127            CE=0;
4128            break;
4129        }
4130
4131        if (CE <= UCOL_NOT_FOUND) {
4132            break;
4133        }
4134    }
4135
4136    return CE;
4137}
4138
4139/* This should really be a macro                                                                      */
4140/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4141/* secondaries in French                                                                              */
4142/*
4143void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4144  uint8_t temp;
4145  while(start<end) {
4146    temp = *start;
4147    *start++ = *end;
4148    *end-- = temp;
4149  }
4150}
4151*/
4152
4153#define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4154  TYPE tempA; \
4155while((start)<(end)) { \
4156    tempA = *(start); \
4157    *(start)++ = *(end); \
4158    *(end)-- = tempA; \
4159} \
4160}
4161
4162/****************************************************************************/
4163/* Following are the sortkey generation functions                           */
4164/*                                                                          */
4165/****************************************************************************/
4166
4167/**
4168 * Merge two sort keys.
4169 * This is useful, for example, to combine sort keys from first and last names
4170 * to sort such pairs.
4171 * Merged sort keys consider on each collation level the first part first entirely,
4172 * then the second one.
4173 * It is possible to merge multiple sort keys by consecutively merging
4174 * another one with the intermediate result.
4175 *
4176 * The length of the merge result is the sum of the lengths of the input sort keys
4177 * minus 1.
4178 *
4179 * @param src1 the first sort key
4180 * @param src1Length the length of the first sort key, including the zero byte at the end;
4181 *        can be -1 if the function is to find the length
4182 * @param src2 the second sort key
4183 * @param src2Length the length of the second sort key, including the zero byte at the end;
4184 *        can be -1 if the function is to find the length
4185 * @param dest the buffer where the merged sort key is written,
4186 *        can be NULL if destCapacity==0
4187 * @param destCapacity the number of bytes in the dest buffer
4188 * @return the length of the merged sort key, src1Length+src2Length-1;
4189 *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4190 *         in which cases the contents of dest is undefined
4191 *
4192 * @draft
4193 */
4194U_CAPI int32_t U_EXPORT2
4195ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4196                   const uint8_t *src2, int32_t src2Length,
4197                   uint8_t *dest, int32_t destCapacity) {
4198    int32_t destLength;
4199    uint8_t b;
4200
4201    /* check arguments */
4202    if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4203        src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4204        destCapacity<0 || (destCapacity>0 && dest==NULL)
4205    ) {
4206        /* error, attempt to write a zero byte and return 0 */
4207        if(dest!=NULL && destCapacity>0) {
4208            *dest=0;
4209        }
4210        return 0;
4211    }
4212
4213    /* check lengths and capacity */
4214    if(src1Length<0) {
4215        src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4216    }
4217    if(src2Length<0) {
4218        src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4219    }
4220
4221    destLength=src1Length+src2Length-1;
4222    if(destLength>destCapacity) {
4223        /* the merged sort key does not fit into the destination */
4224        return destLength;
4225    }
4226
4227    /* merge the sort keys with the same number of levels */
4228    while(*src1!=0 && *src2!=0) { /* while both have another level */
4229        /* copy level from src1 not including 00 or 01 */
4230        while((b=*src1)>=2) {
4231            ++src1;
4232            *dest++=b;
4233        }
4234
4235        /* add a 02 merge separator */
4236        *dest++=2;
4237
4238        /* copy level from src2 not including 00 or 01 */
4239        while((b=*src2)>=2) {
4240            ++src2;
4241            *dest++=b;
4242        }
4243
4244        /* if both sort keys have another level, then add a 01 level separator and continue */
4245        if(*src1==1 && *src2==1) {
4246            ++src1;
4247            ++src2;
4248            *dest++=1;
4249        }
4250    }
4251
4252    /*
4253     * here, at least one sort key is finished now, but the other one
4254     * might have some contents left from containing more levels;
4255     * that contents is just appended to the result
4256     */
4257    if(*src1!=0) {
4258        /* src1 is not finished, therefore *src2==0, and src1 is appended */
4259        src2=src1;
4260    }
4261    /* append src2, "the other, unfinished sort key" */
4262    uprv_strcpy((char *)dest, (const char *)src2);
4263
4264    /* trust that neither sort key contained illegally embedded zero bytes */
4265    return destLength;
4266}
4267
4268U_NAMESPACE_BEGIN
4269
4270class SortKeyByteSink : public ByteSink {
4271public:
4272    static const uint32_t FILL_ORIGINAL_BUFFER = 1;
4273    static const uint32_t DONT_GROW = 2;
4274    SortKeyByteSink(char *dest, int32_t destCapacity, uint32_t flags=0)
4275            : ownedBuffer_(NULL), buffer_(dest), capacity_(destCapacity),
4276              appended_(0),
4277              fill_(flags & FILL_ORIGINAL_BUFFER),
4278              grow_((flags & DONT_GROW) == 0) {
4279        if (buffer_ == NULL || capacity_ < 0) {
4280            buffer_ = reinterpret_cast<char *>(&lastResortByte_);
4281            capacity_ = 0;
4282        }
4283    }
4284    virtual ~SortKeyByteSink() { uprv_free(ownedBuffer_); }
4285
4286    virtual void Append(const char *bytes, int32_t n);
4287    void Append(const uint8_t *bytes, int32_t n) { Append(reinterpret_cast<const char *>(bytes), n); }
4288    void Append(uint8_t b) {
4289        if (appended_ < capacity_) {
4290            buffer_[appended_++] = (char)b;
4291        } else {
4292            Append(&b, 1);
4293        }
4294    }
4295    void Append(uint8_t b1, uint8_t b2) {
4296        int32_t a2 = appended_ + 2;
4297        if (a2 <= capacity_) {
4298            buffer_[appended_] = (char)b1;
4299            buffer_[appended_ + 1] = (char)b2;
4300            appended_ = a2;
4301        } else {
4302            char bytes[2] = { (char)b1, (char)b2 };
4303            Append(bytes, 2);
4304        }
4305    }
4306    void Append(const SortKeyByteSink &other) { Append(other.buffer_, other.appended_); }
4307    virtual char *GetAppendBuffer(int32_t min_capacity,
4308                                  int32_t desired_capacity_hint,
4309                                  char *scratch, int32_t scratch_capacity,
4310                                  int32_t *result_capacity);
4311    int32_t NumberOfBytesAppended() const { return appended_; }
4312    uint8_t &LastByte() {
4313        if (buffer_ != NULL && appended_ > 0) {
4314            return reinterpret_cast<uint8_t *>(buffer_)[appended_ - 1];
4315        } else {
4316            return lastResortByte_;
4317        }
4318    }
4319    uint8_t *GetLastFewBytes(int32_t n) {
4320        if (buffer_ != NULL && appended_ >= n) {
4321            return reinterpret_cast<uint8_t *>(buffer_) + appended_ - n;
4322        } else {
4323            return NULL;
4324        }
4325    }
4326    char *GetBuffer() { return buffer_; }
4327    uint8_t *GetUnsignedBuffer() { return reinterpret_cast<uint8_t *>(buffer_); }
4328    uint8_t *OrphanUnsignedBuffer(int32_t &orphanedCapacity);
4329    UBool IsOk() const { return buffer_ != NULL; }  // otherwise out-of-memory
4330
4331private:
4332    SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
4333    SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4334
4335    UBool Resize(int32_t appendCapacity, int32_t length);
4336    void SetNotOk() {
4337        buffer_ = NULL;
4338        capacity_ = 0;
4339    }
4340
4341    static uint8_t lastResortByte_;  // last-resort return value from LastByte()
4342
4343    char *ownedBuffer_;
4344    char *buffer_;
4345    int32_t capacity_;
4346    int32_t appended_;
4347    UBool fill_;
4348    UBool grow_;
4349};
4350
4351uint8_t SortKeyByteSink::lastResortByte_ = 0;
4352
4353void
4354SortKeyByteSink::Append(const char *bytes, int32_t n) {
4355    if (n <= 0) {
4356        return;
4357    }
4358    int32_t length = appended_;
4359    appended_ += n;
4360    if ((buffer_ + length) == bytes) {
4361        return;  // the caller used GetAppendBuffer() and wrote the bytes already
4362    }
4363    if (buffer_ == NULL) {
4364        return;  // allocation failed before already
4365    }
4366    int32_t available = capacity_ - length;
4367    if (bytes == NULL) {
4368        // assume that the caller failed to allocate memory
4369        if (fill_) {
4370            if (n > available) {
4371                n = available;
4372            }
4373            uprv_memset(buffer_, 0, n);
4374        }
4375        SetNotOk();  // propagate the out-of-memory error
4376        return;
4377    }
4378    if (n > available) {
4379        if (fill_ && available > 0) {
4380            // Fill the original buffer completely.
4381            uprv_memcpy(buffer_ + length, bytes, available);
4382            bytes += available;
4383            length += available;
4384            n -= available;
4385            available = 0;
4386        }
4387        fill_ = FALSE;
4388        if (!Resize(n, length)) {
4389            SetNotOk();
4390            return;
4391        }
4392    }
4393    uprv_memcpy(buffer_ + length, bytes, n);
4394}
4395
4396char *
4397SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4398                                 int32_t desired_capacity_hint,
4399                                 char *scratch,
4400                                 int32_t scratch_capacity,
4401                                 int32_t *result_capacity) {
4402    if (min_capacity < 1 || scratch_capacity < min_capacity) {
4403        *result_capacity = 0;
4404        return NULL;
4405    }
4406    int32_t available = capacity_ - appended_;
4407    if (available >= min_capacity) {
4408        *result_capacity = available;
4409        return buffer_ + appended_;
4410    } else if (Resize(desired_capacity_hint, appended_)) {
4411        *result_capacity = capacity_ - appended_;
4412        return buffer_ + appended_;
4413    } else {
4414        *result_capacity = scratch_capacity;
4415        return scratch;
4416    }
4417}
4418
4419UBool
4420SortKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4421    if (!grow_) {
4422        return FALSE;
4423    }
4424    int32_t newCapacity = 2 * capacity_;
4425    int32_t altCapacity = length + 2 * appendCapacity;
4426    if (newCapacity < altCapacity) {
4427        newCapacity = altCapacity;
4428    }
4429    if (newCapacity < 1024) {
4430        newCapacity = 1024;
4431    }
4432    char *newBuffer = (char *)uprv_malloc(newCapacity);
4433    if (newBuffer == NULL) {
4434        return FALSE;
4435    }
4436    uprv_memcpy(newBuffer, buffer_, length);
4437    uprv_free(ownedBuffer_);
4438    ownedBuffer_ = buffer_ = newBuffer;
4439    capacity_ = newCapacity;
4440    return TRUE;
4441}
4442
4443uint8_t *
4444SortKeyByteSink::OrphanUnsignedBuffer(int32_t &orphanedCapacity) {
4445    if (buffer_ == NULL || appended_ == 0) {
4446        orphanedCapacity = 0;
4447        return NULL;
4448    }
4449    if (ownedBuffer_ != NULL) {
4450        // orphan & forget the ownedBuffer_
4451        uint8_t *returnBuffer = reinterpret_cast<uint8_t *>(ownedBuffer_);
4452        ownedBuffer_ = buffer_ = NULL;
4453        orphanedCapacity = capacity_;
4454        capacity_ = appended_ = 0;
4455        return returnBuffer;
4456    }
4457    // clone the buffer_
4458    uint8_t *newBuffer = (uint8_t *)uprv_malloc(appended_);
4459    if (newBuffer == NULL) {
4460        orphanedCapacity = 0;
4461        return NULL;
4462    }
4463    uprv_memcpy(newBuffer, buffer_, appended_);
4464    orphanedCapacity = appended_;
4465    return newBuffer;
4466}
4467
4468U_NAMESPACE_END
4469
4470/* sortkey API */
4471U_CAPI int32_t U_EXPORT2
4472ucol_getSortKey(const    UCollator    *coll,
4473        const    UChar        *source,
4474        int32_t        sourceLength,
4475        uint8_t        *result,
4476        int32_t        resultLength)
4477{
4478    UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4479    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4480        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4481            ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4482    }
4483
4484    UErrorCode status = U_ZERO_ERROR;
4485    int32_t keySize   = 0;
4486
4487    if(source != NULL) {
4488        // source == NULL is actually an error situation, but we would need to
4489        // have an error code to return it. Until we introduce a new
4490        // API, it stays like this
4491
4492        /* this uses the function pointer that is set in updateinternalstate */
4493        /* currently, there are two funcs: */
4494        /*ucol_calcSortKey(...);*/
4495        /*ucol_calcSortKeySimpleTertiary(...);*/
4496
4497        SortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength,
4498                             SortKeyByteSink::FILL_ORIGINAL_BUFFER | SortKeyByteSink::DONT_GROW);
4499        coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4500        keySize = sink.NumberOfBytesAppended();
4501    }
4502    UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4503    UTRACE_EXIT_STATUS(status);
4504    return keySize;
4505}
4506
4507/* this function is called by the C++ API for sortkey generation */
4508U_CFUNC int32_t
4509ucol_getSortKeyWithAllocation(const UCollator *coll,
4510                              const UChar *source, int32_t sourceLength,
4511                              uint8_t *&result, int32_t &resultCapacity,
4512                              UErrorCode *pErrorCode) {
4513    SortKeyByteSink sink(reinterpret_cast<char *>(result), resultCapacity);
4514    coll->sortKeyGen(coll, source, sourceLength, sink, pErrorCode);
4515    int32_t resultLen = sink.NumberOfBytesAppended();
4516    if (U_SUCCESS(*pErrorCode)) {
4517        if (!sink.IsOk()) {
4518            *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
4519        } else if (result != sink.GetUnsignedBuffer()) {
4520            result = sink.OrphanUnsignedBuffer(resultCapacity);
4521        }
4522    }
4523    return resultLen;
4524}
4525
4526// Is this primary weight compressible?
4527// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4528// TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4529static inline UBool
4530isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4531    return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4532}
4533
4534static
4535inline void doCaseShift(SortKeyByteSink &cases, uint32_t &caseShift) {
4536    if (caseShift  == 0) {
4537        cases.Append(UCOL_CASE_BYTE_START);
4538        caseShift = UCOL_CASE_SHIFT_START;
4539    }
4540}
4541
4542// Packs the secondary buffer when processing French locale.
4543static void
4544packFrench(uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4545    secondaries += secsize;  // We read the secondary-level bytes back to front.
4546    uint8_t secondary;
4547    int32_t count2 = 0;
4548    int32_t i = 0;
4549    // we use i here since the key size already accounts for terminators, so we'll discard the increment
4550    for(i = 0; i<secsize; i++) {
4551        secondary = *(secondaries-i-1);
4552        /* This is compression code. */
4553        if (secondary == UCOL_COMMON2) {
4554            ++count2;
4555        } else {
4556            if (count2 > 0) {
4557                if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4558                    while (count2 > UCOL_TOP_COUNT2) {
4559                        result.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4560                        count2 -= (uint32_t)UCOL_TOP_COUNT2;
4561                    }
4562                    result.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4563                } else {
4564                    while (count2 > UCOL_BOT_COUNT2) {
4565                        result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4566                        count2 -= (uint32_t)UCOL_BOT_COUNT2;
4567                    }
4568                    result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4569                }
4570                count2 = 0;
4571            }
4572            result.Append(secondary);
4573        }
4574    }
4575    if (count2 > 0) {
4576        while (count2 > UCOL_BOT_COUNT2) {
4577            result.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4578            count2 -= (uint32_t)UCOL_BOT_COUNT2;
4579        }
4580        result.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4581    }
4582}
4583
4584#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4585
4586/* This is the sortkey work horse function */
4587U_CFUNC void U_CALLCONV
4588ucol_calcSortKey(const    UCollator    *coll,
4589        const    UChar        *source,
4590        int32_t        sourceLength,
4591        SortKeyByteSink &result,
4592        UErrorCode *status)
4593{
4594    if(U_FAILURE(*status)) {
4595        return;
4596    }
4597
4598    /* Stack allocated buffers for buffers we use */
4599    char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
4600    char caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4601
4602    SortKeyByteSink &primaries = result;
4603    SortKeyByteSink secondaries(second, LENGTHOF(second));
4604    SortKeyByteSink tertiaries(tert, LENGTHOF(tert));
4605    SortKeyByteSink cases(caseB, LENGTHOF(caseB));
4606    SortKeyByteSink quads(quad, LENGTHOF(quad));
4607
4608    UnicodeString normSource;
4609
4610    int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4611
4612    UColAttributeValue strength = coll->strength;
4613
4614    uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4615    uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4616    uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4617    UBool  compareIdent = (strength == UCOL_IDENTICAL);
4618    UBool  doCase = (coll->caseLevel == UCOL_ON);
4619    UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4620    UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4621    //UBool  qShifted = shifted && (compareQuad == 0);
4622    UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4623
4624    uint32_t variableTopValue = coll->variableTopValue;
4625    // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4626    // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4627    uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4628    uint8_t UCOL_HIRAGANA_QUAD = 0;
4629    if(doHiragana) {
4630        UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4631        /* allocate one more space for hiragana, value for hiragana */
4632    }
4633    uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4634
4635    /* support for special features like caselevel and funky secondaries */
4636    int32_t lastSecondaryLength = 0;
4637    uint32_t caseShift = 0;
4638
4639    /* If we need to normalize, we'll do it all at once at the beginning! */
4640    const Normalizer2 *norm2;
4641    if(compareIdent) {
4642        norm2 = Normalizer2Factory::getNFDInstance(*status);
4643    } else if(coll->normalizationMode != UCOL_OFF) {
4644        norm2 = Normalizer2Factory::getFCDInstance(*status);
4645    } else {
4646        norm2 = NULL;
4647    }
4648    if(norm2 != NULL) {
4649        normSource.setTo(FALSE, source, len);
4650        int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4651        if(qcYesLength != len) {
4652            UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4653            normSource.truncate(qcYesLength);
4654            norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4655            source = normSource.getBuffer();
4656            len = normSource.length();
4657        }
4658    }
4659    collIterate s;
4660    IInit_collIterate(coll, source, len, &s, status);
4661    if(U_FAILURE(*status)) {
4662        return;
4663    }
4664    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
4665
4666    uint32_t order = 0;
4667
4668    uint8_t primary1 = 0;
4669    uint8_t primary2 = 0;
4670    uint8_t secondary = 0;
4671    uint8_t tertiary = 0;
4672    uint8_t caseSwitch = coll->caseSwitch;
4673    uint8_t tertiaryMask = coll->tertiaryMask;
4674    int8_t tertiaryAddition = coll->tertiaryAddition;
4675    uint8_t tertiaryTop = coll->tertiaryTop;
4676    uint8_t tertiaryBottom = coll->tertiaryBottom;
4677    uint8_t tertiaryCommon = coll->tertiaryCommon;
4678    uint8_t caseBits = 0;
4679
4680    UBool wasShifted = FALSE;
4681    UBool notIsContinuation = FALSE;
4682
4683    uint32_t count2 = 0, count3 = 0, count4 = 0;
4684    uint8_t leadPrimary = 0;
4685
4686    for(;;) {
4687        order = ucol_IGetNextCE(coll, &s, status);
4688        if(order == UCOL_NO_MORE_CES) {
4689            break;
4690        }
4691
4692        if(order == 0) {
4693            continue;
4694        }
4695
4696        notIsContinuation = !isContinuation(order);
4697
4698        if(notIsContinuation) {
4699            tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4700        } else {
4701            tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4702        }
4703
4704        secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4705        primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4706        primary1 = (uint8_t)(order >> 8);
4707
4708        uint8_t originalPrimary1 = primary1;
4709        if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4710            primary1 = coll->leadBytePermutationTable[primary1];
4711        }
4712
4713        if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4714                        || (!notIsContinuation && wasShifted)))
4715            || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4716        {
4717            /* and other ignorables should be removed if following a shifted code point */
4718            if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4719                /* we should just completely ignore it */
4720                continue;
4721            }
4722            if(compareQuad == 0) {
4723                if(count4 > 0) {
4724                    while (count4 > UCOL_BOT_COUNT4) {
4725                        quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
4726                        count4 -= UCOL_BOT_COUNT4;
4727                    }
4728                    quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
4729                    count4 = 0;
4730                }
4731                /* We are dealing with a variable and we're treating them as shifted */
4732                /* This is a shifted ignorable */
4733                if(primary1 != 0) { /* we need to check this since we could be in continuation */
4734                    quads.Append(primary1);
4735                }
4736                if(primary2 != 0) {
4737                    quads.Append(primary2);
4738                }
4739            }
4740            wasShifted = TRUE;
4741        } else {
4742            wasShifted = FALSE;
4743            /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4744            /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
4745            /* regular and simple sortkey calc */
4746            if(primary1 != UCOL_IGNORABLE) {
4747                if(notIsContinuation) {
4748                    if(leadPrimary == primary1) {
4749                        primaries.Append(primary2);
4750                    } else {
4751                        if(leadPrimary != 0) {
4752                            primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN));
4753                        }
4754                        if(primary2 == UCOL_IGNORABLE) {
4755                            /* one byter, not compressed */
4756                            primaries.Append(primary1);
4757                            leadPrimary = 0;
4758                        } else if(isCompressible(coll, originalPrimary1)) {
4759                            /* compress */
4760                            primaries.Append(leadPrimary = primary1, primary2);
4761                        } else {
4762                            leadPrimary = 0;
4763                            primaries.Append(primary1, primary2);
4764                        }
4765                    }
4766                } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4767                    if(primary2 == UCOL_IGNORABLE) {
4768                        primaries.Append(primary1);
4769                    } else {
4770                        primaries.Append(primary1, primary2);
4771                    }
4772                }
4773            }
4774
4775            if(secondary > compareSec) {
4776                if(!isFrenchSec) {
4777                    /* This is compression code. */
4778                    if (secondary == UCOL_COMMON2 && notIsContinuation) {
4779                        ++count2;
4780                    } else {
4781                        if (count2 > 0) {
4782                            if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4783                                while (count2 > UCOL_TOP_COUNT2) {
4784                                    secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4785                                    count2 -= (uint32_t)UCOL_TOP_COUNT2;
4786                                }
4787                                secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4788                            } else {
4789                                while (count2 > UCOL_BOT_COUNT2) {
4790                                    secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4791                                    count2 -= (uint32_t)UCOL_BOT_COUNT2;
4792                                }
4793                                secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4794                            }
4795                            count2 = 0;
4796                        }
4797                        secondaries.Append(secondary);
4798                    }
4799                } else {
4800                    /* Do the special handling for French secondaries */
4801                    /* We need to get continuation elements and do intermediate restore */
4802                    /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4803                    if(notIsContinuation) {
4804                        if (lastSecondaryLength > 1) {
4805                            uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength);
4806                            if (frenchStartPtr != NULL) {
4807                                /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4808                                uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4809                                uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4810                            }
4811                        }
4812                        lastSecondaryLength = 1;
4813                    } else {
4814                        ++lastSecondaryLength;
4815                    }
4816                    secondaries.Append(secondary);
4817                }
4818            }
4819
4820            if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4821                // do the case level if we need to do it. We don't want to calculate
4822                // case level for primary ignorables if we have only primary strength and case level
4823                // otherwise we would break well formedness of CEs
4824                doCaseShift(cases, caseShift);
4825                if(notIsContinuation) {
4826                    caseBits = (uint8_t)(tertiary & 0xC0);
4827
4828                    if(tertiary != 0) {
4829                        if(coll->caseFirst == UCOL_UPPER_FIRST) {
4830                            if((caseBits & 0xC0) == 0) {
4831                                cases.LastByte() |= 1 << (--caseShift);
4832                            } else {
4833                                cases.LastByte() |= 0 << (--caseShift);
4834                                /* second bit */
4835                                doCaseShift(cases, caseShift);
4836                                cases.LastByte() |= ((caseBits>>6)&1) << (--caseShift);
4837                            }
4838                        } else {
4839                            if((caseBits & 0xC0) == 0) {
4840                                cases.LastByte() |= 0 << (--caseShift);
4841                            } else {
4842                                cases.LastByte() |= 1 << (--caseShift);
4843                                /* second bit */
4844                                doCaseShift(cases, caseShift);
4845                                cases.LastByte() |= ((caseBits>>7)&1) << (--caseShift);
4846                            }
4847                        }
4848                    }
4849                }
4850            } else {
4851                if(notIsContinuation) {
4852                    tertiary ^= caseSwitch;
4853                }
4854            }
4855
4856            tertiary &= tertiaryMask;
4857            if(tertiary > compareTer) {
4858                /* This is compression code. */
4859                /* sequence size check is included in the if clause */
4860                if (tertiary == tertiaryCommon && notIsContinuation) {
4861                    ++count3;
4862                } else {
4863                    if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4864                        tertiary += tertiaryAddition;
4865                    } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4866                        tertiary -= tertiaryAddition;
4867                    }
4868                    if (count3 > 0) {
4869                        if ((tertiary > tertiaryCommon)) {
4870                            while (count3 > coll->tertiaryTopCount) {
4871                                tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
4872                                count3 -= (uint32_t)coll->tertiaryTopCount;
4873                            }
4874                            tertiaries.Append((uint8_t)(tertiaryTop - (count3-1)));
4875                        } else {
4876                            while (count3 > coll->tertiaryBottomCount) {
4877                                tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
4878                                count3 -= (uint32_t)coll->tertiaryBottomCount;
4879                            }
4880                            tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
4881                        }
4882                        count3 = 0;
4883                    }
4884                    tertiaries.Append(tertiary);
4885                }
4886            }
4887
4888            if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4889                if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4890                    if(count4>0) { // Close this part
4891                        while (count4 > UCOL_BOT_COUNT4) {
4892                            quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
4893                            count4 -= UCOL_BOT_COUNT4;
4894                        }
4895                        quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
4896                        count4 = 0;
4897                    }
4898                    quads.Append(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4899                } else { // This wasn't Hiragana, so we can continue adding stuff
4900                    count4++;
4901                }
4902            }
4903        }
4904    }
4905
4906    /* Here, we are generally done with processing */
4907    /* bailing out would not be too productive */
4908
4909    if(U_SUCCESS(*status)) {
4910        /* we have done all the CE's, now let's put them together to form a key */
4911        if(compareSec == 0) {
4912            if (count2 > 0) {
4913                while (count2 > UCOL_BOT_COUNT2) {
4914                    secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4915                    count2 -= (uint32_t)UCOL_BOT_COUNT2;
4916                }
4917                secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4918            }
4919            result.Append(UCOL_LEVELTERMINATOR);
4920            if(!isFrenchSec || !secondaries.IsOk()) {
4921                result.Append(secondaries);
4922            } else {
4923                // If there are any unresolved continuation secondaries,
4924                // reverse them here so that we can reverse the whole secondary thing.
4925                if (lastSecondaryLength > 1) {
4926                    uint8_t *frenchStartPtr = secondaries.GetLastFewBytes(lastSecondaryLength);
4927                    if (frenchStartPtr != NULL) {
4928                        /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4929                        uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4930                        uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4931                    }
4932                }
4933                packFrench(secondaries.GetUnsignedBuffer(), secondaries.NumberOfBytesAppended(), result);
4934            }
4935        }
4936
4937        if(doCase) {
4938            result.Append(UCOL_LEVELTERMINATOR);
4939            result.Append(cases);
4940        }
4941
4942        if(compareTer == 0) {
4943            if (count3 > 0) {
4944                if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4945                    while (count3 >= coll->tertiaryTopCount) {
4946                        tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
4947                        count3 -= (uint32_t)coll->tertiaryTopCount;
4948                    }
4949                    tertiaries.Append((uint8_t)(tertiaryTop - count3));
4950                } else {
4951                    while (count3 > coll->tertiaryBottomCount) {
4952                        tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
4953                        count3 -= (uint32_t)coll->tertiaryBottomCount;
4954                    }
4955                    tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
4956                }
4957            }
4958            result.Append(UCOL_LEVELTERMINATOR);
4959            result.Append(tertiaries);
4960
4961            if(compareQuad == 0/*qShifted == TRUE*/) {
4962                if(count4 > 0) {
4963                    while (count4 > UCOL_BOT_COUNT4) {
4964                        quads.Append((uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4));
4965                        count4 -= UCOL_BOT_COUNT4;
4966                    }
4967                    quads.Append((uint8_t)(UCOL_COMMON_BOT4 + (count4-1)));
4968                }
4969                result.Append(UCOL_LEVELTERMINATOR);
4970                result.Append(quads);
4971            }
4972
4973            if(compareIdent) {
4974                result.Append(UCOL_LEVELTERMINATOR);
4975                u_writeIdenticalLevelRun(s.string, len, result);
4976            }
4977        }
4978        result.Append(0);
4979    }
4980
4981    /* To avoid memory leak, free the offset buffer if necessary. */
4982    ucol_freeOffsetBuffer(&s);
4983}
4984
4985
4986U_CFUNC void U_CALLCONV
4987ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
4988        const    UChar        *source,
4989        int32_t        sourceLength,
4990        SortKeyByteSink &result,
4991        UErrorCode *status)
4992{
4993    U_ALIGN_CODE(16);
4994
4995    if(U_FAILURE(*status)) {
4996        return;
4997    }
4998
4999    /* Stack allocated buffers for buffers we use */
5000    char second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5001
5002    SortKeyByteSink &primaries = result;
5003    SortKeyByteSink secondaries(second, LENGTHOF(second));
5004    SortKeyByteSink tertiaries(tert, LENGTHOF(tert));
5005
5006    UnicodeString normSource;
5007
5008    int32_t len =  sourceLength;
5009
5010    /* If we need to normalize, we'll do it all at once at the beginning! */
5011    if(coll->normalizationMode != UCOL_OFF) {
5012        normSource.setTo(len < 0, source, len);
5013        const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5014        int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5015        if(qcYesLength != normSource.length()) {
5016            UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5017            normSource.truncate(qcYesLength);
5018            norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5019            source = normSource.getBuffer();
5020            len = normSource.length();
5021        }
5022    }
5023    collIterate s;
5024    IInit_collIterate(coll, (UChar *)source, len, &s, status);
5025    if(U_FAILURE(*status)) {
5026        return;
5027    }
5028    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
5029
5030    uint32_t order = 0;
5031
5032    uint8_t primary1 = 0;
5033    uint8_t primary2 = 0;
5034    uint8_t secondary = 0;
5035    uint8_t tertiary = 0;
5036    uint8_t caseSwitch = coll->caseSwitch;
5037    uint8_t tertiaryMask = coll->tertiaryMask;
5038    int8_t tertiaryAddition = coll->tertiaryAddition;
5039    uint8_t tertiaryTop = coll->tertiaryTop;
5040    uint8_t tertiaryBottom = coll->tertiaryBottom;
5041    uint8_t tertiaryCommon = coll->tertiaryCommon;
5042
5043    UBool notIsContinuation = FALSE;
5044
5045    uint32_t count2 = 0, count3 = 0;
5046    uint8_t leadPrimary = 0;
5047
5048    for(;;) {
5049        order = ucol_IGetNextCE(coll, &s, status);
5050
5051        if(order == 0) {
5052            continue;
5053        }
5054
5055        if(order == UCOL_NO_MORE_CES) {
5056            break;
5057        }
5058
5059        notIsContinuation = !isContinuation(order);
5060
5061        if(notIsContinuation) {
5062            tertiary = (uint8_t)((order & tertiaryMask));
5063        } else {
5064            tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5065        }
5066
5067        secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5068        primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5069        primary1 = (uint8_t)(order >> 8);
5070
5071        uint8_t originalPrimary1 = primary1;
5072        if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5073            primary1 = coll->leadBytePermutationTable[primary1];
5074        }
5075
5076        /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5077        /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
5078        /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5079        /* regular and simple sortkey calc */
5080        if(primary1 != UCOL_IGNORABLE) {
5081            if(notIsContinuation) {
5082                if(leadPrimary == primary1) {
5083                    primaries.Append(primary2);
5084                } else {
5085                    if(leadPrimary != 0) {
5086                        primaries.Append((uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN));
5087                    }
5088                    if(primary2 == UCOL_IGNORABLE) {
5089                        /* one byter, not compressed */
5090                        primaries.Append(primary1);
5091                        leadPrimary = 0;
5092                    } else if(isCompressible(coll, originalPrimary1)) {
5093                        /* compress */
5094                        primaries.Append(leadPrimary = primary1, primary2);
5095                    } else {
5096                        leadPrimary = 0;
5097                        primaries.Append(primary1, primary2);
5098                    }
5099                }
5100            } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5101                if(primary2 == UCOL_IGNORABLE) {
5102                    primaries.Append(primary1);
5103                } else {
5104                    primaries.Append(primary1, primary2);
5105                }
5106            }
5107        }
5108
5109        if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5110            /* This is compression code. */
5111            if (secondary == UCOL_COMMON2 && notIsContinuation) {
5112                ++count2;
5113            } else {
5114                if (count2 > 0) {
5115                    if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5116                        while (count2 > UCOL_TOP_COUNT2) {
5117                            secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
5118                            count2 -= (uint32_t)UCOL_TOP_COUNT2;
5119                        }
5120                        secondaries.Append((uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
5121                    } else {
5122                        while (count2 > UCOL_BOT_COUNT2) {
5123                            secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
5124                            count2 -= (uint32_t)UCOL_BOT_COUNT2;
5125                        }
5126                        secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
5127                    }
5128                    count2 = 0;
5129                }
5130                secondaries.Append(secondary);
5131            }
5132        }
5133
5134        if(notIsContinuation) {
5135            tertiary ^= caseSwitch;
5136        }
5137
5138        if(tertiary > 0) {
5139            /* This is compression code. */
5140            /* sequence size check is included in the if clause */
5141            if (tertiary == tertiaryCommon && notIsContinuation) {
5142                ++count3;
5143            } else {
5144                if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5145                    tertiary += tertiaryAddition;
5146                } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5147                    tertiary -= tertiaryAddition;
5148                }
5149                if (count3 > 0) {
5150                    if ((tertiary > tertiaryCommon)) {
5151                        while (count3 > coll->tertiaryTopCount) {
5152                            tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
5153                            count3 -= (uint32_t)coll->tertiaryTopCount;
5154                        }
5155                        tertiaries.Append((uint8_t)(tertiaryTop - (count3-1)));
5156                    } else {
5157                        while (count3 > coll->tertiaryBottomCount) {
5158                            tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
5159                            count3 -= (uint32_t)coll->tertiaryBottomCount;
5160                        }
5161                        tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
5162                    }
5163                    count3 = 0;
5164                }
5165                tertiaries.Append(tertiary);
5166            }
5167        }
5168    }
5169
5170    if(U_SUCCESS(*status)) {
5171        /* we have done all the CE's, now let's put them together to form a key */
5172        if (count2 > 0) {
5173            while (count2 > UCOL_BOT_COUNT2) {
5174                secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
5175                count2 -= (uint32_t)UCOL_BOT_COUNT2;
5176            }
5177            secondaries.Append((uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
5178        }
5179        result.Append(UCOL_LEVELTERMINATOR);
5180        result.Append(secondaries);
5181
5182        if (count3 > 0) {
5183            if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5184                while (count3 >= coll->tertiaryTopCount) {
5185                    tertiaries.Append((uint8_t)(tertiaryTop - coll->tertiaryTopCount));
5186                    count3 -= (uint32_t)coll->tertiaryTopCount;
5187                }
5188                tertiaries.Append((uint8_t)(tertiaryTop - count3));
5189            } else {
5190                while (count3 > coll->tertiaryBottomCount) {
5191                    tertiaries.Append((uint8_t)(tertiaryBottom + coll->tertiaryBottomCount));
5192                    count3 -= (uint32_t)coll->tertiaryBottomCount;
5193                }
5194                tertiaries.Append((uint8_t)(tertiaryBottom + (count3-1)));
5195            }
5196        }
5197        result.Append(UCOL_LEVELTERMINATOR);
5198        result.Append(tertiaries);
5199
5200        result.Append(0);
5201    }
5202
5203    /* To avoid memory leak, free the offset buffer if necessary. */
5204    ucol_freeOffsetBuffer(&s);
5205
5206    if (U_SUCCESS(*status) && !result.IsOk()) {
5207        *status = U_BUFFER_OVERFLOW_ERROR;
5208    }
5209}
5210
5211static inline
5212UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5213    UBool notIsContinuation = !isContinuation(CE);
5214    uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5215    if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5216               || (!notIsContinuation && *wasShifted)))
5217        || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5218    {
5219        // The stuff below should probably be in the sortkey code... maybe not...
5220        if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5221            /* we should just completely ignore it */
5222            *wasShifted = TRUE;
5223            //continue;
5224        }
5225        //*wasShifted = TRUE;
5226        return TRUE;
5227    } else {
5228        *wasShifted = FALSE;
5229        return FALSE;
5230    }
5231}
5232static inline
5233void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5234    if(level < maxLevel) {
5235        dest[i++] = UCOL_LEVELTERMINATOR;
5236    } else {
5237        dest[i++] = 0;
5238    }
5239}
5240
5241/** enumeration of level identifiers for partial sort key generation */
5242enum {
5243  UCOL_PSK_PRIMARY = 0,
5244    UCOL_PSK_SECONDARY = 1,
5245    UCOL_PSK_CASE = 2,
5246    UCOL_PSK_TERTIARY = 3,
5247    UCOL_PSK_QUATERNARY = 4,
5248    UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5249    UCOL_PSK_IDENTICAL = 6,
5250    UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5251    UCOL_PSK_LIMIT
5252};
5253
5254/** collation state enum. *_SHIFT value is how much to shift right
5255 *  to get the state piece to the right. *_MASK value should be
5256 *  ANDed with the shifted state. This data is stored in state[1]
5257 *  field.
5258 */
5259enum {
5260    UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5261    UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5262    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5263    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5264    /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5265     *  This field is also used to denote that the French secondary level is finished
5266     */
5267    UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5268    UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5269    UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5270    UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5271    /** When we do French we need to reverse secondary values. However, continuations
5272     *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5273     */
5274    UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5275    UCOL_PSK_BOCSU_BYTES_MASK = 3,
5276    UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5277    UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5278};
5279
5280// macro calculating the number of expansion CEs available
5281#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5282
5283
5284/** main sortkey part procedure. On the first call,
5285 *  you should pass in a collator, an iterator, empty state
5286 *  state[0] == state[1] == 0, a buffer to hold results
5287 *  number of bytes you need and an error code pointer.
5288 *  Make sure your buffer is big enough to hold the wanted
5289 *  number of sortkey bytes. I don't check.
5290 *  The only meaningful status you can get back is
5291 *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5292 *  have been dealt a raw deal and that you probably won't
5293 *  be able to use partial sortkey generation for this
5294 *  particular combination of string and collator. This
5295 *  is highly unlikely, but you should still check the error code.
5296 *  Any other status means that you're not in a sane situation
5297 *  anymore. After the first call, preserve state values and
5298 *  use them on subsequent calls to obtain more bytes of a sortkey.
5299 *  Use until the number of bytes written is smaller than the requested
5300 *  number of bytes. Generated sortkey is not compatible with the
5301 *  one generated by ucol_getSortKey, as we don't do any compression.
5302 *  However, levels are still terminated by a 1 (one) and the sortkey
5303 *  is terminated by a 0 (zero). Identical level is the same as in the
5304 *  regular sortkey - internal bocu-1 implementation is used.
5305 *  For curious, although you cannot do much about this, here is
5306 *  the structure of state words.
5307 *  state[0] - iterator state. Depends on the iterator implementation,
5308 *             but allows the iterator to continue where it stopped in
5309 *             the last iteration.
5310 *  state[1] - collation processing state. Here is the distribution
5311 *             of the bits:
5312 *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5313 *             quaternary, quin (we don't use this one), identical and
5314 *             null (producing only zeroes - first one to terminate the
5315 *             sortkey and subsequent to fill the buffer).
5316 *   3       - byte count. Number of bytes written on the primary level.
5317 *   4       - was shifted. Whether the previous iteration finished in the
5318 *             shifted state.
5319 *   5, 6    - French continuation bytes written. See the comment in the enum
5320 *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5321 *             the identical level.
5322 *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5323 *             since thes last successful update of the iterator state.
5324 */
5325U_CAPI int32_t U_EXPORT2
5326ucol_nextSortKeyPart(const UCollator *coll,
5327                     UCharIterator *iter,
5328                     uint32_t state[2],
5329                     uint8_t *dest, int32_t count,
5330                     UErrorCode *status)
5331{
5332    /* error checking */
5333    if(status==NULL || U_FAILURE(*status)) {
5334        return 0;
5335    }
5336    UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5337    if( coll==NULL || iter==NULL ||
5338        state==NULL ||
5339        count<0 || (count>0 && dest==NULL)
5340    ) {
5341        *status=U_ILLEGAL_ARGUMENT_ERROR;
5342        UTRACE_EXIT_STATUS(status);
5343        return 0;
5344    }
5345
5346    UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5347                  coll, iter, state[0], state[1], dest, count);
5348
5349    if(count==0) {
5350        /* nothing to do */
5351        UTRACE_EXIT_VALUE(0);
5352        return 0;
5353    }
5354    /** Setting up situation according to the state we got from the previous iteration */
5355    // The state of the iterator from the previous invocation
5356    uint32_t iterState = state[0];
5357    // Has the last iteration ended in the shifted state
5358    UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5359    // What is the current level of the sortkey?
5360    int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5361    // Have we written only one byte from a two byte primary in the previous iteration?
5362    // Also on secondary level - have we finished with the French secondary?
5363    int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5364    // number of bytes in the continuation buffer for French
5365    int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5366    // Number of bytes already written from a bocsu sequence. Since
5367    // the longes bocsu sequence is 4 long, this can be up to 3.
5368    int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5369    // Number of elements that need to be consumed in this iteration because
5370    // the iterator returned UITER_NO_STATE at the end of the last iteration,
5371    // so we had to save the last valid state.
5372    int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5373
5374    /** values that depend on the collator attributes */
5375    // strength of the collator.
5376    int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5377    // maximal level of the partial sortkey. Need to take whether case level is done
5378    int32_t maxLevel = 0;
5379    if(strength < UCOL_TERTIARY) {
5380        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5381            maxLevel = UCOL_PSK_CASE;
5382        } else {
5383            maxLevel = strength;
5384        }
5385    } else {
5386        if(strength == UCOL_TERTIARY) {
5387            maxLevel = UCOL_PSK_TERTIARY;
5388        } else if(strength == UCOL_QUATERNARY) {
5389            maxLevel = UCOL_PSK_QUATERNARY;
5390        } else { // identical
5391            maxLevel = UCOL_IDENTICAL;
5392        }
5393    }
5394    // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5395    uint8_t UCOL_HIRAGANA_QUAD =
5396      (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5397    // Boundary value that decides whether a CE is shifted or not
5398    uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5399    // Are we doing French collation?
5400    UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5401
5402    /** initializing the collation state */
5403    UBool notIsContinuation = FALSE;
5404    uint32_t CE = UCOL_NO_MORE_CES;
5405
5406    collIterate s;
5407    IInit_collIterate(coll, NULL, -1, &s, status);
5408    if(U_FAILURE(*status)) {
5409        UTRACE_EXIT_STATUS(*status);
5410        return 0;
5411    }
5412    s.iterator = iter;
5413    s.flags |= UCOL_USE_ITERATOR;
5414    // This variable tells us whether we have produced some other levels in this iteration
5415    // before we moved to the identical level. In that case, we need to switch the
5416    // type of the iterator.
5417    UBool doingIdenticalFromStart = FALSE;
5418    // Normalizing iterator
5419    // The division for the array length may truncate the array size to
5420    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5421    // for all platforms anyway.
5422    UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5423    UNormIterator *normIter = NULL;
5424    // If the normalization is turned on for the collator and we are below identical level
5425    // we will use a FCD normalizing iterator
5426    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5427        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5428        s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5429        s.flags &= ~UCOL_ITER_NORM;
5430        if(U_FAILURE(*status)) {
5431            UTRACE_EXIT_STATUS(*status);
5432            return 0;
5433        }
5434    } else if(level == UCOL_PSK_IDENTICAL) {
5435        // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5436        // will be updating the state - and this cannot be done on an ordinary iterator.
5437        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5438        s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5439        s.flags &= ~UCOL_ITER_NORM;
5440        if(U_FAILURE(*status)) {
5441            UTRACE_EXIT_STATUS(*status);
5442            return 0;
5443        }
5444        doingIdenticalFromStart = TRUE;
5445    }
5446
5447    // This is the tentative new state of the iterator. The problem
5448    // is that the iterator might return an undefined state, in
5449    // which case we should save the last valid state and increase
5450    // the iterator skip value.
5451    uint32_t newState = 0;
5452
5453    // First, we set the iterator to the last valid position
5454    // from the last iteration. This was saved in state[0].
5455    if(iterState == 0) {
5456        /* initial state */
5457        if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5458            s.iterator->move(s.iterator, 0, UITER_LIMIT);
5459        } else {
5460            s.iterator->move(s.iterator, 0, UITER_START);
5461        }
5462    } else {
5463        /* reset to previous state */
5464        s.iterator->setState(s.iterator, iterState, status);
5465        if(U_FAILURE(*status)) {
5466            UTRACE_EXIT_STATUS(*status);
5467            return 0;
5468        }
5469    }
5470
5471
5472
5473    // This variable tells us whether we can attempt to update the state
5474    // of iterator. Situations where we don't want to update iterator state
5475    // are the existence of expansion CEs that are not yet processed, and
5476    // finishing the case level without enough space in the buffer to insert
5477    // a level terminator.
5478    UBool canUpdateState = TRUE;
5479
5480    // Consume all the CEs that were consumed at the end of the previous
5481    // iteration without updating the iterator state. On identical level,
5482    // consume the code points.
5483    int32_t counter = cces;
5484    if(level < UCOL_PSK_IDENTICAL) {
5485        while(counter-->0) {
5486            // If we're doing French and we are on the secondary level,
5487            // we go backwards.
5488            if(level == UCOL_PSK_SECONDARY && doingFrench) {
5489                CE = ucol_IGetPrevCE(coll, &s, status);
5490            } else {
5491                CE = ucol_IGetNextCE(coll, &s, status);
5492            }
5493            if(CE==UCOL_NO_MORE_CES) {
5494                /* should not happen */
5495                *status=U_INTERNAL_PROGRAM_ERROR;
5496                UTRACE_EXIT_STATUS(*status);
5497                return 0;
5498            }
5499            if(uprv_numAvailableExpCEs(s)) {
5500                canUpdateState = FALSE;
5501            }
5502        }
5503    } else {
5504        while(counter-->0) {
5505            uiter_next32(s.iterator);
5506        }
5507    }
5508
5509    // French secondary needs to know whether the iterator state of zero came from previous level OR
5510    // from a new invocation...
5511    UBool wasDoingPrimary = FALSE;
5512    // destination buffer byte counter. When this guy
5513    // gets to count, we're done with the iteration
5514    int32_t i = 0;
5515    // used to count the zero bytes written after we
5516    // have finished with the sort key
5517    int32_t j = 0;
5518
5519
5520    // Hm.... I think we're ready to plunge in. Basic story is as following:
5521    // we have a fall through case based on level. This is used for initial
5522    // positioning on iteration start. Every level processor contains a
5523    // for(;;) which will be broken when we exhaust all the CEs. Other
5524    // way to exit is a goto saveState, which happens when we have filled
5525    // out our buffer.
5526    switch(level) {
5527    case UCOL_PSK_PRIMARY:
5528        wasDoingPrimary = TRUE;
5529        for(;;) {
5530            if(i==count) {
5531                goto saveState;
5532            }
5533            // We should save the state only if we
5534            // are sure that we are done with the
5535            // previous iterator state
5536            if(canUpdateState && byteCountOrFrenchDone == 0) {
5537                newState = s.iterator->getState(s.iterator);
5538                if(newState != UITER_NO_STATE) {
5539                    iterState = newState;
5540                    cces = 0;
5541                }
5542            }
5543            CE = ucol_IGetNextCE(coll, &s, status);
5544            cces++;
5545            if(CE==UCOL_NO_MORE_CES) {
5546                // Add the level separator
5547                terminatePSKLevel(level, maxLevel, i, dest);
5548                byteCountOrFrenchDone=0;
5549                // Restart the iteration an move to the
5550                // second level
5551                s.iterator->move(s.iterator, 0, UITER_START);
5552                cces = 0;
5553                level = UCOL_PSK_SECONDARY;
5554                break;
5555            }
5556            if(!isContinuation(CE)){
5557                if(coll->leadBytePermutationTable != NULL){
5558                    CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5559                }
5560            }
5561            if(!isShiftedCE(CE, LVT, &wasShifted)) {
5562                CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5563                if(CE != 0) {
5564                    if(byteCountOrFrenchDone == 0) {
5565                        // get the second byte of primary
5566                        dest[i++]=(uint8_t)(CE >> 8);
5567                    } else {
5568                        byteCountOrFrenchDone = 0;
5569                    }
5570                    if((CE &=0xff)!=0) {
5571                        if(i==count) {
5572                            /* overflow */
5573                            byteCountOrFrenchDone = 1;
5574                            cces--;
5575                            goto saveState;
5576                        }
5577                        dest[i++]=(uint8_t)CE;
5578                    }
5579                }
5580            }
5581            if(uprv_numAvailableExpCEs(s)) {
5582                canUpdateState = FALSE;
5583            } else {
5584                canUpdateState = TRUE;
5585            }
5586        }
5587        /* fall through to next level */
5588    case UCOL_PSK_SECONDARY:
5589        if(strength >= UCOL_SECONDARY) {
5590            if(!doingFrench) {
5591                for(;;) {
5592                    if(i == count) {
5593                        goto saveState;
5594                    }
5595                    // We should save the state only if we
5596                    // are sure that we are done with the
5597                    // previous iterator state
5598                    if(canUpdateState) {
5599                        newState = s.iterator->getState(s.iterator);
5600                        if(newState != UITER_NO_STATE) {
5601                            iterState = newState;
5602                            cces = 0;
5603                        }
5604                    }
5605                    CE = ucol_IGetNextCE(coll, &s, status);
5606                    cces++;
5607                    if(CE==UCOL_NO_MORE_CES) {
5608                        // Add the level separator
5609                        terminatePSKLevel(level, maxLevel, i, dest);
5610                        byteCountOrFrenchDone = 0;
5611                        // Restart the iteration an move to the
5612                        // second level
5613                        s.iterator->move(s.iterator, 0, UITER_START);
5614                        cces = 0;
5615                        level = UCOL_PSK_CASE;
5616                        break;
5617                    }
5618                    if(!isShiftedCE(CE, LVT, &wasShifted)) {
5619                        CE >>= 8; /* get secondary */
5620                        if(CE != 0) {
5621                            dest[i++]=(uint8_t)CE;
5622                        }
5623                    }
5624                    if(uprv_numAvailableExpCEs(s)) {
5625                        canUpdateState = FALSE;
5626                    } else {
5627                        canUpdateState = TRUE;
5628                    }
5629                }
5630            } else { // French secondary processing
5631                uint8_t frenchBuff[UCOL_MAX_BUFFER];
5632                int32_t frenchIndex = 0;
5633                // Here we are going backwards.
5634                // If the iterator is at the beggining, it should be
5635                // moved to end.
5636                if(wasDoingPrimary) {
5637                    s.iterator->move(s.iterator, 0, UITER_LIMIT);
5638                    cces = 0;
5639                }
5640                for(;;) {
5641                    if(i == count) {
5642                        goto saveState;
5643                    }
5644                    if(canUpdateState) {
5645                        newState = s.iterator->getState(s.iterator);
5646                        if(newState != UITER_NO_STATE) {
5647                            iterState = newState;
5648                            cces = 0;
5649                        }
5650                    }
5651                    CE = ucol_IGetPrevCE(coll, &s, status);
5652                    cces++;
5653                    if(CE==UCOL_NO_MORE_CES) {
5654                        // Add the level separator
5655                        terminatePSKLevel(level, maxLevel, i, dest);
5656                        byteCountOrFrenchDone = 0;
5657                        // Restart the iteration an move to the next level
5658                        s.iterator->move(s.iterator, 0, UITER_START);
5659                        level = UCOL_PSK_CASE;
5660                        break;
5661                    }
5662                    if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5663                        // reverse when we get a first non-continuation CE.
5664                        CE >>= 8;
5665                        frenchBuff[frenchIndex++] = (uint8_t)CE;
5666                    } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5667                        CE >>= 8; /* get secondary */
5668                        if(!frenchIndex) {
5669                            if(CE != 0) {
5670                                dest[i++]=(uint8_t)CE;
5671                            }
5672                        } else {
5673                            frenchBuff[frenchIndex++] = (uint8_t)CE;
5674                            frenchIndex -= usedFrench;
5675                            usedFrench = 0;
5676                            while(i < count && frenchIndex) {
5677                                dest[i++] = frenchBuff[--frenchIndex];
5678                                usedFrench++;
5679                            }
5680                        }
5681                    }
5682                    if(uprv_numAvailableExpCEs(s)) {
5683                        canUpdateState = FALSE;
5684                    } else {
5685                        canUpdateState = TRUE;
5686                    }
5687                }
5688            }
5689        } else {
5690            level = UCOL_PSK_CASE;
5691        }
5692        /* fall through to next level */
5693    case UCOL_PSK_CASE:
5694        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5695            uint32_t caseShift = UCOL_CASE_SHIFT_START;
5696            uint8_t caseByte = UCOL_CASE_BYTE_START;
5697            uint8_t caseBits = 0;
5698
5699            for(;;) {
5700                U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5701                if(i == count) {
5702                    goto saveState;
5703                }
5704                // We should save the state only if we
5705                // are sure that we are done with the
5706                // previous iterator state
5707                if(canUpdateState) {
5708                    newState = s.iterator->getState(s.iterator);
5709                    if(newState != UITER_NO_STATE) {
5710                        iterState = newState;
5711                        cces = 0;
5712                    }
5713                }
5714                CE = ucol_IGetNextCE(coll, &s, status);
5715                cces++;
5716                if(CE==UCOL_NO_MORE_CES) {
5717                    // On the case level we might have an unfinished
5718                    // case byte. Add one if it's started.
5719                    if(caseShift != UCOL_CASE_SHIFT_START) {
5720                        dest[i++] = caseByte;
5721                    }
5722                    cces = 0;
5723                    // We have finished processing CEs on this level.
5724                    // However, we don't know if we have enough space
5725                    // to add a case level terminator.
5726                    if(i < count) {
5727                        // Add the level separator
5728                        terminatePSKLevel(level, maxLevel, i, dest);
5729                        // Restart the iteration and move to the
5730                        // next level
5731                        s.iterator->move(s.iterator, 0, UITER_START);
5732                        level = UCOL_PSK_TERTIARY;
5733                    } else {
5734                        canUpdateState = FALSE;
5735                    }
5736                    break;
5737                }
5738
5739                if(!isShiftedCE(CE, LVT, &wasShifted)) {
5740                    if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5741                        // do the case level if we need to do it. We don't want to calculate
5742                        // case level for primary ignorables if we have only primary strength and case level
5743                        // otherwise we would break well formedness of CEs
5744                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5745                        caseBits = (uint8_t)(CE & 0xC0);
5746                        // this copies the case level logic from the
5747                        // sort key generation code
5748                        if(CE != 0) {
5749                            if (caseShift == 0) {
5750                                dest[i++] = caseByte;
5751                                caseShift = UCOL_CASE_SHIFT_START;
5752                                caseByte = UCOL_CASE_BYTE_START;
5753                            }
5754                            if(coll->caseFirst == UCOL_UPPER_FIRST) {
5755                                if((caseBits & 0xC0) == 0) {
5756                                    caseByte |= 1 << (--caseShift);
5757                                } else {
5758                                    caseByte |= 0 << (--caseShift);
5759                                    /* second bit */
5760                                    if(caseShift == 0) {
5761                                        dest[i++] = caseByte;
5762                                        caseShift = UCOL_CASE_SHIFT_START;
5763                                        caseByte = UCOL_CASE_BYTE_START;
5764                                    }
5765                                    caseByte |= ((caseBits>>6)&1) << (--caseShift);
5766                                }
5767                            } else {
5768                                if((caseBits & 0xC0) == 0) {
5769                                    caseByte |= 0 << (--caseShift);
5770                                } else {
5771                                    caseByte |= 1 << (--caseShift);
5772                                    /* second bit */
5773                                    if(caseShift == 0) {
5774                                        dest[i++] = caseByte;
5775                                        caseShift = UCOL_CASE_SHIFT_START;
5776                                        caseByte = UCOL_CASE_BYTE_START;
5777                                    }
5778                                    caseByte |= ((caseBits>>7)&1) << (--caseShift);
5779                                }
5780                            }
5781                        }
5782
5783                    }
5784                }
5785                // Not sure this is correct for the case level - revisit
5786                if(uprv_numAvailableExpCEs(s)) {
5787                    canUpdateState = FALSE;
5788                } else {
5789                    canUpdateState = TRUE;
5790                }
5791            }
5792        } else {
5793            level = UCOL_PSK_TERTIARY;
5794        }
5795        /* fall through to next level */
5796    case UCOL_PSK_TERTIARY:
5797        if(strength >= UCOL_TERTIARY) {
5798            for(;;) {
5799                if(i == count) {
5800                    goto saveState;
5801                }
5802                // We should save the state only if we
5803                // are sure that we are done with the
5804                // previous iterator state
5805                if(canUpdateState) {
5806                    newState = s.iterator->getState(s.iterator);
5807                    if(newState != UITER_NO_STATE) {
5808                        iterState = newState;
5809                        cces = 0;
5810                    }
5811                }
5812                CE = ucol_IGetNextCE(coll, &s, status);
5813                cces++;
5814                if(CE==UCOL_NO_MORE_CES) {
5815                    // Add the level separator
5816                    terminatePSKLevel(level, maxLevel, i, dest);
5817                    byteCountOrFrenchDone = 0;
5818                    // Restart the iteration an move to the
5819                    // second level
5820                    s.iterator->move(s.iterator, 0, UITER_START);
5821                    cces = 0;
5822                    level = UCOL_PSK_QUATERNARY;
5823                    break;
5824                }
5825                if(!isShiftedCE(CE, LVT, &wasShifted)) {
5826                    notIsContinuation = !isContinuation(CE);
5827
5828                    if(notIsContinuation) {
5829                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5830                        CE ^= coll->caseSwitch;
5831                        CE &= coll->tertiaryMask;
5832                    } else {
5833                        CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5834                    }
5835
5836                    if(CE != 0) {
5837                        dest[i++]=(uint8_t)CE;
5838                    }
5839                }
5840                if(uprv_numAvailableExpCEs(s)) {
5841                    canUpdateState = FALSE;
5842                } else {
5843                    canUpdateState = TRUE;
5844                }
5845            }
5846        } else {
5847            // if we're not doing tertiary
5848            // skip to the end
5849            level = UCOL_PSK_NULL;
5850        }
5851        /* fall through to next level */
5852    case UCOL_PSK_QUATERNARY:
5853        if(strength >= UCOL_QUATERNARY) {
5854            for(;;) {
5855                if(i == count) {
5856                    goto saveState;
5857                }
5858                // We should save the state only if we
5859                // are sure that we are done with the
5860                // previous iterator state
5861                if(canUpdateState) {
5862                    newState = s.iterator->getState(s.iterator);
5863                    if(newState != UITER_NO_STATE) {
5864                        iterState = newState;
5865                        cces = 0;
5866                    }
5867                }
5868                CE = ucol_IGetNextCE(coll, &s, status);
5869                cces++;
5870                if(CE==UCOL_NO_MORE_CES) {
5871                    // Add the level separator
5872                    terminatePSKLevel(level, maxLevel, i, dest);
5873                    //dest[i++] = UCOL_LEVELTERMINATOR;
5874                    byteCountOrFrenchDone = 0;
5875                    // Restart the iteration an move to the
5876                    // second level
5877                    s.iterator->move(s.iterator, 0, UITER_START);
5878                    cces = 0;
5879                    level = UCOL_PSK_QUIN;
5880                    break;
5881                }
5882                if(CE==0)
5883                    continue;
5884                if(isShiftedCE(CE, LVT, &wasShifted)) {
5885                    CE >>= 16; /* get primary */
5886                    if(CE != 0) {
5887                        if(byteCountOrFrenchDone == 0) {
5888                            dest[i++]=(uint8_t)(CE >> 8);
5889                        } else {
5890                            byteCountOrFrenchDone = 0;
5891                        }
5892                        if((CE &=0xff)!=0) {
5893                            if(i==count) {
5894                                /* overflow */
5895                                byteCountOrFrenchDone = 1;
5896                                goto saveState;
5897                            }
5898                            dest[i++]=(uint8_t)CE;
5899                        }
5900                    }
5901                } else {
5902                    notIsContinuation = !isContinuation(CE);
5903                    if(notIsContinuation) {
5904                        if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5905                            dest[i++] = UCOL_HIRAGANA_QUAD;
5906                        } else {
5907                            dest[i++] = 0xFF;
5908                        }
5909                    }
5910                }
5911                if(uprv_numAvailableExpCEs(s)) {
5912                    canUpdateState = FALSE;
5913                } else {
5914                    canUpdateState = TRUE;
5915                }
5916            }
5917        } else {
5918            // if we're not doing quaternary
5919            // skip to the end
5920            level = UCOL_PSK_NULL;
5921        }
5922        /* fall through to next level */
5923    case UCOL_PSK_QUIN:
5924        level = UCOL_PSK_IDENTICAL;
5925        /* fall through to next level */
5926    case UCOL_PSK_IDENTICAL:
5927        if(strength >= UCOL_IDENTICAL) {
5928            UChar32 first, second;
5929            int32_t bocsuBytesWritten = 0;
5930            // We always need to do identical on
5931            // the NFD form of the string.
5932            if(normIter == NULL) {
5933                // we arrived from the level below and
5934                // normalization was not turned on.
5935                // therefore, we need to make a fresh NFD iterator
5936                normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5937                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5938            } else if(!doingIdenticalFromStart) {
5939                // there is an iterator, but we did some other levels.
5940                // therefore, we have a FCD iterator - need to make
5941                // a NFD one.
5942                // normIter being at the beginning does not guarantee
5943                // that the underlying iterator is at the beginning
5944                iter->move(iter, 0, UITER_START);
5945                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5946            }
5947            // At this point we have a NFD iterator that is positioned
5948            // in the right place
5949            if(U_FAILURE(*status)) {
5950                UTRACE_EXIT_STATUS(*status);
5951                return 0;
5952            }
5953            first = uiter_previous32(s.iterator);
5954            // maybe we're at the start of the string
5955            if(first == U_SENTINEL) {
5956                first = 0;
5957            } else {
5958                uiter_next32(s.iterator);
5959            }
5960
5961            j = 0;
5962            for(;;) {
5963                if(i == count) {
5964                    if(j+1 < bocsuBytesWritten) {
5965                        bocsuBytesUsed = j+1;
5966                    }
5967                    goto saveState;
5968                }
5969
5970                // On identical level, we will always save
5971                // the state if we reach this point, since
5972                // we don't depend on getNextCE for content
5973                // all the content is in our buffer and we
5974                // already either stored the full buffer OR
5975                // otherwise we won't arrive here.
5976                newState = s.iterator->getState(s.iterator);
5977                if(newState != UITER_NO_STATE) {
5978                    iterState = newState;
5979                    cces = 0;
5980                }
5981
5982                uint8_t buff[4];
5983                second = uiter_next32(s.iterator);
5984                cces++;
5985
5986                // end condition for identical level
5987                if(second == U_SENTINEL) {
5988                    terminatePSKLevel(level, maxLevel, i, dest);
5989                    level = UCOL_PSK_NULL;
5990                    break;
5991                }
5992                bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
5993                first = second;
5994
5995                j = 0;
5996                if(bocsuBytesUsed != 0) {
5997                    while(bocsuBytesUsed-->0) {
5998                        j++;
5999                    }
6000                }
6001
6002                while(i < count && j < bocsuBytesWritten) {
6003                    dest[i++] = buff[j++];
6004                }
6005            }
6006
6007        } else {
6008            level = UCOL_PSK_NULL;
6009        }
6010        /* fall through to next level */
6011    case UCOL_PSK_NULL:
6012        j = i;
6013        while(j<count) {
6014            dest[j++]=0;
6015        }
6016        break;
6017    default:
6018        *status = U_INTERNAL_PROGRAM_ERROR;
6019        UTRACE_EXIT_STATUS(*status);
6020        return 0;
6021    }
6022
6023saveState:
6024    // Now we need to return stuff. First we want to see whether we have
6025    // done everything for the current state of iterator.
6026    if(byteCountOrFrenchDone
6027        || canUpdateState == FALSE
6028        || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6029    {
6030        // Any of above mean that the previous transaction
6031        // wasn't finished and that we should store the
6032        // previous iterator state.
6033        state[0] = iterState;
6034    } else {
6035        // The transaction is complete. We will continue in the next iteration.
6036        state[0] = s.iterator->getState(s.iterator);
6037        cces = 0;
6038    }
6039    // Store the number of bocsu bytes written.
6040    if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6041        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6042    }
6043    state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6044
6045    // Next we put in the level of comparison
6046    state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6047
6048    // If we are doing French, we need to store whether we have just finished the French level
6049    if(level == UCOL_PSK_SECONDARY && doingFrench) {
6050        state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6051    } else {
6052        state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6053    }
6054
6055    // Was the latest CE shifted
6056    if(wasShifted) {
6057        state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6058    }
6059    // Check for cces overflow
6060    if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6061        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6062    }
6063    // Store cces
6064    state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6065
6066    // Check for French overflow
6067    if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6068        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6069    }
6070    // Store number of bytes written in the French secondary continuation sequence
6071    state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6072
6073
6074    // If we have used normalizing iterator, get rid of it
6075    if(normIter != NULL) {
6076        unorm_closeIter(normIter);
6077    }
6078
6079    /* To avoid memory leak, free the offset buffer if necessary. */
6080    ucol_freeOffsetBuffer(&s);
6081
6082    // Return number of meaningful sortkey bytes.
6083    UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6084                  dest,i, state[0], state[1]);
6085    UTRACE_EXIT_VALUE(i);
6086    return i;
6087}
6088
6089/**
6090 * Produce a bound for a given sortkey and a number of levels.
6091 */
6092U_CAPI int32_t U_EXPORT2
6093ucol_getBound(const uint8_t       *source,
6094        int32_t             sourceLength,
6095        UColBoundMode       boundType,
6096        uint32_t            noOfLevels,
6097        uint8_t             *result,
6098        int32_t             resultLength,
6099        UErrorCode          *status)
6100{
6101    // consistency checks
6102    if(status == NULL || U_FAILURE(*status)) {
6103        return 0;
6104    }
6105    if(source == NULL) {
6106        *status = U_ILLEGAL_ARGUMENT_ERROR;
6107        return 0;
6108    }
6109
6110    int32_t sourceIndex = 0;
6111    // Scan the string until we skip enough of the key OR reach the end of the key
6112    do {
6113        sourceIndex++;
6114        if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6115            noOfLevels--;
6116        }
6117    } while (noOfLevels > 0
6118        && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6119
6120    if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6121        && noOfLevels > 0) {
6122            *status = U_SORT_KEY_TOO_SHORT_WARNING;
6123    }
6124
6125
6126    // READ ME: this code assumes that the values for boundType
6127    // enum will not changes. They are set so that the enum value
6128    // corresponds to the number of extra bytes each bound type
6129    // needs.
6130    if(result != NULL && resultLength >= sourceIndex+boundType) {
6131        uprv_memcpy(result, source, sourceIndex);
6132        switch(boundType) {
6133            // Lower bound just gets terminated. No extra bytes
6134        case UCOL_BOUND_LOWER: // = 0
6135            break;
6136            // Upper bound needs one extra byte
6137        case UCOL_BOUND_UPPER: // = 1
6138            result[sourceIndex++] = 2;
6139            break;
6140            // Upper long bound needs two extra bytes
6141        case UCOL_BOUND_UPPER_LONG: // = 2
6142            result[sourceIndex++] = 0xFF;
6143            result[sourceIndex++] = 0xFF;
6144            break;
6145        default:
6146            *status = U_ILLEGAL_ARGUMENT_ERROR;
6147            return 0;
6148        }
6149        result[sourceIndex++] = 0;
6150
6151        return sourceIndex;
6152    } else {
6153        return sourceIndex+boundType+1;
6154    }
6155}
6156
6157/****************************************************************************/
6158/* Following are the functions that deal with the properties of a collator  */
6159/* there are new APIs and some compatibility APIs                           */
6160/****************************************************************************/
6161
6162static inline void
6163ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6164                    int32_t *primShift, int32_t *secShift, int32_t *terShift)
6165{
6166    uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6167    UBool reverseSecondary = FALSE;
6168    UBool continuation = isContinuation(CE);
6169    if(!continuation) {
6170        tertiary = (uint8_t)((CE & coll->tertiaryMask));
6171        tertiary ^= coll->caseSwitch;
6172        reverseSecondary = TRUE;
6173    } else {
6174        tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6175        tertiary &= UCOL_REMOVE_CASE;
6176        reverseSecondary = FALSE;
6177    }
6178
6179    secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6180    primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6181    primary1 = (uint8_t)(CE >> 8);
6182
6183    if(primary1 != 0) {
6184        if (coll->leadBytePermutationTable != NULL && !continuation) {
6185            primary1 = coll->leadBytePermutationTable[primary1];
6186        }
6187
6188        coll->latinOneCEs[ch] |= (primary1 << *primShift);
6189        *primShift -= 8;
6190    }
6191    if(primary2 != 0) {
6192        if(*primShift < 0) {
6193            coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6194            coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6195            coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6196            return;
6197        }
6198        coll->latinOneCEs[ch] |= (primary2 << *primShift);
6199        *primShift -= 8;
6200    }
6201    if(secondary != 0) {
6202        if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6203            coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6204            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6205        } else { // normal case
6206            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6207        }
6208        *secShift -= 8;
6209    }
6210    if(tertiary != 0) {
6211        coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6212        *terShift -= 8;
6213    }
6214}
6215
6216static inline UBool
6217ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6218    uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6219    if(newTable == NULL) {
6220      *status = U_MEMORY_ALLOCATION_ERROR;
6221      coll->latinOneFailed = TRUE;
6222      return FALSE;
6223    }
6224    int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6225    uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6226    uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6227    uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6228    uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6229    coll->latinOneTableLen = size;
6230    uprv_free(coll->latinOneCEs);
6231    coll->latinOneCEs = newTable;
6232    return TRUE;
6233}
6234
6235static UBool
6236ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6237    UBool result = TRUE;
6238    if(coll->latinOneCEs == NULL) {
6239        coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6240        if(coll->latinOneCEs == NULL) {
6241            *status = U_MEMORY_ALLOCATION_ERROR;
6242            return FALSE;
6243        }
6244        coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6245    }
6246    UChar ch = 0;
6247    UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6248    // Check for null pointer
6249    if (U_FAILURE(*status)) {
6250        return FALSE;
6251    }
6252    uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6253
6254    int32_t primShift = 24, secShift = 24, terShift = 24;
6255    uint32_t CE = 0;
6256    int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6257
6258    // TODO: make safe if you get more than you wanted...
6259    for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6260        primShift = 24; secShift = 24; terShift = 24;
6261        if(ch < 0x100) {
6262            CE = coll->latinOneMapping[ch];
6263        } else {
6264            CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6265            if(CE == UCOL_NOT_FOUND && coll->UCA) {
6266                CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6267            }
6268        }
6269        if(CE < UCOL_NOT_FOUND) {
6270            ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6271        } else {
6272            switch (getCETag(CE)) {
6273            case EXPANSION_TAG:
6274            case DIGIT_TAG:
6275                ucol_setText(it, &ch, 1, status);
6276                while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6277                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6278                        coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6279                        coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6280                        coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6281                        break;
6282                    }
6283                    ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6284                }
6285                break;
6286            case CONTRACTION_TAG:
6287                // here is the trick
6288                // F2 is contraction. We do something very similar to contractions
6289                // but have two indices, one in the real contraction table and the
6290                // other to where we stuffed things. This hopes that we don't have
6291                // many contractions (this should work for latin-1 tables).
6292                {
6293                    if((CE & 0x00FFF000) != 0) {
6294                        *status = U_UNSUPPORTED_ERROR;
6295                        goto cleanup_after_failure;
6296                    }
6297
6298                    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6299
6300                    CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6301
6302                    coll->latinOneCEs[ch] = CE;
6303                    coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6304                    coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6305
6306                    // We're going to jump into contraction table, pick the elements
6307                    // and use them
6308                    do {
6309                        CE = *(coll->contractionCEs +
6310                            (UCharOffset - coll->contractionIndex));
6311                        if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6312                            uint32_t size;
6313                            uint32_t i;    /* general counter */
6314                            uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6315                            size = getExpansionCount(CE);
6316                            //CE = *CEOffset++;
6317                            if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6318                                for(i = 0; i<size; i++) {
6319                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6320                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6321                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6322                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6323                                        break;
6324                                    }
6325                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6326                                }
6327                            } else { /* else, we do */
6328                                while(*CEOffset != 0) {
6329                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6330                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6331                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6332                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6333                                        break;
6334                                    }
6335                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6336                                }
6337                            }
6338                            contractionOffset++;
6339                        } else if(CE < UCOL_NOT_FOUND) {
6340                            ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6341                        } else {
6342                            coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6343                            coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6344                            coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6345                            contractionOffset++;
6346                        }
6347                        UCharOffset++;
6348                        primShift = 24; secShift = 24; terShift = 24;
6349                        if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6350                            if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6351                                goto cleanup_after_failure;
6352                            }
6353                        }
6354                    } while(*UCharOffset != 0xFFFF);
6355                }
6356                break;;
6357            case SPEC_PROC_TAG:
6358                {
6359                    // 0xB7 is a precontext character defined in UCA5.1, a special
6360                    // handle is implemeted in order to save LatinOne table for
6361                    // most locales.
6362                    if (ch==0xb7) {
6363                        ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6364                    }
6365                    else {
6366                        goto cleanup_after_failure;
6367                    }
6368                }
6369                break;
6370            default:
6371                goto cleanup_after_failure;
6372            }
6373        }
6374    }
6375    // compact table
6376    if(contractionOffset < coll->latinOneTableLen) {
6377        if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6378            goto cleanup_after_failure;
6379        }
6380    }
6381    ucol_closeElements(it);
6382    return result;
6383
6384cleanup_after_failure:
6385    // status should already be set before arriving here.
6386    coll->latinOneFailed = TRUE;
6387    ucol_closeElements(it);
6388    return FALSE;
6389}
6390
6391void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6392    if(U_SUCCESS(*status)) {
6393        if(coll->caseFirst == UCOL_UPPER_FIRST) {
6394            coll->caseSwitch = UCOL_CASE_SWITCH;
6395        } else {
6396            coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6397        }
6398
6399        if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6400            coll->tertiaryMask = UCOL_REMOVE_CASE;
6401            coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6402            coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6403            coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6404            coll->tertiaryBottom = UCOL_COMMON_BOT3;
6405        } else {
6406            coll->tertiaryMask = UCOL_KEEP_CASE;
6407            coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6408            if(coll->caseFirst == UCOL_UPPER_FIRST) {
6409                coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6410                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6411                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6412            } else {
6413                coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6414                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6415                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6416            }
6417        }
6418
6419        /* Set the compression values */
6420        uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
6421        coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6422        coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6423
6424        if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6425            && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6426        {
6427            coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6428        } else {
6429            coll->sortKeyGen = ucol_calcSortKey;
6430        }
6431        if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6432            && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6433        {
6434            if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6435                if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6436                    //fprintf(stderr, "F");
6437                    coll->latinOneUse = TRUE;
6438                } else {
6439                    coll->latinOneUse = FALSE;
6440                }
6441                if(*status == U_UNSUPPORTED_ERROR) {
6442                    *status = U_ZERO_ERROR;
6443                }
6444            } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6445                coll->latinOneUse = TRUE;
6446            }
6447        } else {
6448            coll->latinOneUse = FALSE;
6449        }
6450    }
6451}
6452
6453U_CAPI uint32_t  U_EXPORT2
6454ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6455    if(U_FAILURE(*status) || coll == NULL) {
6456        return 0;
6457    }
6458    if(len == -1) {
6459        len = u_strlen(varTop);
6460    }
6461    if(len == 0) {
6462        *status = U_ILLEGAL_ARGUMENT_ERROR;
6463        return 0;
6464    }
6465
6466    collIterate s;
6467    IInit_collIterate(coll, varTop, len, &s, status);
6468    if(U_FAILURE(*status)) {
6469        return 0;
6470    }
6471
6472    uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6473
6474    /* here we check if we have consumed all characters */
6475    /* you can put in either one character or a contraction */
6476    /* you shouldn't put more... */
6477    if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6478        *status = U_CE_NOT_FOUND_ERROR;
6479        return 0;
6480    }
6481
6482    uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6483
6484    if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6485        *status = U_PRIMARY_TOO_LONG_ERROR;
6486        return 0;
6487    }
6488    if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6489        coll->variableTopValueisDefault = FALSE;
6490        coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6491    }
6492
6493    /* To avoid memory leak, free the offset buffer if necessary. */
6494    ucol_freeOffsetBuffer(&s);
6495
6496    return CE & UCOL_PRIMARYMASK;
6497}
6498
6499U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6500    if(U_FAILURE(*status) || coll == NULL) {
6501        return 0;
6502    }
6503    return coll->variableTopValue<<16;
6504}
6505
6506U_CAPI void  U_EXPORT2
6507ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6508    if(U_FAILURE(*status) || coll == NULL) {
6509        return;
6510    }
6511
6512    if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6513        coll->variableTopValueisDefault = FALSE;
6514        coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6515    }
6516}
6517/* Attribute setter API */
6518U_CAPI void  U_EXPORT2
6519ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6520    if(U_FAILURE(*status) || coll == NULL) {
6521      return;
6522    }
6523
6524    UColAttributeValue oldFrench = coll->frenchCollation;
6525    UColAttributeValue oldCaseFirst = coll->caseFirst;
6526    switch(attr) {
6527    case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6528        if(value == UCOL_ON) {
6529            coll->numericCollation = UCOL_ON;
6530            coll->numericCollationisDefault = FALSE;
6531        } else if (value == UCOL_OFF) {
6532            coll->numericCollation = UCOL_OFF;
6533            coll->numericCollationisDefault = FALSE;
6534        } else if (value == UCOL_DEFAULT) {
6535            coll->numericCollationisDefault = TRUE;
6536            coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6537        } else {
6538            *status = U_ILLEGAL_ARGUMENT_ERROR;
6539        }
6540        break;
6541    case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6542        if(value == UCOL_ON) {
6543            coll->hiraganaQ = UCOL_ON;
6544            coll->hiraganaQisDefault = FALSE;
6545        } else if (value == UCOL_OFF) {
6546            coll->hiraganaQ = UCOL_OFF;
6547            coll->hiraganaQisDefault = FALSE;
6548        } else if (value == UCOL_DEFAULT) {
6549            coll->hiraganaQisDefault = TRUE;
6550            coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6551        } else {
6552            *status = U_ILLEGAL_ARGUMENT_ERROR;
6553        }
6554        break;
6555    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6556        if(value == UCOL_ON) {
6557            coll->frenchCollation = UCOL_ON;
6558            coll->frenchCollationisDefault = FALSE;
6559        } else if (value == UCOL_OFF) {
6560            coll->frenchCollation = UCOL_OFF;
6561            coll->frenchCollationisDefault = FALSE;
6562        } else if (value == UCOL_DEFAULT) {
6563            coll->frenchCollationisDefault = TRUE;
6564            coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6565        } else {
6566            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6567        }
6568        break;
6569    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6570        if(value == UCOL_SHIFTED) {
6571            coll->alternateHandling = UCOL_SHIFTED;
6572            coll->alternateHandlingisDefault = FALSE;
6573        } else if (value == UCOL_NON_IGNORABLE) {
6574            coll->alternateHandling = UCOL_NON_IGNORABLE;
6575            coll->alternateHandlingisDefault = FALSE;
6576        } else if (value == UCOL_DEFAULT) {
6577            coll->alternateHandlingisDefault = TRUE;
6578            coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6579        } else {
6580            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6581        }
6582        break;
6583    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6584        if(value == UCOL_LOWER_FIRST) {
6585            coll->caseFirst = UCOL_LOWER_FIRST;
6586            coll->caseFirstisDefault = FALSE;
6587        } else if (value == UCOL_UPPER_FIRST) {
6588            coll->caseFirst = UCOL_UPPER_FIRST;
6589            coll->caseFirstisDefault = FALSE;
6590        } else if (value == UCOL_OFF) {
6591            coll->caseFirst = UCOL_OFF;
6592            coll->caseFirstisDefault = FALSE;
6593        } else if (value == UCOL_DEFAULT) {
6594            coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6595            coll->caseFirstisDefault = TRUE;
6596        } else {
6597            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6598        }
6599        break;
6600    case UCOL_CASE_LEVEL: /* do we have an extra case level */
6601        if(value == UCOL_ON) {
6602            coll->caseLevel = UCOL_ON;
6603            coll->caseLevelisDefault = FALSE;
6604        } else if (value == UCOL_OFF) {
6605            coll->caseLevel = UCOL_OFF;
6606            coll->caseLevelisDefault = FALSE;
6607        } else if (value == UCOL_DEFAULT) {
6608            coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6609            coll->caseLevelisDefault = TRUE;
6610        } else {
6611            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6612        }
6613        break;
6614    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6615        if(value == UCOL_ON) {
6616            coll->normalizationMode = UCOL_ON;
6617            coll->normalizationModeisDefault = FALSE;
6618            initializeFCD(status);
6619        } else if (value == UCOL_OFF) {
6620            coll->normalizationMode = UCOL_OFF;
6621            coll->normalizationModeisDefault = FALSE;
6622        } else if (value == UCOL_DEFAULT) {
6623            coll->normalizationModeisDefault = TRUE;
6624            coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6625            if(coll->normalizationMode == UCOL_ON) {
6626                initializeFCD(status);
6627            }
6628        } else {
6629            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6630        }
6631        break;
6632    case UCOL_STRENGTH:         /* attribute for strength */
6633        if (value == UCOL_DEFAULT) {
6634            coll->strengthisDefault = TRUE;
6635            coll->strength = (UColAttributeValue)coll->options->strength;
6636        } else if (value <= UCOL_IDENTICAL) {
6637            coll->strengthisDefault = FALSE;
6638            coll->strength = value;
6639        } else {
6640            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6641        }
6642        break;
6643    case UCOL_ATTRIBUTE_COUNT:
6644    default:
6645        *status = U_ILLEGAL_ARGUMENT_ERROR;
6646        break;
6647    }
6648    if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6649        coll->latinOneRegenTable = TRUE;
6650    } else {
6651        coll->latinOneRegenTable = FALSE;
6652    }
6653    ucol_updateInternalState(coll, status);
6654}
6655
6656U_CAPI UColAttributeValue  U_EXPORT2
6657ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6658    if(U_FAILURE(*status) || coll == NULL) {
6659      return UCOL_DEFAULT;
6660    }
6661    switch(attr) {
6662    case UCOL_NUMERIC_COLLATION:
6663      return coll->numericCollation;
6664    case UCOL_HIRAGANA_QUATERNARY_MODE:
6665      return coll->hiraganaQ;
6666    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6667        return coll->frenchCollation;
6668    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6669        return coll->alternateHandling;
6670    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6671        return coll->caseFirst;
6672    case UCOL_CASE_LEVEL: /* do we have an extra case level */
6673        return coll->caseLevel;
6674    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6675        return coll->normalizationMode;
6676    case UCOL_STRENGTH:         /* attribute for strength */
6677        return coll->strength;
6678    case UCOL_ATTRIBUTE_COUNT:
6679    default:
6680        *status = U_ILLEGAL_ARGUMENT_ERROR;
6681        break;
6682    }
6683    return UCOL_DEFAULT;
6684}
6685
6686U_CAPI void U_EXPORT2
6687ucol_setStrength(    UCollator                *coll,
6688            UCollationStrength        strength)
6689{
6690    UErrorCode status = U_ZERO_ERROR;
6691    ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6692}
6693
6694U_CAPI UCollationStrength U_EXPORT2
6695ucol_getStrength(const UCollator *coll)
6696{
6697    UErrorCode status = U_ZERO_ERROR;
6698    return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6699}
6700
6701U_DRAFT int32_t U_EXPORT2
6702ucol_getReorderCodes(const UCollator *coll,
6703                    int32_t *dest,
6704                    int32_t destCapacity,
6705                    UErrorCode *status) {
6706    if (U_FAILURE(*status)) {
6707        return 0;
6708    }
6709
6710    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6711        *status = U_ILLEGAL_ARGUMENT_ERROR;
6712        return 0;
6713    }
6714
6715#ifdef UCOL_DEBUG
6716    printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6717    printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
6718#endif
6719
6720    if (coll->reorderCodesLength > destCapacity) {
6721        *status = U_BUFFER_OVERFLOW_ERROR;
6722        return coll->reorderCodesLength;
6723    }
6724    for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6725        dest[i] = coll->reorderCodes[i];
6726    }
6727    return coll->reorderCodesLength;
6728}
6729
6730U_DRAFT void U_EXPORT2
6731ucol_setReorderCodes(UCollator* coll,
6732                    const int32_t* reorderCodes,
6733                    int32_t reorderCodesLength,
6734                    UErrorCode *status) {
6735    if (U_FAILURE(*status)) {
6736        return;
6737    }
6738
6739    if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
6740        *status = U_ILLEGAL_ARGUMENT_ERROR;
6741        return;
6742    }
6743
6744    if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6745        uprv_free(coll->reorderCodes);
6746    }
6747    coll->reorderCodes = NULL;
6748    coll->reorderCodesLength = 0;
6749    if (reorderCodesLength == 0) {
6750        if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
6751            uprv_free(coll->leadBytePermutationTable);
6752        }
6753        coll->leadBytePermutationTable = NULL;
6754        return;
6755    }
6756    coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
6757    if (coll->reorderCodes == NULL) {
6758        *status = U_MEMORY_ALLOCATION_ERROR;
6759        return;
6760    }
6761    coll->freeReorderCodesOnClose = TRUE;
6762    for (int32_t i = 0; i < reorderCodesLength; i++) {
6763        coll->reorderCodes[i] = reorderCodes[i];
6764    }
6765    coll->reorderCodesLength = reorderCodesLength;
6766    ucol_buildPermutationTable(coll, status);
6767}
6768
6769U_DRAFT int32_t U_EXPORT2
6770ucol_getEquivalentReorderCodes(int32_t reorderCode,
6771                    int32_t* dest,
6772                    int32_t destCapacity,
6773                    UErrorCode *pErrorCode) {
6774    bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
6775    uint16_t leadBytes[256];
6776    int leadBytesCount;
6777    int leadByteIndex;
6778    int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6779    int reorderCodesForLeadByteCount;
6780    int reorderCodeIndex;
6781
6782    int32_t equivalentCodesCount = 0;
6783    int setIndex;
6784
6785    if (U_FAILURE(*pErrorCode)) {
6786        return 0;
6787    }
6788
6789    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6790        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6791        return 0;
6792    }
6793
6794    uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6795
6796    const UCollator* uca = ucol_initUCA(pErrorCode);
6797    if (U_FAILURE(*pErrorCode)) {
6798	return 0;
6799    }
6800    leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
6801    for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6802        reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6803            uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
6804        for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
6805            equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
6806        }
6807    }
6808
6809    for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6810        if (equivalentCodesSet[setIndex] == true) {
6811            equivalentCodesCount++;
6812        }
6813    }
6814
6815    if (destCapacity == 0) {
6816        return equivalentCodesCount;
6817    }
6818
6819    equivalentCodesCount = 0;
6820    for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6821        if (equivalentCodesSet[setIndex] == true) {
6822            dest[equivalentCodesCount++] = setIndex;
6823            if (equivalentCodesCount >= destCapacity) {
6824                break;
6825            }
6826        }
6827    }
6828    return equivalentCodesCount;
6829}
6830
6831
6832/****************************************************************************/
6833/* Following are misc functions                                             */
6834/* there are new APIs and some compatibility APIs                           */
6835/****************************************************************************/
6836
6837U_CAPI void U_EXPORT2
6838ucol_getVersion(const UCollator* coll,
6839                UVersionInfo versionInfo)
6840{
6841    /* RunTime version  */
6842    uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6843    /* Builder version*/
6844    uint8_t bdVersion = coll->image->version[0];
6845
6846    /* Charset Version. Need to get the version from cnv files
6847     * makeconv should populate cnv files with version and
6848     * an api has to be provided in ucnv.h to obtain this version
6849     */
6850    uint8_t csVersion = 0;
6851
6852    /* combine the version info */
6853    uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6854
6855    /* Tailoring rules */
6856    versionInfo[0] = (uint8_t)(cmbVersion>>8);
6857    versionInfo[1] = (uint8_t)cmbVersion;
6858    versionInfo[2] = coll->image->version[1];
6859    if(coll->UCA) {
6860        /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
6861        versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
6862    } else {
6863        versionInfo[3] = 0;
6864    }
6865}
6866
6867
6868/* This internal API checks whether a character is tailored or not */
6869U_CAPI UBool  U_EXPORT2
6870ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6871    if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6872        return FALSE;
6873    }
6874
6875    uint32_t CE = UCOL_NOT_FOUND;
6876    const UChar *ContractionStart = NULL;
6877    if(u < 0x100) { /* latin-1 */
6878        CE = coll->latinOneMapping[u];
6879        if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6880            return FALSE;
6881        }
6882    } else { /* regular */
6883        CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6884    }
6885
6886    if(isContraction(CE)) {
6887        ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6888        CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6889    }
6890
6891    return (UBool)(CE != UCOL_NOT_FOUND);
6892}
6893
6894
6895/****************************************************************************/
6896/* Following are the string compare functions                               */
6897/*                                                                          */
6898/****************************************************************************/
6899
6900
6901/*  ucol_checkIdent    internal function.  Does byte level string compare.   */
6902/*                     Used by strcoll if strength == identical and strings  */
6903/*                     are otherwise equal.                                  */
6904/*                                                                           */
6905/*                     Comparison must be done on NFD normalized strings.    */
6906/*                     FCD is not good enough.                               */
6907
6908static
6909UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6910{
6911    // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6912    // of same type, but that doesn't really mean that it will stay that way.
6913    int32_t            comparison;
6914
6915    if (sColl->flags & UCOL_USE_ITERATOR) {
6916        // The division for the array length may truncate the array size to
6917        // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6918        // for all platforms anyway.
6919        UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6920        UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6921        UNormIterator *sNIt = NULL, *tNIt = NULL;
6922        sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6923        tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6924        sColl->iterator->move(sColl->iterator, 0, UITER_START);
6925        tColl->iterator->move(tColl->iterator, 0, UITER_START);
6926        UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6927        UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6928        comparison = u_strCompareIter(sIt, tIt, TRUE);
6929        unorm_closeIter(sNIt);
6930        unorm_closeIter(tNIt);
6931    } else {
6932        int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
6933        const UChar *sBuf = sColl->string;
6934        int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
6935        const UChar *tBuf = tColl->string;
6936
6937        if (normalize) {
6938            *status = U_ZERO_ERROR;
6939            // Note: We could use Normalizer::compare() or similar, but for short strings
6940            // which may not be in FCD it might be faster to just NFD them.
6941            // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
6942            // NFD'ing immediately might be faster for long strings,
6943            // but string comparison is usually done on relatively short strings.
6944            sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
6945                                  sColl->writableBuffer,
6946                                  *status);
6947            tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
6948                                  tColl->writableBuffer,
6949                                  *status);
6950            if(U_FAILURE(*status)) {
6951                return UCOL_LESS;
6952            }
6953            comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
6954        } else {
6955            comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
6956        }
6957    }
6958
6959    if (comparison < 0) {
6960        return UCOL_LESS;
6961    } else if (comparison == 0) {
6962        return UCOL_EQUAL;
6963    } else /* comparison > 0 */ {
6964        return UCOL_GREATER;
6965    }
6966}
6967
6968/*  CEBuf - A struct and some inline functions to handle the saving    */
6969/*          of CEs in a buffer within ucol_strcoll                     */
6970
6971#define UCOL_CEBUF_SIZE 512
6972typedef struct ucol_CEBuf {
6973    uint32_t    *buf;
6974    uint32_t    *endp;
6975    uint32_t    *pos;
6976    uint32_t     localArray[UCOL_CEBUF_SIZE];
6977} ucol_CEBuf;
6978
6979
6980static
6981inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
6982    (b)->buf = (b)->pos = (b)->localArray;
6983    (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
6984}
6985
6986static
6987void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
6988    uint32_t  oldSize;
6989    uint32_t  newSize;
6990    uint32_t  *newBuf;
6991
6992    ci->flags |= UCOL_ITER_ALLOCATED;
6993    oldSize = (uint32_t)(b->pos - b->buf);
6994    newSize = oldSize * 2;
6995    newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
6996    if(newBuf == NULL) {
6997        *status = U_MEMORY_ALLOCATION_ERROR;
6998    }
6999    else {
7000        uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7001        if (b->buf != b->localArray) {
7002            uprv_free(b->buf);
7003        }
7004        b->buf = newBuf;
7005        b->endp = b->buf + newSize;
7006        b->pos  = b->buf + oldSize;
7007    }
7008}
7009
7010static
7011inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7012    if (b->pos == b->endp) {
7013        ucol_CEBuf_Expand(b, ci, status);
7014    }
7015    if (U_SUCCESS(*status)) {
7016        *(b)->pos++ = ce;
7017    }
7018}
7019
7020/* This is a trick string compare function that goes in and uses sortkeys to compare */
7021/* It is used when compare gets in trouble and needs to bail out                     */
7022static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7023                                                  collIterate *tColl,
7024                                                  UErrorCode *status)
7025{
7026    uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7027    uint8_t *sourceKeyP = sourceKey;
7028    uint8_t *targetKeyP = targetKey;
7029    int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7030    const UCollator *coll = sColl->coll;
7031    const UChar *source = NULL;
7032    const UChar *target = NULL;
7033    int32_t result = UCOL_EQUAL;
7034    UnicodeString sourceString, targetString;
7035    int32_t sourceLength;
7036    int32_t targetLength;
7037
7038    if(sColl->flags & UCOL_USE_ITERATOR) {
7039        sColl->iterator->move(sColl->iterator, 0, UITER_START);
7040        tColl->iterator->move(tColl->iterator, 0, UITER_START);
7041        UChar32 c;
7042        while((c=sColl->iterator->next(sColl->iterator))>=0) {
7043            sourceString.append((UChar)c);
7044        }
7045        while((c=tColl->iterator->next(tColl->iterator))>=0) {
7046            targetString.append((UChar)c);
7047        }
7048        source = sourceString.getBuffer();
7049        sourceLength = sourceString.length();
7050        target = targetString.getBuffer();
7051        targetLength = targetString.length();
7052    } else { // no iterators
7053        sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7054        targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7055        source = sColl->string;
7056        target = tColl->string;
7057    }
7058
7059
7060
7061    sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7062    if(sourceKeyLen > UCOL_MAX_BUFFER) {
7063        sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7064        if(sourceKeyP == NULL) {
7065            *status = U_MEMORY_ALLOCATION_ERROR;
7066            goto cleanup_and_do_compare;
7067        }
7068        sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7069    }
7070
7071    targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7072    if(targetKeyLen > UCOL_MAX_BUFFER) {
7073        targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7074        if(targetKeyP == NULL) {
7075            *status = U_MEMORY_ALLOCATION_ERROR;
7076            goto cleanup_and_do_compare;
7077        }
7078        targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7079    }
7080
7081    result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7082
7083cleanup_and_do_compare:
7084    if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7085        uprv_free(sourceKeyP);
7086    }
7087
7088    if(targetKeyP != NULL && targetKeyP != targetKey) {
7089        uprv_free(targetKeyP);
7090    }
7091
7092    if(result<0) {
7093        return UCOL_LESS;
7094    } else if(result>0) {
7095        return UCOL_GREATER;
7096    } else {
7097        return UCOL_EQUAL;
7098    }
7099}
7100
7101
7102static UCollationResult
7103ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7104{
7105    U_ALIGN_CODE(16);
7106
7107    const UCollator *coll = sColl->coll;
7108
7109
7110    // setting up the collator parameters
7111    UColAttributeValue strength = coll->strength;
7112    UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7113
7114    UBool checkSecTer = initialCheckSecTer;
7115    UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7116    UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7117    UBool checkIdent = (strength == UCOL_IDENTICAL);
7118    UBool checkCase = (coll->caseLevel == UCOL_ON);
7119    UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7120    UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7121    UBool qShifted = shifted && checkQuad;
7122    UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7123
7124    if(doHiragana && shifted) {
7125        return (ucol_compareUsingSortKeys(sColl, tColl, status));
7126    }
7127    uint8_t caseSwitch = coll->caseSwitch;
7128    uint8_t tertiaryMask = coll->tertiaryMask;
7129
7130    // This is the lowest primary value that will not be ignored if shifted
7131    uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7132
7133    UCollationResult result = UCOL_EQUAL;
7134    UCollationResult hirResult = UCOL_EQUAL;
7135
7136    // Preparing the CE buffers. They will be filled during the primary phase
7137    ucol_CEBuf   sCEs;
7138    ucol_CEBuf   tCEs;
7139    UCOL_INIT_CEBUF(&sCEs);
7140    UCOL_INIT_CEBUF(&tCEs);
7141
7142    uint32_t secS = 0, secT = 0;
7143    uint32_t sOrder=0, tOrder=0;
7144
7145    // Non shifted primary processing is quite simple
7146    if(!shifted) {
7147        for(;;) {
7148
7149            // We fetch CEs until we hit a non ignorable primary or end.
7150            do {
7151                // We get the next CE
7152                sOrder = ucol_IGetNextCE(coll, sColl, status);
7153                // Stuff it in the buffer
7154                UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7155                // And keep just the primary part.
7156                sOrder &= UCOL_PRIMARYMASK;
7157            } while(sOrder == 0);
7158
7159            // see the comments on the above block
7160            do {
7161                tOrder = ucol_IGetNextCE(coll, tColl, status);
7162                UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7163                tOrder &= UCOL_PRIMARYMASK;
7164            } while(tOrder == 0);
7165
7166            // if both primaries are the same
7167            if(sOrder == tOrder) {
7168                // and there are no more CEs, we advance to the next level
7169                if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7170                    break;
7171                }
7172                if(doHiragana && hirResult == UCOL_EQUAL) {
7173                    if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7174                        hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7175                            ? UCOL_LESS:UCOL_GREATER;
7176                    }
7177                }
7178            } else {
7179                // only need to check one for continuation
7180                // if one is then the other must be or the preceding CE would be a prefix of the other
7181                if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7182                    sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7183                    tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7184                }
7185                // if two primaries are different, we are done
7186                result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7187                goto commonReturn;
7188            }
7189        } // no primary difference... do the rest from the buffers
7190    } else { // shifted - do a slightly more complicated processing :)
7191        for(;;) {
7192            UBool sInShifted = FALSE;
7193            UBool tInShifted = FALSE;
7194            // This version of code can be refactored. However, it seems easier to understand this way.
7195            // Source loop. Sam as the target loop.
7196            for(;;) {
7197                sOrder = ucol_IGetNextCE(coll, sColl, status);
7198                if(sOrder == UCOL_NO_MORE_CES) {
7199                    UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7200                    break;
7201                } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7202                    /* UCA amendment - ignore ignorables that follow shifted code points */
7203                    continue;
7204                } else if(isContinuation(sOrder)) {
7205                    if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7206                        if(sInShifted) {
7207                            sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7208                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7209                            continue;
7210                        } else {
7211                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7212                            break;
7213                        }
7214                    } else { /* Just lower level values */
7215                        if(sInShifted) {
7216                            continue;
7217                        } else {
7218                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7219                            continue;
7220                        }
7221                    }
7222                } else { /* regular */
7223                    if(coll->leadBytePermutationTable != NULL){
7224                        sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7225                    }
7226                    if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7227                        UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7228                        break;
7229                    } else {
7230                        if((sOrder & UCOL_PRIMARYMASK) > 0) {
7231                            sInShifted = TRUE;
7232                            sOrder &= UCOL_PRIMARYMASK;
7233                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7234                            continue;
7235                        } else {
7236                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7237                            sInShifted = FALSE;
7238                            continue;
7239                        }
7240                    }
7241                }
7242            }
7243            sOrder &= UCOL_PRIMARYMASK;
7244            sInShifted = FALSE;
7245
7246            for(;;) {
7247                tOrder = ucol_IGetNextCE(coll, tColl, status);
7248                if(tOrder == UCOL_NO_MORE_CES) {
7249                    UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7250                    break;
7251                } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7252                    /* UCA amendment - ignore ignorables that follow shifted code points */
7253                    continue;
7254                } else if(isContinuation(tOrder)) {
7255                    if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7256                        if(tInShifted) {
7257                            tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7258                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7259                            continue;
7260                        } else {
7261                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7262                            break;
7263                        }
7264                    } else { /* Just lower level values */
7265                        if(tInShifted) {
7266                            continue;
7267                        } else {
7268                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7269                            continue;
7270                        }
7271                    }
7272                } else { /* regular */
7273                    if(coll->leadBytePermutationTable != NULL){
7274                        tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7275                    }
7276                    if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7277                        UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7278                        break;
7279                    } else {
7280                        if((tOrder & UCOL_PRIMARYMASK) > 0) {
7281                            tInShifted = TRUE;
7282                            tOrder &= UCOL_PRIMARYMASK;
7283                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7284                            continue;
7285                        } else {
7286                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7287                            tInShifted = FALSE;
7288                            continue;
7289                        }
7290                    }
7291                }
7292            }
7293            tOrder &= UCOL_PRIMARYMASK;
7294            tInShifted = FALSE;
7295
7296            if(sOrder == tOrder) {
7297                /*
7298                if(doHiragana && hirResult == UCOL_EQUAL) {
7299                if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7300                hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7301                ? UCOL_LESS:UCOL_GREATER;
7302                }
7303                }
7304                */
7305                if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7306                    break;
7307                } else {
7308                    sOrder = 0;
7309                    tOrder = 0;
7310                    continue;
7311                }
7312            } else {
7313                result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7314                goto commonReturn;
7315            }
7316        } /* no primary difference... do the rest from the buffers */
7317    }
7318
7319    /* now, we're gonna reexamine collected CEs */
7320    uint32_t    *sCE;
7321    uint32_t    *tCE;
7322
7323    /* This is the secondary level of comparison */
7324    if(checkSecTer) {
7325        if(!isFrenchSec) { /* normal */
7326            sCE = sCEs.buf;
7327            tCE = tCEs.buf;
7328            for(;;) {
7329                while (secS == 0) {
7330                    secS = *(sCE++) & UCOL_SECONDARYMASK;
7331                }
7332
7333                while(secT == 0) {
7334                    secT = *(tCE++) & UCOL_SECONDARYMASK;
7335                }
7336
7337                if(secS == secT) {
7338                    if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7339                        break;
7340                    } else {
7341                        secS = 0; secT = 0;
7342                        continue;
7343                    }
7344                } else {
7345                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7346                    goto commonReturn;
7347                }
7348            }
7349        } else { /* do the French */
7350            uint32_t *sCESave = NULL;
7351            uint32_t *tCESave = NULL;
7352            sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7353            tCE = tCEs.pos-2;
7354            for(;;) {
7355                while (secS == 0 && sCE >= sCEs.buf) {
7356                    if(sCESave == NULL) {
7357                        secS = *(sCE--);
7358                        if(isContinuation(secS)) {
7359                            while(isContinuation(secS = *(sCE--)))
7360                                ;
7361                            /* after this, secS has the start of continuation, and sCEs points before that */
7362                            sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7363                            sCE+=2;  /* need to point to the first continuation CP */
7364                            /* However, now you can just continue doing stuff */
7365                        }
7366                    } else {
7367                        secS = *(sCE++);
7368                        if(!isContinuation(secS)) { /* This means we have finished with this cont */
7369                            sCE = sCESave;            /* reset the pointer to before continuation */
7370                            sCESave = NULL;
7371                            secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
7372                            continue;
7373                        }
7374                    }
7375                    secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7376                }
7377
7378                while(secT == 0 && tCE >= tCEs.buf) {
7379                    if(tCESave == NULL) {
7380                        secT = *(tCE--);
7381                        if(isContinuation(secT)) {
7382                            while(isContinuation(secT = *(tCE--)))
7383                                ;
7384                            /* after this, secS has the start of continuation, and sCEs points before that */
7385                            tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7386                            tCE+=2;  /* need to point to the first continuation CP */
7387                            /* However, now you can just continue doing stuff */
7388                        }
7389                    } else {
7390                        secT = *(tCE++);
7391                        if(!isContinuation(secT)) { /* This means we have finished with this cont */
7392                            tCE = tCESave;          /* reset the pointer to before continuation */
7393                            tCESave = NULL;
7394                            secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
7395                            continue;
7396                        }
7397                    }
7398                    secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7399                }
7400
7401                if(secS == secT) {
7402                    if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7403                        break;
7404                    } else {
7405                        secS = 0; secT = 0;
7406                        continue;
7407                    }
7408                } else {
7409                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7410                    goto commonReturn;
7411                }
7412            }
7413        }
7414    }
7415
7416    /* doing the case bit */
7417    if(checkCase) {
7418        sCE = sCEs.buf;
7419        tCE = tCEs.buf;
7420        for(;;) {
7421            while((secS & UCOL_REMOVE_CASE) == 0) {
7422                if(!isContinuation(*sCE++)) {
7423                    secS =*(sCE-1);
7424                    if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7425                        // primary ignorables should not be considered on the case level when the strength is primary
7426                        // otherwise, the CEs stop being well-formed
7427                        secS &= UCOL_TERT_CASE_MASK;
7428                        secS ^= caseSwitch;
7429                    } else {
7430                        secS = 0;
7431                    }
7432                } else {
7433                    secS = 0;
7434                }
7435            }
7436
7437            while((secT & UCOL_REMOVE_CASE) == 0) {
7438                if(!isContinuation(*tCE++)) {
7439                    secT = *(tCE-1);
7440                    if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7441                        // primary ignorables should not be considered on the case level when the strength is primary
7442                        // otherwise, the CEs stop being well-formed
7443                        secT &= UCOL_TERT_CASE_MASK;
7444                        secT ^= caseSwitch;
7445                    } else {
7446                        secT = 0;
7447                    }
7448                } else {
7449                    secT = 0;
7450                }
7451            }
7452
7453            if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7454                result = UCOL_LESS;
7455                goto commonReturn;
7456            } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7457                result = UCOL_GREATER;
7458                goto commonReturn;
7459            }
7460
7461            if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7462                break;
7463            } else {
7464                secS = 0;
7465                secT = 0;
7466            }
7467        }
7468    }
7469
7470    /* Tertiary level */
7471    if(checkTertiary) {
7472        secS = 0;
7473        secT = 0;
7474        sCE = sCEs.buf;
7475        tCE = tCEs.buf;
7476        for(;;) {
7477            while((secS & UCOL_REMOVE_CASE) == 0) {
7478                secS = *(sCE++) & tertiaryMask;
7479                if(!isContinuation(secS)) {
7480                    secS ^= caseSwitch;
7481                } else {
7482                    secS &= UCOL_REMOVE_CASE;
7483                }
7484            }
7485
7486            while((secT & UCOL_REMOVE_CASE)  == 0) {
7487                secT = *(tCE++) & tertiaryMask;
7488                if(!isContinuation(secT)) {
7489                    secT ^= caseSwitch;
7490                } else {
7491                    secT &= UCOL_REMOVE_CASE;
7492                }
7493            }
7494
7495            if(secS == secT) {
7496                if((secS & UCOL_REMOVE_CASE) == 1) {
7497                    break;
7498                } else {
7499                    secS = 0; secT = 0;
7500                    continue;
7501                }
7502            } else {
7503                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7504                goto commonReturn;
7505            }
7506        }
7507    }
7508
7509
7510    if(qShifted /*checkQuad*/) {
7511        UBool sInShifted = TRUE;
7512        UBool tInShifted = TRUE;
7513        secS = 0;
7514        secT = 0;
7515        sCE = sCEs.buf;
7516        tCE = tCEs.buf;
7517        for(;;) {
7518            while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7519                secS = *(sCE++);
7520                if(isContinuation(secS)) {
7521                    if(!sInShifted) {
7522                        continue;
7523                    }
7524                } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7525                    secS = UCOL_PRIMARYMASK;
7526                    sInShifted = FALSE;
7527                } else {
7528                    sInShifted = TRUE;
7529                }
7530            }
7531            secS &= UCOL_PRIMARYMASK;
7532
7533
7534            while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7535                secT = *(tCE++);
7536                if(isContinuation(secT)) {
7537                    if(!tInShifted) {
7538                        continue;
7539                    }
7540                } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7541                    secT = UCOL_PRIMARYMASK;
7542                    tInShifted = FALSE;
7543                } else {
7544                    tInShifted = TRUE;
7545                }
7546            }
7547            secT &= UCOL_PRIMARYMASK;
7548
7549            if(secS == secT) {
7550                if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7551                    break;
7552                } else {
7553                    secS = 0; secT = 0;
7554                    continue;
7555                }
7556            } else {
7557                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7558                goto commonReturn;
7559            }
7560        }
7561    } else if(doHiragana && hirResult != UCOL_EQUAL) {
7562        // If we're fine on quaternaries, we might be different
7563        // on Hiragana. This, however, might fail us in shifted.
7564        result = hirResult;
7565        goto commonReturn;
7566    }
7567
7568    /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7569    /*  as a tiebreaker if all else is equal.                                */
7570    /*  Getting here  should be quite rare - strings are not identical -     */
7571    /*     that is checked first, but compared == through all other checks.  */
7572    if(checkIdent)
7573    {
7574        //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7575        result = ucol_checkIdent(sColl, tColl, TRUE, status);
7576    }
7577
7578commonReturn:
7579    if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7580        if (sCEs.buf != sCEs.localArray ) {
7581            uprv_free(sCEs.buf);
7582        }
7583        if (tCEs.buf != tCEs.localArray ) {
7584            uprv_free(tCEs.buf);
7585        }
7586    }
7587
7588    return result;
7589}
7590
7591static UCollationResult
7592ucol_strcollRegular(const UCollator *coll,
7593                    const UChar *source, int32_t sourceLength,
7594                    const UChar *target, int32_t targetLength,
7595                    UErrorCode *status) {
7596    collIterate sColl, tColl;
7597    // Preparing the context objects for iterating over strings
7598    IInit_collIterate(coll, source, sourceLength, &sColl, status);
7599    IInit_collIterate(coll, target, targetLength, &tColl, status);
7600    if(U_FAILURE(*status)) {
7601        return UCOL_LESS;
7602    }
7603    return ucol_strcollRegular(&sColl, &tColl, status);
7604}
7605
7606static inline uint32_t
7607ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7608                          uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7609{
7610    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7611    int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7612    int32_t offset = 1;
7613    UChar schar = 0, tchar = 0;
7614
7615    for(;;) {
7616        if(len == -1) {
7617            if(s[*index] == 0) { // end of string
7618                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7619            } else {
7620                schar = s[*index];
7621            }
7622        } else {
7623            if(*index == len) {
7624                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7625            } else {
7626                schar = s[*index];
7627            }
7628        }
7629
7630        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7631            offset++;
7632        }
7633
7634        if (schar == tchar) {
7635            (*index)++;
7636            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7637        }
7638        else
7639        {
7640            if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7641                return UCOL_BAIL_OUT_CE;
7642            }
7643            // skip completely ignorables
7644            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7645            if(isZeroCE == 0) { // we have to ignore completely ignorables
7646                (*index)++;
7647                continue;
7648            }
7649
7650            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7651        }
7652    }
7653}
7654
7655
7656/**
7657 * This is a fast strcoll, geared towards text in Latin-1.
7658 * It supports contractions of size two, French secondaries
7659 * and case switching. You can use it with strengths primary
7660 * to tertiary. It does not support shifted and case level.
7661 * It relies on the table build by setupLatin1Table. If it
7662 * doesn't understand something, it will go to the regular
7663 * strcoll.
7664 */
7665static UCollationResult
7666ucol_strcollUseLatin1( const UCollator    *coll,
7667              const UChar        *source,
7668              int32_t            sLen,
7669              const UChar        *target,
7670              int32_t            tLen,
7671              UErrorCode *status)
7672{
7673    U_ALIGN_CODE(16);
7674    int32_t strength = coll->strength;
7675
7676    int32_t sIndex = 0, tIndex = 0;
7677    UChar sChar = 0, tChar = 0;
7678    uint32_t sOrder=0, tOrder=0;
7679
7680    UBool endOfSource = FALSE;
7681
7682    uint32_t *elements = coll->latinOneCEs;
7683
7684    UBool haveContractions = FALSE; // if we have contractions in our string
7685                                    // we cannot do French secondary
7686
7687    // Do the primary level
7688    for(;;) {
7689        while(sOrder==0) { // this loop skips primary ignorables
7690            // sOrder=getNextlatinOneCE(source);
7691            if(sLen==-1) {   // handling zero terminated strings
7692                sChar=source[sIndex++];
7693                if(sChar==0) {
7694                    endOfSource = TRUE;
7695                    break;
7696                }
7697            } else {        // handling strings with known length
7698                if(sIndex==sLen) {
7699                    endOfSource = TRUE;
7700                    break;
7701                }
7702                sChar=source[sIndex++];
7703            }
7704            if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7705                //fprintf(stderr, "R");
7706                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7707            }
7708            sOrder = elements[sChar];
7709            if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7710                // specials can basically be either contractions or bail-out signs. If we get anything
7711                // else, we'll bail out anywasy
7712                if(getCETag(sOrder) == CONTRACTION_TAG) {
7713                    sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7714                    haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7715                    // However, if there are contractions in the table, but we always use just one char,
7716                    // we might be able to do French. This should be checked out.
7717                }
7718                if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7719                    //fprintf(stderr, "S");
7720                    return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7721                }
7722            }
7723        }
7724
7725        while(tOrder==0) {  // this loop skips primary ignorables
7726            // tOrder=getNextlatinOneCE(target);
7727            if(tLen==-1) {    // handling zero terminated strings
7728                tChar=target[tIndex++];
7729                if(tChar==0) {
7730                    if(endOfSource) { // this is different than source loop,
7731                        // as we already know that source loop is done here,
7732                        // so we can either finish the primary loop if both
7733                        // strings are done or anounce the result if only
7734                        // target is done. Same below.
7735                        goto endOfPrimLoop;
7736                    } else {
7737                        return UCOL_GREATER;
7738                    }
7739                }
7740            } else {          // handling strings with known length
7741                if(tIndex==tLen) {
7742                    if(endOfSource) {
7743                        goto endOfPrimLoop;
7744                    } else {
7745                        return UCOL_GREATER;
7746                    }
7747                }
7748                tChar=target[tIndex++];
7749            }
7750            if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7751                //fprintf(stderr, "R");
7752                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7753            }
7754            tOrder = elements[tChar];
7755            if(tOrder >= UCOL_NOT_FOUND) {
7756                // Handling specials, see the comments for source
7757                if(getCETag(tOrder) == CONTRACTION_TAG) {
7758                    tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7759                    haveContractions = TRUE;
7760                }
7761                if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7762                    //fprintf(stderr, "S");
7763                    return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7764                }
7765            }
7766        }
7767        if(endOfSource) { // source is finished, but target is not, say the result.
7768            return UCOL_LESS;
7769        }
7770
7771        if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7772            sOrder = 0; tOrder = 0;
7773            continue;
7774        } else {
7775            // compare current top bytes
7776            if(((sOrder^tOrder)&0xFF000000)!=0) {
7777                // top bytes differ, return difference
7778                if(sOrder < tOrder) {
7779                    return UCOL_LESS;
7780                } else if(sOrder > tOrder) {
7781                    return UCOL_GREATER;
7782                }
7783                // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7784                // since we must return enum value
7785            }
7786
7787            // top bytes match, continue with following bytes
7788            sOrder<<=8;
7789            tOrder<<=8;
7790        }
7791    }
7792
7793endOfPrimLoop:
7794    // after primary loop, we definitely know the sizes of strings,
7795    // so we set it and use simpler loop for secondaries and tertiaries
7796    sLen = sIndex; tLen = tIndex;
7797    if(strength >= UCOL_SECONDARY) {
7798        // adjust the table beggining
7799        elements += coll->latinOneTableLen;
7800        endOfSource = FALSE;
7801
7802        if(coll->frenchCollation == UCOL_OFF) { // non French
7803            // This loop is a simplified copy of primary loop
7804            // at this point we know that whole strings are latin-1, so we don't
7805            // check for that. We also know that we only have contractions as
7806            // specials.
7807            sIndex = 0; tIndex = 0;
7808            for(;;) {
7809                while(sOrder==0) {
7810                    if(sIndex==sLen) {
7811                        endOfSource = TRUE;
7812                        break;
7813                    }
7814                    sChar=source[sIndex++];
7815                    sOrder = elements[sChar];
7816                    if(sOrder > UCOL_NOT_FOUND) {
7817                        sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7818                    }
7819                }
7820
7821                while(tOrder==0) {
7822                    if(tIndex==tLen) {
7823                        if(endOfSource) {
7824                            goto endOfSecLoop;
7825                        } else {
7826                            return UCOL_GREATER;
7827                        }
7828                    }
7829                    tChar=target[tIndex++];
7830                    tOrder = elements[tChar];
7831                    if(tOrder > UCOL_NOT_FOUND) {
7832                        tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7833                    }
7834                }
7835                if(endOfSource) {
7836                    return UCOL_LESS;
7837                }
7838
7839                if(sOrder == tOrder) {
7840                    sOrder = 0; tOrder = 0;
7841                    continue;
7842                } else {
7843                    // see primary loop for comments on this
7844                    if(((sOrder^tOrder)&0xFF000000)!=0) {
7845                        if(sOrder < tOrder) {
7846                            return UCOL_LESS;
7847                        } else if(sOrder > tOrder) {
7848                            return UCOL_GREATER;
7849                        }
7850                    }
7851                    sOrder<<=8;
7852                    tOrder<<=8;
7853                }
7854            }
7855        } else { // French
7856            if(haveContractions) { // if we have contractions, we have to bail out
7857                // since we don't really know how to handle them here
7858                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7859            }
7860            // For French, we go backwards
7861            sIndex = sLen; tIndex = tLen;
7862            for(;;) {
7863                while(sOrder==0) {
7864                    if(sIndex==0) {
7865                        endOfSource = TRUE;
7866                        break;
7867                    }
7868                    sChar=source[--sIndex];
7869                    sOrder = elements[sChar];
7870                    // don't even look for contractions
7871                }
7872
7873                while(tOrder==0) {
7874                    if(tIndex==0) {
7875                        if(endOfSource) {
7876                            goto endOfSecLoop;
7877                        } else {
7878                            return UCOL_GREATER;
7879                        }
7880                    }
7881                    tChar=target[--tIndex];
7882                    tOrder = elements[tChar];
7883                    // don't even look for contractions
7884                }
7885                if(endOfSource) {
7886                    return UCOL_LESS;
7887                }
7888
7889                if(sOrder == tOrder) {
7890                    sOrder = 0; tOrder = 0;
7891                    continue;
7892                } else {
7893                    // see the primary loop for comments
7894                    if(((sOrder^tOrder)&0xFF000000)!=0) {
7895                        if(sOrder < tOrder) {
7896                            return UCOL_LESS;
7897                        } else if(sOrder > tOrder) {
7898                            return UCOL_GREATER;
7899                        }
7900                    }
7901                    sOrder<<=8;
7902                    tOrder<<=8;
7903                }
7904            }
7905        }
7906    }
7907
7908endOfSecLoop:
7909    if(strength >= UCOL_TERTIARY) {
7910        // tertiary loop is the same as secondary (except no French)
7911        elements += coll->latinOneTableLen;
7912        sIndex = 0; tIndex = 0;
7913        endOfSource = FALSE;
7914        for(;;) {
7915            while(sOrder==0) {
7916                if(sIndex==sLen) {
7917                    endOfSource = TRUE;
7918                    break;
7919                }
7920                sChar=source[sIndex++];
7921                sOrder = elements[sChar];
7922                if(sOrder > UCOL_NOT_FOUND) {
7923                    sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7924                }
7925            }
7926            while(tOrder==0) {
7927                if(tIndex==tLen) {
7928                    if(endOfSource) {
7929                        return UCOL_EQUAL; // if both strings are at the end, they are equal
7930                    } else {
7931                        return UCOL_GREATER;
7932                    }
7933                }
7934                tChar=target[tIndex++];
7935                tOrder = elements[tChar];
7936                if(tOrder > UCOL_NOT_FOUND) {
7937                    tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7938                }
7939            }
7940            if(endOfSource) {
7941                return UCOL_LESS;
7942            }
7943            if(sOrder == tOrder) {
7944                sOrder = 0; tOrder = 0;
7945                continue;
7946            } else {
7947                if(((sOrder^tOrder)&0xff000000)!=0) {
7948                    if(sOrder < tOrder) {
7949                        return UCOL_LESS;
7950                    } else if(sOrder > tOrder) {
7951                        return UCOL_GREATER;
7952                    }
7953                }
7954                sOrder<<=8;
7955                tOrder<<=8;
7956            }
7957        }
7958    }
7959    return UCOL_EQUAL;
7960}
7961
7962
7963U_CAPI UCollationResult U_EXPORT2
7964ucol_strcollIter( const UCollator    *coll,
7965                 UCharIterator *sIter,
7966                 UCharIterator *tIter,
7967                 UErrorCode         *status)
7968{
7969    if(!status || U_FAILURE(*status)) {
7970        return UCOL_EQUAL;
7971    }
7972
7973    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
7974    UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
7975
7976    if (sIter == tIter) {
7977        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
7978        return UCOL_EQUAL;
7979    }
7980    if(sIter == NULL || tIter == NULL || coll == NULL) {
7981        *status = U_ILLEGAL_ARGUMENT_ERROR;
7982        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
7983        return UCOL_EQUAL;
7984    }
7985
7986    UCollationResult result = UCOL_EQUAL;
7987
7988    // Preparing the context objects for iterating over strings
7989    collIterate sColl, tColl;
7990    IInit_collIterate(coll, NULL, -1, &sColl, status);
7991    IInit_collIterate(coll, NULL, -1, &tColl, status);
7992    if(U_FAILURE(*status)) {
7993        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
7994        return UCOL_EQUAL;
7995    }
7996    // The division for the array length may truncate the array size to
7997    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7998    // for all platforms anyway.
7999    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8000    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8001    UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8002
8003    sColl.iterator = sIter;
8004    sColl.flags |= UCOL_USE_ITERATOR;
8005    tColl.flags |= UCOL_USE_ITERATOR;
8006    tColl.iterator = tIter;
8007
8008    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8009        sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8010        sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8011        sColl.flags &= ~UCOL_ITER_NORM;
8012
8013        tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8014        tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8015        tColl.flags &= ~UCOL_ITER_NORM;
8016    }
8017
8018    UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8019
8020    while((sChar = sColl.iterator->next(sColl.iterator)) ==
8021        (tChar = tColl.iterator->next(tColl.iterator))) {
8022            if(sChar == U_SENTINEL) {
8023                result = UCOL_EQUAL;
8024                goto end_compare;
8025            }
8026    }
8027
8028    if(sChar == U_SENTINEL) {
8029        tChar = tColl.iterator->previous(tColl.iterator);
8030    }
8031
8032    if(tChar == U_SENTINEL) {
8033        sChar = sColl.iterator->previous(sColl.iterator);
8034    }
8035
8036    sChar = sColl.iterator->previous(sColl.iterator);
8037    tChar = tColl.iterator->previous(tColl.iterator);
8038
8039    if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8040    {
8041        // We are stopped in the middle of a contraction.
8042        // Scan backwards through the == part of the string looking for the start of the contraction.
8043        //   It doesn't matter which string we scan, since they are the same in this region.
8044        do
8045        {
8046            sChar = sColl.iterator->previous(sColl.iterator);
8047            tChar = tColl.iterator->previous(tColl.iterator);
8048        }
8049        while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8050    }
8051
8052
8053    if(U_SUCCESS(*status)) {
8054        result = ucol_strcollRegular(&sColl, &tColl, status);
8055    }
8056
8057end_compare:
8058    if(sNormIter || tNormIter) {
8059        unorm_closeIter(sNormIter);
8060        unorm_closeIter(tNormIter);
8061    }
8062
8063    UTRACE_EXIT_VALUE_STATUS(result, *status)
8064    return result;
8065}
8066
8067
8068/*                                                                      */
8069/* ucol_strcoll     Main public API string comparison function          */
8070/*                                                                      */
8071U_CAPI UCollationResult U_EXPORT2
8072ucol_strcoll( const UCollator    *coll,
8073              const UChar        *source,
8074              int32_t            sourceLength,
8075              const UChar        *target,
8076              int32_t            targetLength)
8077{
8078    U_ALIGN_CODE(16);
8079
8080    UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8081    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8082        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8083        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8084        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8085    }
8086
8087    if(source == NULL || target == NULL) {
8088        // do not crash, but return. Should have
8089        // status argument to return error.
8090        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8091        return UCOL_EQUAL;
8092    }
8093
8094    /* Quick check if source and target are same strings. */
8095    /* They should either both be NULL terminated or the explicit length should be set on both. */
8096    if (source==target && sourceLength==targetLength) {
8097        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8098        return UCOL_EQUAL;
8099    }
8100
8101    /* Scan the strings.  Find:                                                             */
8102    /*    The length of any leading portion that is equal                                   */
8103    /*    Whether they are exactly equal.  (in which case we just return)                   */
8104    const UChar    *pSrc    = source;
8105    const UChar    *pTarg   = target;
8106    int32_t        equalLength;
8107
8108    if (sourceLength == -1 && targetLength == -1) {
8109        // Both strings are null terminated.
8110        //    Scan through any leading equal portion.
8111        while (*pSrc == *pTarg && *pSrc != 0) {
8112            pSrc++;
8113            pTarg++;
8114        }
8115        if (*pSrc == 0 && *pTarg == 0) {
8116            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8117            return UCOL_EQUAL;
8118        }
8119        equalLength = (int32_t)(pSrc - source);
8120    }
8121    else
8122    {
8123        // One or both strings has an explicit length.
8124        const UChar    *pSrcEnd = source + sourceLength;
8125        const UChar    *pTargEnd = target + targetLength;
8126
8127        // Scan while the strings are bitwise ==, or until one is exhausted.
8128        for (;;) {
8129            if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8130                break;
8131            }
8132            if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8133                break;
8134            }
8135            if (*pSrc != *pTarg) {
8136                break;
8137            }
8138            pSrc++;
8139            pTarg++;
8140        }
8141        equalLength = (int32_t)(pSrc - source);
8142
8143        // If we made it all the way through both strings, we are done.  They are ==
8144        if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8145            (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
8146        {
8147            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8148            return UCOL_EQUAL;
8149        }
8150    }
8151    if (equalLength > 0) {
8152        /* There is an identical portion at the beginning of the two strings.        */
8153        /*   If the identical portion ends within a contraction or a comibining      */
8154        /*   character sequence, back up to the start of that sequence.              */
8155
8156        // These values should already be set by the code above.
8157        //pSrc  = source + equalLength;        /* point to the first differing chars   */
8158        //pTarg = target + equalLength;
8159        if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8160            (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8161        {
8162            // We are stopped in the middle of a contraction.
8163            // Scan backwards through the == part of the string looking for the start of the contraction.
8164            //   It doesn't matter which string we scan, since they are the same in this region.
8165            do
8166            {
8167                equalLength--;
8168                pSrc--;
8169            }
8170            while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8171        }
8172
8173        source += equalLength;
8174        target += equalLength;
8175        if (sourceLength > 0) {
8176            sourceLength -= equalLength;
8177        }
8178        if (targetLength > 0) {
8179            targetLength -= equalLength;
8180        }
8181    }
8182
8183    UErrorCode status = U_ZERO_ERROR;
8184    UCollationResult returnVal;
8185    if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8186        returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8187    } else {
8188        returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8189    }
8190    UTRACE_EXIT_VALUE(returnVal);
8191    return returnVal;
8192}
8193
8194/* convenience function for comparing strings */
8195U_CAPI UBool U_EXPORT2
8196ucol_greater(    const    UCollator        *coll,
8197        const    UChar            *source,
8198        int32_t            sourceLength,
8199        const    UChar            *target,
8200        int32_t            targetLength)
8201{
8202    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8203        == UCOL_GREATER);
8204}
8205
8206/* convenience function for comparing strings */
8207U_CAPI UBool U_EXPORT2
8208ucol_greaterOrEqual(    const    UCollator    *coll,
8209            const    UChar        *source,
8210            int32_t        sourceLength,
8211            const    UChar        *target,
8212            int32_t        targetLength)
8213{
8214    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8215        != UCOL_LESS);
8216}
8217
8218/* convenience function for comparing strings */
8219U_CAPI UBool U_EXPORT2
8220ucol_equal(        const    UCollator        *coll,
8221            const    UChar            *source,
8222            int32_t            sourceLength,
8223            const    UChar            *target,
8224            int32_t            targetLength)
8225{
8226    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8227        == UCOL_EQUAL);
8228}
8229
8230U_CAPI void U_EXPORT2
8231ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8232    if(coll && coll->UCA) {
8233        uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8234    }
8235}
8236
8237#endif /* #if !UCONFIG_NO_COLLATION */
8238