1/*
2*******************************************************************************
3*   Copyright (C) 1996-2013, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucol.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11* Modification history
12* Date        Name      Comments
13* 1996-1999   various members of ICU team maintained C API for collation framework
14* 02/16/2001  synwee    Added internal method getPrevSpecialCE
15* 03/01/2001  synwee    Added maxexpansion functionality.
16* 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
17*/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_COLLATION
22
23#include "unicode/bytestream.h"
24#include "unicode/coleitr.h"
25#include "unicode/unorm.h"
26#include "unicode/udata.h"
27#include "unicode/ustring.h"
28#include "unicode/utf8.h"
29
30#include "ucol_imp.h"
31#include "bocsu.h"
32
33#include "normalizer2impl.h"
34#include "unorm_it.h"
35#include "umutex.h"
36#include "cmemory.h"
37#include "ucln_in.h"
38#include "cstring.h"
39#include "utracimp.h"
40#include "putilimp.h"
41#include "uassert.h"
42#include "unicode/coll.h"
43
44#ifdef UCOL_DEBUG
45#include <stdio.h>
46#endif
47
48U_NAMESPACE_USE
49
50#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
51
52#define LAST_BYTE_MASK_           0xFF
53#define SECOND_LAST_BYTE_SHIFT_   8
54
55#define ZERO_CC_LIMIT_            0xC0
56
57// These are static pointers to the NFC/NFD implementation instance.
58// Each of them is always the same between calls to u_cleanup
59// and therefore writing to it is not synchronized.
60// They are cleaned in ucol_cleanup
61static const Normalizer2 *g_nfd = NULL;
62static const Normalizer2Impl *g_nfcImpl = NULL;
63
64// These are values from UCA required for
65// implicit generation and supressing sort key compression
66// they should regularly be in the UCA, but if one
67// is running without UCA, it could be a problem
68static const int32_t maxRegularPrimary  = 0x7A;
69static const int32_t minImplicitPrimary = 0xE0;
70static const int32_t maxImplicitPrimary = 0xE4;
71
72U_CDECL_BEGIN
73static UBool U_CALLCONV
74ucol_cleanup(void)
75{
76    g_nfd = NULL;
77    g_nfcImpl = NULL;
78    return TRUE;
79}
80
81static int32_t U_CALLCONV
82_getFoldingOffset(uint32_t data) {
83    return (int32_t)(data&0xFFFFFF);
84}
85
86U_CDECL_END
87
88static inline
89UBool initializeNFD(UErrorCode *status) {
90    if (g_nfd != NULL) {
91        return TRUE;
92    } else {
93        // The result is constant, until the library is reloaded.
94        g_nfd = Normalizer2Factory::getNFDInstance(*status);
95        ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
96        return U_SUCCESS(*status);
97    }
98}
99
100// init FCD data
101static inline
102UBool initializeFCD(UErrorCode *status) {
103    if (g_nfcImpl != NULL) {
104        return TRUE;
105    } else {
106        // The result is constant, until the library is reloaded.
107        g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
108        // Note: Alternatively, we could also store this pointer in each collIterate struct,
109        // same as Normalizer2Factory::getImpl(collIterate->nfd).
110        ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
111        return U_SUCCESS(*status);
112    }
113}
114
115static
116inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
117                              int32_t sourceLen, collIterate *s,
118                              UErrorCode *status)
119{
120    (s)->string = (s)->pos = sourceString;
121    (s)->origFlags = 0;
122    (s)->flags = 0;
123    if (sourceLen >= 0) {
124        s->flags |= UCOL_ITER_HASLEN;
125        (s)->endp = (UChar *)sourceString+sourceLen;
126    }
127    else {
128        /* change to enable easier checking for end of string for fcdpositon */
129        (s)->endp = NULL;
130    }
131    (s)->extendCEs = NULL;
132    (s)->extendCEsSize = 0;
133    (s)->CEpos = (s)->toReturn = (s)->CEs;
134    (s)->offsetBuffer = NULL;
135    (s)->offsetBufferSize = 0;
136    (s)->offsetReturn = (s)->offsetStore = NULL;
137    (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
138    (s)->coll = (collator);
139    if (initializeNFD(status)) {
140        (s)->nfd = g_nfd;
141    } else {
142        return;
143    }
144    (s)->fcdPosition = 0;
145    if(collator->normalizationMode == UCOL_ON) {
146        (s)->flags |= UCOL_ITER_NORM;
147    }
148    if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
149        (s)->flags |= UCOL_HIRAGANA_Q;
150    }
151    (s)->iterator = NULL;
152    //(s)->iteratorIndex = 0;
153}
154
155U_CAPI void  U_EXPORT2
156uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
157                             int32_t sourceLen, collIterate *s,
158                             UErrorCode *status) {
159    /* Out-of-line version for use from other files. */
160    IInit_collIterate(collator, sourceString, sourceLen, s, status);
161}
162
163U_CAPI collIterate * U_EXPORT2
164uprv_new_collIterate(UErrorCode *status) {
165    if(U_FAILURE(*status)) {
166        return NULL;
167    }
168    collIterate *s = new collIterate;
169    if(s == NULL) {
170        *status = U_MEMORY_ALLOCATION_ERROR;
171        return NULL;
172    }
173    return s;
174}
175
176U_CAPI void U_EXPORT2
177uprv_delete_collIterate(collIterate *s) {
178    delete s;
179}
180
181U_CAPI UBool U_EXPORT2
182uprv_collIterateAtEnd(collIterate *s) {
183    return s == NULL || s->pos == s->endp;
184}
185
186/**
187* Backup the state of the collIterate struct data
188* @param data collIterate to backup
189* @param backup storage
190*/
191static
192inline void backupState(const collIterate *data, collIterateState *backup)
193{
194    backup->fcdPosition = data->fcdPosition;
195    backup->flags       = data->flags;
196    backup->origFlags   = data->origFlags;
197    backup->pos         = data->pos;
198    backup->bufferaddress = data->writableBuffer.getBuffer();
199    backup->buffersize    = data->writableBuffer.length();
200    backup->iteratorMove = 0;
201    backup->iteratorIndex = 0;
202    if(data->iterator != NULL) {
203        //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
204        backup->iteratorIndex = data->iterator->getState(data->iterator);
205        // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
206        if(backup->iteratorIndex == UITER_NO_STATE) {
207            while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
208                backup->iteratorMove++;
209                data->iterator->move(data->iterator, -1, UITER_CURRENT);
210            }
211            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
212        }
213    }
214}
215
216/**
217* Loads the state into the collIterate struct data
218* @param data collIterate to backup
219* @param backup storage
220* @param forwards boolean to indicate if forwards iteration is used,
221*        false indicates backwards iteration
222*/
223static
224inline void loadState(collIterate *data, const collIterateState *backup,
225                      UBool        forwards)
226{
227    UErrorCode status = U_ZERO_ERROR;
228    data->flags       = backup->flags;
229    data->origFlags   = backup->origFlags;
230    if(data->iterator != NULL) {
231        //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
232        data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
233        if(backup->iteratorMove != 0) {
234            data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
235        }
236    }
237    data->pos         = backup->pos;
238
239    if ((data->flags & UCOL_ITER_INNORMBUF) &&
240        data->writableBuffer.getBuffer() != backup->bufferaddress) {
241        /*
242        this is when a new buffer has been reallocated and we'll have to
243        calculate the new position.
244        note the new buffer has to contain the contents of the old buffer.
245        */
246        if (forwards) {
247            data->pos = data->writableBuffer.getTerminatedBuffer() +
248                                         (data->pos - backup->bufferaddress);
249        }
250        else {
251            /* backwards direction */
252            int32_t temp = backup->buffersize -
253                                  (int32_t)(data->pos - backup->bufferaddress);
254            data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
255        }
256    }
257    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
258        /*
259        this is alittle tricky.
260        if we are initially not in the normalization buffer, even if we
261        normalize in the later stage, the data in the buffer will be
262        ignored, since we skip back up to the data string.
263        however if we are already in the normalization buffer, any
264        further normalization will pull data into the normalization
265        buffer and modify the fcdPosition.
266        since we are keeping the data in the buffer for use, the
267        fcdPosition can not be reverted back.
268        arrgghh....
269        */
270        data->fcdPosition = backup->fcdPosition;
271    }
272}
273
274static UBool
275reallocCEs(collIterate *data, int32_t newCapacity) {
276    uint32_t *oldCEs = data->extendCEs;
277    if(oldCEs == NULL) {
278        oldCEs = data->CEs;
279    }
280    int32_t length = data->CEpos - oldCEs;
281    uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
282    if(newCEs == NULL) {
283        return FALSE;
284    }
285    uprv_memcpy(newCEs, oldCEs, length * 4);
286    uprv_free(data->extendCEs);
287    data->extendCEs = newCEs;
288    data->extendCEsSize = newCapacity;
289    data->CEpos = newCEs + length;
290    return TRUE;
291}
292
293static UBool
294increaseCEsCapacity(collIterate *data) {
295    int32_t oldCapacity;
296    if(data->extendCEs != NULL) {
297        oldCapacity = data->extendCEsSize;
298    } else {
299        oldCapacity = LENGTHOF(data->CEs);
300    }
301    return reallocCEs(data, 2 * oldCapacity);
302}
303
304static UBool
305ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
306    int32_t oldCapacity;
307    if(data->extendCEs != NULL) {
308        oldCapacity = data->extendCEsSize;
309    } else {
310        oldCapacity = LENGTHOF(data->CEs);
311    }
312    if(minCapacity <= oldCapacity) {
313        return TRUE;
314    }
315    oldCapacity *= 2;
316    return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
317}
318
319void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
320    if(U_FAILURE(errorCode)) {
321        return;
322    }
323    int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
324    U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
325    if(length >= offsetBufferSize) {
326        int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
327        int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
328        if(newBuffer == NULL) {
329            errorCode = U_MEMORY_ALLOCATION_ERROR;
330            return;
331        }
332        if(length > 0) {
333            uprv_memcpy(newBuffer, offsetBuffer, length * 4);
334        }
335        uprv_free(offsetBuffer);
336        offsetBuffer = newBuffer;
337        offsetStore = offsetBuffer + length;
338        offsetBufferSize = newCapacity;
339    }
340    *offsetStore++ = offset;
341}
342
343/*
344* collIter_eos()
345*     Checks for a collIterate being positioned at the end of
346*     its source string.
347*
348*/
349static
350inline UBool collIter_eos(collIterate *s) {
351    if(s->flags & UCOL_USE_ITERATOR) {
352      return !(s->iterator->hasNext(s->iterator));
353    }
354    if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
355        // Null terminated string, but not at null, so not at end.
356        //   Whether in main or normalization buffer doesn't matter.
357        return FALSE;
358    }
359
360    // String with length.  Can't be in normalization buffer, which is always
361    //  null termintated.
362    if (s->flags & UCOL_ITER_HASLEN) {
363        return (s->pos == s->endp);
364    }
365
366    // We are at a null termination, could be either normalization buffer or main string.
367    if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
368        // At null at end of main string.
369        return TRUE;
370    }
371
372    // At null at end of normalization buffer.  Need to check whether there there are
373    //   any characters left in the main buffer.
374    if(s->origFlags & UCOL_USE_ITERATOR) {
375      return !(s->iterator->hasNext(s->iterator));
376    } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
377        // Null terminated main string.  fcdPosition is the 'return' position into main buf.
378        return (*s->fcdPosition == 0);
379    }
380    else {
381        // Main string with an end pointer.
382        return s->fcdPosition == s->endp;
383    }
384}
385
386/*
387* collIter_bos()
388*     Checks for a collIterate being positioned at the start of
389*     its source string.
390*
391*/
392static
393inline UBool collIter_bos(collIterate *source) {
394  // if we're going backwards, we need to know whether there is more in the
395  // iterator, even if we are in the side buffer
396  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
397    return !source->iterator->hasPrevious(source->iterator);
398  }
399  if (source->pos <= source->string ||
400      ((source->flags & UCOL_ITER_INNORMBUF) &&
401      *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
402    return TRUE;
403  }
404  return FALSE;
405}
406
407/*static
408inline UBool collIter_SimpleBos(collIterate *source) {
409  // if we're going backwards, we need to know whether there is more in the
410  // iterator, even if we are in the side buffer
411  if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
412    return !source->iterator->hasPrevious(source->iterator);
413  }
414  if (source->pos == source->string) {
415    return TRUE;
416  }
417  return FALSE;
418}*/
419    //return (data->pos == data->string) ||
420
421
422/****************************************************************************/
423/* Following are the open/close functions                                   */
424/*                                                                          */
425/****************************************************************************/
426
427static UCollator*
428ucol_initFromBinary(const uint8_t *bin, int32_t length,
429                const UCollator *base,
430                UCollator *fillIn,
431                UErrorCode *status)
432{
433    UCollator *result = fillIn;
434    if(U_FAILURE(*status)) {
435        return NULL;
436    }
437    /*
438    if(base == NULL) {
439        // we don't support null base yet
440        *status = U_ILLEGAL_ARGUMENT_ERROR;
441        return NULL;
442    }
443    */
444    // We need these and we could be running without UCA
445    uprv_uca_initImplicitConstants(status);
446    UCATableHeader *colData = (UCATableHeader *)bin;
447    // do we want version check here? We're trying to figure out whether collators are compatible
448    if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
449        uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
450        colData->version[0] != UCOL_BUILDER_VERSION)
451    {
452        *status = U_COLLATOR_VERSION_MISMATCH;
453        return NULL;
454    }
455    else {
456        if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
457            result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
458            if(U_FAILURE(*status)){
459                return NULL;
460            }
461            result->hasRealData = TRUE;
462        }
463        else {
464            if(base) {
465                result = ucol_initCollator(base->image, result, base, status);
466                ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
467                if(U_FAILURE(*status)){
468                    return NULL;
469                }
470                result->hasRealData = FALSE;
471            }
472            else {
473                *status = U_USELESS_COLLATOR_ERROR;
474                return NULL;
475            }
476        }
477        result->freeImageOnClose = FALSE;
478    }
479    result->actualLocale = NULL;
480    result->validLocale = NULL;
481    result->requestedLocale = NULL;
482    result->rules = NULL;
483    result->rulesLength = 0;
484    result->freeRulesOnClose = FALSE;
485    result->ucaRules = NULL;
486    return result;
487}
488
489U_CAPI UCollator* U_EXPORT2
490ucol_openBinary(const uint8_t *bin, int32_t length,
491                const UCollator *base,
492                UErrorCode *status)
493{
494    return ucol_initFromBinary(bin, length, base, NULL, status);
495}
496
497U_CAPI int32_t U_EXPORT2
498ucol_cloneBinary(const UCollator *coll,
499                 uint8_t *buffer, int32_t capacity,
500                 UErrorCode *status)
501{
502    int32_t length = 0;
503    if(U_FAILURE(*status)) {
504        return length;
505    }
506    if(capacity < 0) {
507        *status = U_ILLEGAL_ARGUMENT_ERROR;
508        return length;
509    }
510    if(coll->hasRealData == TRUE) {
511        length = coll->image->size;
512        if(length <= capacity) {
513            uprv_memcpy(buffer, coll->image, length);
514        } else {
515            *status = U_BUFFER_OVERFLOW_ERROR;
516        }
517    } else {
518        length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
519        if(length <= capacity) {
520            /* build the UCATableHeader with minimal entries */
521            /* do not copy the header from the UCA file because its values are wrong! */
522            /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
523
524            /* reset everything */
525            uprv_memset(buffer, 0, length);
526
527            /* set the tailoring-specific values */
528            UCATableHeader *myData = (UCATableHeader *)buffer;
529            myData->size = length;
530
531            /* offset for the options, the only part of the data that is present after the header */
532            myData->options = sizeof(UCATableHeader);
533
534            /* need to always set the expansion value for an upper bound of the options */
535            myData->expansion = myData->options + sizeof(UColOptionSet);
536
537            myData->magic = UCOL_HEADER_MAGIC;
538            myData->isBigEndian = U_IS_BIG_ENDIAN;
539            myData->charSetFamily = U_CHARSET_FAMILY;
540
541            /* copy UCA's version; genrb will override all but the builder version with tailoring data */
542            uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
543
544            uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
545            uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
546            uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
547            myData->jamoSpecial = coll->image->jamoSpecial;
548
549            /* copy the collator options */
550            uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
551        } else {
552            *status = U_BUFFER_OVERFLOW_ERROR;
553        }
554    }
555    return length;
556}
557
558U_CAPI UCollator* U_EXPORT2
559ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status)
560{
561    UCollator * localCollator;
562    int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
563    int32_t imageSize = 0;
564    int32_t rulesSize = 0;
565    int32_t rulesPadding = 0;
566    int32_t defaultReorderCodesSize = 0;
567    int32_t reorderCodesSize = 0;
568    uint8_t *image;
569    UChar *rules;
570    int32_t* defaultReorderCodes;
571    int32_t* reorderCodes;
572    uint8_t* leadBytePermutationTable;
573    UBool imageAllocated = FALSE;
574
575    if (status == NULL || U_FAILURE(*status)){
576        return NULL;
577    }
578    if (coll == NULL) {
579       *status = U_ILLEGAL_ARGUMENT_ERROR;
580        return NULL;
581    }
582
583    if (coll->rules && coll->freeRulesOnClose) {
584        rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
585        rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
586        bufferSizeNeeded += rulesSize + rulesPadding;
587    }
588    // no padding for alignment needed from here since the next two are 4 byte quantities
589    if (coll->defaultReorderCodes) {
590        defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
591        bufferSizeNeeded += defaultReorderCodesSize;
592    }
593    if (coll->reorderCodes) {
594        reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
595        bufferSizeNeeded += reorderCodesSize;
596    }
597    if (coll->leadBytePermutationTable) {
598        bufferSizeNeeded += 256 * sizeof(uint8_t);
599    }
600
601    if (pBufferSize != NULL) {
602        int32_t inputSize = *pBufferSize;
603        *pBufferSize = 1;
604        if (inputSize == 0) {
605            return NULL;  // preflighting for deprecated functionality
606        }
607    }
608
609    char *stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
610    // Null pointer check.
611    if (stackBufferChars == NULL) {
612        *status = U_MEMORY_ALLOCATION_ERROR;
613        return NULL;
614    }
615    *status = U_SAFECLONE_ALLOCATED_WARNING;
616
617    localCollator = (UCollator *)stackBufferChars;
618    rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
619    defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
620    reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
621    leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
622
623    {
624        UErrorCode tempStatus = U_ZERO_ERROR;
625        imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
626    }
627    if (coll->freeImageOnClose) {
628        image = (uint8_t *)uprv_malloc(imageSize);
629        // Null pointer check
630        if (image == NULL) {
631            *status = U_MEMORY_ALLOCATION_ERROR;
632            return NULL;
633        }
634        ucol_cloneBinary(coll, image, imageSize, status);
635        imageAllocated = TRUE;
636    }
637    else {
638        image = (uint8_t *)coll->image;
639    }
640    localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
641    if (U_FAILURE(*status)) {
642        return NULL;
643    }
644
645    if (coll->rules) {
646        if (coll->freeRulesOnClose) {
647            localCollator->rules = u_strcpy(rules, coll->rules);
648            //bufferEnd += rulesSize;
649        }
650        else {
651            localCollator->rules = coll->rules;
652        }
653        localCollator->freeRulesOnClose = FALSE;
654        localCollator->rulesLength = coll->rulesLength;
655    }
656
657    // collator reordering
658    if (coll->defaultReorderCodes) {
659        localCollator->defaultReorderCodes =
660            (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
661        localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
662        localCollator->freeDefaultReorderCodesOnClose = FALSE;
663    }
664    if (coll->reorderCodes) {
665        localCollator->reorderCodes =
666            (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
667        localCollator->reorderCodesLength = coll->reorderCodesLength;
668        localCollator->freeReorderCodesOnClose = FALSE;
669    }
670    if (coll->leadBytePermutationTable) {
671        localCollator->leadBytePermutationTable =
672            (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
673        localCollator->freeLeadBytePermutationTableOnClose = FALSE;
674    }
675
676    int32_t i;
677    for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
678        ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
679    }
680    // zero copies of pointers
681    localCollator->actualLocale = NULL;
682    localCollator->validLocale = NULL;
683    localCollator->requestedLocale = NULL;
684    localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
685    localCollator->freeOnClose = TRUE;
686    localCollator->freeImageOnClose = imageAllocated;
687    return localCollator;
688}
689
690U_CAPI void U_EXPORT2
691ucol_close(UCollator *coll)
692{
693    UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
694    UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
695    if(coll != NULL) {
696        // these are always owned by each UCollator struct,
697        // so we always free them
698        if(coll->validLocale != NULL) {
699            uprv_free(coll->validLocale);
700        }
701        if(coll->actualLocale != NULL) {
702            uprv_free(coll->actualLocale);
703        }
704        if(coll->requestedLocale != NULL) {
705            uprv_free(coll->requestedLocale);
706        }
707        if(coll->latinOneCEs != NULL) {
708            uprv_free(coll->latinOneCEs);
709        }
710        if(coll->options != NULL && coll->freeOptionsOnClose) {
711            uprv_free(coll->options);
712        }
713        if(coll->rules != NULL && coll->freeRulesOnClose) {
714            uprv_free((UChar *)coll->rules);
715        }
716        if(coll->image != NULL && coll->freeImageOnClose) {
717            uprv_free((UCATableHeader *)coll->image);
718        }
719
720        if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
721            uprv_free(coll->leadBytePermutationTable);
722        }
723        if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
724            uprv_free(coll->defaultReorderCodes);
725        }
726        if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
727            uprv_free(coll->reorderCodes);
728        }
729
730        if(coll->delegate != NULL) {
731          delete (Collator*)coll->delegate;
732        }
733
734        /* Here, it would be advisable to close: */
735        /* - UData for UCA (unless we stuff it in the root resb */
736        /* Again, do we need additional housekeeping... HMMM! */
737        UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
738        if(coll->freeOnClose){
739            /* for safeClone, if freeOnClose is FALSE,
740            don't free the other instance data */
741            uprv_free(coll);
742        }
743    }
744    UTRACE_EXIT();
745}
746
747void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
748    if(U_FAILURE(*status)) {
749        return;
750    }
751    result->caseFirst = (UColAttributeValue)opts->caseFirst;
752    result->caseLevel = (UColAttributeValue)opts->caseLevel;
753    result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
754    result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
755    if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
756        return;
757    }
758    result->strength = (UColAttributeValue)opts->strength;
759    result->variableTopValue = opts->variableTopValue;
760    result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
761    result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
762    result->numericCollation = (UColAttributeValue)opts->numericCollation;
763    result->caseFirstisDefault = TRUE;
764    result->caseLevelisDefault = TRUE;
765    result->frenchCollationisDefault = TRUE;
766    result->normalizationModeisDefault = TRUE;
767    result->strengthisDefault = TRUE;
768    result->variableTopValueisDefault = TRUE;
769    result->alternateHandlingisDefault = TRUE;
770    result->hiraganaQisDefault = TRUE;
771    result->numericCollationisDefault = TRUE;
772
773    ucol_updateInternalState(result, status);
774
775    result->options = opts;
776}
777
778
779/**
780* Approximate determination if a character is at a contraction end.
781* Guaranteed to be TRUE if a character is at the end of a contraction,
782* otherwise it is not deterministic.
783* @param c character to be determined
784* @param coll collator
785*/
786static
787inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
788    if (c < coll->minContrEndCP) {
789        return FALSE;
790    }
791
792    int32_t  hash = c;
793    uint8_t  htbyte;
794    if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
795        if (U16_IS_TRAIL(c)) {
796            return TRUE;
797        }
798        hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
799    }
800    htbyte = coll->contrEndCP[hash>>3];
801    return (((htbyte >> (hash & 7)) & 1) == 1);
802}
803
804
805
806/*
807*   i_getCombiningClass()
808*        A fast, at least partly inline version of u_getCombiningClass()
809*        This is a candidate for further optimization.  Used heavily
810*        in contraction processing.
811*/
812static
813inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
814    uint8_t sCC = 0;
815    if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
816        sCC = u_getCombiningClass(c);
817    }
818    return sCC;
819}
820
821UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
822    UChar c;
823    UCollator *result = fillIn;
824    if(U_FAILURE(*status) || image == NULL) {
825        return NULL;
826    }
827
828    if(result == NULL) {
829        result = (UCollator *)uprv_malloc(sizeof(UCollator));
830        if(result == NULL) {
831            *status = U_MEMORY_ALLOCATION_ERROR;
832            return result;
833        }
834        result->freeOnClose = TRUE;
835    } else {
836        result->freeOnClose = FALSE;
837    }
838
839    result->delegate = NULL;
840
841    result->image = image;
842    result->mapping.getFoldingOffset = _getFoldingOffset;
843    const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
844    utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
845    if(U_FAILURE(*status)) {
846        if(result->freeOnClose == TRUE) {
847            uprv_free(result);
848            result = NULL;
849        }
850        return result;
851    }
852
853    result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
854    result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
855    result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
856    result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
857    result->rules = NULL;
858    result->rulesLength = 0;
859    result->freeRulesOnClose = FALSE;
860    result->defaultReorderCodes = NULL;
861    result->defaultReorderCodesLength = 0;
862    result->freeDefaultReorderCodesOnClose = FALSE;
863    result->reorderCodes = NULL;
864    result->reorderCodesLength = 0;
865    result->freeReorderCodesOnClose = FALSE;
866    result->leadBytePermutationTable = NULL;
867    result->freeLeadBytePermutationTableOnClose = FALSE;
868
869    /* get the version info from UCATableHeader and populate the Collator struct*/
870    result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
871    result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
872    result->dataVersion[2] = 0;
873    result->dataVersion[3] = 0;
874
875    result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
876    result->minUnsafeCP = 0;
877    for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
878        if (ucol_unsafeCP(c, result)) break;
879    }
880    result->minUnsafeCP = c;
881
882    result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
883    result->minContrEndCP = 0;
884    for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
885        if (ucol_contractionEndCP(c, result)) break;
886    }
887    result->minContrEndCP = c;
888
889    /* max expansion tables */
890    result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
891                                         result->image->endExpansionCE);
892    result->lastEndExpansionCE = result->endExpansionCE +
893                                 result->image->endExpansionCECount - 1;
894    result->expansionCESize = (uint8_t*)result->image +
895                                               result->image->expansionCESize;
896
897
898    //result->errorCode = *status;
899
900    result->latinOneCEs = NULL;
901
902    result->latinOneRegenTable = FALSE;
903    result->latinOneFailed = FALSE;
904    result->UCA = UCA;
905
906    /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
907    result->ucaRules = NULL;
908    result->actualLocale = NULL;
909    result->validLocale = NULL;
910    result->requestedLocale = NULL;
911    result->hasRealData = FALSE; // real data lives in .dat file...
912    result->freeImageOnClose = FALSE;
913
914    /* set attributes */
915    ucol_setOptionsFromHeader(
916        result,
917        (UColOptionSet*)((uint8_t*)result->image+result->image->options),
918        status);
919    result->freeOptionsOnClose = FALSE;
920
921    return result;
922}
923
924/* new Mark's code */
925
926/**
927 * For generation of Implicit CEs
928 * @author Davis
929 *
930 * Cleaned up so that changes can be made more easily.
931 * Old values:
932# First Implicit: E26A792D
933# Last Implicit: E3DC70C0
934# First CJK: E0030300
935# Last CJK: E0A9DD00
936# First CJK_A: E0A9DF00
937# Last CJK_A: E0DE3100
938 */
939/* Following is a port of Mark's code for new treatment of implicits.
940 * It is positioned here, since ucol_initUCA need to initialize the
941 * variables below according to the data in the fractional UCA.
942 */
943
944/**
945 * Function used to:
946 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
947 * b) bump any non-CJK characters by 10FFFF.
948 * The relevant blocks are:
949 * A:    4E00..9FFF; CJK Unified Ideographs
950 *       F900..FAFF; CJK Compatibility Ideographs
951 * B:    3400..4DBF; CJK Unified Ideographs Extension A
952 *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
953 * As long as
954 *   no new B characters are allocated between 4E00 and FAFF, and
955 *   no new A characters are outside of this range,
956 * (very high probability) this simple code will work.
957 * The reordered blocks are:
958 * Block1 is CJK
959 * Block2 is CJK_COMPAT_USED
960 * Block3 is CJK_A
961 * (all contiguous)
962 * Any other CJK gets its normal code point
963 * Any non-CJK gets +10FFFF
964 * When we reorder Block1, we make sure that it is at the very start,
965 * so that it will use a 3-byte form.
966 * Warning: the we only pick up the compatibility characters that are
967 * NOT decomposed, so that block is smaller!
968 */
969
970// CONSTANTS
971static const UChar32
972    NON_CJK_OFFSET = 0x110000,
973    UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
974
975/**
976 * Precomputed by initImplicitConstants()
977 */
978static int32_t
979    final3Multiplier = 0,
980    final4Multiplier = 0,
981    final3Count = 0,
982    final4Count = 0,
983    medialCount = 0,
984    min3Primary = 0,
985    min4Primary = 0,
986    max4Primary = 0,
987    minTrail = 0,
988    maxTrail = 0,
989    max3Trail = 0,
990    max4Trail = 0,
991    min4Boundary = 0;
992
993static const UChar32
994    // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
995    // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;  (Unicode 6.1)
996    CJK_BASE = 0x4E00,
997    CJK_LIMIT = 0x9FCC+1,
998    // Unified CJK ideographs in the compatibility ideographs block.
999    CJK_COMPAT_USED_BASE = 0xFA0E,
1000    CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1001    // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1002    // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1003    CJK_A_BASE = 0x3400,
1004    CJK_A_LIMIT = 0x4DB5+1,
1005    // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1006    // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1007    CJK_B_BASE = 0x20000,
1008    CJK_B_LIMIT = 0x2A6D6+1,
1009    // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1010    // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1011    CJK_C_BASE = 0x2A700,
1012    CJK_C_LIMIT = 0x2B734+1,
1013    // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1014    // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1015    CJK_D_BASE = 0x2B740,
1016    CJK_D_LIMIT = 0x2B81D+1;
1017    // when adding to this list, look for all occurrences (in project)
1018    // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1019
1020static UChar32 swapCJK(UChar32 i) {
1021    if (i < CJK_A_BASE) {
1022        // non-CJK
1023    } else if (i < CJK_A_LIMIT) {
1024        // Extension A has lower code points than the original Unihan+compat
1025        // but sorts higher.
1026        return i - CJK_A_BASE
1027                + (CJK_LIMIT - CJK_BASE)
1028                + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1029    } else if (i < CJK_BASE) {
1030        // non-CJK
1031    } else if (i < CJK_LIMIT) {
1032        return i - CJK_BASE;
1033    } else if (i < CJK_COMPAT_USED_BASE) {
1034        // non-CJK
1035    } else if (i < CJK_COMPAT_USED_LIMIT) {
1036        return i - CJK_COMPAT_USED_BASE
1037                + (CJK_LIMIT - CJK_BASE);
1038    } else if (i < CJK_B_BASE) {
1039        // non-CJK
1040    } else if (i < CJK_B_LIMIT) {
1041        return i; // non-BMP-CJK
1042    } else if (i < CJK_C_BASE) {
1043        // non-CJK
1044    } else if (i < CJK_C_LIMIT) {
1045        return i; // non-BMP-CJK
1046    } else if (i < CJK_D_BASE) {
1047        // non-CJK
1048    } else if (i < CJK_D_LIMIT) {
1049        return i; // non-BMP-CJK
1050    }
1051    return i + NON_CJK_OFFSET; // non-CJK
1052}
1053
1054U_CAPI UChar32 U_EXPORT2
1055uprv_uca_getRawFromCodePoint(UChar32 i) {
1056    return swapCJK(i)+1;
1057}
1058
1059U_CAPI UChar32 U_EXPORT2
1060uprv_uca_getCodePointFromRaw(UChar32 i) {
1061    i--;
1062    UChar32 result = 0;
1063    if(i >= NON_CJK_OFFSET) {
1064        result = i - NON_CJK_OFFSET;
1065    } else if(i >= CJK_B_BASE) {
1066        result = i;
1067    } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1068        if(i < CJK_LIMIT - CJK_BASE) {
1069            result = i + CJK_BASE;
1070        } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1071            result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1072        } else {
1073            result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1074        }
1075    } else {
1076        result = -1;
1077    }
1078    return result;
1079}
1080
1081// GET IMPLICIT PRIMARY WEIGHTS
1082// Return value is left justified primary key
1083U_CAPI uint32_t U_EXPORT2
1084uprv_uca_getImplicitFromRaw(UChar32 cp) {
1085    /*
1086    if (cp < 0 || cp > UCOL_MAX_INPUT) {
1087        throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1088    }
1089    */
1090    int32_t last0 = cp - min4Boundary;
1091    if (last0 < 0) {
1092        int32_t last1 = cp / final3Count;
1093        last0 = cp % final3Count;
1094
1095        int32_t last2 = last1 / medialCount;
1096        last1 %= medialCount;
1097
1098        last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1099        last1 = minTrail + last1; // offset
1100        last2 = min3Primary + last2; // offset
1101        /*
1102        if (last2 >= min4Primary) {
1103            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1104        }
1105        */
1106        return (last2 << 24) + (last1 << 16) + (last0 << 8);
1107    } else {
1108        int32_t last1 = last0 / final4Count;
1109        last0 %= final4Count;
1110
1111        int32_t last2 = last1 / medialCount;
1112        last1 %= medialCount;
1113
1114        int32_t last3 = last2 / medialCount;
1115        last2 %= medialCount;
1116
1117        last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1118        last1 = minTrail + last1; // offset
1119        last2 = minTrail + last2; // offset
1120        last3 = min4Primary + last3; // offset
1121        /*
1122        if (last3 > max4Primary) {
1123            throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1124        }
1125        */
1126        return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1127    }
1128}
1129
1130static uint32_t U_EXPORT2
1131uprv_uca_getImplicitPrimary(UChar32 cp) {
1132   //fprintf(stdout, "Incoming: %04x\n", cp);
1133    //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1134
1135    cp = swapCJK(cp);
1136    cp++;
1137    // we now have a range of numbers from 0 to 21FFFF.
1138
1139    //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1140    //fprintf(stdout, "CJK swapped: %04x\n", cp);
1141
1142    return uprv_uca_getImplicitFromRaw(cp);
1143}
1144
1145/**
1146 * Converts implicit CE into raw integer ("code point")
1147 * @param implicit
1148 * @return -1 if illegal format
1149 */
1150U_CAPI UChar32 U_EXPORT2
1151uprv_uca_getRawFromImplicit(uint32_t implicit) {
1152    UChar32 result;
1153    UChar32 b3 = implicit & 0xFF;
1154    UChar32 b2 = (implicit >> 8) & 0xFF;
1155    UChar32 b1 = (implicit >> 16) & 0xFF;
1156    UChar32 b0 = (implicit >> 24) & 0xFF;
1157
1158    // simple parameter checks
1159    if (b0 < min3Primary || b0 > max4Primary
1160        || b1 < minTrail || b1 > maxTrail)
1161        return -1;
1162    // normal offsets
1163    b1 -= minTrail;
1164
1165    // take care of the final values, and compose
1166    if (b0 < min4Primary) {
1167        if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1168            return -1;
1169        b2 -= minTrail;
1170        UChar32 remainder = b2 % final3Multiplier;
1171        if (remainder != 0)
1172            return -1;
1173        b0 -= min3Primary;
1174        b2 /= final3Multiplier;
1175        result = ((b0 * medialCount) + b1) * final3Count + b2;
1176    } else {
1177        if (b2 < minTrail || b2 > maxTrail
1178            || b3 < minTrail || b3 > max4Trail)
1179            return -1;
1180        b2 -= minTrail;
1181        b3 -= minTrail;
1182        UChar32 remainder = b3 % final4Multiplier;
1183        if (remainder != 0)
1184            return -1;
1185        b3 /= final4Multiplier;
1186        b0 -= min4Primary;
1187        result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1188    }
1189    // final check
1190    if (result < 0 || result > UCOL_MAX_INPUT)
1191        return -1;
1192    return result;
1193}
1194
1195
1196static inline int32_t divideAndRoundUp(int a, int b) {
1197    return 1 + (a-1)/b;
1198}
1199
1200/* this function is either called from initUCA or from genUCA before
1201 * doing canonical closure for the UCA.
1202 */
1203
1204/**
1205 * Set up to generate implicits.
1206 * Maintenance Note:  this function may end up being called more than once, due
1207 *                    to threading races during initialization.  Make sure that
1208 *                    none of the Constants is ever transiently assigned an
1209 *                    incorrect value.
1210 * @param minPrimary
1211 * @param maxPrimary
1212 * @param minTrail final byte
1213 * @param maxTrail final byte
1214 * @param gap3 the gap we leave for tailoring for 3-byte forms
1215 * @param gap4 the gap we leave for tailoring for 4-byte forms
1216 */
1217static void initImplicitConstants(int minPrimary, int maxPrimary,
1218                                    int minTrailIn, int maxTrailIn,
1219                                    int gap3, int primaries3count,
1220                                    UErrorCode *status) {
1221    // some simple parameter checks
1222    if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1223        || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1224        || (primaries3count < 1))
1225    {
1226        *status = U_ILLEGAL_ARGUMENT_ERROR;
1227        return;
1228    };
1229
1230    minTrail = minTrailIn;
1231    maxTrail = maxTrailIn;
1232
1233    min3Primary = minPrimary;
1234    max4Primary = maxPrimary;
1235    // compute constants for use later.
1236    // number of values we can use in trailing bytes
1237    // leave room for empty values between AND above, e.g. if gap = 2
1238    // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1239    // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1240    // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1241    final3Multiplier = gap3 + 1;
1242    final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1243    max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1244
1245    // medials can use full range
1246    medialCount = (maxTrail - minTrail + 1);
1247    // find out how many values fit in each form
1248    int32_t threeByteCount = medialCount * final3Count;
1249    // now determine where the 3/4 boundary is.
1250    // we use 3 bytes below the boundary, and 4 above
1251    int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1252    int32_t primaries4count = primariesAvailable - primaries3count;
1253
1254
1255    int32_t min3ByteCoverage = primaries3count * threeByteCount;
1256    min4Primary = minPrimary + primaries3count;
1257    min4Boundary = min3ByteCoverage;
1258    // Now expand out the multiplier for the 4 bytes, and redo.
1259
1260    int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1261    int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1262    int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1263    int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1264    if (gap4 < 1) {
1265        *status = U_ILLEGAL_ARGUMENT_ERROR;
1266        return;
1267    }
1268    final4Multiplier = gap4 + 1;
1269    final4Count = neededPerFinalByte;
1270    max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1271}
1272
1273    /**
1274     * Supply parameters for generating implicit CEs
1275     */
1276U_CAPI void U_EXPORT2
1277uprv_uca_initImplicitConstants(UErrorCode *status) {
1278    // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1279    //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1280    initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1281}
1282
1283
1284/*    collIterNormalize     Incremental Normalization happens here.                       */
1285/*                          pick up the range of chars identifed by FCD,                  */
1286/*                          normalize it into the collIterate's writable buffer,          */
1287/*                          switch the collIterate's state to use the writable buffer.    */
1288/*                                                                                        */
1289static
1290void collIterNormalize(collIterate *collationSource)
1291{
1292    UErrorCode  status = U_ZERO_ERROR;
1293    const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1294    const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1295
1296    collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1297                                    collationSource->writableBuffer,
1298                                    status);
1299    if (U_FAILURE(status)) {
1300#ifdef UCOL_DEBUG
1301        fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1302#endif
1303        return;
1304    }
1305
1306    collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
1307    collationSource->origFlags  = collationSource->flags;
1308    collationSource->flags     |= UCOL_ITER_INNORMBUF;
1309    collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1310}
1311
1312
1313// This function takes the iterator and extracts normalized stuff up to the next boundary
1314// It is similar in the end results to the collIterNormalize, but for the cases when we
1315// use an iterator
1316/*static
1317inline void normalizeIterator(collIterate *collationSource) {
1318  UErrorCode status = U_ZERO_ERROR;
1319  UBool wasNormalized = FALSE;
1320  //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1321  uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1322  int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1323    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1324  if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1325    // reallocate and terminate
1326    if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1327                               &collationSource->writableBuffer,
1328                               (int32_t *)&collationSource->writableBufSize, normLen + 1,
1329                               0)
1330    ) {
1331    #ifdef UCOL_DEBUG
1332        fprintf(stderr, "normalizeIterator(), out of memory\n");
1333    #endif
1334        return;
1335    }
1336    status = U_ZERO_ERROR;
1337    //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1338    collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1339    normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1340    (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1341  }
1342  // Terminate the buffer - we already checked that it is big enough
1343  collationSource->writableBuffer[normLen] = 0;
1344  if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1345      collationSource->flags |= UCOL_ITER_ALLOCATED;
1346  }
1347  collationSource->pos        = collationSource->writableBuffer;
1348  collationSource->origFlags  = collationSource->flags;
1349  collationSource->flags     |= UCOL_ITER_INNORMBUF;
1350  collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1351}*/
1352
1353
1354/* Incremental FCD check and normalize                                                    */
1355/*   Called from getNextCE when normalization state is suspect.                           */
1356/*   When entering, the state is known to be this:                                        */
1357/*      o   We are working in the main buffer of the collIterate, not the side            */
1358/*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1359/*          so we won't get here.                                                         */
1360/*      o   The leading combining class from the current character is 0 or                */
1361/*          the trailing combining class of the previous char was zero.                   */
1362/*          True because the previous call to this function will have always exited       */
1363/*          that way, and we get called for every char where cc might be non-zero.        */
1364static
1365inline UBool collIterFCD(collIterate *collationSource) {
1366    const UChar *srcP, *endP;
1367    uint8_t     leadingCC;
1368    uint8_t     prevTrailingCC = 0;
1369    uint16_t    fcd;
1370    UBool       needNormalize = FALSE;
1371
1372    srcP = collationSource->pos-1;
1373
1374    if (collationSource->flags & UCOL_ITER_HASLEN) {
1375        endP = collationSource->endp;
1376    } else {
1377        endP = NULL;
1378    }
1379
1380    // Get the trailing combining class of the current character. If it's zero, we are OK.
1381    fcd = g_nfcImpl->nextFCD16(srcP, endP);
1382    if (fcd != 0) {
1383        prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1384
1385        if (prevTrailingCC != 0) {
1386            // The current char has a non-zero trailing CC.  Scan forward until we find
1387            //   a char with a leading cc of zero.
1388            while (endP == NULL || srcP != endP)
1389            {
1390                const UChar *savedSrcP = srcP;
1391
1392                fcd = g_nfcImpl->nextFCD16(srcP, endP);
1393                leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1394                if (leadingCC == 0) {
1395                    srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1396                                           //   back up over it.  (Could be surrogate pair!)
1397                    break;
1398                }
1399
1400                if (leadingCC < prevTrailingCC) {
1401                    needNormalize = TRUE;
1402                }
1403
1404                prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1405            }
1406        }
1407    }
1408
1409    collationSource->fcdPosition = (UChar *)srcP;
1410
1411    return needNormalize;
1412}
1413
1414/****************************************************************************/
1415/* Following are the CE retrieval functions                                 */
1416/*                                                                          */
1417/****************************************************************************/
1418
1419static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1420static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1421
1422/* there should be a macro version of this function in the header file */
1423/* This is the first function that tries to fetch a collation element  */
1424/* If it's not succesfull or it encounters a more difficult situation  */
1425/* some more sofisticated and slower functions are invoked             */
1426static
1427inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1428    uint32_t order = 0;
1429    if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1430        order = *(collationSource->toReturn++);                         /* if so, return them */
1431        if(collationSource->CEpos == collationSource->toReturn) {
1432            collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1433        }
1434        return order;
1435    }
1436
1437    UChar ch = 0;
1438    collationSource->offsetReturn = NULL;
1439
1440    do {
1441        for (;;)                           /* Loop handles case when incremental normalize switches   */
1442        {                                  /*   to or from the side buffer / original string, and we  */
1443            /*   need to start again to get the next character.        */
1444
1445            if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1446            {
1447                // The source string is null terminated and we're not working from the side buffer,
1448                //   and we're not normalizing.  This is the fast path.
1449                //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1450                ch = *collationSource->pos++;
1451                if (ch != 0) {
1452                    break;
1453                }
1454                else {
1455                    return UCOL_NO_MORE_CES;
1456                }
1457            }
1458
1459            if (collationSource->flags & UCOL_ITER_HASLEN) {
1460                // Normal path for strings when length is specified.
1461                //   (We can't be in side buffer because it is always null terminated.)
1462                if (collationSource->pos >= collationSource->endp) {
1463                    // Ran off of the end of the main source string.  We're done.
1464                    return UCOL_NO_MORE_CES;
1465                }
1466                ch = *collationSource->pos++;
1467            }
1468            else if(collationSource->flags & UCOL_USE_ITERATOR) {
1469                UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1470                if(iterCh == U_SENTINEL) {
1471                    return UCOL_NO_MORE_CES;
1472                }
1473                ch = (UChar)iterCh;
1474            }
1475            else
1476            {
1477                // Null terminated string.
1478                ch = *collationSource->pos++;
1479                if (ch == 0) {
1480                    // Ran off end of buffer.
1481                    if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1482                        // Ran off end of main string. backing up one character.
1483                        collationSource->pos--;
1484                        return UCOL_NO_MORE_CES;
1485                    }
1486                    else
1487                    {
1488                        // Hit null in the normalize side buffer.
1489                        // Usually this means the end of the normalized data,
1490                        // except for one odd case: a null followed by combining chars,
1491                        //   which is the case if we are at the start of the buffer.
1492                        if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1493                            break;
1494                        }
1495
1496                        //  Null marked end of side buffer.
1497                        //   Revert to the main string and
1498                        //   loop back to top to try again to get a character.
1499                        collationSource->pos   = collationSource->fcdPosition;
1500                        collationSource->flags = collationSource->origFlags;
1501                        continue;
1502                    }
1503                }
1504            }
1505
1506            if(collationSource->flags&UCOL_HIRAGANA_Q) {
1507                /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1508                 * based on whether the previous codepoint was Hiragana or Katakana.
1509                 */
1510                if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1511                        ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1512                    collationSource->flags |= UCOL_WAS_HIRAGANA;
1513                } else {
1514                    collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1515                }
1516            }
1517
1518            // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1519            //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1520            if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1521                break;
1522            }
1523
1524            if (collationSource->fcdPosition >= collationSource->pos) {
1525                // An earlier FCD check has already covered the current character.
1526                // We can go ahead and process this char.
1527                break;
1528            }
1529
1530            if (ch < ZERO_CC_LIMIT_ ) {
1531                // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1532                break;
1533            }
1534
1535            if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1536                // We need to peek at the next character in order to tell if we are FCD
1537                if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1538                    // We are at the last char of source string.
1539                    //  It is always OK for FCD check.
1540                    break;
1541                }
1542
1543                // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1544                if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1545                    break;
1546                }
1547            }
1548
1549
1550            // Need a more complete FCD check and possible normalization.
1551            if (collIterFCD(collationSource)) {
1552                collIterNormalize(collationSource);
1553            }
1554            if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1555                //  No normalization was needed.  Go ahead and process the char we already had.
1556                break;
1557            }
1558
1559            // Some normalization happened.  Next loop iteration will pick up a char
1560            //   from the normalization buffer.
1561
1562        }   // end for (;;)
1563
1564
1565        if (ch <= 0xFF) {
1566            /*  For latin-1 characters we never need to fall back to the UCA table        */
1567            /*    because all of the UCA data is replicated in the latinOneMapping array  */
1568            order = coll->latinOneMapping[ch];
1569            if (order > UCOL_NOT_FOUND) {
1570                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1571            }
1572        }
1573        else
1574        {
1575            // Always use UCA for Han, Hangul
1576            // (Han extension A is before main Han block)
1577            // **** Han compatibility chars ?? ****
1578            if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1579                (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1580                if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1581                    // between the two target ranges; do normal lookup
1582                    // **** this range is YI, Modifier tone letters, ****
1583                    // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
1584                    // **** Latin-D might be tailored, so we need to ****
1585                    // **** do the normal lookup for these guys.     ****
1586                    order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1587                } else {
1588                    // in one of the target ranges; use UCA
1589                    order = UCOL_NOT_FOUND;
1590                }
1591            } else {
1592                order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1593            }
1594
1595            if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1596                order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1597            }
1598
1599            if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1600                /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1601                order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1602
1603                if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1604                    order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1605                }
1606            }
1607        }
1608    } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1609
1610    if(order == UCOL_NOT_FOUND) {
1611        order = getImplicit(ch, collationSource);
1612    }
1613    return order; /* return the CE */
1614}
1615
1616/* ucol_getNextCE, out-of-line version for use from other files.   */
1617U_CAPI uint32_t  U_EXPORT2
1618ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1619    return ucol_IGetNextCE(coll, collationSource, status);
1620}
1621
1622
1623/**
1624* Incremental previous normalization happens here. Pick up the range of chars
1625* identifed by FCD, normalize it into the collIterate's writable buffer,
1626* switch the collIterate's state to use the writable buffer.
1627* @param data collation iterator data
1628*/
1629static
1630void collPrevIterNormalize(collIterate *data)
1631{
1632    UErrorCode status  = U_ZERO_ERROR;
1633    const UChar *pEnd   = data->pos;  /* End normalize + 1 */
1634    const UChar *pStart;
1635
1636    /* Start normalize */
1637    if (data->fcdPosition == NULL) {
1638        pStart = data->string;
1639    }
1640    else {
1641        pStart = data->fcdPosition + 1;
1642    }
1643
1644    int32_t normLen =
1645        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1646                             data->writableBuffer,
1647                             status).
1648        length();
1649    if(U_FAILURE(status)) {
1650        return;
1651    }
1652    /*
1653    this puts the null termination infront of the normalized string instead
1654    of the end
1655    */
1656    data->writableBuffer.insert(0, (UChar)0);
1657
1658    /*
1659     * The usual case at this point is that we've got a base
1660     * character followed by marks that were normalized. If
1661     * fcdPosition is NULL, that means that we backed up to
1662     * the beginning of the string and there's no base character.
1663     *
1664     * Forward processing will usually normalize when it sees
1665     * the first mark, so that mark will get it's natural offset
1666     * and the rest will get the offset of the character following
1667     * the marks. The base character will also get its natural offset.
1668     *
1669     * We write the offset of the base character, if there is one,
1670     * followed by the offset of the first mark and then the offsets
1671     * of the rest of the marks.
1672     */
1673    int32_t firstMarkOffset = 0;
1674    int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
1675    int32_t trailCount      = normLen - 1;
1676
1677    if (data->fcdPosition != NULL) {
1678        int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1679        UChar   baseChar   = *data->fcdPosition;
1680
1681        firstMarkOffset = baseOffset + 1;
1682
1683        /*
1684         * If the base character is the start of a contraction, forward processing
1685         * will normalize the marks while checking for the contraction, which means
1686         * that the offset of the first mark will the same as the other marks.
1687         *
1688         * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1689         */
1690        if (baseChar >= 0x100) {
1691            uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1692
1693            if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1694                baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1695            }
1696
1697            if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1698                firstMarkOffset = trailOffset;
1699            }
1700        }
1701
1702        data->appendOffset(baseOffset, status);
1703    }
1704
1705    data->appendOffset(firstMarkOffset, status);
1706
1707    for (int32_t i = 0; i < trailCount; i += 1) {
1708        data->appendOffset(trailOffset, status);
1709    }
1710
1711    data->offsetRepeatValue = trailOffset;
1712
1713    data->offsetReturn = data->offsetStore - 1;
1714    if (data->offsetReturn == data->offsetBuffer) {
1715        data->offsetStore = data->offsetBuffer;
1716    }
1717
1718    data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1719    data->origFlags  = data->flags;
1720    data->flags     |= UCOL_ITER_INNORMBUF;
1721    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1722}
1723
1724
1725/**
1726* Incremental FCD check for previous iteration and normalize. Called from
1727* getPrevCE when normalization state is suspect.
1728* When entering, the state is known to be this:
1729* o  We are working in the main buffer of the collIterate, not the side
1730*    writable buffer. When in the side buffer, normalization mode is always
1731*    off, so we won't get here.
1732* o  The leading combining class from the current character is 0 or the
1733*    trailing combining class of the previous char was zero.
1734*    True because the previous call to this function will have always exited
1735*    that way, and we get called for every char where cc might be non-zero.
1736* @param data collation iterate struct
1737* @return normalization status, TRUE for normalization to be done, FALSE
1738*         otherwise
1739*/
1740static
1741inline UBool collPrevIterFCD(collIterate *data)
1742{
1743    const UChar *src, *start;
1744    uint8_t     leadingCC;
1745    uint8_t     trailingCC = 0;
1746    uint16_t    fcd;
1747    UBool       result = FALSE;
1748
1749    start = data->string;
1750    src = data->pos + 1;
1751
1752    /* Get the trailing combining class of the current character. */
1753    fcd = g_nfcImpl->previousFCD16(start, src);
1754
1755    leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1756
1757    if (leadingCC != 0) {
1758        /*
1759        The current char has a non-zero leading combining class.
1760        Scan backward until we find a char with a trailing cc of zero.
1761        */
1762        for (;;)
1763        {
1764            if (start == src) {
1765                data->fcdPosition = NULL;
1766                return result;
1767            }
1768
1769            fcd = g_nfcImpl->previousFCD16(start, src);
1770
1771            trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1772
1773            if (trailingCC == 0) {
1774                break;
1775            }
1776
1777            if (leadingCC < trailingCC) {
1778                result = TRUE;
1779            }
1780
1781            leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1782        }
1783    }
1784
1785    data->fcdPosition = (UChar *)src;
1786
1787    return result;
1788}
1789
1790/** gets a code unit from the string at a given offset
1791 *  Handles both normal and iterative cases.
1792 *  No error checking - caller beware!
1793 */
1794static inline
1795UChar peekCodeUnit(collIterate *source, int32_t offset) {
1796    if(source->pos != NULL) {
1797        return *(source->pos + offset);
1798    } else if(source->iterator != NULL) {
1799        UChar32 c;
1800        if(offset != 0) {
1801            source->iterator->move(source->iterator, offset, UITER_CURRENT);
1802            c = source->iterator->next(source->iterator);
1803            source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1804        } else {
1805            c = source->iterator->current(source->iterator);
1806        }
1807        return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
1808    } else {
1809        return 0xfffd;
1810    }
1811}
1812
1813// Code point version. Treats the offset as a _code point_ delta.
1814// We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1815// We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1816static inline
1817UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1818    UChar32 c;
1819    if(source->pos != NULL) {
1820        const UChar *p = source->pos;
1821        if(offset >= 0) {
1822            // Skip forward over (offset-1) code points.
1823            while(--offset >= 0) {
1824                if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1825                    ++p;
1826                }
1827            }
1828            // Read the code point there.
1829            c = *p++;
1830            UChar trail;
1831            if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1832                c = U16_GET_SUPPLEMENTARY(c, trail);
1833            }
1834        } else /* offset<0 */ {
1835            // Skip backward over (offset-1) code points.
1836            while(++offset < 0) {
1837                if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1838                    --p;
1839                }
1840            }
1841            // Read the code point before that.
1842            c = *--p;
1843            UChar lead;
1844            if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1845                c = U16_GET_SUPPLEMENTARY(lead, c);
1846            }
1847        }
1848    } else if(source->iterator != NULL) {
1849        if(offset >= 0) {
1850            // Skip forward over (offset-1) code points.
1851            int32_t fwd = offset;
1852            while(fwd-- > 0) {
1853                uiter_next32(source->iterator);
1854            }
1855            // Read the code point there.
1856            c = uiter_current32(source->iterator);
1857            // Return to the starting point, skipping backward over (offset-1) code points.
1858            while(offset-- > 0) {
1859                uiter_previous32(source->iterator);
1860            }
1861        } else /* offset<0 */ {
1862            // Read backward, reading offset code points, remember only the last-read one.
1863            int32_t back = offset;
1864            do {
1865                c = uiter_previous32(source->iterator);
1866            } while(++back < 0);
1867            // Return to the starting position, skipping forward over offset code points.
1868            do {
1869                uiter_next32(source->iterator);
1870            } while(++offset < 0);
1871        }
1872    } else {
1873        c = U_SENTINEL;
1874    }
1875    return c;
1876}
1877
1878/**
1879* Determines if we are at the start of the data string in the backwards
1880* collation iterator
1881* @param data collation iterator
1882* @return TRUE if we are at the start
1883*/
1884static
1885inline UBool isAtStartPrevIterate(collIterate *data) {
1886    if(data->pos == NULL && data->iterator != NULL) {
1887        return !data->iterator->hasPrevious(data->iterator);
1888    }
1889    //return (collIter_bos(data)) ||
1890    return (data->pos == data->string) ||
1891              ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
1892              *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1893}
1894
1895static
1896inline void goBackOne(collIterate *data) {
1897# if 0
1898    // somehow, it looks like we need to keep iterator synced up
1899    // at all times, as above.
1900    if(data->pos) {
1901        data->pos--;
1902    }
1903    if(data->iterator) {
1904        data->iterator->previous(data->iterator);
1905    }
1906#endif
1907    if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1908        data->iterator->previous(data->iterator);
1909    }
1910    if(data->pos) {
1911        data->pos --;
1912    }
1913}
1914
1915/**
1916* Inline function that gets a simple CE.
1917* So what it does is that it will first check the expansion buffer. If the
1918* expansion buffer is not empty, ie the end pointer to the expansion buffer
1919* is different from the string pointer, we return the collation element at the
1920* return pointer and decrement it.
1921* For more complicated CEs it resorts to getComplicatedCE.
1922* @param coll collator data
1923* @param data collation iterator struct
1924* @param status error status
1925*/
1926static
1927inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1928                               UErrorCode *status)
1929{
1930    uint32_t result = (uint32_t)UCOL_NULLORDER;
1931
1932    if (data->offsetReturn != NULL) {
1933        if (data->offsetRepeatCount > 0) {
1934                data->offsetRepeatCount -= 1;
1935        } else {
1936            if (data->offsetReturn == data->offsetBuffer) {
1937                data->offsetReturn = NULL;
1938                data->offsetStore  = data->offsetBuffer;
1939            } else {
1940                data->offsetReturn -= 1;
1941            }
1942        }
1943    }
1944
1945    if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1946            (!data->extendCEs && data->toReturn > data->CEs))
1947    {
1948        data->toReturn -= 1;
1949        result = *(data->toReturn);
1950        if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1951            data->CEpos = data->toReturn;
1952        }
1953    }
1954    else {
1955        UChar ch = 0;
1956
1957        do {
1958            /*
1959            Loop handles case when incremental normalize switches to or from the
1960            side buffer / original string, and we need to start again to get the
1961            next character.
1962            */
1963            for (;;) {
1964                if (data->flags & UCOL_ITER_HASLEN) {
1965                    /*
1966                    Normal path for strings when length is specified.
1967                    Not in side buffer because it is always null terminated.
1968                    */
1969                    if (data->pos <= data->string) {
1970                        /* End of the main source string */
1971                        return UCOL_NO_MORE_CES;
1972                    }
1973                    data->pos --;
1974                    ch = *data->pos;
1975                }
1976                // we are using an iterator to go back. Pray for us!
1977                else if (data->flags & UCOL_USE_ITERATOR) {
1978                  UChar32 iterCh = data->iterator->previous(data->iterator);
1979                  if(iterCh == U_SENTINEL) {
1980                    return UCOL_NO_MORE_CES;
1981                  } else {
1982                    ch = (UChar)iterCh;
1983                  }
1984                }
1985                else {
1986                    data->pos --;
1987                    ch = *data->pos;
1988                    /* we are in the side buffer. */
1989                    if (ch == 0) {
1990                        /*
1991                        At the start of the normalize side buffer.
1992                        Go back to string.
1993                        Because pointer points to the last accessed character,
1994                        hence we have to increment it by one here.
1995                        */
1996                        data->flags = data->origFlags;
1997                        data->offsetRepeatValue = 0;
1998
1999                         if (data->fcdPosition == NULL) {
2000                            data->pos = data->string;
2001                            return UCOL_NO_MORE_CES;
2002                        }
2003                        else {
2004                            data->pos   = data->fcdPosition + 1;
2005                        }
2006
2007                       continue;
2008                    }
2009                }
2010
2011                if(data->flags&UCOL_HIRAGANA_Q) {
2012                  if(ch>=0x3040 && ch<=0x309f) {
2013                    data->flags |= UCOL_WAS_HIRAGANA;
2014                  } else {
2015                    data->flags &= ~UCOL_WAS_HIRAGANA;
2016                  }
2017                }
2018
2019                /*
2020                * got a character to determine if there's fcd and/or normalization
2021                * stuff to do.
2022                * if the current character is not fcd.
2023                * if current character is at the start of the string
2024                * Trailing combining class == 0.
2025                * Note if pos is in the writablebuffer, norm is always 0
2026                */
2027                if (ch < ZERO_CC_LIMIT_ ||
2028                  // this should propel us out of the loop in the iterator case
2029                    (data->flags & UCOL_ITER_NORM) == 0 ||
2030                    (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2031                    || data->string == data->pos) {
2032                    break;
2033                }
2034
2035                if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2036                    /* if next character is FCD */
2037                    if (data->pos == data->string) {
2038                        /* First char of string is always OK for FCD check */
2039                        break;
2040                    }
2041
2042                    /* Not first char of string, do the FCD fast test */
2043                    if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2044                        break;
2045                    }
2046                }
2047
2048                /* Need a more complete FCD check and possible normalization. */
2049                if (collPrevIterFCD(data)) {
2050                    collPrevIterNormalize(data);
2051                }
2052
2053                if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2054                    /*  No normalization. Go ahead and process the char. */
2055                    break;
2056                }
2057
2058                /*
2059                Some normalization happened.
2060                Next loop picks up a char from the normalization buffer.
2061                */
2062            }
2063
2064            /* attempt to handle contractions, after removal of the backwards
2065            contraction
2066            */
2067            if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2068                result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2069            } else {
2070                if (ch <= 0xFF) {
2071                    result = coll->latinOneMapping[ch];
2072                }
2073                else {
2074                    // Always use UCA for [3400..9FFF], [AC00..D7AF]
2075                    // **** [FA0E..FA2F] ?? ****
2076                    if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2077                        (ch >= 0x3400 && ch <= 0xD7AF)) {
2078                        if (ch > 0x9FFF && ch < 0xAC00) {
2079                            // between the two target ranges; do normal lookup
2080                            // **** this range is YI, Modifier tone letters, ****
2081                            // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
2082                            // **** Latin-D might be tailored, so we need to ****
2083                            // **** do the normal lookup for these guys.     ****
2084                             result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2085                        } else {
2086                            result = UCOL_NOT_FOUND;
2087                        }
2088                    } else {
2089                        result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2090                    }
2091                }
2092                if (result > UCOL_NOT_FOUND) {
2093                    result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2094                }
2095                if (result == UCOL_NOT_FOUND) { // Not found in master list
2096                    if (!isAtStartPrevIterate(data) &&
2097                        ucol_contractionEndCP(ch, data->coll))
2098                    {
2099                        result = UCOL_CONTRACTION;
2100                    } else {
2101                        if(coll->UCA) {
2102                            result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2103                        }
2104                    }
2105
2106                    if (result > UCOL_NOT_FOUND) {
2107                        if(coll->UCA) {
2108                            result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2109                        }
2110                    }
2111                }
2112            }
2113        } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2114
2115        if(result == UCOL_NOT_FOUND) {
2116            result = getPrevImplicit(ch, data);
2117        }
2118    }
2119
2120    return result;
2121}
2122
2123
2124/*   ucol_getPrevCE, out-of-line version for use from other files.  */
2125U_CFUNC uint32_t  U_EXPORT2
2126ucol_getPrevCE(const UCollator *coll, collIterate *data,
2127                        UErrorCode *status) {
2128    return ucol_IGetPrevCE(coll, data, status);
2129}
2130
2131
2132/* this should be connected to special Jamo handling */
2133U_CFUNC uint32_t  U_EXPORT2
2134ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2135    collIterate colIt;
2136    IInit_collIterate(coll, &u, 1, &colIt, status);
2137    if(U_FAILURE(*status)) {
2138        return 0;
2139    }
2140    return ucol_IGetNextCE(coll, &colIt, status);
2141}
2142
2143/**
2144* Inserts the argument character into the end of the buffer pushing back the
2145* null terminator.
2146* @param data collIterate struct data
2147* @param ch character to be appended
2148* @return the position of the new addition
2149*/
2150static
2151inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2152{
2153    int32_t oldLength = data->writableBuffer.length();
2154    return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2155}
2156
2157/**
2158* Inserts the argument string into the end of the buffer pushing back the
2159* null terminator.
2160* @param data collIterate struct data
2161* @param string to be appended
2162* @param length of the string to be appended
2163* @return the position of the new addition
2164*/
2165static
2166inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2167{
2168    int32_t oldLength = data->writableBuffer.length();
2169    return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2170}
2171
2172/**
2173* Special normalization function for contraction in the forwards iterator.
2174* This normalization sequence will place the current character at source->pos
2175* and its following normalized sequence into the buffer.
2176* The fcd position, pos will be changed.
2177* pos will now point to positions in the buffer.
2178* Flags will be changed accordingly.
2179* @param data collation iterator data
2180*/
2181static
2182inline void normalizeNextContraction(collIterate *data)
2183{
2184    int32_t     strsize;
2185    UErrorCode  status     = U_ZERO_ERROR;
2186    /* because the pointer points to the next character */
2187    const UChar *pStart    = data->pos - 1;
2188    const UChar *pEnd;
2189
2190    if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2191        data->writableBuffer.setTo(*(pStart - 1));
2192        strsize               = 1;
2193    }
2194    else {
2195        strsize = data->writableBuffer.length();
2196    }
2197
2198    pEnd = data->fcdPosition;
2199
2200    data->writableBuffer.append(
2201        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2202    if(U_FAILURE(status)) {
2203        return;
2204    }
2205
2206    data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
2207    data->origFlags  = data->flags;
2208    data->flags     |= UCOL_ITER_INNORMBUF;
2209    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2210}
2211
2212/**
2213* Contraction character management function that returns the next character
2214* for the forwards iterator.
2215* Does nothing if the next character is in buffer and not the first character
2216* in it.
2217* Else it checks next character in data string to see if it is normalizable.
2218* If it is not, the character is simply copied into the buffer, else
2219* the whole normalized substring is copied into the buffer, including the
2220* current character.
2221* @param data collation element iterator data
2222* @return next character
2223*/
2224static
2225inline UChar getNextNormalizedChar(collIterate *data)
2226{
2227    UChar  nextch;
2228    UChar  ch;
2229    // Here we need to add the iterator code. One problem is the way
2230    // end of string is handled. If we just return next char, it could
2231    // be the sentinel. Most of the cases already check for this, but we
2232    // need to be sure.
2233    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2234         /* if no normalization and not in buffer. */
2235      if(data->flags & UCOL_USE_ITERATOR) {
2236         return (UChar)data->iterator->next(data->iterator);
2237      } else {
2238         return *(data->pos ++);
2239      }
2240    }
2241
2242    //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2243      //normalizeIterator(data);
2244    //}
2245
2246    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2247    if ((innormbuf && *data->pos != 0) ||
2248        (data->fcdPosition != NULL && !innormbuf &&
2249        data->pos < data->fcdPosition)) {
2250        /*
2251        if next character is in normalized buffer, no further normalization
2252        is required
2253        */
2254        return *(data->pos ++);
2255    }
2256
2257    if (data->flags & UCOL_ITER_HASLEN) {
2258        /* in data string */
2259        if (data->pos + 1 == data->endp) {
2260            return *(data->pos ++);
2261        }
2262    }
2263    else {
2264        if (innormbuf) {
2265          // inside the normalization buffer, but at the end
2266          // (since we encountered zero). This means, in the
2267          // case we're using char iterator, that we need to
2268          // do another round of normalization.
2269          //if(data->origFlags & UCOL_USE_ITERATOR) {
2270            // we need to restore original flags,
2271            // otherwise, we'll lose them
2272            //data->flags = data->origFlags;
2273            //normalizeIterator(data);
2274            //return *(data->pos++);
2275          //} else {
2276            /*
2277            in writable buffer, at this point fcdPosition can not be
2278            pointing to the end of the data string. see contracting tag.
2279            */
2280          if(data->fcdPosition) {
2281            if (*(data->fcdPosition + 1) == 0 ||
2282                data->fcdPosition + 1 == data->endp) {
2283                /* at the end of the string, dump it into the normalizer */
2284                data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2285                // Check if data->pos received a null pointer
2286                if (data->pos == NULL) {
2287                    return (UChar)-1; // Return to indicate error.
2288                }
2289                return *(data->fcdPosition ++);
2290            }
2291            data->pos = data->fcdPosition;
2292          } else if(data->origFlags & UCOL_USE_ITERATOR) {
2293            // if we are here, we're using a normalizing iterator.
2294            // we should just continue further.
2295            data->flags = data->origFlags;
2296            data->pos = NULL;
2297            return (UChar)data->iterator->next(data->iterator);
2298          }
2299          //}
2300        }
2301        else {
2302            if (*(data->pos + 1) == 0) {
2303                return *(data->pos ++);
2304            }
2305        }
2306    }
2307
2308    ch = *data->pos ++;
2309    nextch = *data->pos;
2310
2311    /*
2312    * if the current character is not fcd.
2313    * Trailing combining class == 0.
2314    */
2315    if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2316        (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2317         ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2318            /*
2319            Need a more complete FCD check and possible normalization.
2320            normalize substring will be appended to buffer
2321            */
2322        if (collIterFCD(data)) {
2323            normalizeNextContraction(data);
2324            return *(data->pos ++);
2325        }
2326        else if (innormbuf) {
2327            /* fcdposition shifted even when there's no normalization, if we
2328            don't input the rest into this, we'll get the wrong position when
2329            we reach the end of the writableBuffer */
2330            int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2331            data->pos = insertBufferEnd(data, data->pos - 1, length);
2332            // Check if data->pos received a null pointer
2333            if (data->pos == NULL) {
2334                return (UChar)-1; // Return to indicate error.
2335            }
2336            return *(data->pos ++);
2337        }
2338    }
2339
2340    if (innormbuf) {
2341        /*
2342        no normalization is to be done hence only one character will be
2343        appended to the buffer.
2344        */
2345        data->pos = insertBufferEnd(data, ch) + 1;
2346        // Check if data->pos received a null pointer
2347        if (data->pos == NULL) {
2348            return (UChar)-1; // Return to indicate error.
2349        }
2350    }
2351
2352    /* points back to the pos in string */
2353    return ch;
2354}
2355
2356
2357
2358/**
2359* Function to copy the buffer into writableBuffer and sets the fcd position to
2360* the correct position
2361* @param source data string source
2362* @param buffer character buffer
2363*/
2364static
2365inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2366{
2367    /* okay confusing part here. to ensure that the skipped characters are
2368    considered later, we need to place it in the appropriate position in the
2369    normalization buffer and reassign the pos pointer. simple case if pos
2370    reside in string, simply copy to normalization buffer and
2371    fcdposition = pos, pos = start of normalization buffer. if pos in
2372    normalization buffer, we'll insert the copy infront of pos and point pos
2373    to the start of the normalization buffer. why am i doing these copies?
2374    well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2375    not require any changes, which be really painful. */
2376    if (source->flags & UCOL_ITER_INNORMBUF) {
2377        int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2378        source->writableBuffer.replace(0, replaceLength, buffer);
2379    }
2380    else {
2381        source->fcdPosition  = source->pos;
2382        source->origFlags    = source->flags;
2383        source->flags       |= UCOL_ITER_INNORMBUF;
2384        source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2385        source->writableBuffer = buffer;
2386    }
2387
2388    source->pos = source->writableBuffer.getTerminatedBuffer();
2389}
2390
2391/**
2392* Function to get the discontiguos collation element within the source.
2393* Note this function will set the position to the appropriate places.
2394* @param coll current collator used
2395* @param source data string source
2396* @param constart index to the start character in the contraction table
2397* @return discontiguos collation element offset
2398*/
2399static
2400uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2401                                const UChar *constart)
2402{
2403    /* source->pos currently points to the second combining character after
2404       the start character */
2405          const UChar *temppos      = source->pos;
2406          UnicodeString buffer;
2407    const UChar   *tempconstart = constart;
2408          uint8_t  tempflags    = source->flags;
2409          UBool    multicontraction = FALSE;
2410          collIterateState discState;
2411
2412          backupState(source, &discState);
2413
2414    buffer.setTo(peekCodePoint(source, -1));
2415    for (;;) {
2416        UChar    *UCharOffset;
2417        UChar     schar,
2418                  tchar;
2419        uint32_t  result;
2420
2421        if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2422            || (peekCodeUnit(source, 0) == 0  &&
2423            //|| (*source->pos == 0  &&
2424                ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2425                 source->fcdPosition == NULL ||
2426                 source->fcdPosition == source->endp ||
2427                 *(source->fcdPosition) == 0 ||
2428                 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2429                 /* end of string in null terminated string or stopped by a
2430                 null character, note fcd does not always point to a base
2431                 character after the discontiguos change */
2432                 u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2433                 //u_getCombiningClass(*(source->pos)) == 0) {
2434            //constart = (UChar *)coll->image + getContractOffset(CE);
2435            if (multicontraction) {
2436                source->pos    = temppos - 1;
2437                setDiscontiguosAttribute(source, buffer);
2438                return *(coll->contractionCEs +
2439                                    (tempconstart - coll->contractionIndex));
2440            }
2441            constart = tempconstart;
2442            break;
2443        }
2444
2445        UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2446        schar = getNextNormalizedChar(source);
2447
2448        while (schar > (tchar = *UCharOffset)) {
2449            UCharOffset++;
2450        }
2451
2452        if (schar != tchar) {
2453            /* not the correct codepoint. we stuff the current codepoint into
2454            the discontiguos buffer and try the next character */
2455            buffer.append(schar);
2456            continue;
2457        }
2458        else {
2459            if (u_getCombiningClass(schar) ==
2460                u_getCombiningClass(peekCodePoint(source, -2))) {
2461                buffer.append(schar);
2462                continue;
2463            }
2464            result = *(coll->contractionCEs +
2465                                      (UCharOffset - coll->contractionIndex));
2466        }
2467
2468        if (result == UCOL_NOT_FOUND) {
2469          break;
2470        } else if (isContraction(result)) {
2471            /* this is a multi-contraction*/
2472            tempconstart = (UChar *)coll->image + getContractOffset(result);
2473            if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2474                != UCOL_NOT_FOUND) {
2475                multicontraction = TRUE;
2476                temppos       = source->pos + 1;
2477            }
2478        } else {
2479            setDiscontiguosAttribute(source, buffer);
2480            return result;
2481        }
2482    }
2483
2484    /* no problems simply reverting just like that,
2485    if we are in string before getting into this function, points back to
2486    string hence no problem.
2487    if we are in normalization buffer before getting into this function,
2488    since we'll never use another normalization within this function, we
2489    know that fcdposition points to a base character. the normalization buffer
2490    never change, hence this revert works. */
2491    loadState(source, &discState, TRUE);
2492    goBackOne(source);
2493
2494    //source->pos   = temppos - 1;
2495    source->flags = tempflags;
2496    return *(coll->contractionCEs + (constart - coll->contractionIndex));
2497}
2498
2499/* now uses Mark's getImplicitPrimary code */
2500static
2501inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2502    uint32_t r = uprv_uca_getImplicitPrimary(cp);
2503    *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2504    collationSource->offsetRepeatCount += 1;
2505    return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2506}
2507
2508/**
2509* Inserts the argument character into the front of the buffer replacing the
2510* front null terminator.
2511* @param data collation element iterator data
2512* @param ch character to be appended
2513*/
2514static
2515inline void insertBufferFront(collIterate *data, UChar ch)
2516{
2517    data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2518}
2519
2520/**
2521* Special normalization function for contraction in the previous iterator.
2522* This normalization sequence will place the current character at source->pos
2523* and its following normalized sequence into the buffer.
2524* The fcd position, pos will be changed.
2525* pos will now point to positions in the buffer.
2526* Flags will be changed accordingly.
2527* @param data collation iterator data
2528*/
2529static
2530inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2531{
2532    const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
2533    const UChar *pStart;
2534
2535    UnicodeString endOfBuffer;
2536    if (data->flags & UCOL_ITER_HASLEN) {
2537        /*
2538        normalization buffer not used yet, we'll pull down the next
2539        character into the end of the buffer
2540        */
2541        endOfBuffer.setTo(*pEnd);
2542    }
2543    else {
2544        endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
2545    }
2546
2547    if (data->fcdPosition == NULL) {
2548        pStart = data->string;
2549    }
2550    else {
2551        pStart = data->fcdPosition + 1;
2552    }
2553    int32_t normLen =
2554        data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2555                             data->writableBuffer,
2556                             *status).
2557        length();
2558    if(U_FAILURE(*status)) {
2559        return;
2560    }
2561    /*
2562    this puts the null termination infront of the normalized string instead
2563    of the end
2564    */
2565    data->pos =
2566        data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2567        1 + normLen;
2568    data->origFlags  = data->flags;
2569    data->flags     |= UCOL_ITER_INNORMBUF;
2570    data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2571}
2572
2573/**
2574* Contraction character management function that returns the previous character
2575* for the backwards iterator.
2576* Does nothing if the previous character is in buffer and not the first
2577* character in it.
2578* Else it checks previous character in data string to see if it is
2579* normalizable.
2580* If it is not, the character is simply copied into the buffer, else
2581* the whole normalized substring is copied into the buffer, including the
2582* current character.
2583* @param data collation element iterator data
2584* @return previous character
2585*/
2586static
2587inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2588{
2589    UChar  prevch;
2590    UChar  ch;
2591    const UChar *start;
2592    UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2593    if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2594        (innormbuf && *(data->pos - 1) != 0)) {
2595        /*
2596        if no normalization.
2597        if previous character is in normalized buffer, no further normalization
2598        is required
2599        */
2600      if(data->flags & UCOL_USE_ITERATOR) {
2601        data->iterator->move(data->iterator, -1, UITER_CURRENT);
2602        return (UChar)data->iterator->next(data->iterator);
2603      } else {
2604        return *(data->pos - 1);
2605      }
2606    }
2607
2608    start = data->pos;
2609    if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2610        /* in data string */
2611        if ((start - 1) == data->string) {
2612            return *(start - 1);
2613        }
2614        start --;
2615        ch     = *start;
2616        prevch = *(start - 1);
2617    }
2618    else {
2619        /*
2620        in writable buffer, at this point fcdPosition can not be NULL.
2621        see contracting tag.
2622        */
2623        if (data->fcdPosition == data->string) {
2624            /* at the start of the string, just dump it into the normalizer */
2625            insertBufferFront(data, *(data->fcdPosition));
2626            data->fcdPosition = NULL;
2627            return *(data->pos - 1);
2628        }
2629        start  = data->fcdPosition;
2630        ch     = *start;
2631        prevch = *(start - 1);
2632    }
2633    /*
2634    * if the current character is not fcd.
2635    * Trailing combining class == 0.
2636    */
2637    if (data->fcdPosition > start &&
2638       (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2639    {
2640        /*
2641        Need a more complete FCD check and possible normalization.
2642        normalize substring will be appended to buffer
2643        */
2644        const UChar *backuppos = data->pos;
2645        data->pos = start;
2646        if (collPrevIterFCD(data)) {
2647            normalizePrevContraction(data, status);
2648            return *(data->pos - 1);
2649        }
2650        data->pos = backuppos;
2651        data->fcdPosition ++;
2652    }
2653
2654    if (innormbuf) {
2655    /*
2656    no normalization is to be done hence only one character will be
2657    appended to the buffer.
2658    */
2659        insertBufferFront(data, ch);
2660        data->fcdPosition --;
2661    }
2662
2663    return ch;
2664}
2665
2666/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2667/* It is called by getNextCE */
2668
2669/* The following should be even */
2670#define UCOL_MAX_DIGITS_FOR_NUMBER 254
2671
2672uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2673    collIterateState entryState;
2674    backupState(source, &entryState);
2675    UChar32 cp = ch;
2676
2677    for (;;) {
2678        // This loop will repeat only in the case of contractions, and only when a contraction
2679        //   is found and the first CE resulting from that contraction is itself a special
2680        //   (an expansion, for example.)  All other special CE types are fully handled the
2681        //   first time through, and the loop exits.
2682
2683        const uint32_t *CEOffset = NULL;
2684        switch(getCETag(CE)) {
2685        case NOT_FOUND_TAG:
2686            /* This one is not found, and we'll let somebody else bother about it... no more games */
2687            return CE;
2688        case SPEC_PROC_TAG:
2689            {
2690                // Special processing is getting a CE that is preceded by a certain prefix
2691                // Currently this is only needed for optimizing Japanese length and iteration marks.
2692                // When we encouter a special processing tag, we go backwards and try to see if
2693                // we have a match.
2694                // Contraction tables are used - so the whole process is not unlike contraction.
2695                // prefix data is stored backwards in the table.
2696                const UChar *UCharOffset;
2697                UChar schar, tchar;
2698                collIterateState prefixState;
2699                backupState(source, &prefixState);
2700                loadState(source, &entryState, TRUE);
2701                goBackOne(source); // We want to look at the point where we entered - actually one
2702                // before that...
2703
2704                for(;;) {
2705                    // This loop will run once per source string character, for as long as we
2706                    //  are matching a potential contraction sequence
2707
2708                    // First we position ourselves at the begining of contraction sequence
2709                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2710                    if (collIter_bos(source)) {
2711                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2712                        break;
2713                    }
2714                    schar = getPrevNormalizedChar(source, status);
2715                    goBackOne(source);
2716
2717                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2718                        UCharOffset++;
2719                    }
2720
2721                    if (schar == tchar) {
2722                        // Found the source string char in the table.
2723                        //  Pick up the corresponding CE from the table.
2724                        CE = *(coll->contractionCEs +
2725                            (UCharOffset - coll->contractionIndex));
2726                    }
2727                    else
2728                    {
2729                        // Source string char was not in the table.
2730                        //   We have not found the prefix.
2731                        CE = *(coll->contractionCEs +
2732                            (ContractionStart - coll->contractionIndex));
2733                    }
2734
2735                    if(!isPrefix(CE)) {
2736                        // The source string char was in the contraction table, and the corresponding
2737                        //   CE is not a prefix CE.  We found the prefix, break
2738                        //   out of loop, this CE will end up being returned.  This is the normal
2739                        //   way out of prefix handling when the source actually contained
2740                        //   the prefix.
2741                        break;
2742                    }
2743                }
2744                if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2745                    loadState(source, &prefixState, TRUE);
2746                    if(source->origFlags & UCOL_USE_ITERATOR) {
2747                        source->flags = source->origFlags;
2748                    }
2749                } else { // prefix search was a failure, we have to backup all the way to the start
2750                    loadState(source, &entryState, TRUE);
2751                }
2752                break;
2753            }
2754        case CONTRACTION_TAG:
2755            {
2756                /* This should handle contractions */
2757                collIterateState state;
2758                backupState(source, &state);
2759                uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2760                const UChar *UCharOffset;
2761                UChar schar, tchar;
2762
2763                for (;;) {
2764                    /* This loop will run once per source string character, for as long as we     */
2765                    /*  are matching a potential contraction sequence                  */
2766
2767                    /* First we position ourselves at the begining of contraction sequence */
2768                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2769
2770                    if (collIter_eos(source)) {
2771                        // Ran off the end of the source string.
2772                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2773                        // So we'll pick whatever we have at the point...
2774                        if (CE == UCOL_NOT_FOUND) {
2775                            // back up the source over all the chars we scanned going into this contraction.
2776                            CE = firstCE;
2777                            loadState(source, &state, TRUE);
2778                            if(source->origFlags & UCOL_USE_ITERATOR) {
2779                                source->flags = source->origFlags;
2780                            }
2781                        }
2782                        break;
2783                    }
2784
2785                    uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2786                    uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2787
2788                    schar = getNextNormalizedChar(source);
2789                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2790                        UCharOffset++;
2791                    }
2792
2793                    if (schar == tchar) {
2794                        // Found the source string char in the contraction table.
2795                        //  Pick up the corresponding CE from the table.
2796                        CE = *(coll->contractionCEs +
2797                            (UCharOffset - coll->contractionIndex));
2798                    }
2799                    else
2800                    {
2801                        // Source string char was not in contraction table.
2802                        //   Unless we have a discontiguous contraction, we have finished
2803                        //   with this contraction.
2804                        // in order to do the proper detection, we
2805                        // need to see if we're dealing with a supplementary
2806                        /* We test whether the next two char are surrogate pairs.
2807                        * This test is done if the iterator is not NULL.
2808                        * If there is no surrogate pair, the iterator
2809                        * goes back one if needed. */
2810                        UChar32 miss = schar;
2811                        if (source->iterator) {
2812                            UChar32 surrNextChar; /* the next char in the iteration to test */
2813                            int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2814                            if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2815                                prevPos = source->iterator->index;
2816                                surrNextChar = getNextNormalizedChar(source);
2817                                if (U16_IS_TRAIL(surrNextChar)) {
2818                                    miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2819                                } else if (prevPos < source->iterator->index){
2820                                    goBackOne(source);
2821                                }
2822                            }
2823                        } else if (U16_IS_LEAD(schar)) {
2824                            miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2825                        }
2826
2827                        uint8_t sCC;
2828                        if (miss < 0x300 ||
2829                            maxCC == 0 ||
2830                            (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2831                            sCC>maxCC ||
2832                            (allSame != 0 && sCC == maxCC) ||
2833                            collIter_eos(source))
2834                        {
2835                            //  Contraction can not be discontiguous.
2836                            goBackOne(source);  // back up the source string by one,
2837                            //  because  the character we just looked at was
2838                            //  not part of the contraction.   */
2839                            if(U_IS_SUPPLEMENTARY(miss)) {
2840                                goBackOne(source);
2841                            }
2842                            CE = *(coll->contractionCEs +
2843                                (ContractionStart - coll->contractionIndex));
2844                        } else {
2845                            //
2846                            // Contraction is possibly discontiguous.
2847                            //   Scan more of source string looking for a match
2848                            //
2849                            UChar tempchar;
2850                            /* find the next character if schar is not a base character
2851                            and we are not yet at the end of the string */
2852                            tempchar = getNextNormalizedChar(source);
2853                            // probably need another supplementary thingie here
2854                            goBackOne(source);
2855                            if (i_getCombiningClass(tempchar, coll) == 0) {
2856                                goBackOne(source);
2857                                if(U_IS_SUPPLEMENTARY(miss)) {
2858                                    goBackOne(source);
2859                                }
2860                                /* Spit out the last char of the string, wasn't tasty enough */
2861                                CE = *(coll->contractionCEs +
2862                                    (ContractionStart - coll->contractionIndex));
2863                            } else {
2864                                CE = getDiscontiguous(coll, source, ContractionStart);
2865                            }
2866                        }
2867                    } // else after if(schar == tchar)
2868
2869                    if(CE == UCOL_NOT_FOUND) {
2870                        /* The Source string did not match the contraction that we were checking.  */
2871                        /*  Back up the source position to undo the effects of having partially    */
2872                        /*   scanned through what ultimately proved to not be a contraction.       */
2873                        loadState(source, &state, TRUE);
2874                        CE = firstCE;
2875                        break;
2876                    }
2877
2878                    if(!isContraction(CE)) {
2879                        // The source string char was in the contraction table, and the corresponding
2880                        //   CE is not a contraction CE.  We completed the contraction, break
2881                        //   out of loop, this CE will end up being returned.  This is the normal
2882                        //   way out of contraction handling when the source actually contained
2883                        //   the contraction.
2884                        break;
2885                    }
2886
2887
2888                    // The source string char was in the contraction table, and the corresponding
2889                    //   CE is IS  a contraction CE.  We will continue looping to check the source
2890                    //   string for the remaining chars in the contraction.
2891                    uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2892                    if(tempCE != UCOL_NOT_FOUND) {
2893                        // We have scanned a a section of source string for which there is a
2894                        //  CE from the contraction table.  Remember the CE and scan position, so
2895                        //  that we can return to this point if further scanning fails to
2896                        //  match a longer contraction sequence.
2897                        firstCE = tempCE;
2898
2899                        goBackOne(source);
2900                        backupState(source, &state);
2901                        getNextNormalizedChar(source);
2902
2903                        // Another way to do this is:
2904                        //collIterateState tempState;
2905                        //backupState(source, &tempState);
2906                        //goBackOne(source);
2907                        //backupState(source, &state);
2908                        //loadState(source, &tempState, TRUE);
2909
2910                        // The problem is that for incomplete contractions we have to remember the previous
2911                        // position. Before, the only thing I needed to do was state.pos--;
2912                        // After iterator introduction and especially after introduction of normalizing
2913                        // iterators, it became much more difficult to decrease the saved state.
2914                        // I'm not yet sure which of the two methods above is faster.
2915                    }
2916                } // for(;;)
2917                break;
2918            } // case CONTRACTION_TAG:
2919        case LONG_PRIMARY_TAG:
2920            {
2921                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2922                CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2923                source->offsetRepeatCount += 1;
2924                return CE;
2925            }
2926        case EXPANSION_TAG:
2927            {
2928                /* This should handle expansion. */
2929                /* NOTE: we can encounter both continuations and expansions in an expansion! */
2930                /* I have to decide where continuations are going to be dealt with */
2931                uint32_t size;
2932                uint32_t i;    /* general counter */
2933
2934                CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2935                size = getExpansionCount(CE);
2936                CE = *CEOffset++;
2937              //source->offsetRepeatCount = -1;
2938
2939                if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2940                    for(i = 1; i<size; i++) {
2941                        *(source->CEpos++) = *CEOffset++;
2942                        source->offsetRepeatCount += 1;
2943                    }
2944                } else { /* else, we do */
2945                    while(*CEOffset != 0) {
2946                        *(source->CEpos++) = *CEOffset++;
2947                        source->offsetRepeatCount += 1;
2948                    }
2949                }
2950
2951                return CE;
2952            }
2953        case DIGIT_TAG:
2954            {
2955                /*
2956                We do a check to see if we want to collate digits as numbers; if so we generate
2957                a custom collation key. Otherwise we pull out the value stored in the expansion table.
2958                */
2959                //uint32_t size;
2960                uint32_t i;    /* general counter */
2961
2962                if (source->coll->numericCollation == UCOL_ON){
2963                    collIterateState digitState = {0,0,0,0,0,0,0,0,0};
2964                    UChar32 char32 = 0;
2965                    int32_t digVal = 0;
2966
2967                    uint32_t digIndx = 0;
2968                    uint32_t endIndex = 0;
2969                    uint32_t trailingZeroIndex = 0;
2970
2971                    uint8_t collateVal = 0;
2972
2973                    UBool nonZeroValReached = FALSE;
2974
2975                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
2976                    /*
2977                         We parse the source string until we hit a char that's NOT a digit.
2978                        Use this u_charDigitValue. This might be slow because we have to
2979                        handle surrogates...
2980                    */
2981            /*
2982                    if (U16_IS_LEAD(ch)){
2983                      if (!collIter_eos(source)) {
2984                        backupState(source, &digitState);
2985                        UChar trail = getNextNormalizedChar(source);
2986                        if(U16_IS_TRAIL(trail)) {
2987                          char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2988                        } else {
2989                          loadState(source, &digitState, TRUE);
2990                          char32 = ch;
2991                        }
2992                      } else {
2993                        char32 = ch;
2994                      }
2995                    } else {
2996                      char32 = ch;
2997                    }
2998                    digVal = u_charDigitValue(char32);
2999            */
3000                    digVal = u_charDigitValue(cp); // if we have arrived here, we have
3001                    // already processed possible supplementaries that trigered the digit tag -
3002                    // all supplementaries are marked in the UCA.
3003                    /*
3004                        We  pad a zero in front of the first element anyways. This takes
3005                        care of the (probably) most common case where people are sorting things followed
3006                        by a single digit
3007                    */
3008                    digIndx++;
3009                    for(;;){
3010                        // Make sure we have enough space. No longer needed;
3011                        // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3012                        // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3013                        // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3014
3015                        // Skipping over leading zeroes.
3016                        if (digVal != 0) {
3017                            nonZeroValReached = TRUE;
3018                        }
3019                        if (nonZeroValReached) {
3020                            /*
3021                            We parse the digit string into base 100 numbers (this fits into a byte).
3022                            We only add to the buffer in twos, thus if we are parsing an odd character,
3023                            that serves as the 'tens' digit while the if we are parsing an even one, that
3024                            is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3025                            a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3026                            overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3027                            than all the other bytes.
3028                            */
3029
3030                            if (digIndx % 2 == 1){
3031                                collateVal += (uint8_t)digVal;
3032
3033                                // We don't enter the low-order-digit case unless we've already seen
3034                                // the high order, or for the first digit, which is always non-zero.
3035                                if (collateVal != 0)
3036                                    trailingZeroIndex = 0;
3037
3038                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3039                                collateVal = 0;
3040                            }
3041                            else{
3042                                // We drop the collation value into the buffer so if we need to do
3043                                // a "front patch" we don't have to check to see if we're hitting the
3044                                // last element.
3045                                collateVal = (uint8_t)(digVal * 10);
3046
3047                                // Check for trailing zeroes.
3048                                if (collateVal == 0)
3049                                {
3050                                    if (!trailingZeroIndex)
3051                                        trailingZeroIndex = (digIndx/2) + 2;
3052                                }
3053                                else
3054                                    trailingZeroIndex = 0;
3055
3056                                numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3057                            }
3058                            digIndx++;
3059                        }
3060
3061                        // Get next character.
3062                        if (!collIter_eos(source)){
3063                            ch = getNextNormalizedChar(source);
3064                            if (U16_IS_LEAD(ch)){
3065                                if (!collIter_eos(source)) {
3066                                    backupState(source, &digitState);
3067                                    UChar trail = getNextNormalizedChar(source);
3068                                    if(U16_IS_TRAIL(trail)) {
3069                                        char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3070                                    } else {
3071                                        loadState(source, &digitState, TRUE);
3072                                        char32 = ch;
3073                                    }
3074                                }
3075                            } else {
3076                                char32 = ch;
3077                            }
3078
3079                            if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3080                                // Resetting position to point to the next unprocessed char. We
3081                                // overshot it when doing our test/set for numbers.
3082                                if (char32 > 0xFFFF) { // For surrogates.
3083                                    loadState(source, &digitState, TRUE);
3084                                    //goBackOne(source);
3085                                }
3086                                goBackOne(source);
3087                                break;
3088                            }
3089                        } else {
3090                            break;
3091                        }
3092                    }
3093
3094                    if (nonZeroValReached == FALSE){
3095                        digIndx = 2;
3096                        numTempBuf[2] = 6;
3097                    }
3098
3099                    endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3100                    if (digIndx % 2 != 0){
3101                        /*
3102                        We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3103                        we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3104                        Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3105                        single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3106                        */
3107
3108                        for(i = 2; i < endIndex; i++){
3109                            numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3110                                (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3111                        }
3112                        --digIndx;
3113                    }
3114
3115                    // Subtract one off of the last byte.
3116                    numTempBuf[endIndex-1] -= 1;
3117
3118                    /*
3119                    We want to skip over the first two slots in the buffer. The first slot
3120                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3121                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3122                    */
3123                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3124                    numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3125
3126                    // Now transfer the collation key to our collIterate struct.
3127                    // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3128                    //size = ((endIndex+1) & ~1)/2;
3129                    CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3130                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3131                        UCOL_BYTE_COMMON; // Tertiary weight.
3132                    i = 2; // Reset the index into the buffer.
3133                    while(i < endIndex)
3134                    {
3135                        uint32_t primWeight = numTempBuf[i++] << 8;
3136                        if ( i < endIndex)
3137                            primWeight |= numTempBuf[i++];
3138                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3139                    }
3140
3141                } else {
3142                    // no numeric mode, we'll just switch to whatever we stashed and continue
3143                    CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3144                    CE = *CEOffset++;
3145                    break;
3146                }
3147                return CE;
3148            }
3149            /* various implicits optimization */
3150        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3151            /* UCA is filled with these. Tailorings are NOT_FOUND */
3152            return getImplicit(cp, source);
3153        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3154            // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3155            return getImplicit(cp, source);
3156        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3157            {
3158                static const uint32_t
3159                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3160                //const uint32_t LCount = 19;
3161                static const uint32_t VCount = 21;
3162                static const uint32_t TCount = 28;
3163                //const uint32_t NCount = VCount * TCount;   // 588
3164                //const uint32_t SCount = LCount * NCount;   // 11172
3165                uint32_t L = ch - SBase;
3166
3167                // divide into pieces
3168
3169                uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3170                L /= TCount;
3171                uint32_t V = L % VCount;
3172                L /= VCount;
3173
3174                // offset them
3175
3176                L += LBase;
3177                V += VBase;
3178                T += TBase;
3179
3180                // return the first CE, but first put the rest into the expansion buffer
3181                if (!source->coll->image->jamoSpecial) { // FAST PATH
3182
3183                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3184                    if (T != TBase) {
3185                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3186                    }
3187
3188                    return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3189
3190                } else { // Jamo is Special
3191                    // Since Hanguls pass the FCD check, it is
3192                    // guaranteed that we won't be in
3193                    // the normalization buffer if something like this happens
3194
3195                    // However, if we are using a uchar iterator and normalization
3196                    // is ON, the Hangul that lead us here is going to be in that
3197                    // normalization buffer. Here we want to restore the uchar
3198                    // iterator state and pull out of the normalization buffer
3199                    if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3200                        source->flags = source->origFlags; // restore the iterator
3201                        source->pos = NULL;
3202                    }
3203
3204                    // Move Jamos into normalization buffer
3205                    UChar *buffer = source->writableBuffer.getBuffer(4);
3206                    int32_t bufferLength;
3207                    buffer[0] = (UChar)L;
3208                    buffer[1] = (UChar)V;
3209                    if (T != TBase) {
3210                        buffer[2] = (UChar)T;
3211                        bufferLength = 3;
3212                    } else {
3213                        bufferLength = 2;
3214                    }
3215                    source->writableBuffer.releaseBuffer(bufferLength);
3216
3217                    // Indicate where to continue in main input string after exhausting the writableBuffer
3218                    source->fcdPosition       = source->pos;
3219
3220                    source->pos   = source->writableBuffer.getTerminatedBuffer();
3221                    source->origFlags   = source->flags;
3222                    source->flags       |= UCOL_ITER_INNORMBUF;
3223                    source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3224
3225                    return(UCOL_IGNORABLE);
3226                }
3227            }
3228        case SURROGATE_TAG:
3229            /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3230            /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3231            /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3232            /* we treat it like an unassigned code point. */
3233            {
3234                UChar trail;
3235                collIterateState state;
3236                backupState(source, &state);
3237                if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3238                    // we chould have stepped one char forward and it might have turned that it
3239                    // was not a trail surrogate. In that case, we have to backup.
3240                    loadState(source, &state, TRUE);
3241                    return UCOL_NOT_FOUND;
3242                } else {
3243                    /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3244                    CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3245                    if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3246                        // We need to backup
3247                        loadState(source, &state, TRUE);
3248                        return CE;
3249                    }
3250                    // calculate the supplementary code point value, if surrogate was not tailored
3251                    cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3252                }
3253            }
3254            break;
3255        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3256            UChar nextChar;
3257            if( source->flags & UCOL_USE_ITERATOR) {
3258                if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3259                    cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3260                    source->iterator->next(source->iterator);
3261                    return getImplicit(cp, source);
3262                }
3263            } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3264                      U_IS_TRAIL((nextChar=*source->pos))) {
3265                cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3266                source->pos++;
3267                return getImplicit(cp, source);
3268            }
3269            return UCOL_NOT_FOUND;
3270        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3271            return UCOL_NOT_FOUND; /* broken surrogate sequence */
3272        case CHARSET_TAG:
3273            /* not yet implemented */
3274            /* probably after 1.8 */
3275            return UCOL_NOT_FOUND;
3276        default:
3277            *status = U_INTERNAL_PROGRAM_ERROR;
3278            CE=0;
3279            break;
3280    }
3281    if (CE <= UCOL_NOT_FOUND) break;
3282  }
3283  return CE;
3284}
3285
3286
3287/* now uses Mark's getImplicitPrimary code */
3288static
3289inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3290    uint32_t r = uprv_uca_getImplicitPrimary(cp);
3291
3292    *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3293    collationSource->toReturn = collationSource->CEpos;
3294
3295    // **** doesn't work if using iterator ****
3296    if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3297        collationSource->offsetRepeatCount = 1;
3298    } else {
3299        int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3300
3301        UErrorCode errorCode = U_ZERO_ERROR;
3302        collationSource->appendOffset(firstOffset, errorCode);
3303        collationSource->appendOffset(firstOffset + 1, errorCode);
3304
3305        collationSource->offsetReturn = collationSource->offsetStore - 1;
3306        *(collationSource->offsetBuffer) = firstOffset;
3307        if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3308            collationSource->offsetStore = collationSource->offsetBuffer;
3309        }
3310    }
3311
3312    return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3313}
3314
3315/**
3316 * This function handles the special CEs like contractions, expansions,
3317 * surrogates, Thai.
3318 * It is called by both getPrevCE
3319 */
3320uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3321                          collIterate *source,
3322                          UErrorCode *status)
3323{
3324    const uint32_t *CEOffset    = NULL;
3325          UChar    *UCharOffset = NULL;
3326          UChar    schar;
3327    const UChar    *constart    = NULL;
3328          uint32_t size;
3329          UChar    buffer[UCOL_MAX_BUFFER];
3330          uint32_t *endCEBuffer;
3331          UChar   *strbuffer;
3332          int32_t noChars = 0;
3333          int32_t CECount = 0;
3334
3335    for(;;)
3336    {
3337        /* the only ces that loops are thai and contractions */
3338        switch (getCETag(CE))
3339        {
3340        case NOT_FOUND_TAG:  /* this tag always returns */
3341            return CE;
3342
3343        case SPEC_PROC_TAG:
3344            {
3345                // Special processing is getting a CE that is preceded by a certain prefix
3346                // Currently this is only needed for optimizing Japanese length and iteration marks.
3347                // When we encouter a special processing tag, we go backwards and try to see if
3348                // we have a match.
3349                // Contraction tables are used - so the whole process is not unlike contraction.
3350                // prefix data is stored backwards in the table.
3351                const UChar *UCharOffset;
3352                UChar schar, tchar;
3353                collIterateState prefixState;
3354                backupState(source, &prefixState);
3355                for(;;) {
3356                    // This loop will run once per source string character, for as long as we
3357                    //  are matching a potential contraction sequence
3358
3359                    // First we position ourselves at the begining of contraction sequence
3360                    const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3361
3362                    if (collIter_bos(source)) {
3363                        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3364                        break;
3365                    }
3366                    schar = getPrevNormalizedChar(source, status);
3367                    goBackOne(source);
3368
3369                    while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3370                        UCharOffset++;
3371                    }
3372
3373                    if (schar == tchar) {
3374                        // Found the source string char in the table.
3375                        //  Pick up the corresponding CE from the table.
3376                        CE = *(coll->contractionCEs +
3377                            (UCharOffset - coll->contractionIndex));
3378                    }
3379                    else
3380                    {
3381                        // if there is a completely ignorable code point in the middle of
3382                        // a prefix, we need to act as if it's not there
3383                        // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3384                        // lone surrogates cannot be set to zero as it would break other processing
3385                        uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3386                        // it's easy for BMP code points
3387                        if(isZeroCE == 0) {
3388                            continue;
3389                        } else if(U16_IS_SURROGATE(schar)) {
3390                            // for supplementary code points, we have to check the next one
3391                            // situations where we are going to ignore
3392                            // 1. beginning of the string: schar is a lone surrogate
3393                            // 2. schar is a lone surrogate
3394                            // 3. schar is a trail surrogate in a valid surrogate sequence
3395                            //    that is explicitly set to zero.
3396                            if (!collIter_bos(source)) {
3397                                UChar lead;
3398                                if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3399                                    isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3400                                    if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3401                                        uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3402                                        if(finalCE == 0) {
3403                                            // this is a real, assigned completely ignorable code point
3404                                            goBackOne(source);
3405                                            continue;
3406                                        }
3407                                    }
3408                                } else {
3409                                    // lone surrogate, treat like unassigned
3410                                    return UCOL_NOT_FOUND;
3411                                }
3412                            } else {
3413                                // lone surrogate at the beggining, treat like unassigned
3414                                return UCOL_NOT_FOUND;
3415                            }
3416                        }
3417                        // Source string char was not in the table.
3418                        //   We have not found the prefix.
3419                        CE = *(coll->contractionCEs +
3420                            (ContractionStart - coll->contractionIndex));
3421                    }
3422
3423                    if(!isPrefix(CE)) {
3424                        // The source string char was in the contraction table, and the corresponding
3425                        //   CE is not a prefix CE.  We found the prefix, break
3426                        //   out of loop, this CE will end up being returned.  This is the normal
3427                        //   way out of prefix handling when the source actually contained
3428                        //   the prefix.
3429                        break;
3430                    }
3431                }
3432                loadState(source, &prefixState, TRUE);
3433                break;
3434            }
3435
3436        case CONTRACTION_TAG: {
3437            /* to ensure that the backwards and forwards iteration matches, we
3438            take the current region of most possible match and pass it through
3439            the forward iteration. this will ensure that the obstinate problem of
3440            overlapping contractions will not occur.
3441            */
3442            schar = peekCodeUnit(source, 0);
3443            constart = (UChar *)coll->image + getContractOffset(CE);
3444            if (isAtStartPrevIterate(source)
3445                /* commented away contraction end checks after adding the checks
3446                in getPrevCE  */) {
3447                    /* start of string or this is not the end of any contraction */
3448                    CE = *(coll->contractionCEs +
3449                        (constart - coll->contractionIndex));
3450                    break;
3451            }
3452            strbuffer = buffer;
3453            UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3454            *(UCharOffset --) = 0;
3455            noChars = 0;
3456            // have to swap thai characters
3457            while (ucol_unsafeCP(schar, coll)) {
3458                *(UCharOffset) = schar;
3459                noChars++;
3460                UCharOffset --;
3461                schar = getPrevNormalizedChar(source, status);
3462                goBackOne(source);
3463                // TODO: when we exhaust the contraction buffer,
3464                // it needs to get reallocated. The problem is
3465                // that the size depends on the string which is
3466                // not iterated over. However, since we're travelling
3467                // backwards, we already had to set the iterator at
3468                // the end - so we might as well know where we are?
3469                if (UCharOffset + 1 == buffer) {
3470                    /* we have exhausted the buffer */
3471                    int32_t newsize = 0;
3472                    if(source->pos) { // actually dealing with a position
3473                        newsize = (int32_t)(source->pos - source->string + 1);
3474                    } else { // iterator
3475                        newsize = 4 * UCOL_MAX_BUFFER;
3476                    }
3477                    strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3478                        (newsize + UCOL_MAX_BUFFER));
3479                    /* test for NULL */
3480                    if (strbuffer == NULL) {
3481                        *status = U_MEMORY_ALLOCATION_ERROR;
3482                        return UCOL_NO_MORE_CES;
3483                    }
3484                    UCharOffset = strbuffer + newsize;
3485                    uprv_memcpy(UCharOffset, buffer,
3486                        UCOL_MAX_BUFFER * sizeof(UChar));
3487                    UCharOffset --;
3488                }
3489                if ((source->pos && (source->pos == source->string ||
3490                    ((source->flags & UCOL_ITER_INNORMBUF) &&
3491                    *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3492                    || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3493                        break;
3494                }
3495            }
3496            /* adds the initial base character to the string */
3497            *(UCharOffset) = schar;
3498            noChars++;
3499
3500            int32_t offsetBias;
3501
3502            // **** doesn't work if using iterator ****
3503            if (source->flags & UCOL_ITER_INNORMBUF) {
3504                offsetBias = -1;
3505            } else {
3506                offsetBias = (int32_t)(source->pos - source->string);
3507            }
3508
3509            /* a new collIterate is used to simplify things, since using the current
3510            collIterate will mean that the forward and backwards iteration will
3511            share and change the same buffers. we don't want to get into that. */
3512            collIterate temp;
3513            int32_t rawOffset;
3514
3515            IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3516            if(U_FAILURE(*status)) {
3517                return (uint32_t)UCOL_NULLORDER;
3518            }
3519            temp.flags &= ~UCOL_ITER_NORM;
3520            temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3521
3522            rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3523            CE = ucol_IGetNextCE(coll, &temp, status);
3524
3525            if (source->extendCEs) {
3526                endCEBuffer = source->extendCEs + source->extendCEsSize;
3527                CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3528            } else {
3529                endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3530                CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3531            }
3532
3533            while (CE != UCOL_NO_MORE_CES) {
3534                *(source->CEpos ++) = CE;
3535
3536                if (offsetBias >= 0) {
3537                    source->appendOffset(rawOffset + offsetBias, *status);
3538                }
3539
3540                CECount++;
3541                if (source->CEpos == endCEBuffer) {
3542                    /* ran out of CE space, reallocate to new buffer.
3543                    If reallocation fails, reset pointers and bail out,
3544                    there's no guarantee of the right character position after
3545                    this bail*/
3546                    if (!increaseCEsCapacity(source)) {
3547                        *status = U_MEMORY_ALLOCATION_ERROR;
3548                        break;
3549                    }
3550
3551                    endCEBuffer = source->extendCEs + source->extendCEsSize;
3552                }
3553
3554                if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3555                    rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3556                } else {
3557                    rawOffset = (int32_t)(temp.pos - temp.string);
3558                }
3559
3560                CE = ucol_IGetNextCE(coll, &temp, status);
3561            }
3562
3563            if (strbuffer != buffer) {
3564                uprv_free(strbuffer);
3565            }
3566            if (U_FAILURE(*status)) {
3567                return (uint32_t)UCOL_NULLORDER;
3568            }
3569
3570            if (source->offsetRepeatValue != 0) {
3571                if (CECount > noChars) {
3572                    source->offsetRepeatCount += temp.offsetRepeatCount;
3573                } else {
3574                    // **** does this really skip the right offsets? ****
3575                    source->offsetReturn -= (noChars - CECount);
3576                }
3577            }
3578
3579            if (offsetBias >= 0) {
3580                source->offsetReturn = source->offsetStore - 1;
3581                if (source->offsetReturn == source->offsetBuffer) {
3582                    source->offsetStore = source->offsetBuffer;
3583                }
3584            }
3585
3586            source->toReturn = source->CEpos - 1;
3587            if (source->toReturn == source->CEs) {
3588                source->CEpos = source->CEs;
3589            }
3590
3591            return *(source->toReturn);
3592        }
3593        case LONG_PRIMARY_TAG:
3594            {
3595                *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3596                *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3597                source->toReturn = source->CEpos - 1;
3598
3599                if (source->flags & UCOL_ITER_INNORMBUF) {
3600                    source->offsetRepeatCount = 1;
3601                } else {
3602                    int32_t firstOffset = (int32_t)(source->pos - source->string);
3603
3604                    source->appendOffset(firstOffset, *status);
3605                    source->appendOffset(firstOffset + 1, *status);
3606
3607                    source->offsetReturn = source->offsetStore - 1;
3608                    *(source->offsetBuffer) = firstOffset;
3609                    if (source->offsetReturn == source->offsetBuffer) {
3610                        source->offsetStore = source->offsetBuffer;
3611                    }
3612                }
3613
3614
3615                return *(source->toReturn);
3616            }
3617
3618        case EXPANSION_TAG: /* this tag always returns */
3619            {
3620            /*
3621            This should handle expansion.
3622            NOTE: we can encounter both continuations and expansions in an expansion!
3623            I have to decide where continuations are going to be dealt with
3624            */
3625            int32_t firstOffset = (int32_t)(source->pos - source->string);
3626
3627            // **** doesn't work if using iterator ****
3628            if (source->offsetReturn != NULL) {
3629                if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3630                    source->offsetStore = source->offsetBuffer;
3631                }else {
3632                  firstOffset = -1;
3633                }
3634            }
3635
3636            /* find the offset to expansion table */
3637            CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3638            size     = getExpansionCount(CE);
3639            if (size != 0) {
3640                /*
3641                if there are less than 16 elements in expansion, we don't terminate
3642                */
3643                uint32_t count;
3644
3645                for (count = 0; count < size; count++) {
3646                    *(source->CEpos ++) = *CEOffset++;
3647
3648                    if (firstOffset >= 0) {
3649                        source->appendOffset(firstOffset + 1, *status);
3650                    }
3651                }
3652            } else {
3653                /* else, we do */
3654                while (*CEOffset != 0) {
3655                    *(source->CEpos ++) = *CEOffset ++;
3656
3657                    if (firstOffset >= 0) {
3658                        source->appendOffset(firstOffset + 1, *status);
3659                    }
3660                }
3661            }
3662
3663            if (firstOffset >= 0) {
3664                source->offsetReturn = source->offsetStore - 1;
3665                *(source->offsetBuffer) = firstOffset;
3666                if (source->offsetReturn == source->offsetBuffer) {
3667                    source->offsetStore = source->offsetBuffer;
3668                }
3669            } else {
3670                source->offsetRepeatCount += size - 1;
3671            }
3672
3673            source->toReturn = source->CEpos - 1;
3674            // in case of one element expansion, we
3675            // want to immediately return CEpos
3676            if(source->toReturn == source->CEs) {
3677                source->CEpos = source->CEs;
3678            }
3679
3680            return *(source->toReturn);
3681            }
3682
3683        case DIGIT_TAG:
3684            {
3685                /*
3686                We do a check to see if we want to collate digits as numbers; if so we generate
3687                a custom collation key. Otherwise we pull out the value stored in the expansion table.
3688                */
3689                uint32_t i;    /* general counter */
3690
3691                if (source->coll->numericCollation == UCOL_ON){
3692                    uint32_t digIndx = 0;
3693                    uint32_t endIndex = 0;
3694                    uint32_t leadingZeroIndex = 0;
3695                    uint32_t trailingZeroCount = 0;
3696
3697                    uint8_t collateVal = 0;
3698
3699                    UBool nonZeroValReached = FALSE;
3700
3701                    uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3702                    /*
3703                    We parse the source string until we hit a char that's NOT a digit.
3704                    Use this u_charDigitValue. This might be slow because we have to
3705                    handle surrogates...
3706                    */
3707                    /*
3708                    We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3709                    with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3710                    element we process when going backward. To determine how long that chunk might be, we may need to make
3711                    two passes through the loop that collects digits - one to see how long the string is (and how much is
3712                    leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3713                    more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3714                    element chunk after resetting the state to the initialState at the right side of the digit string.
3715                    */
3716                    uint32_t ceLimit = 0;
3717                    UChar initial_ch = ch;
3718                    collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3719                    backupState(source, &initialState);
3720
3721                    for(;;) {
3722                        collIterateState state = {0,0,0,0,0,0,0,0,0};
3723                        UChar32 char32 = 0;
3724                        int32_t digVal = 0;
3725
3726                        if (U16_IS_TRAIL (ch)) {
3727                            if (!collIter_bos(source)){
3728                                UChar lead = getPrevNormalizedChar(source, status);
3729                                if(U16_IS_LEAD(lead)) {
3730                                    char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3731                                    goBackOne(source);
3732                                } else {
3733                                    char32 = ch;
3734                                }
3735                            } else {
3736                                char32 = ch;
3737                            }
3738                        } else {
3739                            char32 = ch;
3740                        }
3741                        digVal = u_charDigitValue(char32);
3742
3743                        for(;;) {
3744                            // Make sure we have enough space. No longer needed;
3745                            // at this point the largest value of digIndx when we need to save data in numTempBuf
3746                            // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3747                            // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3748
3749                            // Skip over trailing zeroes, and keep a count of them.
3750                            if (digVal != 0)
3751                                nonZeroValReached = TRUE;
3752
3753                            if (nonZeroValReached) {
3754                                /*
3755                                We parse the digit string into base 100 numbers (this fits into a byte).
3756                                We only add to the buffer in twos, thus if we are parsing an odd character,
3757                                that serves as the 'tens' digit while the if we are parsing an even one, that
3758                                is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3759                                a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3760                                overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3761                                than all the other bytes.
3762
3763                                Since we're doing in this reverse we want to put the first digit encountered into the
3764                                ones place and the second digit encountered into the tens place.
3765                                */
3766
3767                                if ((digIndx + trailingZeroCount) % 2 == 1) {
3768                                    // High-order digit case (tens place)
3769                                    collateVal += (uint8_t)(digVal * 10);
3770
3771                                    // We cannot set leadingZeroIndex unless it has been set for the
3772                                    // low-order digit. Therefore, all we can do for the high-order
3773                                    // digit is turn it off, never on.
3774                                    // The only time we will have a high digit without a low is for
3775                                    // the very first non-zero digit, so no zero check is necessary.
3776                                    if (collateVal != 0)
3777                                        leadingZeroIndex = 0;
3778
3779                                    // The first pass through, digIndx may exceed the limit, but in that case
3780                                    // we no longer care about numTempBuf contents since they will be discarded
3781                                    if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3782                                        numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3783                                    }
3784                                    collateVal = 0;
3785                                } else {
3786                                    // Low-order digit case (ones place)
3787                                    collateVal = (uint8_t)digVal;
3788
3789                                    // Check for leading zeroes.
3790                                    if (collateVal == 0) {
3791                                        if (!leadingZeroIndex)
3792                                            leadingZeroIndex = (digIndx/2) + 2;
3793                                    } else
3794                                        leadingZeroIndex = 0;
3795
3796                                    // No need to write to buffer; the case of a last odd digit
3797                                    // is handled below.
3798                                }
3799                                ++digIndx;
3800                            } else
3801                                ++trailingZeroCount;
3802
3803                            if (!collIter_bos(source)) {
3804                                ch = getPrevNormalizedChar(source, status);
3805                                //goBackOne(source);
3806                                if (U16_IS_TRAIL(ch)) {
3807                                    backupState(source, &state);
3808                                    if (!collIter_bos(source)) {
3809                                        goBackOne(source);
3810                                        UChar lead = getPrevNormalizedChar(source, status);
3811
3812                                        if(U16_IS_LEAD(lead)) {
3813                                            char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3814                                        } else {
3815                                            loadState(source, &state, FALSE);
3816                                            char32 = ch;
3817                                        }
3818                                    }
3819                                } else
3820                                    char32 = ch;
3821
3822                                if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3823                                    if (char32 > 0xFFFF) {// For surrogates.
3824                                        loadState(source, &state, FALSE);
3825                                    }
3826                                    // Don't need to "reverse" the goBackOne call,
3827                                    // as this points to the next position to process..
3828                                    //if (char32 > 0xFFFF) // For surrogates.
3829                                    //getNextNormalizedChar(source);
3830                                    break;
3831                                }
3832
3833                                goBackOne(source);
3834                            }else
3835                                break;
3836                        }
3837
3838                        if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3839                            // our collation element is not too big, go ahead and finish with it
3840                            break;
3841                        }
3842                        // our digit string is too long for a collation element;
3843                        // set the limit for it, reset the state and begin again
3844                        ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3845                        if ( ceLimit == 0 ) {
3846                            ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3847                        }
3848                        ch = initial_ch;
3849                        loadState(source, &initialState, FALSE);
3850                        digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3851                        collateVal = 0;
3852                        nonZeroValReached = FALSE;
3853                    }
3854
3855                    if (! nonZeroValReached) {
3856                        digIndx = 2;
3857                        trailingZeroCount = 0;
3858                        numTempBuf[2] = 6;
3859                    }
3860
3861                    if ((digIndx + trailingZeroCount) % 2 != 0) {
3862                        numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3863                        digIndx += 1;       // The implicit leading zero
3864                    }
3865                    if (trailingZeroCount % 2 != 0) {
3866                        // We had to consume one trailing zero for the low digit
3867                        // of the least significant byte
3868                        digIndx += 1;       // The trailing zero not in the exponent
3869                        trailingZeroCount -= 1;
3870                    }
3871
3872                    endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3873
3874                    // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3875                    numTempBuf[2] -= 1;
3876
3877                    /*
3878                    We want to skip over the first two slots in the buffer. The first slot
3879                    is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3880                    sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3881                    The exponent must be adjusted by the number of leading zeroes, and the number of
3882                    trailing zeroes.
3883                    */
3884                    numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3885                    uint32_t exponent = (digIndx+trailingZeroCount)/2;
3886                    if (leadingZeroIndex)
3887                        exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3888                    numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3889
3890                    // Now transfer the collation key to our collIterate struct.
3891                    // The total size for our collation key is half of endIndex, rounded up.
3892                    int32_t size = (endIndex+1)/2;
3893                    if(!ensureCEsCapacity(source, size)) {
3894                        return (uint32_t)UCOL_NULLORDER;
3895                    }
3896                    *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3897                        (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3898                        UCOL_BYTE_COMMON; // Tertiary weight.
3899                    i = endIndex - 1; // Reset the index into the buffer.
3900                    while(i >= 2) {
3901                        uint32_t primWeight = numTempBuf[i--] << 8;
3902                        if ( i >= 2)
3903                            primWeight |= numTempBuf[i--];
3904                        *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3905                    }
3906
3907                    source->toReturn = source->CEpos -1;
3908                    return *(source->toReturn);
3909                } else {
3910                    CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3911                    CE = *(CEOffset++);
3912                    break;
3913                }
3914            }
3915
3916        case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3917            {
3918                static const uint32_t
3919                    SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3920                //const uint32_t LCount = 19;
3921                static const uint32_t VCount = 21;
3922                static const uint32_t TCount = 28;
3923                //const uint32_t NCount = VCount * TCount;   /* 588 */
3924                //const uint32_t SCount = LCount * NCount;   /* 11172 */
3925
3926                uint32_t L = ch - SBase;
3927                /*
3928                divide into pieces.
3929                we do it in this order since some compilers can do % and / in one
3930                operation
3931                */
3932                uint32_t T = L % TCount;
3933                L /= TCount;
3934                uint32_t V = L % VCount;
3935                L /= VCount;
3936
3937                /* offset them */
3938                L += LBase;
3939                V += VBase;
3940                T += TBase;
3941
3942                int32_t firstOffset = (int32_t)(source->pos - source->string);
3943                source->appendOffset(firstOffset, *status);
3944
3945                /*
3946                 * return the first CE, but first put the rest into the expansion buffer
3947                 */
3948                if (!source->coll->image->jamoSpecial) {
3949                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3950                    *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3951                    source->appendOffset(firstOffset + 1, *status);
3952
3953                    if (T != TBase) {
3954                        *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3955                        source->appendOffset(firstOffset + 1, *status);
3956                    }
3957
3958                    source->toReturn = source->CEpos - 1;
3959
3960                    source->offsetReturn = source->offsetStore - 1;
3961                    if (source->offsetReturn == source->offsetBuffer) {
3962                        source->offsetStore = source->offsetBuffer;
3963                    }
3964
3965                    return *(source->toReturn);
3966                } else {
3967                    // Since Hanguls pass the FCD check, it is
3968                    // guaranteed that we won't be in
3969                    // the normalization buffer if something like this happens
3970
3971                    // Move Jamos into normalization buffer
3972                    UChar *tempbuffer = source->writableBuffer.getBuffer(5);
3973                    int32_t tempbufferLength, jamoOffset;
3974                    tempbuffer[0] = 0;
3975                    tempbuffer[1] = (UChar)L;
3976                    tempbuffer[2] = (UChar)V;
3977                    if (T != TBase) {
3978                        tempbuffer[3] = (UChar)T;
3979                        tempbufferLength = 4;
3980                    } else {
3981                        tempbufferLength = 3;
3982                    }
3983                    source->writableBuffer.releaseBuffer(tempbufferLength);
3984
3985                    // Indicate where to continue in main input string after exhausting the writableBuffer
3986                    if (source->pos  == source->string) {
3987                        jamoOffset = 0;
3988                        source->fcdPosition = NULL;
3989                    } else {
3990                        jamoOffset = source->pos - source->string;
3991                        source->fcdPosition       = source->pos-1;
3992                    }
3993
3994                    // Append offsets for the additional chars
3995                    // (not the 0, and not the L whose offsets match the original Hangul)
3996                    int32_t jamoRemaining = tempbufferLength - 2;
3997                    jamoOffset++; // appended offsets should match end of original Hangul
3998                    while (jamoRemaining-- > 0) {
3999                        source->appendOffset(jamoOffset, *status);
4000                    }
4001
4002                    source->offsetRepeatValue = jamoOffset;
4003
4004                    source->offsetReturn = source->offsetStore - 1;
4005                    if (source->offsetReturn == source->offsetBuffer) {
4006                        source->offsetStore = source->offsetBuffer;
4007                    }
4008
4009                    source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4010                    source->origFlags         = source->flags;
4011                    source->flags            |= UCOL_ITER_INNORMBUF;
4012                    source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4013
4014                    return(UCOL_IGNORABLE);
4015                }
4016            }
4017
4018        case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4019            return getPrevImplicit(ch, source);
4020
4021            // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4022        case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4023            return getPrevImplicit(ch, source);
4024
4025        case SURROGATE_TAG:  /* This is a surrogate pair */
4026            /* essentially an engaged lead surrogate. */
4027            /* if you have encountered it here, it means that a */
4028            /* broken sequence was encountered and this is an error */
4029            return UCOL_NOT_FOUND;
4030
4031        case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4032            return UCOL_NOT_FOUND; /* broken surrogate sequence */
4033
4034        case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4035            {
4036                UChar32 cp = 0;
4037                UChar  prevChar;
4038                const UChar *prev;
4039                if (isAtStartPrevIterate(source)) {
4040                    /* we are at the start of the string, wrong place to be at */
4041                    return UCOL_NOT_FOUND;
4042                }
4043                if (source->pos != source->writableBuffer.getBuffer()) {
4044                    prev     = source->pos - 1;
4045                } else {
4046                    prev     = source->fcdPosition;
4047                }
4048                prevChar = *prev;
4049
4050                /* Handles Han and Supplementary characters here.*/
4051                if (U16_IS_LEAD(prevChar)) {
4052                    cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4053                    source->pos = prev;
4054                } else {
4055                    return UCOL_NOT_FOUND; /* like unassigned */
4056                }
4057
4058                return getPrevImplicit(cp, source);
4059            }
4060
4061            /* UCA is filled with these. Tailorings are NOT_FOUND */
4062            /* not yet implemented */
4063        case CHARSET_TAG:  /* this tag always returns */
4064            /* probably after 1.8 */
4065            return UCOL_NOT_FOUND;
4066
4067        default:           /* this tag always returns */
4068            *status = U_INTERNAL_PROGRAM_ERROR;
4069            CE=0;
4070            break;
4071        }
4072
4073        if (CE <= UCOL_NOT_FOUND) {
4074            break;
4075        }
4076    }
4077
4078    return CE;
4079}
4080
4081/* This should really be a macro                                                                      */
4082/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4083/* secondaries in French                                                                              */
4084/*
4085void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4086  uint8_t temp;
4087  while(start<end) {
4088    temp = *start;
4089    *start++ = *end;
4090    *end-- = temp;
4091  }
4092}
4093*/
4094
4095#define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4096  TYPE tempA; \
4097while((start)<(end)) { \
4098    tempA = *(start); \
4099    *(start)++ = *(end); \
4100    *(end)-- = tempA; \
4101} \
4102}
4103
4104/****************************************************************************/
4105/* Following are the sortkey generation functions                           */
4106/*                                                                          */
4107/****************************************************************************/
4108
4109U_CAPI int32_t U_EXPORT2
4110ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4111                   const uint8_t *src2, int32_t src2Length,
4112                   uint8_t *dest, int32_t destCapacity) {
4113    /* check arguments */
4114    if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4115        src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4116        destCapacity<0 || (destCapacity>0 && dest==NULL)
4117    ) {
4118        /* error, attempt to write a zero byte and return 0 */
4119        if(dest!=NULL && destCapacity>0) {
4120            *dest=0;
4121        }
4122        return 0;
4123    }
4124
4125    /* check lengths and capacity */
4126    if(src1Length<0) {
4127        src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4128    }
4129    if(src2Length<0) {
4130        src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4131    }
4132
4133    int32_t destLength=src1Length+src2Length;
4134    if(destLength>destCapacity) {
4135        /* the merged sort key does not fit into the destination */
4136        return destLength;
4137    }
4138
4139    /* merge the sort keys with the same number of levels */
4140    uint8_t *p=dest;
4141    for(;;) {
4142        /* copy level from src1 not including 00 or 01 */
4143        uint8_t b;
4144        while((b=*src1)>=2) {
4145            ++src1;
4146            *p++=b;
4147        }
4148
4149        /* add a 02 merge separator */
4150        *p++=2;
4151
4152        /* copy level from src2 not including 00 or 01 */
4153        while((b=*src2)>=2) {
4154            ++src2;
4155            *p++=b;
4156        }
4157
4158        /* if both sort keys have another level, then add a 01 level separator and continue */
4159        if(*src1==1 && *src2==1) {
4160            ++src1;
4161            ++src2;
4162            *p++=1;
4163        } else {
4164            break;
4165        }
4166    }
4167
4168    /*
4169     * here, at least one sort key is finished now, but the other one
4170     * might have some contents left from containing more levels;
4171     * that contents is just appended to the result
4172     */
4173    if(*src1!=0) {
4174        /* src1 is not finished, therefore *src2==0, and src1 is appended */
4175        src2=src1;
4176    }
4177    /* append src2, "the other, unfinished sort key" */
4178    while((*p++=*src2++)!=0) {}
4179
4180    /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
4181    return (int32_t)(p-dest);
4182}
4183
4184U_NAMESPACE_BEGIN
4185
4186class SortKeyByteSink : public ByteSink {
4187public:
4188    SortKeyByteSink(char *dest, int32_t destCapacity)
4189            : buffer_(dest), capacity_(destCapacity),
4190              appended_(0) {
4191        if (buffer_ == NULL) {
4192            capacity_ = 0;
4193        } else if(capacity_ < 0) {
4194            buffer_ = NULL;
4195            capacity_ = 0;
4196        }
4197    }
4198    virtual ~SortKeyByteSink();
4199
4200    virtual void Append(const char *bytes, int32_t n);
4201    void Append(uint32_t b) {
4202        if (appended_ < capacity_ || Resize(1, appended_)) {
4203            buffer_[appended_] = (char)b;
4204        }
4205        ++appended_;
4206    }
4207    void Append(uint32_t b1, uint32_t b2) {
4208        int32_t a2 = appended_ + 2;
4209        if (a2 <= capacity_ || Resize(2, appended_)) {
4210            buffer_[appended_] = (char)b1;
4211            buffer_[appended_ + 1] = (char)b2;
4212        } else if(appended_ < capacity_) {
4213            buffer_[appended_] = (char)b1;
4214        }
4215        appended_ = a2;
4216    }
4217    virtual char *GetAppendBuffer(int32_t min_capacity,
4218                                  int32_t desired_capacity_hint,
4219                                  char *scratch, int32_t scratch_capacity,
4220                                  int32_t *result_capacity);
4221    int32_t NumberOfBytesAppended() const { return appended_; }
4222    /** @return FALSE if memory allocation failed */
4223    UBool IsOk() const { return buffer_ != NULL; }
4224
4225protected:
4226    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
4227    virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
4228
4229    void SetNotOk() {
4230        buffer_ = NULL;
4231        capacity_ = 0;
4232    }
4233
4234    char *buffer_;
4235    int32_t capacity_;
4236    int32_t appended_;
4237
4238private:
4239    SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
4240    SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4241};
4242
4243SortKeyByteSink::~SortKeyByteSink() {}
4244
4245void
4246SortKeyByteSink::Append(const char *bytes, int32_t n) {
4247    if (n <= 0 || bytes == NULL) {
4248        return;
4249    }
4250    int32_t length = appended_;
4251    appended_ += n;
4252    if ((buffer_ + length) == bytes) {
4253        return;  // the caller used GetAppendBuffer() and wrote the bytes already
4254    }
4255    int32_t available = capacity_ - length;
4256    if (n <= available) {
4257        uprv_memcpy(buffer_ + length, bytes, n);
4258    } else {
4259        AppendBeyondCapacity(bytes, n, length);
4260    }
4261}
4262
4263char *
4264SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4265                                 int32_t desired_capacity_hint,
4266                                 char *scratch,
4267                                 int32_t scratch_capacity,
4268                                 int32_t *result_capacity) {
4269    if (min_capacity < 1 || scratch_capacity < min_capacity) {
4270        *result_capacity = 0;
4271        return NULL;
4272    }
4273    int32_t available = capacity_ - appended_;
4274    if (available >= min_capacity) {
4275        *result_capacity = available;
4276        return buffer_ + appended_;
4277    } else if (Resize(desired_capacity_hint, appended_)) {
4278        *result_capacity = capacity_ - appended_;
4279        return buffer_ + appended_;
4280    } else {
4281        *result_capacity = scratch_capacity;
4282        return scratch;
4283    }
4284}
4285
4286class FixedSortKeyByteSink : public SortKeyByteSink {
4287public:
4288    FixedSortKeyByteSink(char *dest, int32_t destCapacity)
4289            : SortKeyByteSink(dest, destCapacity) {}
4290    virtual ~FixedSortKeyByteSink();
4291
4292private:
4293    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4294    virtual UBool Resize(int32_t appendCapacity, int32_t length);
4295};
4296
4297FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
4298
4299void
4300FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
4301    // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4302    // Fill the buffer completely.
4303    int32_t available = capacity_ - length;
4304    if (available > 0) {
4305        uprv_memcpy(buffer_ + length, bytes, available);
4306    }
4307}
4308
4309UBool
4310FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
4311    return FALSE;
4312}
4313
4314class CollationKeyByteSink : public SortKeyByteSink {
4315public:
4316    CollationKeyByteSink(CollationKey &key)
4317            : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
4318              key_(key) {}
4319    virtual ~CollationKeyByteSink();
4320
4321private:
4322    virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4323    virtual UBool Resize(int32_t appendCapacity, int32_t length);
4324
4325    CollationKey &key_;
4326};
4327
4328CollationKeyByteSink::~CollationKeyByteSink() {}
4329
4330void
4331CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
4332    // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4333    if (Resize(n, length)) {
4334        uprv_memcpy(buffer_ + length, bytes, n);
4335    }
4336}
4337
4338UBool
4339CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4340    if (buffer_ == NULL) {
4341        return FALSE;  // allocation failed before already
4342    }
4343    int32_t newCapacity = 2 * capacity_;
4344    int32_t altCapacity = length + 2 * appendCapacity;
4345    if (newCapacity < altCapacity) {
4346        newCapacity = altCapacity;
4347    }
4348    if (newCapacity < 200) {
4349        newCapacity = 200;
4350    }
4351    uint8_t *newBuffer = key_.reallocate(newCapacity, length);
4352    if (newBuffer == NULL) {
4353        SetNotOk();
4354        return FALSE;
4355    }
4356    buffer_ = reinterpret_cast<char *>(newBuffer);
4357    capacity_ = newCapacity;
4358    return TRUE;
4359}
4360
4361/**
4362 * uint8_t byte buffer, similar to CharString but simpler.
4363 */
4364class SortKeyLevel : public UMemory {
4365public:
4366    SortKeyLevel() : len(0), ok(TRUE) {}
4367    ~SortKeyLevel() {}
4368
4369    /** @return FALSE if memory allocation failed */
4370    UBool isOk() const { return ok; }
4371    UBool isEmpty() const { return len == 0; }
4372    int32_t length() const { return len; }
4373    const uint8_t *data() const { return buffer.getAlias(); }
4374    uint8_t operator[](int32_t index) const { return buffer[index]; }
4375
4376    void appendByte(uint32_t b);
4377
4378    void appendTo(ByteSink &sink) const {
4379        sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
4380    }
4381
4382    uint8_t &lastByte() {
4383        U_ASSERT(len > 0);
4384        return buffer[len - 1];
4385    }
4386
4387    uint8_t *getLastFewBytes(int32_t n) {
4388        if (ok && len >= n) {
4389            return buffer.getAlias() + len - n;
4390        } else {
4391            return NULL;
4392        }
4393    }
4394
4395private:
4396    MaybeStackArray<uint8_t, 40> buffer;
4397    int32_t len;
4398    UBool ok;
4399
4400    UBool ensureCapacity(int32_t appendCapacity);
4401
4402    SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
4403    SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
4404};
4405
4406void SortKeyLevel::appendByte(uint32_t b) {
4407    if(len < buffer.getCapacity() || ensureCapacity(1)) {
4408        buffer[len++] = (uint8_t)b;
4409    }
4410}
4411
4412UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
4413    if(!ok) {
4414        return FALSE;
4415    }
4416    int32_t newCapacity = 2 * buffer.getCapacity();
4417    int32_t altCapacity = len + 2 * appendCapacity;
4418    if (newCapacity < altCapacity) {
4419        newCapacity = altCapacity;
4420    }
4421    if (newCapacity < 200) {
4422        newCapacity = 200;
4423    }
4424    if(buffer.resize(newCapacity, len)==NULL) {
4425        return ok = FALSE;
4426    }
4427    return TRUE;
4428}
4429
4430U_NAMESPACE_END
4431
4432/* sortkey API */
4433U_CAPI int32_t U_EXPORT2
4434ucol_getSortKey(const    UCollator    *coll,
4435        const    UChar        *source,
4436        int32_t        sourceLength,
4437        uint8_t        *result,
4438        int32_t        resultLength)
4439{
4440    UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4441    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4442        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4443            ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4444    }
4445
4446    if(coll->delegate != NULL) {
4447      return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
4448    }
4449
4450    UErrorCode status = U_ZERO_ERROR;
4451    int32_t keySize   = 0;
4452
4453    if(source != NULL) {
4454        // source == NULL is actually an error situation, but we would need to
4455        // have an error code to return it. Until we introduce a new
4456        // API, it stays like this
4457
4458        /* this uses the function pointer that is set in updateinternalstate */
4459        /* currently, there are two funcs: */
4460        /*ucol_calcSortKey(...);*/
4461        /*ucol_calcSortKeySimpleTertiary(...);*/
4462
4463        uint8_t noDest[1] = { 0 };
4464        if(result == NULL) {
4465            // Distinguish pure preflighting from an allocation error.
4466            result = noDest;
4467            resultLength = 0;
4468        }
4469        FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
4470        coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4471        if(U_SUCCESS(status)) {
4472            keySize = sink.NumberOfBytesAppended();
4473        }
4474    }
4475    UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4476    UTRACE_EXIT_STATUS(status);
4477    return keySize;
4478}
4479
4480U_CFUNC int32_t
4481ucol_getCollationKey(const UCollator *coll,
4482                     const UChar *source, int32_t sourceLength,
4483                     CollationKey &key,
4484                     UErrorCode &errorCode) {
4485    CollationKeyByteSink sink(key);
4486    coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
4487    return sink.NumberOfBytesAppended();
4488}
4489
4490// Is this primary weight compressible?
4491// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4492// TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4493static inline UBool
4494isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4495    return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4496}
4497
4498static
4499inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
4500    if (caseShift  == 0) {
4501        cases.appendByte(UCOL_CASE_BYTE_START);
4502        caseShift = UCOL_CASE_SHIFT_START;
4503    }
4504}
4505
4506// Packs the secondary buffer when processing French locale.
4507static void
4508packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4509    secondaries += secsize;  // We read the secondary-level bytes back to front.
4510    uint8_t secondary;
4511    int32_t count2 = 0;
4512    int32_t i = 0;
4513    // we use i here since the key size already accounts for terminators, so we'll discard the increment
4514    for(i = 0; i<secsize; i++) {
4515        secondary = *(secondaries-i-1);
4516        /* This is compression code. */
4517        if (secondary == UCOL_COMMON2) {
4518            ++count2;
4519        } else {
4520            if (count2 > 0) {
4521                if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4522                    while (count2 > UCOL_TOP_COUNT2) {
4523                        result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4524                        count2 -= (uint32_t)UCOL_TOP_COUNT2;
4525                    }
4526                    result.Append(UCOL_COMMON_TOP2 - (count2-1));
4527                } else {
4528                    while (count2 > UCOL_BOT_COUNT2) {
4529                        result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4530                        count2 -= (uint32_t)UCOL_BOT_COUNT2;
4531                    }
4532                    result.Append(UCOL_COMMON_BOT2 + (count2-1));
4533                }
4534                count2 = 0;
4535            }
4536            result.Append(secondary);
4537        }
4538    }
4539    if (count2 > 0) {
4540        while (count2 > UCOL_BOT_COUNT2) {
4541            result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4542            count2 -= (uint32_t)UCOL_BOT_COUNT2;
4543        }
4544        result.Append(UCOL_COMMON_BOT2 + (count2-1));
4545    }
4546}
4547
4548#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4549
4550/* This is the sortkey work horse function */
4551U_CFUNC void U_CALLCONV
4552ucol_calcSortKey(const    UCollator    *coll,
4553        const    UChar        *source,
4554        int32_t        sourceLength,
4555        SortKeyByteSink &result,
4556        UErrorCode *status)
4557{
4558    if(U_FAILURE(*status)) {
4559        return;
4560    }
4561
4562    SortKeyByteSink &primaries = result;
4563    SortKeyLevel secondaries;
4564    SortKeyLevel tertiaries;
4565    SortKeyLevel cases;
4566    SortKeyLevel quads;
4567
4568    UnicodeString normSource;
4569
4570    int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4571
4572    UColAttributeValue strength = coll->strength;
4573
4574    uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4575    uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4576    uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4577    UBool  compareIdent = (strength == UCOL_IDENTICAL);
4578    UBool  doCase = (coll->caseLevel == UCOL_ON);
4579    UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4580    UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4581    //UBool  qShifted = shifted && (compareQuad == 0);
4582    UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4583
4584    uint32_t variableTopValue = coll->variableTopValue;
4585    // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4586    // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4587    uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4588    uint8_t UCOL_HIRAGANA_QUAD = 0;
4589    if(doHiragana) {
4590        UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4591        /* allocate one more space for hiragana, value for hiragana */
4592    }
4593    uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4594
4595    /* support for special features like caselevel and funky secondaries */
4596    int32_t lastSecondaryLength = 0;
4597    uint32_t caseShift = 0;
4598
4599    /* If we need to normalize, we'll do it all at once at the beginning! */
4600    const Normalizer2 *norm2;
4601    if(compareIdent) {
4602        norm2 = Normalizer2Factory::getNFDInstance(*status);
4603    } else if(coll->normalizationMode != UCOL_OFF) {
4604        norm2 = Normalizer2Factory::getFCDInstance(*status);
4605    } else {
4606        norm2 = NULL;
4607    }
4608    if(norm2 != NULL) {
4609        normSource.setTo(FALSE, source, len);
4610        int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4611        if(qcYesLength != len) {
4612            UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4613            normSource.truncate(qcYesLength);
4614            norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4615            source = normSource.getBuffer();
4616            len = normSource.length();
4617        }
4618    }
4619    collIterate s;
4620    IInit_collIterate(coll, source, len, &s, status);
4621    if(U_FAILURE(*status)) {
4622        return;
4623    }
4624    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
4625
4626    uint32_t order = 0;
4627
4628    uint8_t primary1 = 0;
4629    uint8_t primary2 = 0;
4630    uint8_t secondary = 0;
4631    uint8_t tertiary = 0;
4632    uint8_t caseSwitch = coll->caseSwitch;
4633    uint8_t tertiaryMask = coll->tertiaryMask;
4634    int8_t tertiaryAddition = coll->tertiaryAddition;
4635    uint8_t tertiaryTop = coll->tertiaryTop;
4636    uint8_t tertiaryBottom = coll->tertiaryBottom;
4637    uint8_t tertiaryCommon = coll->tertiaryCommon;
4638    uint8_t caseBits = 0;
4639
4640    UBool wasShifted = FALSE;
4641    UBool notIsContinuation = FALSE;
4642
4643    uint32_t count2 = 0, count3 = 0, count4 = 0;
4644    uint8_t leadPrimary = 0;
4645
4646    for(;;) {
4647        order = ucol_IGetNextCE(coll, &s, status);
4648        if(order == UCOL_NO_MORE_CES) {
4649            break;
4650        }
4651
4652        if(order == 0) {
4653            continue;
4654        }
4655
4656        notIsContinuation = !isContinuation(order);
4657
4658        if(notIsContinuation) {
4659            tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4660        } else {
4661            tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4662        }
4663
4664        secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4665        primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4666        primary1 = (uint8_t)(order >> 8);
4667
4668        uint8_t originalPrimary1 = primary1;
4669        if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4670            primary1 = coll->leadBytePermutationTable[primary1];
4671        }
4672
4673        if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4674                        || (!notIsContinuation && wasShifted)))
4675            || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4676        {
4677            /* and other ignorables should be removed if following a shifted code point */
4678            if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4679                /* we should just completely ignore it */
4680                continue;
4681            }
4682            if(compareQuad == 0) {
4683                if(count4 > 0) {
4684                    while (count4 > UCOL_BOT_COUNT4) {
4685                        quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4686                        count4 -= UCOL_BOT_COUNT4;
4687                    }
4688                    quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4689                    count4 = 0;
4690                }
4691                /* We are dealing with a variable and we're treating them as shifted */
4692                /* This is a shifted ignorable */
4693                if(primary1 != 0) { /* we need to check this since we could be in continuation */
4694                    quads.appendByte(primary1);
4695                }
4696                if(primary2 != 0) {
4697                    quads.appendByte(primary2);
4698                }
4699            }
4700            wasShifted = TRUE;
4701        } else {
4702            wasShifted = FALSE;
4703            /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4704            /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
4705            /* regular and simple sortkey calc */
4706            if(primary1 != UCOL_IGNORABLE) {
4707                if(notIsContinuation) {
4708                    if(leadPrimary == primary1) {
4709                        primaries.Append(primary2);
4710                    } else {
4711                        if(leadPrimary != 0) {
4712                            primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4713                        }
4714                        if(primary2 == UCOL_IGNORABLE) {
4715                            /* one byter, not compressed */
4716                            primaries.Append(primary1);
4717                            leadPrimary = 0;
4718                        } else if(isCompressible(coll, originalPrimary1)) {
4719                            /* compress */
4720                            primaries.Append(leadPrimary = primary1, primary2);
4721                        } else {
4722                            leadPrimary = 0;
4723                            primaries.Append(primary1, primary2);
4724                        }
4725                    }
4726                } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4727                    if(primary2 == UCOL_IGNORABLE) {
4728                        primaries.Append(primary1);
4729                    } else {
4730                        primaries.Append(primary1, primary2);
4731                    }
4732                }
4733            }
4734
4735            if(secondary > compareSec) {
4736                if(!isFrenchSec) {
4737                    /* This is compression code. */
4738                    if (secondary == UCOL_COMMON2 && notIsContinuation) {
4739                        ++count2;
4740                    } else {
4741                        if (count2 > 0) {
4742                            if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4743                                while (count2 > UCOL_TOP_COUNT2) {
4744                                    secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4745                                    count2 -= (uint32_t)UCOL_TOP_COUNT2;
4746                                }
4747                                secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
4748                            } else {
4749                                while (count2 > UCOL_BOT_COUNT2) {
4750                                    secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4751                                    count2 -= (uint32_t)UCOL_BOT_COUNT2;
4752                                }
4753                                secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4754                            }
4755                            count2 = 0;
4756                        }
4757                        secondaries.appendByte(secondary);
4758                    }
4759                } else {
4760                    /* Do the special handling for French secondaries */
4761                    /* We need to get continuation elements and do intermediate restore */
4762                    /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4763                    if(notIsContinuation) {
4764                        if (lastSecondaryLength > 1) {
4765                            uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4766                            if (frenchStartPtr != NULL) {
4767                                /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4768                                uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4769                                uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4770                            }
4771                        }
4772                        lastSecondaryLength = 1;
4773                    } else {
4774                        ++lastSecondaryLength;
4775                    }
4776                    secondaries.appendByte(secondary);
4777                }
4778            }
4779
4780            if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4781                // do the case level if we need to do it. We don't want to calculate
4782                // case level for primary ignorables if we have only primary strength and case level
4783                // otherwise we would break well formedness of CEs
4784                doCaseShift(cases, caseShift);
4785                if(notIsContinuation) {
4786                    caseBits = (uint8_t)(tertiary & 0xC0);
4787
4788                    if(tertiary != 0) {
4789                        if(coll->caseFirst == UCOL_UPPER_FIRST) {
4790                            if((caseBits & 0xC0) == 0) {
4791                                cases.lastByte() |= 1 << (--caseShift);
4792                            } else {
4793                                cases.lastByte() |= 0 << (--caseShift);
4794                                /* second bit */
4795                                doCaseShift(cases, caseShift);
4796                                cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
4797                            }
4798                        } else {
4799                            if((caseBits & 0xC0) == 0) {
4800                                cases.lastByte() |= 0 << (--caseShift);
4801                            } else {
4802                                cases.lastByte() |= 1 << (--caseShift);
4803                                /* second bit */
4804                                doCaseShift(cases, caseShift);
4805                                cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
4806                            }
4807                        }
4808                    }
4809                }
4810            } else {
4811                if(notIsContinuation) {
4812                    tertiary ^= caseSwitch;
4813                }
4814            }
4815
4816            tertiary &= tertiaryMask;
4817            if(tertiary > compareTer) {
4818                /* This is compression code. */
4819                /* sequence size check is included in the if clause */
4820                if (tertiary == tertiaryCommon && notIsContinuation) {
4821                    ++count3;
4822                } else {
4823                    if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4824                        tertiary += tertiaryAddition;
4825                    } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4826                        tertiary -= tertiaryAddition;
4827                    }
4828                    if (count3 > 0) {
4829                        if ((tertiary > tertiaryCommon)) {
4830                            while (count3 > coll->tertiaryTopCount) {
4831                                tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4832                                count3 -= (uint32_t)coll->tertiaryTopCount;
4833                            }
4834                            tertiaries.appendByte(tertiaryTop - (count3-1));
4835                        } else {
4836                            while (count3 > coll->tertiaryBottomCount) {
4837                                tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4838                                count3 -= (uint32_t)coll->tertiaryBottomCount;
4839                            }
4840                            tertiaries.appendByte(tertiaryBottom + (count3-1));
4841                        }
4842                        count3 = 0;
4843                    }
4844                    tertiaries.appendByte(tertiary);
4845                }
4846            }
4847
4848            if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4849                if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4850                    if(count4>0) { // Close this part
4851                        while (count4 > UCOL_BOT_COUNT4) {
4852                            quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4853                            count4 -= UCOL_BOT_COUNT4;
4854                        }
4855                        quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4856                        count4 = 0;
4857                    }
4858                    quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4859                } else { // This wasn't Hiragana, so we can continue adding stuff
4860                    count4++;
4861                }
4862            }
4863        }
4864    }
4865
4866    /* Here, we are generally done with processing */
4867    /* bailing out would not be too productive */
4868
4869    UBool ok = TRUE;
4870    if(U_SUCCESS(*status)) {
4871        /* we have done all the CE's, now let's put them together to form a key */
4872        if(compareSec == 0) {
4873            if (count2 > 0) {
4874                while (count2 > UCOL_BOT_COUNT2) {
4875                    secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4876                    count2 -= (uint32_t)UCOL_BOT_COUNT2;
4877                }
4878                secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4879            }
4880            result.Append(UCOL_LEVELTERMINATOR);
4881            if(!secondaries.isOk()) {
4882                ok = FALSE;
4883            } else if(!isFrenchSec) {
4884                secondaries.appendTo(result);
4885            } else {
4886                // If there are any unresolved continuation secondaries,
4887                // reverse them here so that we can reverse the whole secondary thing.
4888                if (lastSecondaryLength > 1) {
4889                    uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4890                    if (frenchStartPtr != NULL) {
4891                        /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4892                        uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4893                        uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4894                    }
4895                }
4896                packFrench(secondaries.data(), secondaries.length(), result);
4897            }
4898        }
4899
4900        if(doCase) {
4901            ok &= cases.isOk();
4902            result.Append(UCOL_LEVELTERMINATOR);
4903            cases.appendTo(result);
4904        }
4905
4906        if(compareTer == 0) {
4907            if (count3 > 0) {
4908                if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4909                    while (count3 >= coll->tertiaryTopCount) {
4910                        tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4911                        count3 -= (uint32_t)coll->tertiaryTopCount;
4912                    }
4913                    tertiaries.appendByte(tertiaryTop - count3);
4914                } else {
4915                    while (count3 > coll->tertiaryBottomCount) {
4916                        tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4917                        count3 -= (uint32_t)coll->tertiaryBottomCount;
4918                    }
4919                    tertiaries.appendByte(tertiaryBottom + (count3-1));
4920                }
4921            }
4922            ok &= tertiaries.isOk();
4923            result.Append(UCOL_LEVELTERMINATOR);
4924            tertiaries.appendTo(result);
4925
4926            if(compareQuad == 0/*qShifted == TRUE*/) {
4927                if(count4 > 0) {
4928                    while (count4 > UCOL_BOT_COUNT4) {
4929                        quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4930                        count4 -= UCOL_BOT_COUNT4;
4931                    }
4932                    quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4933                }
4934                ok &= quads.isOk();
4935                result.Append(UCOL_LEVELTERMINATOR);
4936                quads.appendTo(result);
4937            }
4938
4939            if(compareIdent) {
4940                result.Append(UCOL_LEVELTERMINATOR);
4941                u_writeIdenticalLevelRun(s.string, len, result);
4942            }
4943        }
4944        result.Append(0);
4945    }
4946
4947    /* To avoid memory leak, free the offset buffer if necessary. */
4948    ucol_freeOffsetBuffer(&s);
4949
4950    ok &= result.IsOk();
4951    if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
4952}
4953
4954
4955U_CFUNC void U_CALLCONV
4956ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
4957        const    UChar        *source,
4958        int32_t        sourceLength,
4959        SortKeyByteSink &result,
4960        UErrorCode *status)
4961{
4962    U_ALIGN_CODE(16);
4963
4964    if(U_FAILURE(*status)) {
4965        return;
4966    }
4967
4968    SortKeyByteSink &primaries = result;
4969    SortKeyLevel secondaries;
4970    SortKeyLevel tertiaries;
4971
4972    UnicodeString normSource;
4973
4974    int32_t len =  sourceLength;
4975
4976    /* If we need to normalize, we'll do it all at once at the beginning! */
4977    if(coll->normalizationMode != UCOL_OFF) {
4978        normSource.setTo(len < 0, source, len);
4979        const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
4980        int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4981        if(qcYesLength != normSource.length()) {
4982            UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4983            normSource.truncate(qcYesLength);
4984            norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4985            source = normSource.getBuffer();
4986            len = normSource.length();
4987        }
4988    }
4989    collIterate s;
4990    IInit_collIterate(coll, (UChar *)source, len, &s, status);
4991    if(U_FAILURE(*status)) {
4992        return;
4993    }
4994    s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
4995
4996    uint32_t order = 0;
4997
4998    uint8_t primary1 = 0;
4999    uint8_t primary2 = 0;
5000    uint8_t secondary = 0;
5001    uint8_t tertiary = 0;
5002    uint8_t caseSwitch = coll->caseSwitch;
5003    uint8_t tertiaryMask = coll->tertiaryMask;
5004    int8_t tertiaryAddition = coll->tertiaryAddition;
5005    uint8_t tertiaryTop = coll->tertiaryTop;
5006    uint8_t tertiaryBottom = coll->tertiaryBottom;
5007    uint8_t tertiaryCommon = coll->tertiaryCommon;
5008
5009    UBool notIsContinuation = FALSE;
5010
5011    uint32_t count2 = 0, count3 = 0;
5012    uint8_t leadPrimary = 0;
5013
5014    for(;;) {
5015        order = ucol_IGetNextCE(coll, &s, status);
5016
5017        if(order == 0) {
5018            continue;
5019        }
5020
5021        if(order == UCOL_NO_MORE_CES) {
5022            break;
5023        }
5024
5025        notIsContinuation = !isContinuation(order);
5026
5027        if(notIsContinuation) {
5028            tertiary = (uint8_t)((order & tertiaryMask));
5029        } else {
5030            tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5031        }
5032
5033        secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5034        primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5035        primary1 = (uint8_t)(order >> 8);
5036
5037        uint8_t originalPrimary1 = primary1;
5038        if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5039            primary1 = coll->leadBytePermutationTable[primary1];
5040        }
5041
5042        /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5043        /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
5044        /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5045        /* regular and simple sortkey calc */
5046        if(primary1 != UCOL_IGNORABLE) {
5047            if(notIsContinuation) {
5048                if(leadPrimary == primary1) {
5049                    primaries.Append(primary2);
5050                } else {
5051                    if(leadPrimary != 0) {
5052                        primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5053                    }
5054                    if(primary2 == UCOL_IGNORABLE) {
5055                        /* one byter, not compressed */
5056                        primaries.Append(primary1);
5057                        leadPrimary = 0;
5058                    } else if(isCompressible(coll, originalPrimary1)) {
5059                        /* compress */
5060                        primaries.Append(leadPrimary = primary1, primary2);
5061                    } else {
5062                        leadPrimary = 0;
5063                        primaries.Append(primary1, primary2);
5064                    }
5065                }
5066            } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5067                if(primary2 == UCOL_IGNORABLE) {
5068                    primaries.Append(primary1);
5069                } else {
5070                    primaries.Append(primary1, primary2);
5071                }
5072            }
5073        }
5074
5075        if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5076            /* This is compression code. */
5077            if (secondary == UCOL_COMMON2 && notIsContinuation) {
5078                ++count2;
5079            } else {
5080                if (count2 > 0) {
5081                    if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5082                        while (count2 > UCOL_TOP_COUNT2) {
5083                            secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5084                            count2 -= (uint32_t)UCOL_TOP_COUNT2;
5085                        }
5086                        secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
5087                    } else {
5088                        while (count2 > UCOL_BOT_COUNT2) {
5089                            secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5090                            count2 -= (uint32_t)UCOL_BOT_COUNT2;
5091                        }
5092                        secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5093                    }
5094                    count2 = 0;
5095                }
5096                secondaries.appendByte(secondary);
5097            }
5098        }
5099
5100        if(notIsContinuation) {
5101            tertiary ^= caseSwitch;
5102        }
5103
5104        if(tertiary > 0) {
5105            /* This is compression code. */
5106            /* sequence size check is included in the if clause */
5107            if (tertiary == tertiaryCommon && notIsContinuation) {
5108                ++count3;
5109            } else {
5110                if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5111                    tertiary += tertiaryAddition;
5112                } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5113                    tertiary -= tertiaryAddition;
5114                }
5115                if (count3 > 0) {
5116                    if ((tertiary > tertiaryCommon)) {
5117                        while (count3 > coll->tertiaryTopCount) {
5118                            tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5119                            count3 -= (uint32_t)coll->tertiaryTopCount;
5120                        }
5121                        tertiaries.appendByte(tertiaryTop - (count3-1));
5122                    } else {
5123                        while (count3 > coll->tertiaryBottomCount) {
5124                            tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5125                            count3 -= (uint32_t)coll->tertiaryBottomCount;
5126                        }
5127                        tertiaries.appendByte(tertiaryBottom + (count3-1));
5128                    }
5129                    count3 = 0;
5130                }
5131                tertiaries.appendByte(tertiary);
5132            }
5133        }
5134    }
5135
5136    UBool ok = TRUE;
5137    if(U_SUCCESS(*status)) {
5138        /* we have done all the CE's, now let's put them together to form a key */
5139        if (count2 > 0) {
5140            while (count2 > UCOL_BOT_COUNT2) {
5141                secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5142                count2 -= (uint32_t)UCOL_BOT_COUNT2;
5143            }
5144            secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5145        }
5146        ok &= secondaries.isOk();
5147        result.Append(UCOL_LEVELTERMINATOR);
5148        secondaries.appendTo(result);
5149
5150        if (count3 > 0) {
5151            if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5152                while (count3 >= coll->tertiaryTopCount) {
5153                    tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5154                    count3 -= (uint32_t)coll->tertiaryTopCount;
5155                }
5156                tertiaries.appendByte(tertiaryTop - count3);
5157            } else {
5158                while (count3 > coll->tertiaryBottomCount) {
5159                    tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5160                    count3 -= (uint32_t)coll->tertiaryBottomCount;
5161                }
5162                tertiaries.appendByte(tertiaryBottom + (count3-1));
5163            }
5164        }
5165        ok &= tertiaries.isOk();
5166        result.Append(UCOL_LEVELTERMINATOR);
5167        tertiaries.appendTo(result);
5168
5169        result.Append(0);
5170    }
5171
5172    /* To avoid memory leak, free the offset buffer if necessary. */
5173    ucol_freeOffsetBuffer(&s);
5174
5175    ok &= result.IsOk();
5176    if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5177}
5178
5179static inline
5180UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5181    UBool notIsContinuation = !isContinuation(CE);
5182    uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5183    if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5184               || (!notIsContinuation && *wasShifted)))
5185        || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5186    {
5187        // The stuff below should probably be in the sortkey code... maybe not...
5188        if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5189            /* we should just completely ignore it */
5190            *wasShifted = TRUE;
5191            //continue;
5192        }
5193        //*wasShifted = TRUE;
5194        return TRUE;
5195    } else {
5196        *wasShifted = FALSE;
5197        return FALSE;
5198    }
5199}
5200static inline
5201void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5202    if(level < maxLevel) {
5203        dest[i++] = UCOL_LEVELTERMINATOR;
5204    } else {
5205        dest[i++] = 0;
5206    }
5207}
5208
5209/** enumeration of level identifiers for partial sort key generation */
5210enum {
5211  UCOL_PSK_PRIMARY = 0,
5212    UCOL_PSK_SECONDARY = 1,
5213    UCOL_PSK_CASE = 2,
5214    UCOL_PSK_TERTIARY = 3,
5215    UCOL_PSK_QUATERNARY = 4,
5216    UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5217    UCOL_PSK_IDENTICAL = 6,
5218    UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5219    UCOL_PSK_LIMIT
5220};
5221
5222/** collation state enum. *_SHIFT value is how much to shift right
5223 *  to get the state piece to the right. *_MASK value should be
5224 *  ANDed with the shifted state. This data is stored in state[1]
5225 *  field.
5226 */
5227enum {
5228    UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5229    UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5230    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5231    UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5232    /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5233     *  This field is also used to denote that the French secondary level is finished
5234     */
5235    UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5236    UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5237    UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5238    UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5239    /** When we do French we need to reverse secondary values. However, continuations
5240     *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5241     */
5242    UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5243    UCOL_PSK_BOCSU_BYTES_MASK = 3,
5244    UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5245    UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5246};
5247
5248// macro calculating the number of expansion CEs available
5249#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5250
5251
5252/** main sortkey part procedure. On the first call,
5253 *  you should pass in a collator, an iterator, empty state
5254 *  state[0] == state[1] == 0, a buffer to hold results
5255 *  number of bytes you need and an error code pointer.
5256 *  Make sure your buffer is big enough to hold the wanted
5257 *  number of sortkey bytes. I don't check.
5258 *  The only meaningful status you can get back is
5259 *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5260 *  have been dealt a raw deal and that you probably won't
5261 *  be able to use partial sortkey generation for this
5262 *  particular combination of string and collator. This
5263 *  is highly unlikely, but you should still check the error code.
5264 *  Any other status means that you're not in a sane situation
5265 *  anymore. After the first call, preserve state values and
5266 *  use them on subsequent calls to obtain more bytes of a sortkey.
5267 *  Use until the number of bytes written is smaller than the requested
5268 *  number of bytes. Generated sortkey is not compatible with the
5269 *  one generated by ucol_getSortKey, as we don't do any compression.
5270 *  However, levels are still terminated by a 1 (one) and the sortkey
5271 *  is terminated by a 0 (zero). Identical level is the same as in the
5272 *  regular sortkey - internal bocu-1 implementation is used.
5273 *  For curious, although you cannot do much about this, here is
5274 *  the structure of state words.
5275 *  state[0] - iterator state. Depends on the iterator implementation,
5276 *             but allows the iterator to continue where it stopped in
5277 *             the last iteration.
5278 *  state[1] - collation processing state. Here is the distribution
5279 *             of the bits:
5280 *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5281 *             quaternary, quin (we don't use this one), identical and
5282 *             null (producing only zeroes - first one to terminate the
5283 *             sortkey and subsequent to fill the buffer).
5284 *   3       - byte count. Number of bytes written on the primary level.
5285 *   4       - was shifted. Whether the previous iteration finished in the
5286 *             shifted state.
5287 *   5, 6    - French continuation bytes written. See the comment in the enum
5288 *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5289 *             the identical level.
5290 *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5291 *             since thes last successful update of the iterator state.
5292 */
5293U_CAPI int32_t U_EXPORT2
5294ucol_nextSortKeyPart(const UCollator *coll,
5295                     UCharIterator *iter,
5296                     uint32_t state[2],
5297                     uint8_t *dest, int32_t count,
5298                     UErrorCode *status)
5299{
5300    /* error checking */
5301    if(status==NULL || U_FAILURE(*status)) {
5302        return 0;
5303    }
5304    UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5305    if( coll==NULL || iter==NULL ||
5306        state==NULL ||
5307        count<0 || (count>0 && dest==NULL)
5308    ) {
5309        *status=U_ILLEGAL_ARGUMENT_ERROR;
5310        UTRACE_EXIT_STATUS(status);
5311        return 0;
5312    }
5313
5314    UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5315                  coll, iter, state[0], state[1], dest, count);
5316
5317    if(count==0) {
5318        /* nothing to do */
5319        UTRACE_EXIT_VALUE(0);
5320        return 0;
5321    }
5322    /** Setting up situation according to the state we got from the previous iteration */
5323    // The state of the iterator from the previous invocation
5324    uint32_t iterState = state[0];
5325    // Has the last iteration ended in the shifted state
5326    UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5327    // What is the current level of the sortkey?
5328    int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5329    // Have we written only one byte from a two byte primary in the previous iteration?
5330    // Also on secondary level - have we finished with the French secondary?
5331    int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5332    // number of bytes in the continuation buffer for French
5333    int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5334    // Number of bytes already written from a bocsu sequence. Since
5335    // the longes bocsu sequence is 4 long, this can be up to 3.
5336    int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5337    // Number of elements that need to be consumed in this iteration because
5338    // the iterator returned UITER_NO_STATE at the end of the last iteration,
5339    // so we had to save the last valid state.
5340    int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5341
5342    /** values that depend on the collator attributes */
5343    // strength of the collator.
5344    int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5345    // maximal level of the partial sortkey. Need to take whether case level is done
5346    int32_t maxLevel = 0;
5347    if(strength < UCOL_TERTIARY) {
5348        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5349            maxLevel = UCOL_PSK_CASE;
5350        } else {
5351            maxLevel = strength;
5352        }
5353    } else {
5354        if(strength == UCOL_TERTIARY) {
5355            maxLevel = UCOL_PSK_TERTIARY;
5356        } else if(strength == UCOL_QUATERNARY) {
5357            maxLevel = UCOL_PSK_QUATERNARY;
5358        } else { // identical
5359            maxLevel = UCOL_IDENTICAL;
5360        }
5361    }
5362    // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5363    uint8_t UCOL_HIRAGANA_QUAD =
5364      (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5365    // Boundary value that decides whether a CE is shifted or not
5366    uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5367    // Are we doing French collation?
5368    UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5369
5370    /** initializing the collation state */
5371    UBool notIsContinuation = FALSE;
5372    uint32_t CE = UCOL_NO_MORE_CES;
5373
5374    collIterate s;
5375    IInit_collIterate(coll, NULL, -1, &s, status);
5376    if(U_FAILURE(*status)) {
5377        UTRACE_EXIT_STATUS(*status);
5378        return 0;
5379    }
5380    s.iterator = iter;
5381    s.flags |= UCOL_USE_ITERATOR;
5382    // This variable tells us whether we have produced some other levels in this iteration
5383    // before we moved to the identical level. In that case, we need to switch the
5384    // type of the iterator.
5385    UBool doingIdenticalFromStart = FALSE;
5386    // Normalizing iterator
5387    // The division for the array length may truncate the array size to
5388    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5389    // for all platforms anyway.
5390    UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5391    UNormIterator *normIter = NULL;
5392    // If the normalization is turned on for the collator and we are below identical level
5393    // we will use a FCD normalizing iterator
5394    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5395        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5396        s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5397        s.flags &= ~UCOL_ITER_NORM;
5398        if(U_FAILURE(*status)) {
5399            UTRACE_EXIT_STATUS(*status);
5400            return 0;
5401        }
5402    } else if(level == UCOL_PSK_IDENTICAL) {
5403        // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5404        // will be updating the state - and this cannot be done on an ordinary iterator.
5405        normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5406        s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5407        s.flags &= ~UCOL_ITER_NORM;
5408        if(U_FAILURE(*status)) {
5409            UTRACE_EXIT_STATUS(*status);
5410            return 0;
5411        }
5412        doingIdenticalFromStart = TRUE;
5413    }
5414
5415    // This is the tentative new state of the iterator. The problem
5416    // is that the iterator might return an undefined state, in
5417    // which case we should save the last valid state and increase
5418    // the iterator skip value.
5419    uint32_t newState = 0;
5420
5421    // First, we set the iterator to the last valid position
5422    // from the last iteration. This was saved in state[0].
5423    if(iterState == 0) {
5424        /* initial state */
5425        if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5426            s.iterator->move(s.iterator, 0, UITER_LIMIT);
5427        } else {
5428            s.iterator->move(s.iterator, 0, UITER_START);
5429        }
5430    } else {
5431        /* reset to previous state */
5432        s.iterator->setState(s.iterator, iterState, status);
5433        if(U_FAILURE(*status)) {
5434            UTRACE_EXIT_STATUS(*status);
5435            return 0;
5436        }
5437    }
5438
5439
5440
5441    // This variable tells us whether we can attempt to update the state
5442    // of iterator. Situations where we don't want to update iterator state
5443    // are the existence of expansion CEs that are not yet processed, and
5444    // finishing the case level without enough space in the buffer to insert
5445    // a level terminator.
5446    UBool canUpdateState = TRUE;
5447
5448    // Consume all the CEs that were consumed at the end of the previous
5449    // iteration without updating the iterator state. On identical level,
5450    // consume the code points.
5451    int32_t counter = cces;
5452    if(level < UCOL_PSK_IDENTICAL) {
5453        while(counter-->0) {
5454            // If we're doing French and we are on the secondary level,
5455            // we go backwards.
5456            if(level == UCOL_PSK_SECONDARY && doingFrench) {
5457                CE = ucol_IGetPrevCE(coll, &s, status);
5458            } else {
5459                CE = ucol_IGetNextCE(coll, &s, status);
5460            }
5461            if(CE==UCOL_NO_MORE_CES) {
5462                /* should not happen */
5463                *status=U_INTERNAL_PROGRAM_ERROR;
5464                UTRACE_EXIT_STATUS(*status);
5465                return 0;
5466            }
5467            if(uprv_numAvailableExpCEs(s)) {
5468                canUpdateState = FALSE;
5469            }
5470        }
5471    } else {
5472        while(counter-->0) {
5473            uiter_next32(s.iterator);
5474        }
5475    }
5476
5477    // French secondary needs to know whether the iterator state of zero came from previous level OR
5478    // from a new invocation...
5479    UBool wasDoingPrimary = FALSE;
5480    // destination buffer byte counter. When this guy
5481    // gets to count, we're done with the iteration
5482    int32_t i = 0;
5483    // used to count the zero bytes written after we
5484    // have finished with the sort key
5485    int32_t j = 0;
5486
5487
5488    // Hm.... I think we're ready to plunge in. Basic story is as following:
5489    // we have a fall through case based on level. This is used for initial
5490    // positioning on iteration start. Every level processor contains a
5491    // for(;;) which will be broken when we exhaust all the CEs. Other
5492    // way to exit is a goto saveState, which happens when we have filled
5493    // out our buffer.
5494    switch(level) {
5495    case UCOL_PSK_PRIMARY:
5496        wasDoingPrimary = TRUE;
5497        for(;;) {
5498            if(i==count) {
5499                goto saveState;
5500            }
5501            // We should save the state only if we
5502            // are sure that we are done with the
5503            // previous iterator state
5504            if(canUpdateState && byteCountOrFrenchDone == 0) {
5505                newState = s.iterator->getState(s.iterator);
5506                if(newState != UITER_NO_STATE) {
5507                    iterState = newState;
5508                    cces = 0;
5509                }
5510            }
5511            CE = ucol_IGetNextCE(coll, &s, status);
5512            cces++;
5513            if(CE==UCOL_NO_MORE_CES) {
5514                // Add the level separator
5515                terminatePSKLevel(level, maxLevel, i, dest);
5516                byteCountOrFrenchDone=0;
5517                // Restart the iteration an move to the
5518                // second level
5519                s.iterator->move(s.iterator, 0, UITER_START);
5520                cces = 0;
5521                level = UCOL_PSK_SECONDARY;
5522                break;
5523            }
5524            if(!isContinuation(CE)){
5525                if(coll->leadBytePermutationTable != NULL){
5526                    CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5527                }
5528            }
5529            if(!isShiftedCE(CE, LVT, &wasShifted)) {
5530                CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5531                if(CE != 0) {
5532                    if(byteCountOrFrenchDone == 0) {
5533                        // get the second byte of primary
5534                        dest[i++]=(uint8_t)(CE >> 8);
5535                    } else {
5536                        byteCountOrFrenchDone = 0;
5537                    }
5538                    if((CE &=0xff)!=0) {
5539                        if(i==count) {
5540                            /* overflow */
5541                            byteCountOrFrenchDone = 1;
5542                            cces--;
5543                            goto saveState;
5544                        }
5545                        dest[i++]=(uint8_t)CE;
5546                    }
5547                }
5548            }
5549            if(uprv_numAvailableExpCEs(s)) {
5550                canUpdateState = FALSE;
5551            } else {
5552                canUpdateState = TRUE;
5553            }
5554        }
5555        /* fall through to next level */
5556    case UCOL_PSK_SECONDARY:
5557        if(strength >= UCOL_SECONDARY) {
5558            if(!doingFrench) {
5559                for(;;) {
5560                    if(i == count) {
5561                        goto saveState;
5562                    }
5563                    // We should save the state only if we
5564                    // are sure that we are done with the
5565                    // previous iterator state
5566                    if(canUpdateState) {
5567                        newState = s.iterator->getState(s.iterator);
5568                        if(newState != UITER_NO_STATE) {
5569                            iterState = newState;
5570                            cces = 0;
5571                        }
5572                    }
5573                    CE = ucol_IGetNextCE(coll, &s, status);
5574                    cces++;
5575                    if(CE==UCOL_NO_MORE_CES) {
5576                        // Add the level separator
5577                        terminatePSKLevel(level, maxLevel, i, dest);
5578                        byteCountOrFrenchDone = 0;
5579                        // Restart the iteration an move to the
5580                        // second level
5581                        s.iterator->move(s.iterator, 0, UITER_START);
5582                        cces = 0;
5583                        level = UCOL_PSK_CASE;
5584                        break;
5585                    }
5586                    if(!isShiftedCE(CE, LVT, &wasShifted)) {
5587                        CE >>= 8; /* get secondary */
5588                        if(CE != 0) {
5589                            dest[i++]=(uint8_t)CE;
5590                        }
5591                    }
5592                    if(uprv_numAvailableExpCEs(s)) {
5593                        canUpdateState = FALSE;
5594                    } else {
5595                        canUpdateState = TRUE;
5596                    }
5597                }
5598            } else { // French secondary processing
5599                uint8_t frenchBuff[UCOL_MAX_BUFFER];
5600                int32_t frenchIndex = 0;
5601                // Here we are going backwards.
5602                // If the iterator is at the beggining, it should be
5603                // moved to end.
5604                if(wasDoingPrimary) {
5605                    s.iterator->move(s.iterator, 0, UITER_LIMIT);
5606                    cces = 0;
5607                }
5608                for(;;) {
5609                    if(i == count) {
5610                        goto saveState;
5611                    }
5612                    if(canUpdateState) {
5613                        newState = s.iterator->getState(s.iterator);
5614                        if(newState != UITER_NO_STATE) {
5615                            iterState = newState;
5616                            cces = 0;
5617                        }
5618                    }
5619                    CE = ucol_IGetPrevCE(coll, &s, status);
5620                    cces++;
5621                    if(CE==UCOL_NO_MORE_CES) {
5622                        // Add the level separator
5623                        terminatePSKLevel(level, maxLevel, i, dest);
5624                        byteCountOrFrenchDone = 0;
5625                        // Restart the iteration an move to the next level
5626                        s.iterator->move(s.iterator, 0, UITER_START);
5627                        level = UCOL_PSK_CASE;
5628                        break;
5629                    }
5630                    if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5631                        // reverse when we get a first non-continuation CE.
5632                        CE >>= 8;
5633                        frenchBuff[frenchIndex++] = (uint8_t)CE;
5634                    } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5635                        CE >>= 8; /* get secondary */
5636                        if(!frenchIndex) {
5637                            if(CE != 0) {
5638                                dest[i++]=(uint8_t)CE;
5639                            }
5640                        } else {
5641                            frenchBuff[frenchIndex++] = (uint8_t)CE;
5642                            frenchIndex -= usedFrench;
5643                            usedFrench = 0;
5644                            while(i < count && frenchIndex) {
5645                                dest[i++] = frenchBuff[--frenchIndex];
5646                                usedFrench++;
5647                            }
5648                        }
5649                    }
5650                    if(uprv_numAvailableExpCEs(s)) {
5651                        canUpdateState = FALSE;
5652                    } else {
5653                        canUpdateState = TRUE;
5654                    }
5655                }
5656            }
5657        } else {
5658            level = UCOL_PSK_CASE;
5659        }
5660        /* fall through to next level */
5661    case UCOL_PSK_CASE:
5662        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5663            uint32_t caseShift = UCOL_CASE_SHIFT_START;
5664            uint8_t caseByte = UCOL_CASE_BYTE_START;
5665            uint8_t caseBits = 0;
5666
5667            for(;;) {
5668                U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5669                if(i == count) {
5670                    goto saveState;
5671                }
5672                // We should save the state only if we
5673                // are sure that we are done with the
5674                // previous iterator state
5675                if(canUpdateState) {
5676                    newState = s.iterator->getState(s.iterator);
5677                    if(newState != UITER_NO_STATE) {
5678                        iterState = newState;
5679                        cces = 0;
5680                    }
5681                }
5682                CE = ucol_IGetNextCE(coll, &s, status);
5683                cces++;
5684                if(CE==UCOL_NO_MORE_CES) {
5685                    // On the case level we might have an unfinished
5686                    // case byte. Add one if it's started.
5687                    if(caseShift != UCOL_CASE_SHIFT_START) {
5688                        dest[i++] = caseByte;
5689                    }
5690                    cces = 0;
5691                    // We have finished processing CEs on this level.
5692                    // However, we don't know if we have enough space
5693                    // to add a case level terminator.
5694                    if(i < count) {
5695                        // Add the level separator
5696                        terminatePSKLevel(level, maxLevel, i, dest);
5697                        // Restart the iteration and move to the
5698                        // next level
5699                        s.iterator->move(s.iterator, 0, UITER_START);
5700                        level = UCOL_PSK_TERTIARY;
5701                    } else {
5702                        canUpdateState = FALSE;
5703                    }
5704                    break;
5705                }
5706
5707                if(!isShiftedCE(CE, LVT, &wasShifted)) {
5708                    if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5709                        // do the case level if we need to do it. We don't want to calculate
5710                        // case level for primary ignorables if we have only primary strength and case level
5711                        // otherwise we would break well formedness of CEs
5712                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5713                        caseBits = (uint8_t)(CE & 0xC0);
5714                        // this copies the case level logic from the
5715                        // sort key generation code
5716                        if(CE != 0) {
5717                            if (caseShift == 0) {
5718                                dest[i++] = caseByte;
5719                                caseShift = UCOL_CASE_SHIFT_START;
5720                                caseByte = UCOL_CASE_BYTE_START;
5721                            }
5722                            if(coll->caseFirst == UCOL_UPPER_FIRST) {
5723                                if((caseBits & 0xC0) == 0) {
5724                                    caseByte |= 1 << (--caseShift);
5725                                } else {
5726                                    caseByte |= 0 << (--caseShift);
5727                                    /* second bit */
5728                                    if(caseShift == 0) {
5729                                        dest[i++] = caseByte;
5730                                        caseShift = UCOL_CASE_SHIFT_START;
5731                                        caseByte = UCOL_CASE_BYTE_START;
5732                                    }
5733                                    caseByte |= ((caseBits>>6)&1) << (--caseShift);
5734                                }
5735                            } else {
5736                                if((caseBits & 0xC0) == 0) {
5737                                    caseByte |= 0 << (--caseShift);
5738                                } else {
5739                                    caseByte |= 1 << (--caseShift);
5740                                    /* second bit */
5741                                    if(caseShift == 0) {
5742                                        dest[i++] = caseByte;
5743                                        caseShift = UCOL_CASE_SHIFT_START;
5744                                        caseByte = UCOL_CASE_BYTE_START;
5745                                    }
5746                                    caseByte |= ((caseBits>>7)&1) << (--caseShift);
5747                                }
5748                            }
5749                        }
5750
5751                    }
5752                }
5753                // Not sure this is correct for the case level - revisit
5754                if(uprv_numAvailableExpCEs(s)) {
5755                    canUpdateState = FALSE;
5756                } else {
5757                    canUpdateState = TRUE;
5758                }
5759            }
5760        } else {
5761            level = UCOL_PSK_TERTIARY;
5762        }
5763        /* fall through to next level */
5764    case UCOL_PSK_TERTIARY:
5765        if(strength >= UCOL_TERTIARY) {
5766            for(;;) {
5767                if(i == count) {
5768                    goto saveState;
5769                }
5770                // We should save the state only if we
5771                // are sure that we are done with the
5772                // previous iterator state
5773                if(canUpdateState) {
5774                    newState = s.iterator->getState(s.iterator);
5775                    if(newState != UITER_NO_STATE) {
5776                        iterState = newState;
5777                        cces = 0;
5778                    }
5779                }
5780                CE = ucol_IGetNextCE(coll, &s, status);
5781                cces++;
5782                if(CE==UCOL_NO_MORE_CES) {
5783                    // Add the level separator
5784                    terminatePSKLevel(level, maxLevel, i, dest);
5785                    byteCountOrFrenchDone = 0;
5786                    // Restart the iteration an move to the
5787                    // second level
5788                    s.iterator->move(s.iterator, 0, UITER_START);
5789                    cces = 0;
5790                    level = UCOL_PSK_QUATERNARY;
5791                    break;
5792                }
5793                if(!isShiftedCE(CE, LVT, &wasShifted)) {
5794                    notIsContinuation = !isContinuation(CE);
5795
5796                    if(notIsContinuation) {
5797                        CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5798                        CE ^= coll->caseSwitch;
5799                        CE &= coll->tertiaryMask;
5800                    } else {
5801                        CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5802                    }
5803
5804                    if(CE != 0) {
5805                        dest[i++]=(uint8_t)CE;
5806                    }
5807                }
5808                if(uprv_numAvailableExpCEs(s)) {
5809                    canUpdateState = FALSE;
5810                } else {
5811                    canUpdateState = TRUE;
5812                }
5813            }
5814        } else {
5815            // if we're not doing tertiary
5816            // skip to the end
5817            level = UCOL_PSK_NULL;
5818        }
5819        /* fall through to next level */
5820    case UCOL_PSK_QUATERNARY:
5821        if(strength >= UCOL_QUATERNARY) {
5822            for(;;) {
5823                if(i == count) {
5824                    goto saveState;
5825                }
5826                // We should save the state only if we
5827                // are sure that we are done with the
5828                // previous iterator state
5829                if(canUpdateState) {
5830                    newState = s.iterator->getState(s.iterator);
5831                    if(newState != UITER_NO_STATE) {
5832                        iterState = newState;
5833                        cces = 0;
5834                    }
5835                }
5836                CE = ucol_IGetNextCE(coll, &s, status);
5837                cces++;
5838                if(CE==UCOL_NO_MORE_CES) {
5839                    // Add the level separator
5840                    terminatePSKLevel(level, maxLevel, i, dest);
5841                    //dest[i++] = UCOL_LEVELTERMINATOR;
5842                    byteCountOrFrenchDone = 0;
5843                    // Restart the iteration an move to the
5844                    // second level
5845                    s.iterator->move(s.iterator, 0, UITER_START);
5846                    cces = 0;
5847                    level = UCOL_PSK_QUIN;
5848                    break;
5849                }
5850                if(CE==0)
5851                    continue;
5852                if(isShiftedCE(CE, LVT, &wasShifted)) {
5853                    CE >>= 16; /* get primary */
5854                    if(CE != 0) {
5855                        if(byteCountOrFrenchDone == 0) {
5856                            dest[i++]=(uint8_t)(CE >> 8);
5857                        } else {
5858                            byteCountOrFrenchDone = 0;
5859                        }
5860                        if((CE &=0xff)!=0) {
5861                            if(i==count) {
5862                                /* overflow */
5863                                byteCountOrFrenchDone = 1;
5864                                goto saveState;
5865                            }
5866                            dest[i++]=(uint8_t)CE;
5867                        }
5868                    }
5869                } else {
5870                    notIsContinuation = !isContinuation(CE);
5871                    if(notIsContinuation) {
5872                        if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5873                            dest[i++] = UCOL_HIRAGANA_QUAD;
5874                        } else {
5875                            dest[i++] = 0xFF;
5876                        }
5877                    }
5878                }
5879                if(uprv_numAvailableExpCEs(s)) {
5880                    canUpdateState = FALSE;
5881                } else {
5882                    canUpdateState = TRUE;
5883                }
5884            }
5885        } else {
5886            // if we're not doing quaternary
5887            // skip to the end
5888            level = UCOL_PSK_NULL;
5889        }
5890        /* fall through to next level */
5891    case UCOL_PSK_QUIN:
5892        level = UCOL_PSK_IDENTICAL;
5893        /* fall through to next level */
5894    case UCOL_PSK_IDENTICAL:
5895        if(strength >= UCOL_IDENTICAL) {
5896            UChar32 first, second;
5897            int32_t bocsuBytesWritten = 0;
5898            // We always need to do identical on
5899            // the NFD form of the string.
5900            if(normIter == NULL) {
5901                // we arrived from the level below and
5902                // normalization was not turned on.
5903                // therefore, we need to make a fresh NFD iterator
5904                normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5905                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5906            } else if(!doingIdenticalFromStart) {
5907                // there is an iterator, but we did some other levels.
5908                // therefore, we have a FCD iterator - need to make
5909                // a NFD one.
5910                // normIter being at the beginning does not guarantee
5911                // that the underlying iterator is at the beginning
5912                iter->move(iter, 0, UITER_START);
5913                s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5914            }
5915            // At this point we have a NFD iterator that is positioned
5916            // in the right place
5917            if(U_FAILURE(*status)) {
5918                UTRACE_EXIT_STATUS(*status);
5919                return 0;
5920            }
5921            first = uiter_previous32(s.iterator);
5922            // maybe we're at the start of the string
5923            if(first == U_SENTINEL) {
5924                first = 0;
5925            } else {
5926                uiter_next32(s.iterator);
5927            }
5928
5929            j = 0;
5930            for(;;) {
5931                if(i == count) {
5932                    if(j+1 < bocsuBytesWritten) {
5933                        bocsuBytesUsed = j+1;
5934                    }
5935                    goto saveState;
5936                }
5937
5938                // On identical level, we will always save
5939                // the state if we reach this point, since
5940                // we don't depend on getNextCE for content
5941                // all the content is in our buffer and we
5942                // already either stored the full buffer OR
5943                // otherwise we won't arrive here.
5944                newState = s.iterator->getState(s.iterator);
5945                if(newState != UITER_NO_STATE) {
5946                    iterState = newState;
5947                    cces = 0;
5948                }
5949
5950                uint8_t buff[4];
5951                second = uiter_next32(s.iterator);
5952                cces++;
5953
5954                // end condition for identical level
5955                if(second == U_SENTINEL) {
5956                    terminatePSKLevel(level, maxLevel, i, dest);
5957                    level = UCOL_PSK_NULL;
5958                    break;
5959                }
5960                bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
5961                first = second;
5962
5963                j = 0;
5964                if(bocsuBytesUsed != 0) {
5965                    while(bocsuBytesUsed-->0) {
5966                        j++;
5967                    }
5968                }
5969
5970                while(i < count && j < bocsuBytesWritten) {
5971                    dest[i++] = buff[j++];
5972                }
5973            }
5974
5975        } else {
5976            level = UCOL_PSK_NULL;
5977        }
5978        /* fall through to next level */
5979    case UCOL_PSK_NULL:
5980        j = i;
5981        while(j<count) {
5982            dest[j++]=0;
5983        }
5984        break;
5985    default:
5986        *status = U_INTERNAL_PROGRAM_ERROR;
5987        UTRACE_EXIT_STATUS(*status);
5988        return 0;
5989    }
5990
5991saveState:
5992    // Now we need to return stuff. First we want to see whether we have
5993    // done everything for the current state of iterator.
5994    if(byteCountOrFrenchDone
5995        || canUpdateState == FALSE
5996        || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
5997    {
5998        // Any of above mean that the previous transaction
5999        // wasn't finished and that we should store the
6000        // previous iterator state.
6001        state[0] = iterState;
6002    } else {
6003        // The transaction is complete. We will continue in the next iteration.
6004        state[0] = s.iterator->getState(s.iterator);
6005        cces = 0;
6006    }
6007    // Store the number of bocsu bytes written.
6008    if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6009        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6010    }
6011    state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6012
6013    // Next we put in the level of comparison
6014    state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6015
6016    // If we are doing French, we need to store whether we have just finished the French level
6017    if(level == UCOL_PSK_SECONDARY && doingFrench) {
6018        state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6019    } else {
6020        state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6021    }
6022
6023    // Was the latest CE shifted
6024    if(wasShifted) {
6025        state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6026    }
6027    // Check for cces overflow
6028    if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6029        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6030    }
6031    // Store cces
6032    state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6033
6034    // Check for French overflow
6035    if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6036        *status = U_INDEX_OUTOFBOUNDS_ERROR;
6037    }
6038    // Store number of bytes written in the French secondary continuation sequence
6039    state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6040
6041
6042    // If we have used normalizing iterator, get rid of it
6043    if(normIter != NULL) {
6044        unorm_closeIter(normIter);
6045    }
6046
6047    /* To avoid memory leak, free the offset buffer if necessary. */
6048    ucol_freeOffsetBuffer(&s);
6049
6050    // Return number of meaningful sortkey bytes.
6051    UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6052                  dest,i, state[0], state[1]);
6053    UTRACE_EXIT_VALUE(i);
6054    return i;
6055}
6056
6057/**
6058 * Produce a bound for a given sortkey and a number of levels.
6059 */
6060U_CAPI int32_t U_EXPORT2
6061ucol_getBound(const uint8_t       *source,
6062        int32_t             sourceLength,
6063        UColBoundMode       boundType,
6064        uint32_t            noOfLevels,
6065        uint8_t             *result,
6066        int32_t             resultLength,
6067        UErrorCode          *status)
6068{
6069    // consistency checks
6070    if(status == NULL || U_FAILURE(*status)) {
6071        return 0;
6072    }
6073    if(source == NULL) {
6074        *status = U_ILLEGAL_ARGUMENT_ERROR;
6075        return 0;
6076    }
6077
6078    int32_t sourceIndex = 0;
6079    // Scan the string until we skip enough of the key OR reach the end of the key
6080    do {
6081        sourceIndex++;
6082        if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6083            noOfLevels--;
6084        }
6085    } while (noOfLevels > 0
6086        && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6087
6088    if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6089        && noOfLevels > 0) {
6090            *status = U_SORT_KEY_TOO_SHORT_WARNING;
6091    }
6092
6093
6094    // READ ME: this code assumes that the values for boundType
6095    // enum will not changes. They are set so that the enum value
6096    // corresponds to the number of extra bytes each bound type
6097    // needs.
6098    if(result != NULL && resultLength >= sourceIndex+boundType) {
6099        uprv_memcpy(result, source, sourceIndex);
6100        switch(boundType) {
6101            // Lower bound just gets terminated. No extra bytes
6102        case UCOL_BOUND_LOWER: // = 0
6103            break;
6104            // Upper bound needs one extra byte
6105        case UCOL_BOUND_UPPER: // = 1
6106            result[sourceIndex++] = 2;
6107            break;
6108            // Upper long bound needs two extra bytes
6109        case UCOL_BOUND_UPPER_LONG: // = 2
6110            result[sourceIndex++] = 0xFF;
6111            result[sourceIndex++] = 0xFF;
6112            break;
6113        default:
6114            *status = U_ILLEGAL_ARGUMENT_ERROR;
6115            return 0;
6116        }
6117        result[sourceIndex++] = 0;
6118
6119        return sourceIndex;
6120    } else {
6121        return sourceIndex+boundType+1;
6122    }
6123}
6124
6125/****************************************************************************/
6126/* Following are the functions that deal with the properties of a collator  */
6127/* there are new APIs and some compatibility APIs                           */
6128/****************************************************************************/
6129
6130static inline void
6131ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6132                    int32_t *primShift, int32_t *secShift, int32_t *terShift)
6133{
6134    uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6135    UBool reverseSecondary = FALSE;
6136    UBool continuation = isContinuation(CE);
6137    if(!continuation) {
6138        tertiary = (uint8_t)((CE & coll->tertiaryMask));
6139        tertiary ^= coll->caseSwitch;
6140        reverseSecondary = TRUE;
6141    } else {
6142        tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6143        tertiary &= UCOL_REMOVE_CASE;
6144        reverseSecondary = FALSE;
6145    }
6146
6147    secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6148    primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6149    primary1 = (uint8_t)(CE >> 8);
6150
6151    if(primary1 != 0) {
6152        if (coll->leadBytePermutationTable != NULL && !continuation) {
6153            primary1 = coll->leadBytePermutationTable[primary1];
6154        }
6155
6156        coll->latinOneCEs[ch] |= (primary1 << *primShift);
6157        *primShift -= 8;
6158    }
6159    if(primary2 != 0) {
6160        if(*primShift < 0) {
6161            coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6162            coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6163            coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6164            return;
6165        }
6166        coll->latinOneCEs[ch] |= (primary2 << *primShift);
6167        *primShift -= 8;
6168    }
6169    if(secondary != 0) {
6170        if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6171            coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6172            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6173        } else { // normal case
6174            coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6175        }
6176        *secShift -= 8;
6177    }
6178    if(tertiary != 0) {
6179        coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6180        *terShift -= 8;
6181    }
6182}
6183
6184static inline UBool
6185ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6186    uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6187    if(newTable == NULL) {
6188      *status = U_MEMORY_ALLOCATION_ERROR;
6189      coll->latinOneFailed = TRUE;
6190      return FALSE;
6191    }
6192    int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6193    uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6194    uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6195    uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6196    uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6197    coll->latinOneTableLen = size;
6198    uprv_free(coll->latinOneCEs);
6199    coll->latinOneCEs = newTable;
6200    return TRUE;
6201}
6202
6203static UBool
6204ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6205    UBool result = TRUE;
6206    if(coll->latinOneCEs == NULL) {
6207        coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6208        if(coll->latinOneCEs == NULL) {
6209            *status = U_MEMORY_ALLOCATION_ERROR;
6210            return FALSE;
6211        }
6212        coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6213    }
6214    UChar ch = 0;
6215    UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6216    // Check for null pointer
6217    if (U_FAILURE(*status)) {
6218        ucol_closeElements(it);
6219        return FALSE;
6220    }
6221    uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6222
6223    int32_t primShift = 24, secShift = 24, terShift = 24;
6224    uint32_t CE = 0;
6225    int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6226
6227    // TODO: make safe if you get more than you wanted...
6228    for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6229        primShift = 24; secShift = 24; terShift = 24;
6230        if(ch < 0x100) {
6231            CE = coll->latinOneMapping[ch];
6232        } else {
6233            CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6234            if(CE == UCOL_NOT_FOUND && coll->UCA) {
6235                CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6236            }
6237        }
6238        if(CE < UCOL_NOT_FOUND) {
6239            ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6240        } else {
6241            switch (getCETag(CE)) {
6242            case EXPANSION_TAG:
6243            case DIGIT_TAG:
6244                ucol_setText(it, &ch, 1, status);
6245                while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6246                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6247                        coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6248                        coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6249                        coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6250                        break;
6251                    }
6252                    ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6253                }
6254                break;
6255            case CONTRACTION_TAG:
6256                // here is the trick
6257                // F2 is contraction. We do something very similar to contractions
6258                // but have two indices, one in the real contraction table and the
6259                // other to where we stuffed things. This hopes that we don't have
6260                // many contractions (this should work for latin-1 tables).
6261                {
6262                    if((CE & 0x00FFF000) != 0) {
6263                        *status = U_UNSUPPORTED_ERROR;
6264                        goto cleanup_after_failure;
6265                    }
6266
6267                    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6268
6269                    CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6270
6271                    coll->latinOneCEs[ch] = CE;
6272                    coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6273                    coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6274
6275                    // We're going to jump into contraction table, pick the elements
6276                    // and use them
6277                    do {
6278                        CE = *(coll->contractionCEs +
6279                            (UCharOffset - coll->contractionIndex));
6280                        if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6281                            uint32_t size;
6282                            uint32_t i;    /* general counter */
6283                            uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6284                            size = getExpansionCount(CE);
6285                            //CE = *CEOffset++;
6286                            if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6287                                for(i = 0; i<size; i++) {
6288                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6289                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6290                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6291                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6292                                        break;
6293                                    }
6294                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6295                                }
6296                            } else { /* else, we do */
6297                                while(*CEOffset != 0) {
6298                                    if(primShift < 0 || secShift < 0 || terShift < 0) {
6299                                        coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6300                                        coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6301                                        coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6302                                        break;
6303                                    }
6304                                    ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6305                                }
6306                            }
6307                            contractionOffset++;
6308                        } else if(CE < UCOL_NOT_FOUND) {
6309                            ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6310                        } else {
6311                            coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6312                            coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6313                            coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6314                            contractionOffset++;
6315                        }
6316                        UCharOffset++;
6317                        primShift = 24; secShift = 24; terShift = 24;
6318                        if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6319                            if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6320                                goto cleanup_after_failure;
6321                            }
6322                        }
6323                    } while(*UCharOffset != 0xFFFF);
6324                }
6325                break;;
6326            case SPEC_PROC_TAG:
6327                {
6328                    // 0xB7 is a precontext character defined in UCA5.1, a special
6329                    // handle is implemeted in order to save LatinOne table for
6330                    // most locales.
6331                    if (ch==0xb7) {
6332                        ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6333                    }
6334                    else {
6335                        goto cleanup_after_failure;
6336                    }
6337                }
6338                break;
6339            default:
6340                goto cleanup_after_failure;
6341            }
6342        }
6343    }
6344    // compact table
6345    if(contractionOffset < coll->latinOneTableLen) {
6346        if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6347            goto cleanup_after_failure;
6348        }
6349    }
6350    ucol_closeElements(it);
6351    return result;
6352
6353cleanup_after_failure:
6354    // status should already be set before arriving here.
6355    coll->latinOneFailed = TRUE;
6356    ucol_closeElements(it);
6357    return FALSE;
6358}
6359
6360void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6361    if(U_SUCCESS(*status)) {
6362        if(coll->caseFirst == UCOL_UPPER_FIRST) {
6363            coll->caseSwitch = UCOL_CASE_SWITCH;
6364        } else {
6365            coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6366        }
6367
6368        if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6369            coll->tertiaryMask = UCOL_REMOVE_CASE;
6370            coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6371            coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6372            coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6373            coll->tertiaryBottom = UCOL_COMMON_BOT3;
6374        } else {
6375            coll->tertiaryMask = UCOL_KEEP_CASE;
6376            coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6377            if(coll->caseFirst == UCOL_UPPER_FIRST) {
6378                coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6379                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6380                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6381            } else {
6382                coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6383                coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6384                coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6385            }
6386        }
6387
6388        /* Set the compression values */
6389        uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
6390        coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6391        coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6392
6393        if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6394            && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6395        {
6396            coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6397        } else {
6398            coll->sortKeyGen = ucol_calcSortKey;
6399        }
6400        if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6401            && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6402        {
6403            if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6404                if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6405                    //fprintf(stderr, "F");
6406                    coll->latinOneUse = TRUE;
6407                } else {
6408                    coll->latinOneUse = FALSE;
6409                }
6410                if(*status == U_UNSUPPORTED_ERROR) {
6411                    *status = U_ZERO_ERROR;
6412                }
6413            } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6414                coll->latinOneUse = TRUE;
6415            }
6416        } else {
6417            coll->latinOneUse = FALSE;
6418        }
6419    }
6420}
6421
6422U_CAPI uint32_t  U_EXPORT2
6423ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6424    if(U_FAILURE(*status) || coll == NULL) {
6425        return 0;
6426    }
6427    if(len == -1) {
6428        len = u_strlen(varTop);
6429    }
6430    if(len == 0) {
6431        *status = U_ILLEGAL_ARGUMENT_ERROR;
6432        return 0;
6433    }
6434
6435    if(coll->delegate!=NULL) {
6436      return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
6437    }
6438
6439
6440    collIterate s;
6441    IInit_collIterate(coll, varTop, len, &s, status);
6442    if(U_FAILURE(*status)) {
6443        return 0;
6444    }
6445
6446    uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6447
6448    /* here we check if we have consumed all characters */
6449    /* you can put in either one character or a contraction */
6450    /* you shouldn't put more... */
6451    if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6452        *status = U_CE_NOT_FOUND_ERROR;
6453        return 0;
6454    }
6455
6456    uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6457
6458    if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6459        *status = U_PRIMARY_TOO_LONG_ERROR;
6460        return 0;
6461    }
6462    if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6463        coll->variableTopValueisDefault = FALSE;
6464        coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6465    }
6466
6467    /* To avoid memory leak, free the offset buffer if necessary. */
6468    ucol_freeOffsetBuffer(&s);
6469
6470    return CE & UCOL_PRIMARYMASK;
6471}
6472
6473U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6474    if(U_FAILURE(*status) || coll == NULL) {
6475        return 0;
6476    }
6477    if(coll->delegate!=NULL) {
6478      return ((const Collator*)coll->delegate)->getVariableTop(*status);
6479    }
6480    return coll->variableTopValue<<16;
6481}
6482
6483U_CAPI void  U_EXPORT2
6484ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6485    if(U_FAILURE(*status) || coll == NULL) {
6486        return;
6487    }
6488
6489    if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6490        coll->variableTopValueisDefault = FALSE;
6491        coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6492    }
6493}
6494/* Attribute setter API */
6495U_CAPI void  U_EXPORT2
6496ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6497    if(U_FAILURE(*status) || coll == NULL) {
6498      return;
6499    }
6500
6501    if(coll->delegate != NULL) {
6502      ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
6503      return;
6504    }
6505
6506    UColAttributeValue oldFrench = coll->frenchCollation;
6507    UColAttributeValue oldCaseFirst = coll->caseFirst;
6508    switch(attr) {
6509    case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6510        if(value == UCOL_ON) {
6511            coll->numericCollation = UCOL_ON;
6512            coll->numericCollationisDefault = FALSE;
6513        } else if (value == UCOL_OFF) {
6514            coll->numericCollation = UCOL_OFF;
6515            coll->numericCollationisDefault = FALSE;
6516        } else if (value == UCOL_DEFAULT) {
6517            coll->numericCollationisDefault = TRUE;
6518            coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6519        } else {
6520            *status = U_ILLEGAL_ARGUMENT_ERROR;
6521        }
6522        break;
6523    case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6524        if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
6525            // This attribute is an implementation detail of the CLDR Japanese tailoring.
6526            // The implementation might change to use a different mechanism
6527            // to achieve the same Japanese sort order.
6528            // Since ICU 50, this attribute is not settable any more via API functions.
6529        } else {
6530            *status = U_ILLEGAL_ARGUMENT_ERROR;
6531        }
6532        break;
6533    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6534        if(value == UCOL_ON) {
6535            coll->frenchCollation = UCOL_ON;
6536            coll->frenchCollationisDefault = FALSE;
6537        } else if (value == UCOL_OFF) {
6538            coll->frenchCollation = UCOL_OFF;
6539            coll->frenchCollationisDefault = FALSE;
6540        } else if (value == UCOL_DEFAULT) {
6541            coll->frenchCollationisDefault = TRUE;
6542            coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6543        } else {
6544            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6545        }
6546        break;
6547    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6548        if(value == UCOL_SHIFTED) {
6549            coll->alternateHandling = UCOL_SHIFTED;
6550            coll->alternateHandlingisDefault = FALSE;
6551        } else if (value == UCOL_NON_IGNORABLE) {
6552            coll->alternateHandling = UCOL_NON_IGNORABLE;
6553            coll->alternateHandlingisDefault = FALSE;
6554        } else if (value == UCOL_DEFAULT) {
6555            coll->alternateHandlingisDefault = TRUE;
6556            coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6557        } else {
6558            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6559        }
6560        break;
6561    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6562        if(value == UCOL_LOWER_FIRST) {
6563            coll->caseFirst = UCOL_LOWER_FIRST;
6564            coll->caseFirstisDefault = FALSE;
6565        } else if (value == UCOL_UPPER_FIRST) {
6566            coll->caseFirst = UCOL_UPPER_FIRST;
6567            coll->caseFirstisDefault = FALSE;
6568        } else if (value == UCOL_OFF) {
6569            coll->caseFirst = UCOL_OFF;
6570            coll->caseFirstisDefault = FALSE;
6571        } else if (value == UCOL_DEFAULT) {
6572            coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6573            coll->caseFirstisDefault = TRUE;
6574        } else {
6575            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6576        }
6577        break;
6578    case UCOL_CASE_LEVEL: /* do we have an extra case level */
6579        if(value == UCOL_ON) {
6580            coll->caseLevel = UCOL_ON;
6581            coll->caseLevelisDefault = FALSE;
6582        } else if (value == UCOL_OFF) {
6583            coll->caseLevel = UCOL_OFF;
6584            coll->caseLevelisDefault = FALSE;
6585        } else if (value == UCOL_DEFAULT) {
6586            coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6587            coll->caseLevelisDefault = TRUE;
6588        } else {
6589            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6590        }
6591        break;
6592    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6593        if(value == UCOL_ON) {
6594            coll->normalizationMode = UCOL_ON;
6595            coll->normalizationModeisDefault = FALSE;
6596            initializeFCD(status);
6597        } else if (value == UCOL_OFF) {
6598            coll->normalizationMode = UCOL_OFF;
6599            coll->normalizationModeisDefault = FALSE;
6600        } else if (value == UCOL_DEFAULT) {
6601            coll->normalizationModeisDefault = TRUE;
6602            coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6603            if(coll->normalizationMode == UCOL_ON) {
6604                initializeFCD(status);
6605            }
6606        } else {
6607            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6608        }
6609        break;
6610    case UCOL_STRENGTH:         /* attribute for strength */
6611        if (value == UCOL_DEFAULT) {
6612            coll->strengthisDefault = TRUE;
6613            coll->strength = (UColAttributeValue)coll->options->strength;
6614        } else if (value <= UCOL_IDENTICAL) {
6615            coll->strengthisDefault = FALSE;
6616            coll->strength = value;
6617        } else {
6618            *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6619        }
6620        break;
6621    case UCOL_ATTRIBUTE_COUNT:
6622    default:
6623        *status = U_ILLEGAL_ARGUMENT_ERROR;
6624        break;
6625    }
6626    if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6627        coll->latinOneRegenTable = TRUE;
6628    } else {
6629        coll->latinOneRegenTable = FALSE;
6630    }
6631    ucol_updateInternalState(coll, status);
6632}
6633
6634U_CAPI UColAttributeValue  U_EXPORT2
6635ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6636    if(U_FAILURE(*status) || coll == NULL) {
6637      return UCOL_DEFAULT;
6638    }
6639
6640    if(coll->delegate != NULL) {
6641      return ((Collator*)coll->delegate)->getAttribute(attr,*status);
6642    }
6643
6644    switch(attr) {
6645    case UCOL_NUMERIC_COLLATION:
6646      return coll->numericCollation;
6647    case UCOL_HIRAGANA_QUATERNARY_MODE:
6648      return coll->hiraganaQ;
6649    case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6650        return coll->frenchCollation;
6651    case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6652        return coll->alternateHandling;
6653    case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6654        return coll->caseFirst;
6655    case UCOL_CASE_LEVEL: /* do we have an extra case level */
6656        return coll->caseLevel;
6657    case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6658        return coll->normalizationMode;
6659    case UCOL_STRENGTH:         /* attribute for strength */
6660        return coll->strength;
6661    case UCOL_ATTRIBUTE_COUNT:
6662    default:
6663        *status = U_ILLEGAL_ARGUMENT_ERROR;
6664        break;
6665    }
6666    return UCOL_DEFAULT;
6667}
6668
6669U_CAPI void U_EXPORT2
6670ucol_setStrength(    UCollator                *coll,
6671            UCollationStrength        strength)
6672{
6673    UErrorCode status = U_ZERO_ERROR;
6674    ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6675}
6676
6677U_CAPI UCollationStrength U_EXPORT2
6678ucol_getStrength(const UCollator *coll)
6679{
6680    UErrorCode status = U_ZERO_ERROR;
6681    return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6682}
6683
6684U_CAPI int32_t U_EXPORT2
6685ucol_getReorderCodes(const UCollator *coll,
6686                    int32_t *dest,
6687                    int32_t destCapacity,
6688                    UErrorCode *status) {
6689    if (U_FAILURE(*status)) {
6690        return 0;
6691    }
6692
6693    if(coll->delegate!=NULL) {
6694      return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
6695    }
6696
6697    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6698        *status = U_ILLEGAL_ARGUMENT_ERROR;
6699        return 0;
6700    }
6701
6702#ifdef UCOL_DEBUG
6703    printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6704    printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
6705#endif
6706
6707    if (coll->reorderCodesLength > destCapacity) {
6708        *status = U_BUFFER_OVERFLOW_ERROR;
6709        return coll->reorderCodesLength;
6710    }
6711    for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6712        dest[i] = coll->reorderCodes[i];
6713    }
6714    return coll->reorderCodesLength;
6715}
6716
6717U_CAPI void U_EXPORT2
6718ucol_setReorderCodes(UCollator* coll,
6719                    const int32_t* reorderCodes,
6720                    int32_t reorderCodesLength,
6721                    UErrorCode *status) {
6722    if (U_FAILURE(*status)) {
6723        return;
6724    }
6725
6726    if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
6727        *status = U_ILLEGAL_ARGUMENT_ERROR;
6728        return;
6729    }
6730
6731    if(coll->delegate!=NULL) {
6732      ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
6733      return;
6734    }
6735
6736    if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6737        uprv_free(coll->reorderCodes);
6738    }
6739    coll->reorderCodes = NULL;
6740    coll->freeReorderCodesOnClose = FALSE;
6741    coll->reorderCodesLength = 0;
6742    if (reorderCodesLength == 0) {
6743        if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
6744            uprv_free(coll->leadBytePermutationTable);
6745        }
6746        coll->leadBytePermutationTable = NULL;
6747        coll->freeLeadBytePermutationTableOnClose = FALSE;
6748        return;
6749    }
6750    coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
6751    if (coll->reorderCodes == NULL) {
6752        *status = U_MEMORY_ALLOCATION_ERROR;
6753        return;
6754    }
6755    coll->freeReorderCodesOnClose = TRUE;
6756    for (int32_t i = 0; i < reorderCodesLength; i++) {
6757        coll->reorderCodes[i] = reorderCodes[i];
6758    }
6759    coll->reorderCodesLength = reorderCodesLength;
6760    ucol_buildPermutationTable(coll, status);
6761}
6762
6763U_CAPI int32_t U_EXPORT2
6764ucol_getEquivalentReorderCodes(int32_t reorderCode,
6765                    int32_t* dest,
6766                    int32_t destCapacity,
6767                    UErrorCode *pErrorCode) {
6768    bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
6769    uint16_t leadBytes[256];
6770    int leadBytesCount;
6771    int leadByteIndex;
6772    int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6773    int reorderCodesForLeadByteCount;
6774    int reorderCodeIndex;
6775
6776    int32_t equivalentCodesCount = 0;
6777    int setIndex;
6778
6779    if (U_FAILURE(*pErrorCode)) {
6780        return 0;
6781    }
6782
6783    if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6784        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6785        return 0;
6786    }
6787
6788    uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6789
6790    const UCollator* uca = ucol_initUCA(pErrorCode);
6791    if (U_FAILURE(*pErrorCode)) {
6792	return 0;
6793    }
6794    leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
6795    for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6796        reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6797            uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
6798        for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
6799            equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
6800        }
6801    }
6802
6803    for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6804        if (equivalentCodesSet[setIndex] == true) {
6805            equivalentCodesCount++;
6806        }
6807    }
6808
6809    if (destCapacity == 0) {
6810        return equivalentCodesCount;
6811    }
6812
6813    equivalentCodesCount = 0;
6814    for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6815        if (equivalentCodesSet[setIndex] == true) {
6816            dest[equivalentCodesCount++] = setIndex;
6817            if (equivalentCodesCount >= destCapacity) {
6818                break;
6819            }
6820        }
6821    }
6822    return equivalentCodesCount;
6823}
6824
6825
6826/****************************************************************************/
6827/* Following are misc functions                                             */
6828/* there are new APIs and some compatibility APIs                           */
6829/****************************************************************************/
6830
6831U_CAPI void U_EXPORT2
6832ucol_getVersion(const UCollator* coll,
6833                UVersionInfo versionInfo)
6834{
6835    if(coll->delegate!=NULL) {
6836      ((const Collator*)coll->delegate)->getVersion(versionInfo);
6837      return;
6838    }
6839    /* RunTime version  */
6840    uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6841    /* Builder version*/
6842    uint8_t bdVersion = coll->image->version[0];
6843
6844    /* Charset Version. Need to get the version from cnv files
6845     * makeconv should populate cnv files with version and
6846     * an api has to be provided in ucnv.h to obtain this version
6847     */
6848    uint8_t csVersion = 0;
6849
6850    /* combine the version info */
6851    uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6852
6853    /* Tailoring rules */
6854    versionInfo[0] = (uint8_t)(cmbVersion>>8);
6855    versionInfo[1] = (uint8_t)cmbVersion;
6856    versionInfo[2] = coll->image->version[1];
6857    if(coll->UCA) {
6858        /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
6859        versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
6860    } else {
6861        versionInfo[3] = 0;
6862    }
6863}
6864
6865
6866/* This internal API checks whether a character is tailored or not */
6867U_CAPI UBool  U_EXPORT2
6868ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6869    if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6870        return FALSE;
6871    }
6872
6873    uint32_t CE = UCOL_NOT_FOUND;
6874    const UChar *ContractionStart = NULL;
6875    if(u < 0x100) { /* latin-1 */
6876        CE = coll->latinOneMapping[u];
6877        if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6878            return FALSE;
6879        }
6880    } else { /* regular */
6881        CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6882    }
6883
6884    if(isContraction(CE)) {
6885        ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6886        CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6887    }
6888
6889    return (UBool)(CE != UCOL_NOT_FOUND);
6890}
6891
6892
6893/****************************************************************************/
6894/* Following are the string compare functions                               */
6895/*                                                                          */
6896/****************************************************************************/
6897
6898
6899/*  ucol_checkIdent    internal function.  Does byte level string compare.   */
6900/*                     Used by strcoll if strength == identical and strings  */
6901/*                     are otherwise equal.                                  */
6902/*                                                                           */
6903/*                     Comparison must be done on NFD normalized strings.    */
6904/*                     FCD is not good enough.                               */
6905
6906static
6907UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6908{
6909    // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6910    // of same type, but that doesn't really mean that it will stay that way.
6911    int32_t            comparison;
6912
6913    if (sColl->flags & UCOL_USE_ITERATOR) {
6914        // The division for the array length may truncate the array size to
6915        // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6916        // for all platforms anyway.
6917        UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6918        UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6919        UNormIterator *sNIt = NULL, *tNIt = NULL;
6920        sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6921        tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6922        sColl->iterator->move(sColl->iterator, 0, UITER_START);
6923        tColl->iterator->move(tColl->iterator, 0, UITER_START);
6924        UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6925        UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6926        comparison = u_strCompareIter(sIt, tIt, TRUE);
6927        unorm_closeIter(sNIt);
6928        unorm_closeIter(tNIt);
6929    } else {
6930        int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
6931        const UChar *sBuf = sColl->string;
6932        int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
6933        const UChar *tBuf = tColl->string;
6934
6935        if (normalize) {
6936            *status = U_ZERO_ERROR;
6937            // Note: We could use Normalizer::compare() or similar, but for short strings
6938            // which may not be in FCD it might be faster to just NFD them.
6939            // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
6940            // NFD'ing immediately might be faster for long strings,
6941            // but string comparison is usually done on relatively short strings.
6942            sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
6943                                  sColl->writableBuffer,
6944                                  *status);
6945            tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
6946                                  tColl->writableBuffer,
6947                                  *status);
6948            if(U_FAILURE(*status)) {
6949                return UCOL_LESS;
6950            }
6951            comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
6952        } else {
6953            comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
6954        }
6955    }
6956
6957    if (comparison < 0) {
6958        return UCOL_LESS;
6959    } else if (comparison == 0) {
6960        return UCOL_EQUAL;
6961    } else /* comparison > 0 */ {
6962        return UCOL_GREATER;
6963    }
6964}
6965
6966/*  CEBuf - A struct and some inline functions to handle the saving    */
6967/*          of CEs in a buffer within ucol_strcoll                     */
6968
6969#define UCOL_CEBUF_SIZE 512
6970typedef struct ucol_CEBuf {
6971    uint32_t    *buf;
6972    uint32_t    *endp;
6973    uint32_t    *pos;
6974    uint32_t     localArray[UCOL_CEBUF_SIZE];
6975} ucol_CEBuf;
6976
6977
6978static
6979inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
6980    (b)->buf = (b)->pos = (b)->localArray;
6981    (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
6982}
6983
6984static
6985void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
6986    uint32_t  oldSize;
6987    uint32_t  newSize;
6988    uint32_t  *newBuf;
6989
6990    ci->flags |= UCOL_ITER_ALLOCATED;
6991    oldSize = (uint32_t)(b->pos - b->buf);
6992    newSize = oldSize * 2;
6993    newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
6994    if(newBuf == NULL) {
6995        *status = U_MEMORY_ALLOCATION_ERROR;
6996    }
6997    else {
6998        uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
6999        if (b->buf != b->localArray) {
7000            uprv_free(b->buf);
7001        }
7002        b->buf = newBuf;
7003        b->endp = b->buf + newSize;
7004        b->pos  = b->buf + oldSize;
7005    }
7006}
7007
7008static
7009inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7010    if (b->pos == b->endp) {
7011        ucol_CEBuf_Expand(b, ci, status);
7012    }
7013    if (U_SUCCESS(*status)) {
7014        *(b)->pos++ = ce;
7015    }
7016}
7017
7018/* This is a trick string compare function that goes in and uses sortkeys to compare */
7019/* It is used when compare gets in trouble and needs to bail out                     */
7020static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7021                                                  collIterate *tColl,
7022                                                  UErrorCode *status)
7023{
7024    uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7025    uint8_t *sourceKeyP = sourceKey;
7026    uint8_t *targetKeyP = targetKey;
7027    int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7028    const UCollator *coll = sColl->coll;
7029    const UChar *source = NULL;
7030    const UChar *target = NULL;
7031    int32_t result = UCOL_EQUAL;
7032    UnicodeString sourceString, targetString;
7033    int32_t sourceLength;
7034    int32_t targetLength;
7035
7036    if(sColl->flags & UCOL_USE_ITERATOR) {
7037        sColl->iterator->move(sColl->iterator, 0, UITER_START);
7038        tColl->iterator->move(tColl->iterator, 0, UITER_START);
7039        UChar32 c;
7040        while((c=sColl->iterator->next(sColl->iterator))>=0) {
7041            sourceString.append((UChar)c);
7042        }
7043        while((c=tColl->iterator->next(tColl->iterator))>=0) {
7044            targetString.append((UChar)c);
7045        }
7046        source = sourceString.getBuffer();
7047        sourceLength = sourceString.length();
7048        target = targetString.getBuffer();
7049        targetLength = targetString.length();
7050    } else { // no iterators
7051        sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7052        targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7053        source = sColl->string;
7054        target = tColl->string;
7055    }
7056
7057
7058
7059    sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7060    if(sourceKeyLen > UCOL_MAX_BUFFER) {
7061        sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7062        if(sourceKeyP == NULL) {
7063            *status = U_MEMORY_ALLOCATION_ERROR;
7064            goto cleanup_and_do_compare;
7065        }
7066        sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7067    }
7068
7069    targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7070    if(targetKeyLen > UCOL_MAX_BUFFER) {
7071        targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7072        if(targetKeyP == NULL) {
7073            *status = U_MEMORY_ALLOCATION_ERROR;
7074            goto cleanup_and_do_compare;
7075        }
7076        targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7077    }
7078
7079    result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7080
7081cleanup_and_do_compare:
7082    if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7083        uprv_free(sourceKeyP);
7084    }
7085
7086    if(targetKeyP != NULL && targetKeyP != targetKey) {
7087        uprv_free(targetKeyP);
7088    }
7089
7090    if(result<0) {
7091        return UCOL_LESS;
7092    } else if(result>0) {
7093        return UCOL_GREATER;
7094    } else {
7095        return UCOL_EQUAL;
7096    }
7097}
7098
7099
7100static UCollationResult
7101ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7102{
7103    U_ALIGN_CODE(16);
7104
7105    const UCollator *coll = sColl->coll;
7106
7107
7108    // setting up the collator parameters
7109    UColAttributeValue strength = coll->strength;
7110    UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7111
7112    UBool checkSecTer = initialCheckSecTer;
7113    UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7114    UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7115    UBool checkIdent = (strength == UCOL_IDENTICAL);
7116    UBool checkCase = (coll->caseLevel == UCOL_ON);
7117    UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7118    UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7119    UBool qShifted = shifted && checkQuad;
7120    UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7121
7122    if(doHiragana && shifted) {
7123        return (ucol_compareUsingSortKeys(sColl, tColl, status));
7124    }
7125    uint8_t caseSwitch = coll->caseSwitch;
7126    uint8_t tertiaryMask = coll->tertiaryMask;
7127
7128    // This is the lowest primary value that will not be ignored if shifted
7129    uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7130
7131    UCollationResult result = UCOL_EQUAL;
7132    UCollationResult hirResult = UCOL_EQUAL;
7133
7134    // Preparing the CE buffers. They will be filled during the primary phase
7135    ucol_CEBuf   sCEs;
7136    ucol_CEBuf   tCEs;
7137    UCOL_INIT_CEBUF(&sCEs);
7138    UCOL_INIT_CEBUF(&tCEs);
7139
7140    uint32_t secS = 0, secT = 0;
7141    uint32_t sOrder=0, tOrder=0;
7142
7143    // Non shifted primary processing is quite simple
7144    if(!shifted) {
7145        for(;;) {
7146            // We fetch CEs until we hit a non ignorable primary or end.
7147            uint32_t sPrimary;
7148            do {
7149                // We get the next CE
7150                sOrder = ucol_IGetNextCE(coll, sColl, status);
7151                // Stuff it in the buffer
7152                UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7153                // And keep just the primary part.
7154                sPrimary = sOrder & UCOL_PRIMARYMASK;
7155            } while(sPrimary == 0);
7156
7157            // see the comments on the above block
7158            uint32_t tPrimary;
7159            do {
7160                tOrder = ucol_IGetNextCE(coll, tColl, status);
7161                UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7162                tPrimary = tOrder & UCOL_PRIMARYMASK;
7163            } while(tPrimary == 0);
7164
7165            // if both primaries are the same
7166            if(sPrimary == tPrimary) {
7167                // and there are no more CEs, we advance to the next level
7168                if(sPrimary == UCOL_NO_MORE_CES_PRIMARY) {
7169                    break;
7170                }
7171                if(doHiragana && hirResult == UCOL_EQUAL) {
7172                    if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7173                        hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7174                            ? UCOL_LESS:UCOL_GREATER;
7175                    }
7176                }
7177            } else {
7178                // only need to check one for continuation
7179                // if one is then the other must be or the preceding CE would be a prefix of the other
7180                if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7181                    sPrimary = (coll->leadBytePermutationTable[sPrimary>>24] << 24) | (sPrimary & 0x00FFFFFF);
7182                    tPrimary = (coll->leadBytePermutationTable[tPrimary>>24] << 24) | (tPrimary & 0x00FFFFFF);
7183                }
7184                // if two primaries are different, we are done
7185                result = (sPrimary < tPrimary) ?  UCOL_LESS: UCOL_GREATER;
7186                goto commonReturn;
7187            }
7188        } // no primary difference... do the rest from the buffers
7189    } else { // shifted - do a slightly more complicated processing :)
7190        for(;;) {
7191            UBool sInShifted = FALSE;
7192            UBool tInShifted = FALSE;
7193            // This version of code can be refactored. However, it seems easier to understand this way.
7194            // Source loop. Same as the target loop.
7195            for(;;) {
7196                sOrder = ucol_IGetNextCE(coll, sColl, status);
7197                if(sOrder == UCOL_NO_MORE_CES) {
7198                    UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7199                    break;
7200                } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7201                    /* UCA amendment - ignore ignorables that follow shifted code points */
7202                    continue;
7203                } else if(isContinuation(sOrder)) {
7204                    if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7205                        if(sInShifted) {
7206                            sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7207                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7208                            continue;
7209                        } else {
7210                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7211                            break;
7212                        }
7213                    } else { /* Just lower level values */
7214                        if(sInShifted) {
7215                            continue;
7216                        } else {
7217                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7218                            continue;
7219                        }
7220                    }
7221                } else { /* regular */
7222                    if(coll->leadBytePermutationTable != NULL){
7223                        sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7224                    }
7225                    if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7226                        UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7227                        break;
7228                    } else {
7229                        if((sOrder & UCOL_PRIMARYMASK) > 0) {
7230                            sInShifted = TRUE;
7231                            sOrder &= UCOL_PRIMARYMASK;
7232                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7233                            continue;
7234                        } else {
7235                            UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7236                            sInShifted = FALSE;
7237                            continue;
7238                        }
7239                    }
7240                }
7241            }
7242            sOrder &= UCOL_PRIMARYMASK;
7243            sInShifted = FALSE;
7244
7245            for(;;) {
7246                tOrder = ucol_IGetNextCE(coll, tColl, status);
7247                if(tOrder == UCOL_NO_MORE_CES) {
7248                    UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7249                    break;
7250                } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7251                    /* UCA amendment - ignore ignorables that follow shifted code points */
7252                    continue;
7253                } else if(isContinuation(tOrder)) {
7254                    if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7255                        if(tInShifted) {
7256                            tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7257                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7258                            continue;
7259                        } else {
7260                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7261                            break;
7262                        }
7263                    } else { /* Just lower level values */
7264                        if(tInShifted) {
7265                            continue;
7266                        } else {
7267                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7268                            continue;
7269                        }
7270                    }
7271                } else { /* regular */
7272                    if(coll->leadBytePermutationTable != NULL){
7273                        tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7274                    }
7275                    if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7276                        UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7277                        break;
7278                    } else {
7279                        if((tOrder & UCOL_PRIMARYMASK) > 0) {
7280                            tInShifted = TRUE;
7281                            tOrder &= UCOL_PRIMARYMASK;
7282                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7283                            continue;
7284                        } else {
7285                            UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7286                            tInShifted = FALSE;
7287                            continue;
7288                        }
7289                    }
7290                }
7291            }
7292            tOrder &= UCOL_PRIMARYMASK;
7293            tInShifted = FALSE;
7294
7295            if(sOrder == tOrder) {
7296                /*
7297                if(doHiragana && hirResult == UCOL_EQUAL) {
7298                if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7299                hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7300                ? UCOL_LESS:UCOL_GREATER;
7301                }
7302                }
7303                */
7304                if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7305                    break;
7306                } else {
7307                    sOrder = 0;
7308                    tOrder = 0;
7309                    continue;
7310                }
7311            } else {
7312                result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7313                goto commonReturn;
7314            }
7315        } /* no primary difference... do the rest from the buffers */
7316    }
7317
7318    /* now, we're gonna reexamine collected CEs */
7319    uint32_t    *sCE;
7320    uint32_t    *tCE;
7321
7322    /* This is the secondary level of comparison */
7323    if(checkSecTer) {
7324        if(!isFrenchSec) { /* normal */
7325            sCE = sCEs.buf;
7326            tCE = tCEs.buf;
7327            for(;;) {
7328                while (secS == 0) {
7329                    secS = *(sCE++) & UCOL_SECONDARYMASK;
7330                }
7331
7332                while(secT == 0) {
7333                    secT = *(tCE++) & UCOL_SECONDARYMASK;
7334                }
7335
7336                if(secS == secT) {
7337                    if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7338                        break;
7339                    } else {
7340                        secS = 0; secT = 0;
7341                        continue;
7342                    }
7343                } else {
7344                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7345                    goto commonReturn;
7346                }
7347            }
7348        } else { /* do the French */
7349            uint32_t *sCESave = NULL;
7350            uint32_t *tCESave = NULL;
7351            sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7352            tCE = tCEs.pos-2;
7353            for(;;) {
7354                while (secS == 0 && sCE >= sCEs.buf) {
7355                    if(sCESave == NULL) {
7356                        secS = *(sCE--);
7357                        if(isContinuation(secS)) {
7358                            while(isContinuation(secS = *(sCE--)))
7359                                ;
7360                            /* after this, secS has the start of continuation, and sCEs points before that */
7361                            sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7362                            sCE+=2;  /* need to point to the first continuation CP */
7363                            /* However, now you can just continue doing stuff */
7364                        }
7365                    } else {
7366                        secS = *(sCE++);
7367                        if(!isContinuation(secS)) { /* This means we have finished with this cont */
7368                            sCE = sCESave;            /* reset the pointer to before continuation */
7369                            sCESave = NULL;
7370                            secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
7371                            continue;
7372                        }
7373                    }
7374                    secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7375                }
7376
7377                while(secT == 0 && tCE >= tCEs.buf) {
7378                    if(tCESave == NULL) {
7379                        secT = *(tCE--);
7380                        if(isContinuation(secT)) {
7381                            while(isContinuation(secT = *(tCE--)))
7382                                ;
7383                            /* after this, secS has the start of continuation, and sCEs points before that */
7384                            tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7385                            tCE+=2;  /* need to point to the first continuation CP */
7386                            /* However, now you can just continue doing stuff */
7387                        }
7388                    } else {
7389                        secT = *(tCE++);
7390                        if(!isContinuation(secT)) { /* This means we have finished with this cont */
7391                            tCE = tCESave;          /* reset the pointer to before continuation */
7392                            tCESave = NULL;
7393                            secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
7394                            continue;
7395                        }
7396                    }
7397                    secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7398                }
7399
7400                if(secS == secT) {
7401                    if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7402                        break;
7403                    } else {
7404                        secS = 0; secT = 0;
7405                        continue;
7406                    }
7407                } else {
7408                    result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7409                    goto commonReturn;
7410                }
7411            }
7412        }
7413    }
7414
7415    /* doing the case bit */
7416    if(checkCase) {
7417        sCE = sCEs.buf;
7418        tCE = tCEs.buf;
7419        for(;;) {
7420            while((secS & UCOL_REMOVE_CASE) == 0) {
7421                if(!isContinuation(*sCE++)) {
7422                    secS =*(sCE-1);
7423                    if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7424                        // primary ignorables should not be considered on the case level when the strength is primary
7425                        // otherwise, the CEs stop being well-formed
7426                        secS &= UCOL_TERT_CASE_MASK;
7427                        secS ^= caseSwitch;
7428                    } else {
7429                        secS = 0;
7430                    }
7431                } else {
7432                    secS = 0;
7433                }
7434            }
7435
7436            while((secT & UCOL_REMOVE_CASE) == 0) {
7437                if(!isContinuation(*tCE++)) {
7438                    secT = *(tCE-1);
7439                    if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7440                        // primary ignorables should not be considered on the case level when the strength is primary
7441                        // otherwise, the CEs stop being well-formed
7442                        secT &= UCOL_TERT_CASE_MASK;
7443                        secT ^= caseSwitch;
7444                    } else {
7445                        secT = 0;
7446                    }
7447                } else {
7448                    secT = 0;
7449                }
7450            }
7451
7452            if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7453                result = UCOL_LESS;
7454                goto commonReturn;
7455            } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7456                result = UCOL_GREATER;
7457                goto commonReturn;
7458            }
7459
7460            if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7461                break;
7462            } else {
7463                secS = 0;
7464                secT = 0;
7465            }
7466        }
7467    }
7468
7469    /* Tertiary level */
7470    if(checkTertiary) {
7471        secS = 0;
7472        secT = 0;
7473        sCE = sCEs.buf;
7474        tCE = tCEs.buf;
7475        for(;;) {
7476            while((secS & UCOL_REMOVE_CASE) == 0) {
7477                sOrder = *sCE++;
7478                secS = sOrder & tertiaryMask;
7479                if(!isContinuation(sOrder)) {
7480                    secS ^= caseSwitch;
7481                } else {
7482                    secS &= UCOL_REMOVE_CASE;
7483                }
7484            }
7485
7486            while((secT & UCOL_REMOVE_CASE)  == 0) {
7487                tOrder = *tCE++;
7488                secT = tOrder & tertiaryMask;
7489                if(!isContinuation(tOrder)) {
7490                    secT ^= caseSwitch;
7491                } else {
7492                    secT &= UCOL_REMOVE_CASE;
7493                }
7494            }
7495
7496            if(secS == secT) {
7497                if((secS & UCOL_REMOVE_CASE) == 1) {
7498                    break;
7499                } else {
7500                    secS = 0; secT = 0;
7501                    continue;
7502                }
7503            } else {
7504                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7505                goto commonReturn;
7506            }
7507        }
7508    }
7509
7510
7511    if(qShifted /*checkQuad*/) {
7512        UBool sInShifted = TRUE;
7513        UBool tInShifted = TRUE;
7514        secS = 0;
7515        secT = 0;
7516        sCE = sCEs.buf;
7517        tCE = tCEs.buf;
7518        for(;;) {
7519            while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7520                secS = *(sCE++);
7521                if(isContinuation(secS)) {
7522                    if(!sInShifted) {
7523                        continue;
7524                    }
7525                } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7526                    secS = UCOL_PRIMARYMASK;
7527                    sInShifted = FALSE;
7528                } else {
7529                    sInShifted = TRUE;
7530                }
7531            }
7532            secS &= UCOL_PRIMARYMASK;
7533
7534
7535            while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7536                secT = *(tCE++);
7537                if(isContinuation(secT)) {
7538                    if(!tInShifted) {
7539                        continue;
7540                    }
7541                } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7542                    secT = UCOL_PRIMARYMASK;
7543                    tInShifted = FALSE;
7544                } else {
7545                    tInShifted = TRUE;
7546                }
7547            }
7548            secT &= UCOL_PRIMARYMASK;
7549
7550            if(secS == secT) {
7551                if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7552                    break;
7553                } else {
7554                    secS = 0; secT = 0;
7555                    continue;
7556                }
7557            } else {
7558                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7559                goto commonReturn;
7560            }
7561        }
7562    } else if(doHiragana && hirResult != UCOL_EQUAL) {
7563        // If we're fine on quaternaries, we might be different
7564        // on Hiragana. This, however, might fail us in shifted.
7565        result = hirResult;
7566        goto commonReturn;
7567    }
7568
7569    /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7570    /*  as a tiebreaker if all else is equal.                                */
7571    /*  Getting here  should be quite rare - strings are not identical -     */
7572    /*     that is checked first, but compared == through all other checks.  */
7573    if(checkIdent)
7574    {
7575        //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7576        result = ucol_checkIdent(sColl, tColl, TRUE, status);
7577    }
7578
7579commonReturn:
7580    if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7581        if (sCEs.buf != sCEs.localArray ) {
7582            uprv_free(sCEs.buf);
7583        }
7584        if (tCEs.buf != tCEs.localArray ) {
7585            uprv_free(tCEs.buf);
7586        }
7587    }
7588
7589    return result;
7590}
7591
7592static UCollationResult
7593ucol_strcollRegular(const UCollator *coll,
7594                    const UChar *source, int32_t sourceLength,
7595                    const UChar *target, int32_t targetLength,
7596                    UErrorCode *status) {
7597    collIterate sColl, tColl;
7598    // Preparing the context objects for iterating over strings
7599    IInit_collIterate(coll, source, sourceLength, &sColl, status);
7600    IInit_collIterate(coll, target, targetLength, &tColl, status);
7601    if(U_FAILURE(*status)) {
7602        return UCOL_LESS;
7603    }
7604    return ucol_strcollRegular(&sColl, &tColl, status);
7605}
7606
7607static inline uint32_t
7608ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7609                          uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7610{
7611    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7612    int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7613    int32_t offset = 1;
7614    UChar schar = 0, tchar = 0;
7615
7616    for(;;) {
7617        if(len == -1) {
7618            if(s[*index] == 0) { // end of string
7619                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7620            } else {
7621                schar = s[*index];
7622            }
7623        } else {
7624            if(*index == len) {
7625                return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7626            } else {
7627                schar = s[*index];
7628            }
7629        }
7630
7631        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7632            offset++;
7633        }
7634
7635        if (schar == tchar) {
7636            (*index)++;
7637            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7638        }
7639        else
7640        {
7641            if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7642                return UCOL_BAIL_OUT_CE;
7643            }
7644            // skip completely ignorables
7645            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7646            if(isZeroCE == 0) { // we have to ignore completely ignorables
7647                (*index)++;
7648                continue;
7649            }
7650
7651            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7652        }
7653    }
7654}
7655
7656
7657/**
7658 * This is a fast strcoll, geared towards text in Latin-1.
7659 * It supports contractions of size two, French secondaries
7660 * and case switching. You can use it with strengths primary
7661 * to tertiary. It does not support shifted and case level.
7662 * It relies on the table build by setupLatin1Table. If it
7663 * doesn't understand something, it will go to the regular
7664 * strcoll.
7665 */
7666static UCollationResult
7667ucol_strcollUseLatin1( const UCollator    *coll,
7668              const UChar        *source,
7669              int32_t            sLen,
7670              const UChar        *target,
7671              int32_t            tLen,
7672              UErrorCode *status)
7673{
7674    U_ALIGN_CODE(16);
7675    int32_t strength = coll->strength;
7676
7677    int32_t sIndex = 0, tIndex = 0;
7678    UChar sChar = 0, tChar = 0;
7679    uint32_t sOrder=0, tOrder=0;
7680
7681    UBool endOfSource = FALSE;
7682
7683    uint32_t *elements = coll->latinOneCEs;
7684
7685    UBool haveContractions = FALSE; // if we have contractions in our string
7686                                    // we cannot do French secondary
7687
7688    // Do the primary level
7689    for(;;) {
7690        while(sOrder==0) { // this loop skips primary ignorables
7691            // sOrder=getNextlatinOneCE(source);
7692            if(sLen==-1) {   // handling zero terminated strings
7693                sChar=source[sIndex++];
7694                if(sChar==0) {
7695                    endOfSource = TRUE;
7696                    break;
7697                }
7698            } else {        // handling strings with known length
7699                if(sIndex==sLen) {
7700                    endOfSource = TRUE;
7701                    break;
7702                }
7703                sChar=source[sIndex++];
7704            }
7705            if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7706                //fprintf(stderr, "R");
7707                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7708            }
7709            sOrder = elements[sChar];
7710            if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7711                // specials can basically be either contractions or bail-out signs. If we get anything
7712                // else, we'll bail out anywasy
7713                if(getCETag(sOrder) == CONTRACTION_TAG) {
7714                    sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7715                    haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7716                    // However, if there are contractions in the table, but we always use just one char,
7717                    // we might be able to do French. This should be checked out.
7718                }
7719                if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7720                    //fprintf(stderr, "S");
7721                    return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7722                }
7723            }
7724        }
7725
7726        while(tOrder==0) {  // this loop skips primary ignorables
7727            // tOrder=getNextlatinOneCE(target);
7728            if(tLen==-1) {    // handling zero terminated strings
7729                tChar=target[tIndex++];
7730                if(tChar==0) {
7731                    if(endOfSource) { // this is different than source loop,
7732                        // as we already know that source loop is done here,
7733                        // so we can either finish the primary loop if both
7734                        // strings are done or anounce the result if only
7735                        // target is done. Same below.
7736                        goto endOfPrimLoop;
7737                    } else {
7738                        return UCOL_GREATER;
7739                    }
7740                }
7741            } else {          // handling strings with known length
7742                if(tIndex==tLen) {
7743                    if(endOfSource) {
7744                        goto endOfPrimLoop;
7745                    } else {
7746                        return UCOL_GREATER;
7747                    }
7748                }
7749                tChar=target[tIndex++];
7750            }
7751            if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7752                //fprintf(stderr, "R");
7753                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7754            }
7755            tOrder = elements[tChar];
7756            if(tOrder >= UCOL_NOT_FOUND) {
7757                // Handling specials, see the comments for source
7758                if(getCETag(tOrder) == CONTRACTION_TAG) {
7759                    tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7760                    haveContractions = TRUE;
7761                }
7762                if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7763                    //fprintf(stderr, "S");
7764                    return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7765                }
7766            }
7767        }
7768        if(endOfSource) { // source is finished, but target is not, say the result.
7769            return UCOL_LESS;
7770        }
7771
7772        if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7773            sOrder = 0; tOrder = 0;
7774            continue;
7775        } else {
7776            // compare current top bytes
7777            if(((sOrder^tOrder)&0xFF000000)!=0) {
7778                // top bytes differ, return difference
7779                if(sOrder < tOrder) {
7780                    return UCOL_LESS;
7781                } else if(sOrder > tOrder) {
7782                    return UCOL_GREATER;
7783                }
7784                // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7785                // since we must return enum value
7786            }
7787
7788            // top bytes match, continue with following bytes
7789            sOrder<<=8;
7790            tOrder<<=8;
7791        }
7792    }
7793
7794endOfPrimLoop:
7795    // after primary loop, we definitely know the sizes of strings,
7796    // so we set it and use simpler loop for secondaries and tertiaries
7797    sLen = sIndex; tLen = tIndex;
7798    if(strength >= UCOL_SECONDARY) {
7799        // adjust the table beggining
7800        elements += coll->latinOneTableLen;
7801        endOfSource = FALSE;
7802
7803        if(coll->frenchCollation == UCOL_OFF) { // non French
7804            // This loop is a simplified copy of primary loop
7805            // at this point we know that whole strings are latin-1, so we don't
7806            // check for that. We also know that we only have contractions as
7807            // specials.
7808            sIndex = 0; tIndex = 0;
7809            for(;;) {
7810                while(sOrder==0) {
7811                    if(sIndex==sLen) {
7812                        endOfSource = TRUE;
7813                        break;
7814                    }
7815                    sChar=source[sIndex++];
7816                    sOrder = elements[sChar];
7817                    if(sOrder > UCOL_NOT_FOUND) {
7818                        sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7819                    }
7820                }
7821
7822                while(tOrder==0) {
7823                    if(tIndex==tLen) {
7824                        if(endOfSource) {
7825                            goto endOfSecLoop;
7826                        } else {
7827                            return UCOL_GREATER;
7828                        }
7829                    }
7830                    tChar=target[tIndex++];
7831                    tOrder = elements[tChar];
7832                    if(tOrder > UCOL_NOT_FOUND) {
7833                        tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7834                    }
7835                }
7836                if(endOfSource) {
7837                    return UCOL_LESS;
7838                }
7839
7840                if(sOrder == tOrder) {
7841                    sOrder = 0; tOrder = 0;
7842                    continue;
7843                } else {
7844                    // see primary loop for comments on this
7845                    if(((sOrder^tOrder)&0xFF000000)!=0) {
7846                        if(sOrder < tOrder) {
7847                            return UCOL_LESS;
7848                        } else if(sOrder > tOrder) {
7849                            return UCOL_GREATER;
7850                        }
7851                    }
7852                    sOrder<<=8;
7853                    tOrder<<=8;
7854                }
7855            }
7856        } else { // French
7857            if(haveContractions) { // if we have contractions, we have to bail out
7858                // since we don't really know how to handle them here
7859                return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7860            }
7861            // For French, we go backwards
7862            sIndex = sLen; tIndex = tLen;
7863            for(;;) {
7864                while(sOrder==0) {
7865                    if(sIndex==0) {
7866                        endOfSource = TRUE;
7867                        break;
7868                    }
7869                    sChar=source[--sIndex];
7870                    sOrder = elements[sChar];
7871                    // don't even look for contractions
7872                }
7873
7874                while(tOrder==0) {
7875                    if(tIndex==0) {
7876                        if(endOfSource) {
7877                            goto endOfSecLoop;
7878                        } else {
7879                            return UCOL_GREATER;
7880                        }
7881                    }
7882                    tChar=target[--tIndex];
7883                    tOrder = elements[tChar];
7884                    // don't even look for contractions
7885                }
7886                if(endOfSource) {
7887                    return UCOL_LESS;
7888                }
7889
7890                if(sOrder == tOrder) {
7891                    sOrder = 0; tOrder = 0;
7892                    continue;
7893                } else {
7894                    // see the primary loop for comments
7895                    if(((sOrder^tOrder)&0xFF000000)!=0) {
7896                        if(sOrder < tOrder) {
7897                            return UCOL_LESS;
7898                        } else if(sOrder > tOrder) {
7899                            return UCOL_GREATER;
7900                        }
7901                    }
7902                    sOrder<<=8;
7903                    tOrder<<=8;
7904                }
7905            }
7906        }
7907    }
7908
7909endOfSecLoop:
7910    if(strength >= UCOL_TERTIARY) {
7911        // tertiary loop is the same as secondary (except no French)
7912        elements += coll->latinOneTableLen;
7913        sIndex = 0; tIndex = 0;
7914        endOfSource = FALSE;
7915        for(;;) {
7916            while(sOrder==0) {
7917                if(sIndex==sLen) {
7918                    endOfSource = TRUE;
7919                    break;
7920                }
7921                sChar=source[sIndex++];
7922                sOrder = elements[sChar];
7923                if(sOrder > UCOL_NOT_FOUND) {
7924                    sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7925                }
7926            }
7927            while(tOrder==0) {
7928                if(tIndex==tLen) {
7929                    if(endOfSource) {
7930                        return UCOL_EQUAL; // if both strings are at the end, they are equal
7931                    } else {
7932                        return UCOL_GREATER;
7933                    }
7934                }
7935                tChar=target[tIndex++];
7936                tOrder = elements[tChar];
7937                if(tOrder > UCOL_NOT_FOUND) {
7938                    tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7939                }
7940            }
7941            if(endOfSource) {
7942                return UCOL_LESS;
7943            }
7944            if(sOrder == tOrder) {
7945                sOrder = 0; tOrder = 0;
7946                continue;
7947            } else {
7948                if(((sOrder^tOrder)&0xff000000)!=0) {
7949                    if(sOrder < tOrder) {
7950                        return UCOL_LESS;
7951                    } else if(sOrder > tOrder) {
7952                        return UCOL_GREATER;
7953                    }
7954                }
7955                sOrder<<=8;
7956                tOrder<<=8;
7957            }
7958        }
7959    }
7960    return UCOL_EQUAL;
7961}
7962
7963/*
7964  Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
7965  null terminated input string takes extra amount of CPU cycles.
7966*/
7967static UCollationResult
7968ucol_strcollRegularUTF8(
7969                    const UCollator *coll,
7970                    const char      *source,
7971                    int32_t         sourceLength,
7972                    const char      *target,
7973                    int32_t         targetLength,
7974                    UErrorCode      *status)
7975{
7976    UCharIterator src;
7977    UCharIterator tgt;
7978
7979    uiter_setUTF8(&src, source, sourceLength);
7980    uiter_setUTF8(&tgt, target, targetLength);
7981
7982    // Preparing the context objects for iterating over strings
7983    collIterate sColl, tColl;
7984    IInit_collIterate(coll, NULL, -1, &sColl, status);
7985    IInit_collIterate(coll, NULL, -1, &tColl, status);
7986    if(U_FAILURE(*status)) {
7987        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
7988        return UCOL_EQUAL;
7989    }
7990    // The division for the array length may truncate the array size to
7991    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7992    // for all platforms anyway.
7993    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7994    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7995    UNormIterator *sNormIter = NULL, *tNormIter = NULL;
7996
7997    sColl.iterator = &src;
7998    sColl.flags |= UCOL_USE_ITERATOR;
7999    tColl.flags |= UCOL_USE_ITERATOR;
8000    tColl.iterator = &tgt;
8001
8002    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8003        sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8004        sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
8005        sColl.flags &= ~UCOL_ITER_NORM;
8006
8007        tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8008        tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
8009        tColl.flags &= ~UCOL_ITER_NORM;
8010    }
8011
8012    return ucol_strcollRegular(&sColl, &tColl, status);
8013}
8014
8015static inline uint32_t
8016ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
8017                          uint32_t CE, const char *s, int32_t *index, int32_t len)
8018{
8019    const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8020    int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8021    int32_t offset = 1;
8022    UChar32 schar = 0, tchar = 0;
8023
8024    for(;;) {
8025        if (*index == len) {
8026            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8027        }
8028        U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
8029        if (len < 0 && schar == 0) {
8030            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8031        }
8032
8033        while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8034            offset++;
8035        }
8036
8037        if (schar == tchar) {
8038            U8_FWD_1(s, *index, len);
8039            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8040        }
8041        else
8042        {
8043            if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8044                return UCOL_BAIL_OUT_CE;
8045            }
8046            // skip completely ignorables
8047            uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8048            if(isZeroCE == 0) { // we have to ignore completely ignorables
8049                U8_FWD_1(s, *index, len);
8050                continue;
8051            }
8052
8053            return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8054        }
8055    }
8056}
8057
8058static inline UCollationResult
8059ucol_strcollUseLatin1UTF8(
8060                const UCollator *coll,
8061                const char      *source,
8062                int32_t         sLen,
8063                const char      *target,
8064                int32_t         tLen,
8065                UErrorCode      *status)
8066{
8067    U_ALIGN_CODE(16);
8068    int32_t strength = coll->strength;
8069
8070    int32_t sIndex = 0, tIndex = 0;
8071    UChar32 sChar = 0, tChar = 0;
8072    uint32_t sOrder=0, tOrder=0;
8073
8074    UBool endOfSource = FALSE;
8075
8076    uint32_t *elements = coll->latinOneCEs;
8077
8078    UBool haveContractions = FALSE; // if we have contractions in our string
8079                                    // we cannot do French secondary
8080
8081    // Do the primary level
8082    for(;;) {
8083        while(sOrder==0) { // this loop skips primary ignorables
8084            // sOrder=getNextlatinOneCE(source);
8085            if (sIndex == sLen) {
8086                endOfSource = TRUE;
8087                break;
8088            }
8089            U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
8090            if (sLen < 0 && sChar == 0) {
8091                endOfSource = TRUE;
8092                sLen = sIndex;
8093                break;
8094            }
8095            if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8096                //fprintf(stderr, "R");
8097                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8098            }
8099            sOrder = elements[sChar];
8100            if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8101                // specials can basically be either contractions or bail-out signs. If we get anything
8102                // else, we'll bail out anywasy
8103                if(getCETag(sOrder) == CONTRACTION_TAG) {
8104                    sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8105                    haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8106                    // However, if there are contractions in the table, but we always use just one char,
8107                    // we might be able to do French. This should be checked out.
8108                }
8109                if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8110                    //fprintf(stderr, "S");
8111                    return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8112                }
8113            }
8114        }
8115
8116        while(tOrder==0) {  // this loop skips primary ignorables
8117            // tOrder=getNextlatinOneCE(target);
8118            if (tIndex == tLen) {
8119                if(endOfSource) {
8120                    goto endOfPrimLoopU8;
8121                } else {
8122                    return UCOL_GREATER;
8123                }
8124            }
8125            U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8126            if (tLen < 0 && tChar == 0) {
8127                if(endOfSource) {
8128                    tLen = tIndex;
8129                    goto endOfPrimLoopU8;
8130                } else {
8131                    return UCOL_GREATER;
8132                }
8133            }
8134            if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8135                //fprintf(stderr, "R");
8136                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8137            }
8138            tOrder = elements[tChar];
8139            if(tOrder >= UCOL_NOT_FOUND) {
8140                // Handling specials, see the comments for source
8141                if(getCETag(tOrder) == CONTRACTION_TAG) {
8142                    tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8143                    haveContractions = TRUE;
8144                }
8145                if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8146                    //fprintf(stderr, "S");
8147                    return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8148                }
8149            }
8150        }
8151        if(endOfSource) { // source is finished, but target is not, say the result.
8152            return UCOL_LESS;
8153        }
8154
8155        if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8156            sOrder = 0; tOrder = 0;
8157            continue;
8158        } else {
8159            // compare current top bytes
8160            if(((sOrder^tOrder)&0xFF000000)!=0) {
8161                // top bytes differ, return difference
8162                if(sOrder < tOrder) {
8163                    return UCOL_LESS;
8164                } else if(sOrder > tOrder) {
8165                    return UCOL_GREATER;
8166                }
8167                // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8168                // since we must return enum value
8169            }
8170
8171            // top bytes match, continue with following bytes
8172            sOrder<<=8;
8173            tOrder<<=8;
8174        }
8175    }
8176
8177endOfPrimLoopU8:
8178    // after primary loop, we definitely know the sizes of strings,
8179    // so we set it and use simpler loop for secondaries and tertiaries
8180    sLen = sIndex; tLen = tIndex;
8181    if(strength >= UCOL_SECONDARY) {
8182        // adjust the table beggining
8183        elements += coll->latinOneTableLen;
8184        endOfSource = FALSE;
8185
8186        if(coll->frenchCollation == UCOL_OFF) { // non French
8187            // This loop is a simplified copy of primary loop
8188            // at this point we know that whole strings are latin-1, so we don't
8189            // check for that. We also know that we only have contractions as
8190            // specials.
8191            sIndex = 0; tIndex = 0;
8192            for(;;) {
8193                while(sOrder==0) {
8194                    if(sIndex==sLen) {
8195                        endOfSource = TRUE;
8196                        break;
8197                    }
8198                    U_ASSERT(sLen >= 0);
8199                    U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8200                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8201                    sOrder = elements[sChar];
8202                    if(sOrder > UCOL_NOT_FOUND) {
8203                        sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8204                    }
8205                }
8206
8207                while(tOrder==0) {
8208                    if(tIndex==tLen) {
8209                        if(endOfSource) {
8210                            goto endOfSecLoopU8;
8211                        } else {
8212                            return UCOL_GREATER;
8213                        }
8214                    }
8215                    U_ASSERT(tLen >= 0);
8216                    U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8217                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8218                    tOrder = elements[tChar];
8219                    if(tOrder > UCOL_NOT_FOUND) {
8220                        tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8221                    }
8222                }
8223                if(endOfSource) {
8224                    return UCOL_LESS;
8225                }
8226
8227                if(sOrder == tOrder) {
8228                    sOrder = 0; tOrder = 0;
8229                    continue;
8230                } else {
8231                    // see primary loop for comments on this
8232                    if(((sOrder^tOrder)&0xFF000000)!=0) {
8233                        if(sOrder < tOrder) {
8234                            return UCOL_LESS;
8235                        } else if(sOrder > tOrder) {
8236                            return UCOL_GREATER;
8237                        }
8238                    }
8239                    sOrder<<=8;
8240                    tOrder<<=8;
8241                }
8242            }
8243        } else { // French
8244            if(haveContractions) { // if we have contractions, we have to bail out
8245                // since we don't really know how to handle them here
8246                return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8247            }
8248            // For French, we go backwards
8249            sIndex = sLen; tIndex = tLen;
8250            for(;;) {
8251                while(sOrder==0) {
8252                    if(sIndex==0) {
8253                        endOfSource = TRUE;
8254                        break;
8255                    }
8256                    U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
8257                    U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8258                    sOrder = elements[sChar];
8259                    // don't even look for contractions
8260                }
8261
8262                while(tOrder==0) {
8263                    if(tIndex==0) {
8264                        if(endOfSource) {
8265                            goto endOfSecLoopU8;
8266                        } else {
8267                            return UCOL_GREATER;
8268                        }
8269                    }
8270                    U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
8271                    U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8272                    tOrder = elements[tChar];
8273                    // don't even look for contractions
8274                }
8275                if(endOfSource) {
8276                    return UCOL_LESS;
8277                }
8278
8279                if(sOrder == tOrder) {
8280                    sOrder = 0; tOrder = 0;
8281                    continue;
8282                } else {
8283                    // see the primary loop for comments
8284                    if(((sOrder^tOrder)&0xFF000000)!=0) {
8285                        if(sOrder < tOrder) {
8286                            return UCOL_LESS;
8287                        } else if(sOrder > tOrder) {
8288                            return UCOL_GREATER;
8289                        }
8290                    }
8291                    sOrder<<=8;
8292                    tOrder<<=8;
8293                }
8294            }
8295        }
8296    }
8297
8298endOfSecLoopU8:
8299    if(strength >= UCOL_TERTIARY) {
8300        // tertiary loop is the same as secondary (except no French)
8301        elements += coll->latinOneTableLen;
8302        sIndex = 0; tIndex = 0;
8303        endOfSource = FALSE;
8304        for(;;) {
8305            while(sOrder==0) {
8306                if(sIndex==sLen) {
8307                    endOfSource = TRUE;
8308                    break;
8309                }
8310                U_ASSERT(sLen >= 0);
8311                U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8312                U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8313                sOrder = elements[sChar];
8314                if(sOrder > UCOL_NOT_FOUND) {
8315                    sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8316                }
8317            }
8318            while(tOrder==0) {
8319                if(tIndex==tLen) {
8320                    if(endOfSource) {
8321                        return UCOL_EQUAL; // if both strings are at the end, they are equal
8322                    } else {
8323                        return UCOL_GREATER;
8324                    }
8325                }
8326                U_ASSERT(tLen >= 0);
8327                U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8328                U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8329                tOrder = elements[tChar];
8330                if(tOrder > UCOL_NOT_FOUND) {
8331                    tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8332                }
8333            }
8334            if(endOfSource) {
8335                return UCOL_LESS;
8336            }
8337            if(sOrder == tOrder) {
8338                sOrder = 0; tOrder = 0;
8339                continue;
8340            } else {
8341                if(((sOrder^tOrder)&0xff000000)!=0) {
8342                    if(sOrder < tOrder) {
8343                        return UCOL_LESS;
8344                    } else if(sOrder > tOrder) {
8345                        return UCOL_GREATER;
8346                    }
8347                }
8348                sOrder<<=8;
8349                tOrder<<=8;
8350            }
8351        }
8352    }
8353    return UCOL_EQUAL;
8354}
8355
8356U_CAPI UCollationResult U_EXPORT2
8357ucol_strcollIter( const UCollator    *coll,
8358                 UCharIterator *sIter,
8359                 UCharIterator *tIter,
8360                 UErrorCode         *status)
8361{
8362    if(!status || U_FAILURE(*status)) {
8363        return UCOL_EQUAL;
8364    }
8365
8366    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8367    UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8368
8369    if (sIter == tIter) {
8370        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8371        return UCOL_EQUAL;
8372    }
8373    if(sIter == NULL || tIter == NULL || coll == NULL) {
8374        *status = U_ILLEGAL_ARGUMENT_ERROR;
8375        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8376        return UCOL_EQUAL;
8377    }
8378
8379    UCollationResult result = UCOL_EQUAL;
8380
8381    // Preparing the context objects for iterating over strings
8382    collIterate sColl, tColl;
8383    IInit_collIterate(coll, NULL, -1, &sColl, status);
8384    IInit_collIterate(coll, NULL, -1, &tColl, status);
8385    if(U_FAILURE(*status)) {
8386        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8387        return UCOL_EQUAL;
8388    }
8389    // The division for the array length may truncate the array size to
8390    // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8391    // for all platforms anyway.
8392    UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8393    UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8394    UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8395
8396    sColl.iterator = sIter;
8397    sColl.flags |= UCOL_USE_ITERATOR;
8398    tColl.flags |= UCOL_USE_ITERATOR;
8399    tColl.iterator = tIter;
8400
8401    if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8402        sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8403        sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8404        sColl.flags &= ~UCOL_ITER_NORM;
8405
8406        tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8407        tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8408        tColl.flags &= ~UCOL_ITER_NORM;
8409    }
8410
8411    UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8412
8413    while((sChar = sColl.iterator->next(sColl.iterator)) ==
8414        (tChar = tColl.iterator->next(tColl.iterator))) {
8415            if(sChar == U_SENTINEL) {
8416                result = UCOL_EQUAL;
8417                goto end_compare;
8418            }
8419    }
8420
8421    if(sChar == U_SENTINEL) {
8422        tChar = tColl.iterator->previous(tColl.iterator);
8423    }
8424
8425    if(tChar == U_SENTINEL) {
8426        sChar = sColl.iterator->previous(sColl.iterator);
8427    }
8428
8429    sChar = sColl.iterator->previous(sColl.iterator);
8430    tChar = tColl.iterator->previous(tColl.iterator);
8431
8432    if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8433    {
8434        // We are stopped in the middle of a contraction.
8435        // Scan backwards through the == part of the string looking for the start of the contraction.
8436        //   It doesn't matter which string we scan, since they are the same in this region.
8437        do
8438        {
8439            sChar = sColl.iterator->previous(sColl.iterator);
8440            tChar = tColl.iterator->previous(tColl.iterator);
8441        }
8442        while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8443    }
8444
8445
8446    if(U_SUCCESS(*status)) {
8447        result = ucol_strcollRegular(&sColl, &tColl, status);
8448    }
8449
8450end_compare:
8451    if(sNormIter || tNormIter) {
8452        unorm_closeIter(sNormIter);
8453        unorm_closeIter(tNormIter);
8454    }
8455
8456    UTRACE_EXIT_VALUE_STATUS(result, *status)
8457    return result;
8458}
8459
8460
8461/*                                                                      */
8462/* ucol_strcoll     Main public API string comparison function          */
8463/*                                                                      */
8464U_CAPI UCollationResult U_EXPORT2
8465ucol_strcoll( const UCollator    *coll,
8466              const UChar        *source,
8467              int32_t            sourceLength,
8468              const UChar        *target,
8469              int32_t            targetLength)
8470{
8471    U_ALIGN_CODE(16);
8472
8473    UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8474    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8475        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8476        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8477        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8478    }
8479
8480    if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
8481        // do not crash, but return. Should have
8482        // status argument to return error.
8483        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8484        return UCOL_EQUAL;
8485    }
8486
8487    /* Quick check if source and target are same strings. */
8488    /* They should either both be NULL terminated or the explicit length should be set on both. */
8489    if (source==target && sourceLength==targetLength) {
8490        UTRACE_EXIT_VALUE(UCOL_EQUAL);
8491        return UCOL_EQUAL;
8492    }
8493
8494    if(coll->delegate != NULL) {
8495      UErrorCode status = U_ZERO_ERROR;
8496      return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
8497    }
8498
8499    /* Scan the strings.  Find:                                                             */
8500    /*    The length of any leading portion that is equal                                   */
8501    /*    Whether they are exactly equal.  (in which case we just return)                   */
8502    const UChar    *pSrc    = source;
8503    const UChar    *pTarg   = target;
8504    int32_t        equalLength;
8505
8506    if (sourceLength == -1 && targetLength == -1) {
8507        // Both strings are null terminated.
8508        //    Scan through any leading equal portion.
8509        while (*pSrc == *pTarg && *pSrc != 0) {
8510            pSrc++;
8511            pTarg++;
8512        }
8513        if (*pSrc == 0 && *pTarg == 0) {
8514            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8515            return UCOL_EQUAL;
8516        }
8517        equalLength = (int32_t)(pSrc - source);
8518    }
8519    else
8520    {
8521        // One or both strings has an explicit length.
8522        const UChar    *pSrcEnd = source + sourceLength;
8523        const UChar    *pTargEnd = target + targetLength;
8524
8525        // Scan while the strings are bitwise ==, or until one is exhausted.
8526        for (;;) {
8527            if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8528                break;
8529            }
8530            if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8531                break;
8532            }
8533            if (*pSrc != *pTarg) {
8534                break;
8535            }
8536            pSrc++;
8537            pTarg++;
8538        }
8539        equalLength = (int32_t)(pSrc - source);
8540
8541        // If we made it all the way through both strings, we are done.  They are ==
8542        if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8543            (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
8544        {
8545            UTRACE_EXIT_VALUE(UCOL_EQUAL);
8546            return UCOL_EQUAL;
8547        }
8548    }
8549    if (equalLength > 0) {
8550        /* There is an identical portion at the beginning of the two strings.        */
8551        /*   If the identical portion ends within a contraction or a comibining      */
8552        /*   character sequence, back up to the start of that sequence.              */
8553
8554        // These values should already be set by the code above.
8555        //pSrc  = source + equalLength;        /* point to the first differing chars   */
8556        //pTarg = target + equalLength;
8557        if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8558            (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8559        {
8560            // We are stopped in the middle of a contraction.
8561            // Scan backwards through the == part of the string looking for the start of the contraction.
8562            //   It doesn't matter which string we scan, since they are the same in this region.
8563            do
8564            {
8565                equalLength--;
8566                pSrc--;
8567            }
8568            while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8569        }
8570
8571        source += equalLength;
8572        target += equalLength;
8573        if (sourceLength > 0) {
8574            sourceLength -= equalLength;
8575        }
8576        if (targetLength > 0) {
8577            targetLength -= equalLength;
8578        }
8579    }
8580
8581    UErrorCode status = U_ZERO_ERROR;
8582    UCollationResult returnVal;
8583    if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8584        returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8585    } else {
8586        returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8587    }
8588    UTRACE_EXIT_VALUE(returnVal);
8589    return returnVal;
8590}
8591
8592U_CAPI UCollationResult U_EXPORT2
8593ucol_strcollUTF8(
8594        const UCollator *coll,
8595        const char      *source,
8596        int32_t         sourceLength,
8597        const char      *target,
8598        int32_t         targetLength,
8599        UErrorCode      *status)
8600{
8601    U_ALIGN_CODE(16);
8602
8603    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
8604    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8605        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8606        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
8607        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
8608    }
8609
8610    if (U_FAILURE(*status)) {
8611        /* do nothing */
8612        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8613        return UCOL_EQUAL;
8614    }
8615
8616    if((source == NULL && sourceLength != 0) || (target == NULL && targetLength != 0)) {
8617        *status = U_ILLEGAL_ARGUMENT_ERROR;
8618        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8619        return UCOL_EQUAL;
8620    }
8621
8622    /* Quick check if source and target are same strings. */
8623    /* They should either both be NULL terminated or the explicit length should be set on both. */
8624    if (source==target && sourceLength==targetLength) {
8625        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8626        return UCOL_EQUAL;
8627    }
8628
8629    if(coll->delegate != NULL) {
8630        return ((const Collator*)coll->delegate)->compareUTF8(
8631            StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
8632            StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
8633            *status);
8634    }
8635
8636    /* Scan the strings.  Find:                                                             */
8637    /*    The length of any leading portion that is equal                                   */
8638    /*    Whether they are exactly equal.  (in which case we just return)                   */
8639    const char  *pSrc = source;
8640    const char  *pTarg = target;
8641    UBool       bSrcLimit = FALSE;
8642    UBool       bTargLimit = FALSE;
8643
8644    if (sourceLength == -1 && targetLength == -1) {
8645        // Both strings are null terminated.
8646        //    Scan through any leading equal portion.
8647        while (*pSrc == *pTarg && *pSrc != 0) {
8648            pSrc++;
8649            pTarg++;
8650        }
8651        if (*pSrc == 0 && *pTarg == 0) {
8652            UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8653            return UCOL_EQUAL;
8654        }
8655        bSrcLimit = (*pSrc == 0);
8656        bTargLimit = (*pTarg == 0);
8657    }
8658    else
8659    {
8660        // One or both strings has an explicit length.
8661        const char *pSrcEnd = source + sourceLength;
8662        const char *pTargEnd = target + targetLength;
8663
8664        // Scan while the strings are bitwise ==, or until one is exhausted.
8665        for (;;) {
8666            if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8667                break;
8668            }
8669            if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8670                break;
8671            }
8672            if (*pSrc != *pTarg) {
8673                break;
8674            }
8675            pSrc++;
8676            pTarg++;
8677        }
8678        bSrcLimit = (pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0));
8679        bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
8680
8681        // If we made it all the way through both strings, we are done.  They are ==
8682        if (bSrcLimit &&    /* At end of src string, however it was specified. */
8683            bTargLimit)     /* and also at end of dest string                  */
8684        {
8685            UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8686            return UCOL_EQUAL;
8687        }
8688    }
8689
8690    U_ASSERT(!(bSrcLimit && bTargLimit));
8691
8692    int32_t    equalLength = pSrc - source;
8693    UBool       bSawNonLatin1 = FALSE;
8694
8695    if (equalLength > 0) {
8696        // Align position to the start of UTF-8 code point.
8697        if (bTargLimit) {
8698            U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
8699        } else {
8700            U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
8701        }
8702        pSrc = source + equalLength;
8703        pTarg = target + equalLength;
8704    }
8705
8706    if (equalLength > 0) {
8707        /* There is an identical portion at the beginning of the two strings.        */
8708        /*   If the identical portion ends within a contraction or a comibining      */
8709        /*   character sequence, back up to the start of that sequence.              */
8710        UBool bUnsafeCP = FALSE;
8711        UChar32 uc32 = -1;
8712
8713        if (!bSrcLimit) {
8714            U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
8715            if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8716                bUnsafeCP = TRUE;
8717            }
8718            bSawNonLatin1 |= (uc32 > 0xff);
8719        }
8720        if (!bTargLimit) {
8721            U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
8722            if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8723                bUnsafeCP = TRUE;
8724            }
8725            bSawNonLatin1 |= (uc32 > 0xff);
8726        }
8727
8728        if (bUnsafeCP) {
8729            while (equalLength > 0) {
8730                // We are stopped in the middle of a contraction.
8731                // Scan backwards through the == part of the string looking for the start of the contraction.
8732                //   It doesn't matter which string we scan, since they are the same in this region.
8733                U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
8734                bSawNonLatin1 |= (uc32 > 0xff);
8735                if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
8736                    break;
8737                }
8738            }
8739        }
8740        source += equalLength;
8741        target += equalLength;
8742        if (sourceLength > 0) {
8743            sourceLength -= equalLength;
8744        }
8745        if (targetLength > 0) {
8746            targetLength -= equalLength;
8747        }
8748    } else {
8749        // Lead byte of Latin 1 character is 0x00 - 0xC3
8750        bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
8751        bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
8752    }
8753
8754    UCollationResult returnVal;
8755
8756    if(!coll->latinOneUse || bSawNonLatin1) {
8757        returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
8758    } else {
8759        returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
8760    }
8761    UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
8762    return returnVal;
8763}
8764
8765
8766/* convenience function for comparing strings */
8767U_CAPI UBool U_EXPORT2
8768ucol_greater(    const    UCollator        *coll,
8769        const    UChar            *source,
8770        int32_t            sourceLength,
8771        const    UChar            *target,
8772        int32_t            targetLength)
8773{
8774    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8775        == UCOL_GREATER);
8776}
8777
8778/* convenience function for comparing strings */
8779U_CAPI UBool U_EXPORT2
8780ucol_greaterOrEqual(    const    UCollator    *coll,
8781            const    UChar        *source,
8782            int32_t        sourceLength,
8783            const    UChar        *target,
8784            int32_t        targetLength)
8785{
8786    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8787        != UCOL_LESS);
8788}
8789
8790/* convenience function for comparing strings */
8791U_CAPI UBool U_EXPORT2
8792ucol_equal(        const    UCollator        *coll,
8793            const    UChar            *source,
8794            int32_t            sourceLength,
8795            const    UChar            *target,
8796            int32_t            targetLength)
8797{
8798    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8799        == UCOL_EQUAL);
8800}
8801
8802U_CAPI void U_EXPORT2
8803ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8804    if(coll && coll->UCA) {
8805        uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8806    }
8807}
8808
8809#endif /* #if !UCONFIG_NO_COLLATION */
8810