1/*
2******************************************************************************
3*   Copyright (C) 2001-2009, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5******************************************************************************
6*
7* File ucoleitr.cpp
8*
9* Modification History:
10*
11* Date        Name        Description
12* 02/15/2001  synwee      Modified all methods to process its own function
13*                         instead of calling the equivalent c++ api (coleitr.h)
14******************************************************************************/
15
16#include "unicode/utypes.h"
17
18#if !UCONFIG_NO_COLLATION
19
20#include "unicode/ucoleitr.h"
21#include "unicode/ustring.h"
22#include "unicode/sortkey.h"
23#include "unicode/uobject.h"
24#include "ucol_imp.h"
25#include "cmemory.h"
26
27U_NAMESPACE_USE
28
29#define BUFFER_LENGTH             100
30
31#define DEFAULT_BUFFER_SIZE 16
32#define BUFFER_GROW 8
33
34#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
35
36#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
37
38#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39
40#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
41
42#define DELETE_ARRAY(array) uprv_free((void *) (array))
43
44typedef struct collIterate collIterator;
45
46struct RCEI
47{
48    uint32_t ce;
49    int32_t  low;
50    int32_t  high;
51};
52
53U_NAMESPACE_BEGIN
54
55struct RCEBuffer
56{
57    RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
58    RCEI   *buffer;
59    int32_t bufferIndex;
60    int32_t bufferSize;
61
62    RCEBuffer();
63    ~RCEBuffer();
64
65    UBool empty() const;
66    void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
67    const RCEI *get();
68};
69
70RCEBuffer::RCEBuffer()
71{
72    buffer = defaultBuffer;
73    bufferIndex = 0;
74    bufferSize = DEFAULT_BUFFER_SIZE;
75}
76
77RCEBuffer::~RCEBuffer()
78{
79    if (buffer != defaultBuffer) {
80        DELETE_ARRAY(buffer);
81    }
82}
83
84UBool RCEBuffer::empty() const
85{
86    return bufferIndex <= 0;
87}
88
89void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
90{
91    if (bufferIndex >= bufferSize) {
92        RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
93
94        ARRAY_COPY(newBuffer, buffer, bufferSize);
95
96        if (buffer != defaultBuffer) {
97            DELETE_ARRAY(buffer);
98        }
99
100        buffer = newBuffer;
101        bufferSize += BUFFER_GROW;
102    }
103
104    buffer[bufferIndex].ce   = ce;
105    buffer[bufferIndex].low  = ixLow;
106    buffer[bufferIndex].high = ixHigh;
107
108    bufferIndex += 1;
109}
110
111const RCEI *RCEBuffer::get()
112{
113    if (bufferIndex > 0) {
114     return &buffer[--bufferIndex];
115    }
116
117    return NULL;
118}
119
120struct PCEI
121{
122    uint64_t ce;
123    int32_t  low;
124    int32_t  high;
125};
126
127struct PCEBuffer
128{
129    PCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
130    PCEI   *buffer;
131    int32_t bufferIndex;
132    int32_t bufferSize;
133
134    PCEBuffer();
135    ~PCEBuffer();
136
137    void  reset();
138    UBool empty() const;
139    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
140    const PCEI *get();
141};
142
143PCEBuffer::PCEBuffer()
144{
145    buffer = defaultBuffer;
146    bufferIndex = 0;
147    bufferSize = DEFAULT_BUFFER_SIZE;
148}
149
150PCEBuffer::~PCEBuffer()
151{
152    if (buffer != defaultBuffer) {
153        DELETE_ARRAY(buffer);
154    }
155}
156
157void PCEBuffer::reset()
158{
159    bufferIndex = 0;
160}
161
162UBool PCEBuffer::empty() const
163{
164    return bufferIndex <= 0;
165}
166
167void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
168{
169    if (bufferIndex >= bufferSize) {
170        PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
171
172        ARRAY_COPY(newBuffer, buffer, bufferSize);
173
174        if (buffer != defaultBuffer) {
175            DELETE_ARRAY(buffer);
176        }
177
178        buffer = newBuffer;
179        bufferSize += BUFFER_GROW;
180    }
181
182    buffer[bufferIndex].ce   = ce;
183    buffer[bufferIndex].low  = ixLow;
184    buffer[bufferIndex].high = ixHigh;
185
186    bufferIndex += 1;
187}
188
189const PCEI *PCEBuffer::get()
190{
191    if (bufferIndex > 0) {
192     return &buffer[--bufferIndex];
193    }
194
195    return NULL;
196}
197
198/*
199 * This inherits from UObject so that
200 * it can be allocated by new and the
201 * constructor for PCEBuffer is called.
202 */
203struct UCollationPCE : public UObject
204{
205    PCEBuffer          pceBuffer;
206    UCollationStrength strength;
207    UBool              toShift;
208    UBool              isShifted;
209    uint32_t           variableTop;
210
211    UCollationPCE(UCollationElements *elems);
212    ~UCollationPCE();
213
214    void init(const UCollator *coll);
215
216    virtual UClassID getDynamicClassID() const;
217    static UClassID getStaticClassID();
218};
219
220UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE)
221
222UCollationPCE::UCollationPCE(UCollationElements *elems)
223{
224    init(elems->iteratordata_.coll);
225}
226
227void UCollationPCE::init(const UCollator *coll)
228{
229    UErrorCode status = U_ZERO_ERROR;
230
231    strength    = ucol_getStrength(coll);
232    toShift     = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
233    isShifted   = FALSE;
234    variableTop = coll->variableTopValue << 16;
235}
236
237UCollationPCE::~UCollationPCE()
238{
239    // nothing to do
240}
241
242
243U_NAMESPACE_END
244
245
246inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
247{
248    uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
249
250    // This is clean, but somewhat slow...
251    // We could apply the mask to ce and then
252    // just get all three orders...
253    switch(elems->pce->strength) {
254    default:
255        tertiary = ucol_tertiaryOrder(ce);
256        /* note fall-through */
257
258    case UCOL_SECONDARY:
259        secondary = ucol_secondaryOrder(ce);
260        /* note fall-through */
261
262    case UCOL_PRIMARY:
263        primary = ucol_primaryOrder(ce);
264    }
265
266    // **** This should probably handle continuations too.  ****
267    // **** That means that we need 24 bits for the primary ****
268    // **** instead of the 16 that we're currently using.   ****
269    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
270    // **** Another complication with continuations is that ****
271    // **** the *second* CE is marked as a continuation, so ****
272    // **** we always have to peek ahead to know how long   ****
273    // **** the primary is...                               ****
274    if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0)
275                || (elems->pce->isShifted && primary == 0)) {
276
277        if (primary == 0) {
278            return UCOL_IGNORABLE;
279        }
280
281        if (elems->pce->strength >= UCOL_QUATERNARY) {
282            quaternary = primary;
283        }
284
285        primary = secondary = tertiary = 0;
286        elems->pce->isShifted = TRUE;
287    } else {
288        if (elems->pce->strength >= UCOL_QUATERNARY) {
289            quaternary = 0xFFFF;
290        }
291
292        elems->pce->isShifted = FALSE;
293    }
294
295    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
296}
297
298U_CAPI void U_EXPORT2
299uprv_init_pce(const UCollationElements *elems)
300{
301    if (elems->pce != NULL) {
302        elems->pce->init(elems->iteratordata_.coll);
303    }
304}
305
306
307
308/* public methods ---------------------------------------------------- */
309
310U_CAPI UCollationElements* U_EXPORT2
311ucol_openElements(const UCollator  *coll,
312                  const UChar      *text,
313                        int32_t    textLength,
314                        UErrorCode *status)
315{
316    UCollationElements *result;
317
318    if (U_FAILURE(*status)) {
319        return NULL;
320    }
321
322    result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
323    /* test for NULL */
324    if (result == NULL) {
325        *status = U_MEMORY_ALLOCATION_ERROR;
326        return NULL;
327    }
328
329    result->reset_ = TRUE;
330    result->isWritable = FALSE;
331    result->pce = NULL;
332
333    if (text == NULL) {
334        textLength = 0;
335    }
336    uprv_init_collIterate(coll, text, textLength, &result->iteratordata_);
337
338    return result;
339}
340
341
342U_CAPI void U_EXPORT2
343ucol_closeElements(UCollationElements *elems)
344{
345	if (elems != NULL) {
346	  collIterate *ci = &elems->iteratordata_;
347
348	  if (ci != NULL) {
349		  if (ci->writableBuffer != ci->stackWritableBuffer) {
350			uprv_free(ci->writableBuffer);
351		  }
352
353		  if (ci->extendCEs) {
354			  uprv_free(ci->extendCEs);
355		  }
356
357		  if (ci->offsetBuffer) {
358			  uprv_free(ci->offsetBuffer);
359		  }
360	  }
361
362	  if (elems->isWritable && elems->iteratordata_.string != NULL)
363	  {
364		uprv_free(elems->iteratordata_.string);
365	  }
366
367	  if (elems->pce != NULL) {
368		  delete elems->pce;
369	  }
370
371	  uprv_free(elems);
372	}
373}
374
375U_CAPI void U_EXPORT2
376ucol_reset(UCollationElements *elems)
377{
378    collIterate *ci = &(elems->iteratordata_);
379    elems->reset_   = TRUE;
380    ci->pos         = ci->string;
381    if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) {
382        ci->endp      = ci->string + u_strlen(ci->string);
383    }
384    ci->CEpos       = ci->toReturn = ci->CEs;
385    ci->flags       = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN;
386    if (ci->coll->normalizationMode == UCOL_ON) {
387        ci->flags |= UCOL_ITER_NORM;
388    }
389
390    if (ci->stackWritableBuffer != ci->writableBuffer) {
391        uprv_free(ci->writableBuffer);
392        ci->writableBuffer = ci->stackWritableBuffer;
393        ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
394    }
395    ci->fcdPosition = NULL;
396
397  //ci->offsetReturn = ci->offsetStore = NULL;
398	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
399}
400
401U_CAPI void U_EXPORT2
402ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status)
403{
404    if (U_FAILURE(*status)) {
405        return;
406    }
407
408    if (elems == NULL) {
409        *status = U_ILLEGAL_ARGUMENT_ERROR;
410        return;
411    }
412
413    elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT;
414}
415
416U_CAPI int32_t U_EXPORT2
417ucol_next(UCollationElements *elems,
418          UErrorCode         *status)
419{
420    int32_t result;
421    if (U_FAILURE(*status)) {
422        return UCOL_NULLORDER;
423    }
424
425    elems->reset_ = FALSE;
426
427    result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll,
428                                     &elems->iteratordata_,
429                                     status);
430
431    if (result == UCOL_NO_MORE_CES) {
432        result = UCOL_NULLORDER;
433    }
434    return result;
435}
436
437U_CAPI int64_t U_EXPORT2
438ucol_nextProcessed(UCollationElements *elems,
439                   int32_t            *ixLow,
440                   int32_t            *ixHigh,
441                   UErrorCode         *status)
442{
443    const UCollator *coll = elems->iteratordata_.coll;
444    int64_t result = UCOL_IGNORABLE;
445    uint32_t low = 0, high = 0;
446
447    if (U_FAILURE(*status)) {
448        return UCOL_PROCESSED_NULLORDER;
449    }
450
451    if (elems->pce == NULL) {
452        elems->pce = new UCollationPCE(elems);
453    } else {
454        elems->pce->pceBuffer.reset();
455    }
456
457    elems->reset_ = FALSE;
458
459    do {
460        low = ucol_getOffset(elems);
461        uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status);
462        high = ucol_getOffset(elems);
463
464        if (ce == UCOL_NO_MORE_CES) {
465             result = UCOL_PROCESSED_NULLORDER;
466             break;
467        }
468
469        result = processCE(elems, ce);
470    } while (result == UCOL_IGNORABLE);
471
472    if (ixLow != NULL) {
473        *ixLow = low;
474    }
475
476    if (ixHigh != NULL) {
477        *ixHigh = high;
478    }
479
480    return result;
481}
482
483U_CAPI int32_t U_EXPORT2
484ucol_previous(UCollationElements *elems,
485              UErrorCode         *status)
486{
487    if(U_FAILURE(*status)) {
488        return UCOL_NULLORDER;
489    }
490    else
491    {
492        int32_t result;
493
494        if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) {
495            if (elems->iteratordata_.endp == NULL) {
496                elems->iteratordata_.endp = elems->iteratordata_.string +
497                                            u_strlen(elems->iteratordata_.string);
498                elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
499            }
500            elems->iteratordata_.pos = elems->iteratordata_.endp;
501            elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
502        }
503
504        elems->reset_ = FALSE;
505
506        result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll,
507                                         &(elems->iteratordata_),
508                                         status);
509
510        if (result == UCOL_NO_MORE_CES) {
511            result = UCOL_NULLORDER;
512        }
513
514        return result;
515    }
516}
517
518U_CAPI int64_t U_EXPORT2
519ucol_previousProcessed(UCollationElements *elems,
520                   int32_t            *ixLow,
521                   int32_t            *ixHigh,
522                   UErrorCode         *status)
523{
524    const UCollator *coll = elems->iteratordata_.coll;
525    int64_t result = UCOL_IGNORABLE;
526 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
527 // UCollationStrength strength = ucol_getStrength(coll);
528 //  UBool toShift   = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) ==  UCOL_SHIFTED;
529 // uint32_t variableTop = coll->variableTopValue;
530    int32_t  low = 0, high = 0;
531
532    if (U_FAILURE(*status)) {
533        return UCOL_PROCESSED_NULLORDER;
534    }
535
536    if (elems->reset_ &&
537        (elems->iteratordata_.pos == elems->iteratordata_.string)) {
538        if (elems->iteratordata_.endp == NULL) {
539            elems->iteratordata_.endp = elems->iteratordata_.string +
540                                        u_strlen(elems->iteratordata_.string);
541            elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
542        }
543
544        elems->iteratordata_.pos = elems->iteratordata_.endp;
545        elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
546    }
547
548    if (elems->pce == NULL) {
549        elems->pce = new UCollationPCE(elems);
550    } else {
551      //elems->pce->pceBuffer.reset();
552    }
553
554    elems->reset_ = FALSE;
555
556    while (elems->pce->pceBuffer.empty()) {
557        // buffer raw CEs up to non-ignorable primary
558        RCEBuffer rceb;
559        uint32_t ce;
560
561        // **** do we need to reset rceb, or will it always be empty at this point ****
562        do {
563            high = ucol_getOffset(elems);
564            ce   = ucol_getPrevCE(coll, &elems->iteratordata_, status);
565            low  = ucol_getOffset(elems);
566
567            if (ce == UCOL_NO_MORE_CES) {
568                if (! rceb.empty()) {
569                    break;
570                }
571
572                goto finish;
573            }
574
575            rceb.put(ce, low, high);
576        } while ((ce & UCOL_PRIMARYMASK) == 0);
577
578        // process the raw CEs
579        while (! rceb.empty()) {
580            const RCEI *rcei = rceb.get();
581
582            result = processCE(elems, rcei->ce);
583
584            if (result != UCOL_IGNORABLE) {
585                elems->pce->pceBuffer.put(result, rcei->low, rcei->high);
586            }
587        }
588    }
589
590finish:
591    if (elems->pce->pceBuffer.empty()) {
592        // **** Is -1 the right value for ixLow, ixHigh? ****
593    	if (ixLow != NULL) {
594    		*ixLow = -1;
595    	}
596
597    	if (ixHigh != NULL) {
598    		*ixHigh = -1
599    		;
600    	}
601        return UCOL_PROCESSED_NULLORDER;
602    }
603
604    const PCEI *pcei = elems->pce->pceBuffer.get();
605
606    if (ixLow != NULL) {
607        *ixLow = pcei->low;
608    }
609
610    if (ixHigh != NULL) {
611        *ixHigh = pcei->high;
612    }
613
614    return pcei->ce;
615}
616
617U_CAPI int32_t U_EXPORT2
618ucol_getMaxExpansion(const UCollationElements *elems,
619                           int32_t            order)
620{
621    uint8_t result;
622
623#if 0
624    UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
625#else
626    const UCollator *coll = elems->iteratordata_.coll;
627    const uint32_t *start;
628    const uint32_t *limit;
629    const uint32_t *mid;
630          uint32_t strengthMask = 0;
631          uint32_t mOrder = (uint32_t) order;
632
633    switch (coll->strength)
634    {
635    default:
636        strengthMask |= UCOL_TERTIARYORDERMASK;
637        /* fall through */
638
639    case UCOL_SECONDARY:
640        strengthMask |= UCOL_SECONDARYORDERMASK;
641        /* fall through */
642
643    case UCOL_PRIMARY:
644        strengthMask |= UCOL_PRIMARYORDERMASK;
645    }
646
647    mOrder &= strengthMask;
648    start = (coll)->endExpansionCE;
649    limit = (coll)->lastEndExpansionCE;
650
651    while (start < limit - 1) {
652        mid = start + ((limit - start) >> 1);
653        if (mOrder <= (*mid & strengthMask)) {
654          limit = mid;
655        } else {
656          start = mid;
657        }
658    }
659
660    // FIXME: with a masked search, there might be more than one hit,
661    // so we need to look forward and backward from the match to find all
662    // of the hits...
663    if ((*start & strengthMask) == mOrder) {
664        result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));
665    } else if ((*limit & strengthMask) == mOrder) {
666         result = *(coll->expansionCESize + (limit - coll->endExpansionCE));
667   } else if ((mOrder & 0xFFFF) == 0x00C0) {
668        result = 2;
669   } else {
670       result = 1;
671   }
672#endif
673
674    return result;
675}
676
677U_CAPI void U_EXPORT2
678ucol_setText(      UCollationElements *elems,
679             const UChar              *text,
680                   int32_t            textLength,
681                   UErrorCode         *status)
682{
683    if (U_FAILURE(*status)) {
684        return;
685    }
686
687    if (elems->isWritable && elems->iteratordata_.string != NULL)
688    {
689        uprv_free(elems->iteratordata_.string);
690    }
691
692    if (text == NULL) {
693        textLength = 0;
694    }
695
696    elems->isWritable = FALSE;
697
698    /* free offset buffer to avoid memory leak before initializing. */
699    ucol_freeOffsetBuffer(&(elems->iteratordata_));
700    uprv_init_collIterate(elems->iteratordata_.coll, text, textLength,
701                          &elems->iteratordata_);
702
703    elems->reset_   = TRUE;
704}
705
706U_CAPI int32_t U_EXPORT2
707ucol_getOffset(const UCollationElements *elems)
708{
709  const collIterate *ci = &(elems->iteratordata_);
710
711  if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) {
712      return ci->offsetRepeatValue;
713  }
714
715  if (ci->offsetReturn != NULL) {
716      return *ci->offsetReturn;
717  }
718
719  // while processing characters in normalization buffer getOffset will
720  // return the next non-normalized character.
721  // should be inline with the old implementation since the old codes uses
722  // nextDecomp in normalizer which also decomposes the string till the
723  // first base character is found.
724  if (ci->flags & UCOL_ITER_INNORMBUF) {
725      if (ci->fcdPosition == NULL) {
726        return 0;
727      }
728      return (int32_t)(ci->fcdPosition - ci->string);
729  }
730  else {
731      return (int32_t)(ci->pos - ci->string);
732  }
733}
734
735U_CAPI void U_EXPORT2
736ucol_setOffset(UCollationElements    *elems,
737               int32_t           offset,
738               UErrorCode            *status)
739{
740    if (U_FAILURE(*status)) {
741        return;
742    }
743
744    // this methods will clean up any use of the writable buffer and points to
745    // the original string
746    collIterate *ci = &(elems->iteratordata_);
747    ci->pos         = ci->string + offset;
748    ci->CEpos       = ci->toReturn = ci->CEs;
749    if (ci->flags & UCOL_ITER_INNORMBUF) {
750        ci->flags = ci->origFlags;
751    }
752    if ((ci->flags & UCOL_ITER_HASLEN) == 0) {
753        ci->endp  = ci->string + u_strlen(ci->string);
754        ci->flags |= UCOL_ITER_HASLEN;
755    }
756    ci->fcdPosition = NULL;
757    elems->reset_ = FALSE;
758
759	ci->offsetReturn = NULL;
760    ci->offsetStore = ci->offsetBuffer;
761	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
762}
763
764U_CAPI int32_t U_EXPORT2
765ucol_primaryOrder (int32_t order)
766{
767    order &= UCOL_PRIMARYMASK;
768    return (order >> UCOL_PRIMARYORDERSHIFT);
769}
770
771U_CAPI int32_t U_EXPORT2
772ucol_secondaryOrder (int32_t order)
773{
774    order &= UCOL_SECONDARYMASK;
775    return (order >> UCOL_SECONDARYORDERSHIFT);
776}
777
778U_CAPI int32_t U_EXPORT2
779ucol_tertiaryOrder (int32_t order)
780{
781    return (order & UCOL_TERTIARYMASK);
782}
783
784
785void ucol_freeOffsetBuffer(collIterate *s) {
786    if (s != NULL && s->offsetBuffer != NULL) {
787        uprv_free(s->offsetBuffer);
788        s->offsetBuffer = NULL;
789        s->offsetBufferSize = 0;
790    }
791}
792
793#endif /* #if !UCONFIG_NO_COLLATION */
794