1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*   Copyright (C) 2001-2016, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7******************************************************************************
8*
9* File ucoleitr.cpp
10*
11* Modification History:
12*
13* Date        Name        Description
14* 02/15/2001  synwee      Modified all methods to process its own function
15*                         instead of calling the equivalent c++ api (coleitr.h)
16* 2012-2014   markus      Rewritten in C++ again.
17******************************************************************************/
18
19#include "unicode/utypes.h"
20
21#if !UCONFIG_NO_COLLATION
22
23#include "unicode/coleitr.h"
24#include "unicode/tblcoll.h"
25#include "unicode/ucoleitr.h"
26#include "unicode/ustring.h"
27#include "unicode/sortkey.h"
28#include "unicode/uobject.h"
29#include "cmemory.h"
30#include "usrchimp.h"
31
32U_NAMESPACE_USE
33
34#define BUFFER_LENGTH             100
35
36#define DEFAULT_BUFFER_SIZE 16
37#define BUFFER_GROW 8
38
39#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
40
41#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
42
43#define DELETE_ARRAY(array) uprv_free((void *) (array))
44
45struct RCEI
46{
47    uint32_t ce;
48    int32_t  low;
49    int32_t  high;
50};
51
52U_NAMESPACE_BEGIN
53
54struct RCEBuffer
55{
56    RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
57    RCEI   *buffer;
58    int32_t bufferIndex;
59    int32_t bufferSize;
60
61    RCEBuffer();
62    ~RCEBuffer();
63
64    UBool isEmpty() const;
65    void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
66    const RCEI *get();
67};
68
69RCEBuffer::RCEBuffer()
70{
71    buffer = defaultBuffer;
72    bufferIndex = 0;
73    bufferSize = UPRV_LENGTHOF(defaultBuffer);
74}
75
76RCEBuffer::~RCEBuffer()
77{
78    if (buffer != defaultBuffer) {
79        DELETE_ARRAY(buffer);
80    }
81}
82
83UBool RCEBuffer::isEmpty() const
84{
85    return bufferIndex <= 0;
86}
87
88void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
89{
90    if (U_FAILURE(errorCode)) {
91        return;
92    }
93    if (bufferIndex >= bufferSize) {
94        RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
95        if (newBuffer == NULL) {
96            errorCode = U_MEMORY_ALLOCATION_ERROR;
97            return;
98        }
99
100        ARRAY_COPY(newBuffer, buffer, bufferSize);
101
102        if (buffer != defaultBuffer) {
103            DELETE_ARRAY(buffer);
104        }
105
106        buffer = newBuffer;
107        bufferSize += BUFFER_GROW;
108    }
109
110    buffer[bufferIndex].ce   = ce;
111    buffer[bufferIndex].low  = ixLow;
112    buffer[bufferIndex].high = ixHigh;
113
114    bufferIndex += 1;
115}
116
117const RCEI *RCEBuffer::get()
118{
119    if (bufferIndex > 0) {
120     return &buffer[--bufferIndex];
121    }
122
123    return NULL;
124}
125
126PCEBuffer::PCEBuffer()
127{
128    buffer = defaultBuffer;
129    bufferIndex = 0;
130    bufferSize = UPRV_LENGTHOF(defaultBuffer);
131}
132
133PCEBuffer::~PCEBuffer()
134{
135    if (buffer != defaultBuffer) {
136        DELETE_ARRAY(buffer);
137    }
138}
139
140void PCEBuffer::reset()
141{
142    bufferIndex = 0;
143}
144
145UBool PCEBuffer::isEmpty() const
146{
147    return bufferIndex <= 0;
148}
149
150void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
151{
152    if (U_FAILURE(errorCode)) {
153        return;
154    }
155    if (bufferIndex >= bufferSize) {
156        PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
157        if (newBuffer == NULL) {
158            errorCode = U_MEMORY_ALLOCATION_ERROR;
159            return;
160        }
161
162        ARRAY_COPY(newBuffer, buffer, bufferSize);
163
164        if (buffer != defaultBuffer) {
165            DELETE_ARRAY(buffer);
166        }
167
168        buffer = newBuffer;
169        bufferSize += BUFFER_GROW;
170    }
171
172    buffer[bufferIndex].ce   = ce;
173    buffer[bufferIndex].low  = ixLow;
174    buffer[bufferIndex].high = ixHigh;
175
176    bufferIndex += 1;
177}
178
179const PCEI *PCEBuffer::get()
180{
181    if (bufferIndex > 0) {
182     return &buffer[--bufferIndex];
183    }
184
185    return NULL;
186}
187
188UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
189
190UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
191
192void UCollationPCE::init(UCollationElements *elems) {
193    init(CollationElementIterator::fromUCollationElements(elems));
194}
195
196void UCollationPCE::init(CollationElementIterator *iter)
197{
198    cei = iter;
199    init(*iter->rbc_);
200}
201
202void UCollationPCE::init(const Collator &coll)
203{
204    UErrorCode status = U_ZERO_ERROR;
205
206    strength    = coll.getAttribute(UCOL_STRENGTH, status);
207    toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
208    isShifted   = FALSE;
209    variableTop = coll.getVariableTop(status);
210}
211
212UCollationPCE::~UCollationPCE()
213{
214    // nothing to do
215}
216
217uint64_t UCollationPCE::processCE(uint32_t ce)
218{
219    uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
220
221    // This is clean, but somewhat slow...
222    // We could apply the mask to ce and then
223    // just get all three orders...
224    switch(strength) {
225    default:
226        tertiary = ucol_tertiaryOrder(ce);
227        U_FALLTHROUGH;
228
229    case UCOL_SECONDARY:
230        secondary = ucol_secondaryOrder(ce);
231        U_FALLTHROUGH;
232
233    case UCOL_PRIMARY:
234        primary = ucol_primaryOrder(ce);
235    }
236
237    // **** This should probably handle continuations too.  ****
238    // **** That means that we need 24 bits for the primary ****
239    // **** instead of the 16 that we're currently using.   ****
240    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
241    // **** Another complication with continuations is that ****
242    // **** the *second* CE is marked as a continuation, so ****
243    // **** we always have to peek ahead to know how long   ****
244    // **** the primary is...                               ****
245    if ((toShift && variableTop > ce && primary != 0)
246                || (isShifted && primary == 0)) {
247
248        if (primary == 0) {
249            return UCOL_IGNORABLE;
250        }
251
252        if (strength >= UCOL_QUATERNARY) {
253            quaternary = primary;
254        }
255
256        primary = secondary = tertiary = 0;
257        isShifted = TRUE;
258    } else {
259        if (strength >= UCOL_QUATERNARY) {
260            quaternary = 0xFFFF;
261        }
262
263        isShifted = FALSE;
264    }
265
266    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
267}
268
269U_NAMESPACE_END
270
271/* public methods ---------------------------------------------------- */
272
273U_CAPI UCollationElements* U_EXPORT2
274ucol_openElements(const UCollator  *coll,
275                  const UChar      *text,
276                        int32_t    textLength,
277                        UErrorCode *status)
278{
279    if (U_FAILURE(*status)) {
280        return NULL;
281    }
282    if (coll == NULL || (text == NULL && textLength != 0)) {
283        *status = U_ILLEGAL_ARGUMENT_ERROR;
284        return NULL;
285    }
286    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
287    if (rbc == NULL) {
288        *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
289        return NULL;
290    }
291
292    UnicodeString s((UBool)(textLength < 0), text, textLength);
293    CollationElementIterator *cei = rbc->createCollationElementIterator(s);
294    if (cei == NULL) {
295        *status = U_MEMORY_ALLOCATION_ERROR;
296        return NULL;
297    }
298
299    return cei->toUCollationElements();
300}
301
302
303U_CAPI void U_EXPORT2
304ucol_closeElements(UCollationElements *elems)
305{
306    delete CollationElementIterator::fromUCollationElements(elems);
307}
308
309U_CAPI void U_EXPORT2
310ucol_reset(UCollationElements *elems)
311{
312    CollationElementIterator::fromUCollationElements(elems)->reset();
313}
314
315U_CAPI int32_t U_EXPORT2
316ucol_next(UCollationElements *elems,
317          UErrorCode         *status)
318{
319    if (U_FAILURE(*status)) {
320        return UCOL_NULLORDER;
321    }
322
323    return CollationElementIterator::fromUCollationElements(elems)->next(*status);
324}
325
326U_NAMESPACE_BEGIN
327
328int64_t
329UCollationPCE::nextProcessed(
330                   int32_t            *ixLow,
331                   int32_t            *ixHigh,
332                   UErrorCode         *status)
333{
334    int64_t result = UCOL_IGNORABLE;
335    uint32_t low = 0, high = 0;
336
337    if (U_FAILURE(*status)) {
338        return UCOL_PROCESSED_NULLORDER;
339    }
340
341    pceBuffer.reset();
342
343    do {
344        low = cei->getOffset();
345        int32_t ce = cei->next(*status);
346        high = cei->getOffset();
347
348        if (ce == UCOL_NULLORDER) {
349             result = UCOL_PROCESSED_NULLORDER;
350             break;
351        }
352
353        result = processCE((uint32_t)ce);
354    } while (result == UCOL_IGNORABLE);
355
356    if (ixLow != NULL) {
357        *ixLow = low;
358    }
359
360    if (ixHigh != NULL) {
361        *ixHigh = high;
362    }
363
364    return result;
365}
366
367U_NAMESPACE_END
368
369U_CAPI int32_t U_EXPORT2
370ucol_previous(UCollationElements *elems,
371              UErrorCode         *status)
372{
373    if(U_FAILURE(*status)) {
374        return UCOL_NULLORDER;
375    }
376    return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
377}
378
379U_NAMESPACE_BEGIN
380
381int64_t
382UCollationPCE::previousProcessed(
383                   int32_t            *ixLow,
384                   int32_t            *ixHigh,
385                   UErrorCode         *status)
386{
387    int64_t result = UCOL_IGNORABLE;
388    int32_t  low = 0, high = 0;
389
390    if (U_FAILURE(*status)) {
391        return UCOL_PROCESSED_NULLORDER;
392    }
393
394    // pceBuffer.reset();
395
396    while (pceBuffer.isEmpty()) {
397        // buffer raw CEs up to non-ignorable primary
398        RCEBuffer rceb;
399        int32_t ce;
400
401        // **** do we need to reset rceb, or will it always be empty at this point ****
402        do {
403            high = cei->getOffset();
404            ce   = cei->previous(*status);
405            low  = cei->getOffset();
406
407            if (ce == UCOL_NULLORDER) {
408                if (!rceb.isEmpty()) {
409                    break;
410                }
411
412                goto finish;
413            }
414
415            rceb.put((uint32_t)ce, low, high, *status);
416        } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
417
418        // process the raw CEs
419        while (U_SUCCESS(*status) && !rceb.isEmpty()) {
420            const RCEI *rcei = rceb.get();
421
422            result = processCE(rcei->ce);
423
424            if (result != UCOL_IGNORABLE) {
425                pceBuffer.put(result, rcei->low, rcei->high, *status);
426            }
427        }
428        if (U_FAILURE(*status)) {
429            return UCOL_PROCESSED_NULLORDER;
430        }
431    }
432
433finish:
434    if (pceBuffer.isEmpty()) {
435        // **** Is -1 the right value for ixLow, ixHigh? ****
436    	if (ixLow != NULL) {
437    		*ixLow = -1;
438    	}
439
440    	if (ixHigh != NULL) {
441    		*ixHigh = -1
442    		;
443    	}
444        return UCOL_PROCESSED_NULLORDER;
445    }
446
447    const PCEI *pcei = pceBuffer.get();
448
449    if (ixLow != NULL) {
450        *ixLow = pcei->low;
451    }
452
453    if (ixHigh != NULL) {
454        *ixHigh = pcei->high;
455    }
456
457    return pcei->ce;
458}
459
460U_NAMESPACE_END
461
462U_CAPI int32_t U_EXPORT2
463ucol_getMaxExpansion(const UCollationElements *elems,
464                           int32_t            order)
465{
466    return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
467
468    // TODO: The old code masked the order according to strength and then did a binary search.
469    // However this was probably at least partially broken because of the following comment.
470    // Still, it might have found a match when this version may not.
471
472    // FIXME: with a masked search, there might be more than one hit,
473    // so we need to look forward and backward from the match to find all
474    // of the hits...
475}
476
477U_CAPI void U_EXPORT2
478ucol_setText(      UCollationElements *elems,
479             const UChar              *text,
480                   int32_t            textLength,
481                   UErrorCode         *status)
482{
483    if (U_FAILURE(*status)) {
484        return;
485    }
486
487    if ((text == NULL && textLength != 0)) {
488        *status = U_ILLEGAL_ARGUMENT_ERROR;
489        return;
490    }
491    UnicodeString s((UBool)(textLength < 0), text, textLength);
492    return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
493}
494
495U_CAPI int32_t U_EXPORT2
496ucol_getOffset(const UCollationElements *elems)
497{
498    return CollationElementIterator::fromUCollationElements(elems)->getOffset();
499}
500
501U_CAPI void U_EXPORT2
502ucol_setOffset(UCollationElements    *elems,
503               int32_t           offset,
504               UErrorCode            *status)
505{
506    if (U_FAILURE(*status)) {
507        return;
508    }
509
510    CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
511}
512
513U_CAPI int32_t U_EXPORT2
514ucol_primaryOrder (int32_t order)
515{
516    return (order >> 16) & 0xffff;
517}
518
519U_CAPI int32_t U_EXPORT2
520ucol_secondaryOrder (int32_t order)
521{
522    return (order >> 8) & 0xff;
523}
524
525U_CAPI int32_t U_EXPORT2
526ucol_tertiaryOrder (int32_t order)
527{
528    return order & 0xff;
529}
530
531#endif /* #if !UCONFIG_NO_COLLATION */
532