ucoleitr.cpp revision c14898b482f76ecab9026615e2e4c6fe78358bdc
1/*
2******************************************************************************
3*   Copyright (C) 2001-2015, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5******************************************************************************
6*
7* File ucoleitr.cpp
8*
9* Modification History:
10*
11* Date        Name        Description
12* 02/15/2001  synwee      Modified all methods to process its own function
13*                         instead of calling the equivalent c++ api (coleitr.h)
14* 2012-2014   markus      Rewritten in C++ again.
15******************************************************************************/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "unicode/coleitr.h"
22#include "unicode/tblcoll.h"
23#include "unicode/ucoleitr.h"
24#include "unicode/ustring.h"
25#include "unicode/sortkey.h"
26#include "unicode/uobject.h"
27#include "cmemory.h"
28#include "usrchimp.h"
29
30U_NAMESPACE_USE
31
32#define BUFFER_LENGTH             100
33
34#define DEFAULT_BUFFER_SIZE 16
35#define BUFFER_GROW 8
36
37#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
38
39#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
40
41#define DELETE_ARRAY(array) uprv_free((void *) (array))
42
43struct RCEI
44{
45    uint32_t ce;
46    int32_t  low;
47    int32_t  high;
48};
49
50U_NAMESPACE_BEGIN
51
52struct RCEBuffer
53{
54    RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
55    RCEI   *buffer;
56    int32_t bufferIndex;
57    int32_t bufferSize;
58
59    RCEBuffer();
60    ~RCEBuffer();
61
62    UBool isEmpty() const;
63    void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
64    const RCEI *get();
65};
66
67RCEBuffer::RCEBuffer()
68{
69    buffer = defaultBuffer;
70    bufferIndex = 0;
71    bufferSize = UPRV_LENGTHOF(defaultBuffer);
72}
73
74RCEBuffer::~RCEBuffer()
75{
76    if (buffer != defaultBuffer) {
77        DELETE_ARRAY(buffer);
78    }
79}
80
81UBool RCEBuffer::isEmpty() const
82{
83    return bufferIndex <= 0;
84}
85
86void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
87{
88    if (U_FAILURE(errorCode)) {
89        return;
90    }
91    if (bufferIndex >= bufferSize) {
92        RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
93        if (newBuffer == NULL) {
94            errorCode = U_MEMORY_ALLOCATION_ERROR;
95            return;
96        }
97
98        ARRAY_COPY(newBuffer, buffer, bufferSize);
99
100        if (buffer != defaultBuffer) {
101            DELETE_ARRAY(buffer);
102        }
103
104        buffer = newBuffer;
105        bufferSize += BUFFER_GROW;
106    }
107
108    buffer[bufferIndex].ce   = ce;
109    buffer[bufferIndex].low  = ixLow;
110    buffer[bufferIndex].high = ixHigh;
111
112    bufferIndex += 1;
113}
114
115const RCEI *RCEBuffer::get()
116{
117    if (bufferIndex > 0) {
118     return &buffer[--bufferIndex];
119    }
120
121    return NULL;
122}
123
124PCEBuffer::PCEBuffer()
125{
126    buffer = defaultBuffer;
127    bufferIndex = 0;
128    bufferSize = UPRV_LENGTHOF(defaultBuffer);
129}
130
131PCEBuffer::~PCEBuffer()
132{
133    if (buffer != defaultBuffer) {
134        DELETE_ARRAY(buffer);
135    }
136}
137
138void PCEBuffer::reset()
139{
140    bufferIndex = 0;
141}
142
143UBool PCEBuffer::isEmpty() const
144{
145    return bufferIndex <= 0;
146}
147
148void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
149{
150    if (U_FAILURE(errorCode)) {
151        return;
152    }
153    if (bufferIndex >= bufferSize) {
154        PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
155        if (newBuffer == NULL) {
156            errorCode = U_MEMORY_ALLOCATION_ERROR;
157            return;
158        }
159
160        ARRAY_COPY(newBuffer, buffer, bufferSize);
161
162        if (buffer != defaultBuffer) {
163            DELETE_ARRAY(buffer);
164        }
165
166        buffer = newBuffer;
167        bufferSize += BUFFER_GROW;
168    }
169
170    buffer[bufferIndex].ce   = ce;
171    buffer[bufferIndex].low  = ixLow;
172    buffer[bufferIndex].high = ixHigh;
173
174    bufferIndex += 1;
175}
176
177const PCEI *PCEBuffer::get()
178{
179    if (bufferIndex > 0) {
180     return &buffer[--bufferIndex];
181    }
182
183    return NULL;
184}
185
186UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }
187
188UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }
189
190void UCollationPCE::init(UCollationElements *elems) {
191    init(CollationElementIterator::fromUCollationElements(elems));
192}
193
194void UCollationPCE::init(CollationElementIterator *iter)
195{
196    cei = iter;
197    init(*iter->rbc_);
198}
199
200void UCollationPCE::init(const Collator &coll)
201{
202    UErrorCode status = U_ZERO_ERROR;
203
204    strength    = coll.getAttribute(UCOL_STRENGTH, status);
205    toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
206    isShifted   = FALSE;
207    variableTop = coll.getVariableTop(status);
208}
209
210UCollationPCE::~UCollationPCE()
211{
212    // nothing to do
213}
214
215uint64_t UCollationPCE::processCE(uint32_t ce)
216{
217    uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
218
219    // This is clean, but somewhat slow...
220    // We could apply the mask to ce and then
221    // just get all three orders...
222    switch(strength) {
223    default:
224        tertiary = ucol_tertiaryOrder(ce);
225        /* note fall-through */
226
227    case UCOL_SECONDARY:
228        secondary = ucol_secondaryOrder(ce);
229        /* note fall-through */
230
231    case UCOL_PRIMARY:
232        primary = ucol_primaryOrder(ce);
233    }
234
235    // **** This should probably handle continuations too.  ****
236    // **** That means that we need 24 bits for the primary ****
237    // **** instead of the 16 that we're currently using.   ****
238    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
239    // **** Another complication with continuations is that ****
240    // **** the *second* CE is marked as a continuation, so ****
241    // **** we always have to peek ahead to know how long   ****
242    // **** the primary is...                               ****
243    if ((toShift && variableTop > ce && primary != 0)
244                || (isShifted && primary == 0)) {
245
246        if (primary == 0) {
247            return UCOL_IGNORABLE;
248        }
249
250        if (strength >= UCOL_QUATERNARY) {
251            quaternary = primary;
252        }
253
254        primary = secondary = tertiary = 0;
255        isShifted = TRUE;
256    } else {
257        if (strength >= UCOL_QUATERNARY) {
258            quaternary = 0xFFFF;
259        }
260
261        isShifted = FALSE;
262    }
263
264    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
265}
266
267U_NAMESPACE_END
268
269/* public methods ---------------------------------------------------- */
270
271U_CAPI UCollationElements* U_EXPORT2
272ucol_openElements(const UCollator  *coll,
273                  const UChar      *text,
274                        int32_t    textLength,
275                        UErrorCode *status)
276{
277    if (U_FAILURE(*status)) {
278        return NULL;
279    }
280    if (coll == NULL || (text == NULL && textLength != 0)) {
281        *status = U_ILLEGAL_ARGUMENT_ERROR;
282        return NULL;
283    }
284    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
285    if (rbc == NULL) {
286        *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
287        return NULL;
288    }
289
290    UnicodeString s((UBool)(textLength < 0), text, textLength);
291    CollationElementIterator *cei = rbc->createCollationElementIterator(s);
292    if (cei == NULL) {
293        *status = U_MEMORY_ALLOCATION_ERROR;
294        return NULL;
295    }
296
297    return cei->toUCollationElements();
298}
299
300
301U_CAPI void U_EXPORT2
302ucol_closeElements(UCollationElements *elems)
303{
304    delete CollationElementIterator::fromUCollationElements(elems);
305}
306
307U_CAPI void U_EXPORT2
308ucol_reset(UCollationElements *elems)
309{
310    CollationElementIterator::fromUCollationElements(elems)->reset();
311}
312
313U_CAPI int32_t U_EXPORT2
314ucol_next(UCollationElements *elems,
315          UErrorCode         *status)
316{
317    if (U_FAILURE(*status)) {
318        return UCOL_NULLORDER;
319    }
320
321    return CollationElementIterator::fromUCollationElements(elems)->next(*status);
322}
323
324U_NAMESPACE_BEGIN
325
326int64_t
327UCollationPCE::nextProcessed(
328                   int32_t            *ixLow,
329                   int32_t            *ixHigh,
330                   UErrorCode         *status)
331{
332    int64_t result = UCOL_IGNORABLE;
333    uint32_t low = 0, high = 0;
334
335    if (U_FAILURE(*status)) {
336        return UCOL_PROCESSED_NULLORDER;
337    }
338
339    pceBuffer.reset();
340
341    do {
342        low = cei->getOffset();
343        int32_t ce = cei->next(*status);
344        high = cei->getOffset();
345
346        if (ce == UCOL_NULLORDER) {
347             result = UCOL_PROCESSED_NULLORDER;
348             break;
349        }
350
351        result = processCE((uint32_t)ce);
352    } while (result == UCOL_IGNORABLE);
353
354    if (ixLow != NULL) {
355        *ixLow = low;
356    }
357
358    if (ixHigh != NULL) {
359        *ixHigh = high;
360    }
361
362    return result;
363}
364
365U_NAMESPACE_END
366
367U_CAPI int32_t U_EXPORT2
368ucol_previous(UCollationElements *elems,
369              UErrorCode         *status)
370{
371    if(U_FAILURE(*status)) {
372        return UCOL_NULLORDER;
373    }
374    return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
375}
376
377U_NAMESPACE_BEGIN
378
379int64_t
380UCollationPCE::previousProcessed(
381                   int32_t            *ixLow,
382                   int32_t            *ixHigh,
383                   UErrorCode         *status)
384{
385    int64_t result = UCOL_IGNORABLE;
386    int32_t  low = 0, high = 0;
387
388    if (U_FAILURE(*status)) {
389        return UCOL_PROCESSED_NULLORDER;
390    }
391
392    // pceBuffer.reset();
393
394    while (pceBuffer.isEmpty()) {
395        // buffer raw CEs up to non-ignorable primary
396        RCEBuffer rceb;
397        int32_t ce;
398
399        // **** do we need to reset rceb, or will it always be empty at this point ****
400        do {
401            high = cei->getOffset();
402            ce   = cei->previous(*status);
403            low  = cei->getOffset();
404
405            if (ce == UCOL_NULLORDER) {
406                if (!rceb.isEmpty()) {
407                    break;
408                }
409
410                goto finish;
411            }
412
413            rceb.put((uint32_t)ce, low, high, *status);
414        } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));
415
416        // process the raw CEs
417        while (U_SUCCESS(*status) && !rceb.isEmpty()) {
418            const RCEI *rcei = rceb.get();
419
420            result = processCE(rcei->ce);
421
422            if (result != UCOL_IGNORABLE) {
423                pceBuffer.put(result, rcei->low, rcei->high, *status);
424            }
425        }
426        if (U_FAILURE(*status)) {
427            return UCOL_PROCESSED_NULLORDER;
428        }
429    }
430
431finish:
432    if (pceBuffer.isEmpty()) {
433        // **** Is -1 the right value for ixLow, ixHigh? ****
434    	if (ixLow != NULL) {
435    		*ixLow = -1;
436    	}
437
438    	if (ixHigh != NULL) {
439    		*ixHigh = -1
440    		;
441    	}
442        return UCOL_PROCESSED_NULLORDER;
443    }
444
445    const PCEI *pcei = pceBuffer.get();
446
447    if (ixLow != NULL) {
448        *ixLow = pcei->low;
449    }
450
451    if (ixHigh != NULL) {
452        *ixHigh = pcei->high;
453    }
454
455    return pcei->ce;
456}
457
458U_NAMESPACE_END
459
460U_CAPI int32_t U_EXPORT2
461ucol_getMaxExpansion(const UCollationElements *elems,
462                           int32_t            order)
463{
464    return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);
465
466    // TODO: The old code masked the order according to strength and then did a binary search.
467    // However this was probably at least partially broken because of the following comment.
468    // Still, it might have found a match when this version may not.
469
470    // FIXME: with a masked search, there might be more than one hit,
471    // so we need to look forward and backward from the match to find all
472    // of the hits...
473}
474
475U_CAPI void U_EXPORT2
476ucol_setText(      UCollationElements *elems,
477             const UChar              *text,
478                   int32_t            textLength,
479                   UErrorCode         *status)
480{
481    if (U_FAILURE(*status)) {
482        return;
483    }
484
485    if ((text == NULL && textLength != 0)) {
486        *status = U_ILLEGAL_ARGUMENT_ERROR;
487        return;
488    }
489    UnicodeString s((UBool)(textLength < 0), text, textLength);
490    return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
491}
492
493U_CAPI int32_t U_EXPORT2
494ucol_getOffset(const UCollationElements *elems)
495{
496    return CollationElementIterator::fromUCollationElements(elems)->getOffset();
497}
498
499U_CAPI void U_EXPORT2
500ucol_setOffset(UCollationElements    *elems,
501               int32_t           offset,
502               UErrorCode            *status)
503{
504    if (U_FAILURE(*status)) {
505        return;
506    }
507
508    CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
509}
510
511U_CAPI int32_t U_EXPORT2
512ucol_primaryOrder (int32_t order)
513{
514    return (order >> 16) & 0xffff;
515}
516
517U_CAPI int32_t U_EXPORT2
518ucol_secondaryOrder (int32_t order)
519{
520    return (order >> 8) & 0xff;
521}
522
523U_CAPI int32_t U_EXPORT2
524ucol_tertiaryOrder (int32_t order)
525{
526    return order & 0xff;
527}
528
529#endif /* #if !UCONFIG_NO_COLLATION */
530