ucol.cpp revision c73f511526464f8e56c242df80552e9b0d94ae3d
1/*
2*******************************************************************************
3*   Copyright (C) 1996-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  ucol.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11* Modification history
12* Date        Name      Comments
13* 1996-1999   various members of ICU team maintained C API for collation framework
14* 02/16/2001  synwee    Added internal method getPrevSpecialCE
15* 03/01/2001  synwee    Added maxexpansion functionality.
16* 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
17* 2012-2014   markus    Rewritten in C++ again.
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/coll.h"
25#include "unicode/tblcoll.h"
26#include "unicode/bytestream.h"
27#include "unicode/coleitr.h"
28#include "unicode/ucoleitr.h"
29#include "unicode/ustring.h"
30#include "cmemory.h"
31#include "collation.h"
32#include "cstring.h"
33#include "putilimp.h"
34#include "uassert.h"
35#include "utracimp.h"
36
37U_NAMESPACE_USE
38
39U_CAPI UCollator* U_EXPORT2
40ucol_openBinary(const uint8_t *bin, int32_t length,
41                const UCollator *base,
42                UErrorCode *status)
43{
44    if(U_FAILURE(*status)) { return NULL; }
45    RuleBasedCollator *coll = new RuleBasedCollator(
46            bin, length,
47            RuleBasedCollator::rbcFromUCollator(base),
48            *status);
49    if(coll == NULL) {
50        *status = U_MEMORY_ALLOCATION_ERROR;
51        return NULL;
52    }
53    if(U_FAILURE(*status)) {
54        delete coll;
55        return NULL;
56    }
57    return coll->toUCollator();
58}
59
60U_CAPI int32_t U_EXPORT2
61ucol_cloneBinary(const UCollator *coll,
62                 uint8_t *buffer, int32_t capacity,
63                 UErrorCode *status)
64{
65    if(U_FAILURE(*status)) {
66        return 0;
67    }
68    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
69    if(rbc == NULL && coll != NULL) {
70        *status = U_UNSUPPORTED_ERROR;
71        return 0;
72    }
73    return rbc->cloneBinary(buffer, capacity, *status);
74}
75
76U_CAPI UCollator* U_EXPORT2
77ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status)
78{
79    if (status == NULL || U_FAILURE(*status)){
80        return NULL;
81    }
82    if (coll == NULL) {
83       *status = U_ILLEGAL_ARGUMENT_ERROR;
84        return NULL;
85    }
86    if (pBufferSize != NULL) {
87        int32_t inputSize = *pBufferSize;
88        *pBufferSize = 1;
89        if (inputSize == 0) {
90            return NULL;  // preflighting for deprecated functionality
91        }
92    }
93    Collator *newColl = Collator::fromUCollator(coll)->clone();
94    if (newColl == NULL) {
95        *status = U_MEMORY_ALLOCATION_ERROR;
96    } else {
97        *status = U_SAFECLONE_ALLOCATED_WARNING;
98    }
99    return newColl->toUCollator();
100}
101
102U_CAPI void U_EXPORT2
103ucol_close(UCollator *coll)
104{
105    UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
106    UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
107    if(coll != NULL) {
108        delete Collator::fromUCollator(coll);
109    }
110    UTRACE_EXIT();
111}
112
113U_CAPI int32_t U_EXPORT2
114ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
115                   const uint8_t *src2, int32_t src2Length,
116                   uint8_t *dest, int32_t destCapacity) {
117    /* check arguments */
118    if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
119        src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
120        destCapacity<0 || (destCapacity>0 && dest==NULL)
121    ) {
122        /* error, attempt to write a zero byte and return 0 */
123        if(dest!=NULL && destCapacity>0) {
124            *dest=0;
125        }
126        return 0;
127    }
128
129    /* check lengths and capacity */
130    if(src1Length<0) {
131        src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
132    }
133    if(src2Length<0) {
134        src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
135    }
136
137    int32_t destLength=src1Length+src2Length;
138    if(destLength>destCapacity) {
139        /* the merged sort key does not fit into the destination */
140        return destLength;
141    }
142
143    /* merge the sort keys with the same number of levels */
144    uint8_t *p=dest;
145    for(;;) {
146        /* copy level from src1 not including 00 or 01 */
147        uint8_t b;
148        while((b=*src1)>=2) {
149            ++src1;
150            *p++=b;
151        }
152
153        /* add a 02 merge separator */
154        *p++=2;
155
156        /* copy level from src2 not including 00 or 01 */
157        while((b=*src2)>=2) {
158            ++src2;
159            *p++=b;
160        }
161
162        /* if both sort keys have another level, then add a 01 level separator and continue */
163        if(*src1==1 && *src2==1) {
164            ++src1;
165            ++src2;
166            *p++=1;
167        } else {
168            break;
169        }
170    }
171
172    /*
173     * here, at least one sort key is finished now, but the other one
174     * might have some contents left from containing more levels;
175     * that contents is just appended to the result
176     */
177    if(*src1!=0) {
178        /* src1 is not finished, therefore *src2==0, and src1 is appended */
179        src2=src1;
180    }
181    /* append src2, "the other, unfinished sort key" */
182    while((*p++=*src2++)!=0) {}
183
184    /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
185    return (int32_t)(p-dest);
186}
187
188U_CAPI int32_t U_EXPORT2
189ucol_getSortKey(const    UCollator    *coll,
190        const    UChar        *source,
191        int32_t        sourceLength,
192        uint8_t        *result,
193        int32_t        resultLength)
194{
195    UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
196    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
197        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
198            ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
199    }
200
201    int32_t keySize = Collator::fromUCollator(coll)->
202            getSortKey(source, sourceLength, result, resultLength);
203
204    UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
205    UTRACE_EXIT_VALUE(keySize);
206    return keySize;
207}
208
209U_CAPI int32_t U_EXPORT2
210ucol_nextSortKeyPart(const UCollator *coll,
211                     UCharIterator *iter,
212                     uint32_t state[2],
213                     uint8_t *dest, int32_t count,
214                     UErrorCode *status)
215{
216    /* error checking */
217    if(status==NULL || U_FAILURE(*status)) {
218        return 0;
219    }
220    UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
221    UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
222                  coll, iter, state[0], state[1], dest, count);
223
224    int32_t i = Collator::fromUCollator(coll)->
225            internalNextSortKeyPart(iter, state, dest, count, *status);
226
227    // Return number of meaningful sortkey bytes.
228    UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
229                  dest,i, state[0], state[1]);
230    UTRACE_EXIT_VALUE_STATUS(i, *status);
231    return i;
232}
233
234/**
235 * Produce a bound for a given sortkey and a number of levels.
236 */
237U_CAPI int32_t U_EXPORT2
238ucol_getBound(const uint8_t       *source,
239        int32_t             sourceLength,
240        UColBoundMode       boundType,
241        uint32_t            noOfLevels,
242        uint8_t             *result,
243        int32_t             resultLength,
244        UErrorCode          *status)
245{
246    // consistency checks
247    if(status == NULL || U_FAILURE(*status)) {
248        return 0;
249    }
250    if(source == NULL) {
251        *status = U_ILLEGAL_ARGUMENT_ERROR;
252        return 0;
253    }
254
255    int32_t sourceIndex = 0;
256    // Scan the string until we skip enough of the key OR reach the end of the key
257    do {
258        sourceIndex++;
259        if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) {
260            noOfLevels--;
261        }
262    } while (noOfLevels > 0
263        && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
264
265    if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
266        && noOfLevels > 0) {
267            *status = U_SORT_KEY_TOO_SHORT_WARNING;
268    }
269
270
271    // READ ME: this code assumes that the values for boundType
272    // enum will not changes. They are set so that the enum value
273    // corresponds to the number of extra bytes each bound type
274    // needs.
275    if(result != NULL && resultLength >= sourceIndex+boundType) {
276        uprv_memcpy(result, source, sourceIndex);
277        switch(boundType) {
278            // Lower bound just gets terminated. No extra bytes
279        case UCOL_BOUND_LOWER: // = 0
280            break;
281            // Upper bound needs one extra byte
282        case UCOL_BOUND_UPPER: // = 1
283            result[sourceIndex++] = 2;
284            break;
285            // Upper long bound needs two extra bytes
286        case UCOL_BOUND_UPPER_LONG: // = 2
287            result[sourceIndex++] = 0xFF;
288            result[sourceIndex++] = 0xFF;
289            break;
290        default:
291            *status = U_ILLEGAL_ARGUMENT_ERROR;
292            return 0;
293        }
294        result[sourceIndex++] = 0;
295
296        return sourceIndex;
297    } else {
298        return sourceIndex+boundType+1;
299    }
300}
301
302U_CAPI void U_EXPORT2
303ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode) {
304    if(U_FAILURE(*pErrorCode)) { return; }
305    Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode);
306}
307
308U_CAPI UColReorderCode U_EXPORT2
309ucol_getMaxVariable(const UCollator *coll) {
310    return Collator::fromUCollator(coll)->getMaxVariable();
311}
312
313U_CAPI uint32_t  U_EXPORT2
314ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
315    if(U_FAILURE(*status) || coll == NULL) {
316        return 0;
317    }
318    return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status);
319}
320
321U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
322    if(U_FAILURE(*status) || coll == NULL) {
323        return 0;
324    }
325    return Collator::fromUCollator(coll)->getVariableTop(*status);
326}
327
328U_CAPI void  U_EXPORT2
329ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
330    if(U_FAILURE(*status) || coll == NULL) {
331        return;
332    }
333    Collator::fromUCollator(coll)->setVariableTop(varTop, *status);
334}
335
336U_CAPI void  U_EXPORT2
337ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
338    if(U_FAILURE(*status) || coll == NULL) {
339      return;
340    }
341
342    Collator::fromUCollator(coll)->setAttribute(attr, value, *status);
343}
344
345U_CAPI UColAttributeValue  U_EXPORT2
346ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
347    if(U_FAILURE(*status) || coll == NULL) {
348      return UCOL_DEFAULT;
349    }
350
351    return Collator::fromUCollator(coll)->getAttribute(attr, *status);
352}
353
354U_CAPI void U_EXPORT2
355ucol_setStrength(    UCollator                *coll,
356            UCollationStrength        strength)
357{
358    UErrorCode status = U_ZERO_ERROR;
359    ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
360}
361
362U_CAPI UCollationStrength U_EXPORT2
363ucol_getStrength(const UCollator *coll)
364{
365    UErrorCode status = U_ZERO_ERROR;
366    return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
367}
368
369U_CAPI int32_t U_EXPORT2
370ucol_getReorderCodes(const UCollator *coll,
371                    int32_t *dest,
372                    int32_t destCapacity,
373                    UErrorCode *status) {
374    if (U_FAILURE(*status)) {
375        return 0;
376    }
377
378    return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *status);
379}
380
381U_CAPI void U_EXPORT2
382ucol_setReorderCodes(UCollator* coll,
383                    const int32_t* reorderCodes,
384                    int32_t reorderCodesLength,
385                    UErrorCode *status) {
386    if (U_FAILURE(*status)) {
387        return;
388    }
389
390    Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
391}
392
393U_CAPI int32_t U_EXPORT2
394ucol_getEquivalentReorderCodes(int32_t reorderCode,
395                    int32_t* dest,
396                    int32_t destCapacity,
397                    UErrorCode *pErrorCode) {
398    return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode);
399}
400
401U_CAPI void U_EXPORT2
402ucol_getVersion(const UCollator* coll,
403                UVersionInfo versionInfo)
404{
405    Collator::fromUCollator(coll)->getVersion(versionInfo);
406}
407
408U_CAPI UCollationResult U_EXPORT2
409ucol_strcollIter( const UCollator    *coll,
410                 UCharIterator *sIter,
411                 UCharIterator *tIter,
412                 UErrorCode         *status)
413{
414    if(!status || U_FAILURE(*status)) {
415        return UCOL_EQUAL;
416    }
417
418    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
419    UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
420
421    if(sIter == NULL || tIter == NULL || coll == NULL) {
422        *status = U_ILLEGAL_ARGUMENT_ERROR;
423        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
424        return UCOL_EQUAL;
425    }
426
427    UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tIter, *status);
428
429    UTRACE_EXIT_VALUE_STATUS(result, *status);
430    return result;
431}
432
433
434/*                                                                      */
435/* ucol_strcoll     Main public API string comparison function          */
436/*                                                                      */
437U_CAPI UCollationResult U_EXPORT2
438ucol_strcoll( const UCollator    *coll,
439              const UChar        *source,
440              int32_t            sourceLength,
441              const UChar        *target,
442              int32_t            targetLength)
443{
444    U_ALIGN_CODE(16);
445
446    UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
447    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
448        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
449        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
450        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
451    }
452
453    UErrorCode status = U_ZERO_ERROR;
454    UCollationResult returnVal = Collator::fromUCollator(coll)->
455            compare(source, sourceLength, target, targetLength, status);
456    UTRACE_EXIT_VALUE_STATUS(returnVal, status);
457    return returnVal;
458}
459
460U_CAPI UCollationResult U_EXPORT2
461ucol_strcollUTF8(
462        const UCollator *coll,
463        const char      *source,
464        int32_t         sourceLength,
465        const char      *target,
466        int32_t         targetLength,
467        UErrorCode      *status)
468{
469    U_ALIGN_CODE(16);
470
471    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
472    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
473        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
474        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
475        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
476    }
477
478    if (U_FAILURE(*status)) {
479        /* do nothing */
480        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
481        return UCOL_EQUAL;
482    }
483
484    UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareUTF8(
485            source, sourceLength, target, targetLength, *status);
486    UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
487    return returnVal;
488}
489
490
491/* convenience function for comparing strings */
492U_CAPI UBool U_EXPORT2
493ucol_greater(    const    UCollator        *coll,
494        const    UChar            *source,
495        int32_t            sourceLength,
496        const    UChar            *target,
497        int32_t            targetLength)
498{
499    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
500        == UCOL_GREATER);
501}
502
503/* convenience function for comparing strings */
504U_CAPI UBool U_EXPORT2
505ucol_greaterOrEqual(    const    UCollator    *coll,
506            const    UChar        *source,
507            int32_t        sourceLength,
508            const    UChar        *target,
509            int32_t        targetLength)
510{
511    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
512        != UCOL_LESS);
513}
514
515/* convenience function for comparing strings */
516U_CAPI UBool U_EXPORT2
517ucol_equal(        const    UCollator        *coll,
518            const    UChar            *source,
519            int32_t            sourceLength,
520            const    UChar            *target,
521            int32_t            targetLength)
522{
523    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
524        == UCOL_EQUAL);
525}
526
527U_CAPI void U_EXPORT2
528ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
529    const Collator *c = Collator::fromUCollator(coll);
530    if(c != NULL) {
531        UVersionInfo v;
532        c->getVersion(v);
533        // Note: This is tied to how the current implementation encodes the UCA version
534        // in the overall getVersion().
535        // Alternatively, we could load the root collator and get at lower-level data from there.
536        // Either way, it will reflect the input collator's UCA version only
537        // if it is a known implementation.
538        // It would be cleaner to make this a virtual Collator method.
539        info[0] = v[1] >> 3;
540        info[1] = v[1] & 7;
541        info[2] = v[2] >> 6;
542        info[3] = 0;
543    }
544}
545
546U_CAPI const UChar * U_EXPORT2
547ucol_getRules(const UCollator *coll, int32_t *length) {
548    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
549    // OK to crash if coll==NULL: We do not want to check "this" pointers.
550    if(rbc != NULL || coll == NULL) {
551        const UnicodeString &rules = rbc->getRules();
552        U_ASSERT(rules.getBuffer()[rules.length()] == 0);
553        *length = rules.length();
554        return rules.getBuffer();
555    }
556    static const UChar _NUL = 0;
557    *length = 0;
558    return &_NUL;
559}
560
561U_CAPI int32_t U_EXPORT2
562ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
563    UnicodeString rules;
564    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
565    if(rbc != NULL || coll == NULL) {
566        rbc->getRules(delta, rules);
567    }
568    if(buffer != NULL && bufferLen > 0) {
569        UErrorCode errorCode = U_ZERO_ERROR;
570        return rules.extract(buffer, bufferLen, errorCode);
571    } else {
572        return rules.length();
573    }
574}
575
576U_CAPI const char * U_EXPORT2
577ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
578    return ucol_getLocaleByType(coll, type, status);
579}
580
581U_CAPI const char * U_EXPORT2
582ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
583    if(U_FAILURE(*status)) {
584        return NULL;
585    }
586    UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE);
587    UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll);
588
589    const char *result;
590    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
591    if(rbc == NULL && coll != NULL) {
592        *status = U_UNSUPPORTED_ERROR;
593        result = NULL;
594    } else {
595        result = rbc->internalGetLocaleID(type, *status);
596    }
597
598    UTRACE_DATA1(UTRACE_INFO, "result = %s", result);
599    UTRACE_EXIT_STATUS(*status);
600    return result;
601}
602
603U_CAPI USet * U_EXPORT2
604ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) {
605    if(U_FAILURE(*status)) {
606        return NULL;
607    }
608    UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status);
609    if(U_FAILURE(*status)) {
610        delete set;
611        return NULL;
612    }
613    return set->toUSet();
614}
615
616U_CAPI UBool U_EXPORT2
617ucol_equals(const UCollator *source, const UCollator *target) {
618    return source == target ||
619        (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target));
620}
621
622#endif /* #if !UCONFIG_NO_COLLATION */
623