1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*   Copyright (C) 1996-2015, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7*******************************************************************************
8*   file name:  ucol.cpp
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13* Modification history
14* Date        Name      Comments
15* 1996-1999   various members of ICU team maintained C API for collation framework
16* 02/16/2001  synwee    Added internal method getPrevSpecialCE
17* 03/01/2001  synwee    Added maxexpansion functionality.
18* 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
19* 2012-2014   markus    Rewritten in C++ again.
20*/
21
22#include "unicode/utypes.h"
23
24#if !UCONFIG_NO_COLLATION
25
26#include "unicode/coll.h"
27#include "unicode/tblcoll.h"
28#include "unicode/bytestream.h"
29#include "unicode/coleitr.h"
30#include "unicode/ucoleitr.h"
31#include "unicode/ustring.h"
32#include "cmemory.h"
33#include "collation.h"
34#include "cstring.h"
35#include "putilimp.h"
36#include "uassert.h"
37#include "utracimp.h"
38
39U_NAMESPACE_USE
40
41U_CAPI UCollator* U_EXPORT2
42ucol_openBinary(const uint8_t *bin, int32_t length,
43                const UCollator *base,
44                UErrorCode *status)
45{
46    if(U_FAILURE(*status)) { return NULL; }
47    RuleBasedCollator *coll = new RuleBasedCollator(
48            bin, length,
49            RuleBasedCollator::rbcFromUCollator(base),
50            *status);
51    if(coll == NULL) {
52        *status = U_MEMORY_ALLOCATION_ERROR;
53        return NULL;
54    }
55    if(U_FAILURE(*status)) {
56        delete coll;
57        return NULL;
58    }
59    return coll->toUCollator();
60}
61
62U_CAPI int32_t U_EXPORT2
63ucol_cloneBinary(const UCollator *coll,
64                 uint8_t *buffer, int32_t capacity,
65                 UErrorCode *status)
66{
67    if(U_FAILURE(*status)) {
68        return 0;
69    }
70    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
71    if(rbc == NULL && coll != NULL) {
72        *status = U_UNSUPPORTED_ERROR;
73        return 0;
74    }
75    return rbc->cloneBinary(buffer, capacity, *status);
76}
77
78U_CAPI UCollator* U_EXPORT2
79ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status)
80{
81    if (status == NULL || U_FAILURE(*status)){
82        return NULL;
83    }
84    if (coll == NULL) {
85       *status = U_ILLEGAL_ARGUMENT_ERROR;
86        return NULL;
87    }
88    if (pBufferSize != NULL) {
89        int32_t inputSize = *pBufferSize;
90        *pBufferSize = 1;
91        if (inputSize == 0) {
92            return NULL;  // preflighting for deprecated functionality
93        }
94    }
95    Collator *newColl = Collator::fromUCollator(coll)->clone();
96    if (newColl == NULL) {
97        *status = U_MEMORY_ALLOCATION_ERROR;
98    } else {
99        *status = U_SAFECLONE_ALLOCATED_WARNING;
100    }
101    return newColl->toUCollator();
102}
103
104U_CAPI void U_EXPORT2
105ucol_close(UCollator *coll)
106{
107    UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
108    UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
109    if(coll != NULL) {
110        delete Collator::fromUCollator(coll);
111    }
112    UTRACE_EXIT();
113}
114
115U_CAPI int32_t U_EXPORT2
116ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
117                   const uint8_t *src2, int32_t src2Length,
118                   uint8_t *dest, int32_t destCapacity) {
119    /* check arguments */
120    if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
121        src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
122        destCapacity<0 || (destCapacity>0 && dest==NULL)
123    ) {
124        /* error, attempt to write a zero byte and return 0 */
125        if(dest!=NULL && destCapacity>0) {
126            *dest=0;
127        }
128        return 0;
129    }
130
131    /* check lengths and capacity */
132    if(src1Length<0) {
133        src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
134    }
135    if(src2Length<0) {
136        src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
137    }
138
139    int32_t destLength=src1Length+src2Length;
140    if(destLength>destCapacity) {
141        /* the merged sort key does not fit into the destination */
142        return destLength;
143    }
144
145    /* merge the sort keys with the same number of levels */
146    uint8_t *p=dest;
147    for(;;) {
148        /* copy level from src1 not including 00 or 01 */
149        uint8_t b;
150        while((b=*src1)>=2) {
151            ++src1;
152            *p++=b;
153        }
154
155        /* add a 02 merge separator */
156        *p++=2;
157
158        /* copy level from src2 not including 00 or 01 */
159        while((b=*src2)>=2) {
160            ++src2;
161            *p++=b;
162        }
163
164        /* if both sort keys have another level, then add a 01 level separator and continue */
165        if(*src1==1 && *src2==1) {
166            ++src1;
167            ++src2;
168            *p++=1;
169        } else {
170            break;
171        }
172    }
173
174    /*
175     * here, at least one sort key is finished now, but the other one
176     * might have some contents left from containing more levels;
177     * that contents is just appended to the result
178     */
179    if(*src1!=0) {
180        /* src1 is not finished, therefore *src2==0, and src1 is appended */
181        src2=src1;
182    }
183    /* append src2, "the other, unfinished sort key" */
184    while((*p++=*src2++)!=0) {}
185
186    /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
187    return (int32_t)(p-dest);
188}
189
190U_CAPI int32_t U_EXPORT2
191ucol_getSortKey(const    UCollator    *coll,
192        const    UChar        *source,
193        int32_t        sourceLength,
194        uint8_t        *result,
195        int32_t        resultLength)
196{
197    UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
198    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
199        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
200            ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
201    }
202
203    int32_t keySize = Collator::fromUCollator(coll)->
204            getSortKey(source, sourceLength, result, resultLength);
205
206    UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
207    UTRACE_EXIT_VALUE(keySize);
208    return keySize;
209}
210
211U_CAPI int32_t U_EXPORT2
212ucol_nextSortKeyPart(const UCollator *coll,
213                     UCharIterator *iter,
214                     uint32_t state[2],
215                     uint8_t *dest, int32_t count,
216                     UErrorCode *status)
217{
218    /* error checking */
219    if(status==NULL || U_FAILURE(*status)) {
220        return 0;
221    }
222    UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
223    UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
224                  coll, iter, state[0], state[1], dest, count);
225
226    int32_t i = Collator::fromUCollator(coll)->
227            internalNextSortKeyPart(iter, state, dest, count, *status);
228
229    // Return number of meaningful sortkey bytes.
230    UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
231                  dest,i, state[0], state[1]);
232    UTRACE_EXIT_VALUE_STATUS(i, *status);
233    return i;
234}
235
236/**
237 * Produce a bound for a given sortkey and a number of levels.
238 */
239U_CAPI int32_t U_EXPORT2
240ucol_getBound(const uint8_t       *source,
241        int32_t             sourceLength,
242        UColBoundMode       boundType,
243        uint32_t            noOfLevels,
244        uint8_t             *result,
245        int32_t             resultLength,
246        UErrorCode          *status)
247{
248    // consistency checks
249    if(status == NULL || U_FAILURE(*status)) {
250        return 0;
251    }
252    if(source == NULL) {
253        *status = U_ILLEGAL_ARGUMENT_ERROR;
254        return 0;
255    }
256
257    int32_t sourceIndex = 0;
258    // Scan the string until we skip enough of the key OR reach the end of the key
259    do {
260        sourceIndex++;
261        if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) {
262            noOfLevels--;
263        }
264    } while (noOfLevels > 0
265        && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
266
267    if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
268        && noOfLevels > 0) {
269            *status = U_SORT_KEY_TOO_SHORT_WARNING;
270    }
271
272
273    // READ ME: this code assumes that the values for boundType
274    // enum will not changes. They are set so that the enum value
275    // corresponds to the number of extra bytes each bound type
276    // needs.
277    if(result != NULL && resultLength >= sourceIndex+boundType) {
278        uprv_memcpy(result, source, sourceIndex);
279        switch(boundType) {
280            // Lower bound just gets terminated. No extra bytes
281        case UCOL_BOUND_LOWER: // = 0
282            break;
283            // Upper bound needs one extra byte
284        case UCOL_BOUND_UPPER: // = 1
285            result[sourceIndex++] = 2;
286            break;
287            // Upper long bound needs two extra bytes
288        case UCOL_BOUND_UPPER_LONG: // = 2
289            result[sourceIndex++] = 0xFF;
290            result[sourceIndex++] = 0xFF;
291            break;
292        default:
293            *status = U_ILLEGAL_ARGUMENT_ERROR;
294            return 0;
295        }
296        result[sourceIndex++] = 0;
297
298        return sourceIndex;
299    } else {
300        return sourceIndex+boundType+1;
301    }
302}
303
304U_CAPI void U_EXPORT2
305ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode) {
306    if(U_FAILURE(*pErrorCode)) { return; }
307    Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode);
308}
309
310U_CAPI UColReorderCode U_EXPORT2
311ucol_getMaxVariable(const UCollator *coll) {
312    return Collator::fromUCollator(coll)->getMaxVariable();
313}
314
315U_CAPI uint32_t  U_EXPORT2
316ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
317    if(U_FAILURE(*status) || coll == NULL) {
318        return 0;
319    }
320    return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status);
321}
322
323U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
324    if(U_FAILURE(*status) || coll == NULL) {
325        return 0;
326    }
327    return Collator::fromUCollator(coll)->getVariableTop(*status);
328}
329
330U_CAPI void  U_EXPORT2
331ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
332    if(U_FAILURE(*status) || coll == NULL) {
333        return;
334    }
335    Collator::fromUCollator(coll)->setVariableTop(varTop, *status);
336}
337
338U_CAPI void  U_EXPORT2
339ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
340    if(U_FAILURE(*status) || coll == NULL) {
341      return;
342    }
343
344    Collator::fromUCollator(coll)->setAttribute(attr, value, *status);
345}
346
347U_CAPI UColAttributeValue  U_EXPORT2
348ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
349    if(U_FAILURE(*status) || coll == NULL) {
350      return UCOL_DEFAULT;
351    }
352
353    return Collator::fromUCollator(coll)->getAttribute(attr, *status);
354}
355
356U_CAPI void U_EXPORT2
357ucol_setStrength(    UCollator                *coll,
358            UCollationStrength        strength)
359{
360    UErrorCode status = U_ZERO_ERROR;
361    ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
362}
363
364U_CAPI UCollationStrength U_EXPORT2
365ucol_getStrength(const UCollator *coll)
366{
367    UErrorCode status = U_ZERO_ERROR;
368    return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
369}
370
371U_CAPI int32_t U_EXPORT2
372ucol_getReorderCodes(const UCollator *coll,
373                    int32_t *dest,
374                    int32_t destCapacity,
375                    UErrorCode *status) {
376    if (U_FAILURE(*status)) {
377        return 0;
378    }
379
380    return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *status);
381}
382
383U_CAPI void U_EXPORT2
384ucol_setReorderCodes(UCollator* coll,
385                    const int32_t* reorderCodes,
386                    int32_t reorderCodesLength,
387                    UErrorCode *status) {
388    if (U_FAILURE(*status)) {
389        return;
390    }
391
392    Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
393}
394
395U_CAPI int32_t U_EXPORT2
396ucol_getEquivalentReorderCodes(int32_t reorderCode,
397                    int32_t* dest,
398                    int32_t destCapacity,
399                    UErrorCode *pErrorCode) {
400    return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode);
401}
402
403U_CAPI void U_EXPORT2
404ucol_getVersion(const UCollator* coll,
405                UVersionInfo versionInfo)
406{
407    Collator::fromUCollator(coll)->getVersion(versionInfo);
408}
409
410U_CAPI UCollationResult U_EXPORT2
411ucol_strcollIter( const UCollator    *coll,
412                 UCharIterator *sIter,
413                 UCharIterator *tIter,
414                 UErrorCode         *status)
415{
416    if(!status || U_FAILURE(*status)) {
417        return UCOL_EQUAL;
418    }
419
420    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
421    UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
422
423    if(sIter == NULL || tIter == NULL || coll == NULL) {
424        *status = U_ILLEGAL_ARGUMENT_ERROR;
425        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
426        return UCOL_EQUAL;
427    }
428
429    UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tIter, *status);
430
431    UTRACE_EXIT_VALUE_STATUS(result, *status);
432    return result;
433}
434
435
436/*                                                                      */
437/* ucol_strcoll     Main public API string comparison function          */
438/*                                                                      */
439U_CAPI UCollationResult U_EXPORT2
440ucol_strcoll( const UCollator    *coll,
441              const UChar        *source,
442              int32_t            sourceLength,
443              const UChar        *target,
444              int32_t            targetLength)
445{
446    UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
447    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
448        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
449        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
450        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
451    }
452
453    UErrorCode status = U_ZERO_ERROR;
454    UCollationResult returnVal = Collator::fromUCollator(coll)->
455            compare(source, sourceLength, target, targetLength, status);
456    UTRACE_EXIT_VALUE_STATUS(returnVal, status);
457    return returnVal;
458}
459
460U_CAPI UCollationResult U_EXPORT2
461ucol_strcollUTF8(
462        const UCollator *coll,
463        const char      *source,
464        int32_t         sourceLength,
465        const char      *target,
466        int32_t         targetLength,
467        UErrorCode      *status)
468{
469    UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
470    if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
471        UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
472        UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
473        UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
474    }
475
476    if (U_FAILURE(*status)) {
477        /* do nothing */
478        UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
479        return UCOL_EQUAL;
480    }
481
482    UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareUTF8(
483            source, sourceLength, target, targetLength, *status);
484    UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
485    return returnVal;
486}
487
488
489/* convenience function for comparing strings */
490U_CAPI UBool U_EXPORT2
491ucol_greater(    const    UCollator        *coll,
492        const    UChar            *source,
493        int32_t            sourceLength,
494        const    UChar            *target,
495        int32_t            targetLength)
496{
497    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
498        == UCOL_GREATER);
499}
500
501/* convenience function for comparing strings */
502U_CAPI UBool U_EXPORT2
503ucol_greaterOrEqual(    const    UCollator    *coll,
504            const    UChar        *source,
505            int32_t        sourceLength,
506            const    UChar        *target,
507            int32_t        targetLength)
508{
509    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
510        != UCOL_LESS);
511}
512
513/* convenience function for comparing strings */
514U_CAPI UBool U_EXPORT2
515ucol_equal(        const    UCollator        *coll,
516            const    UChar            *source,
517            int32_t            sourceLength,
518            const    UChar            *target,
519            int32_t            targetLength)
520{
521    return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
522        == UCOL_EQUAL);
523}
524
525U_CAPI void U_EXPORT2
526ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
527    const Collator *c = Collator::fromUCollator(coll);
528    if(c != NULL) {
529        UVersionInfo v;
530        c->getVersion(v);
531        // Note: This is tied to how the current implementation encodes the UCA version
532        // in the overall getVersion().
533        // Alternatively, we could load the root collator and get at lower-level data from there.
534        // Either way, it will reflect the input collator's UCA version only
535        // if it is a known implementation.
536        // It would be cleaner to make this a virtual Collator method.
537        info[0] = v[1] >> 3;
538        info[1] = v[1] & 7;
539        info[2] = v[2] >> 6;
540        info[3] = 0;
541    }
542}
543
544U_CAPI const UChar * U_EXPORT2
545ucol_getRules(const UCollator *coll, int32_t *length) {
546    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
547    // OK to crash if coll==NULL: We do not want to check "this" pointers.
548    if(rbc != NULL || coll == NULL) {
549        const UnicodeString &rules = rbc->getRules();
550        U_ASSERT(rules.getBuffer()[rules.length()] == 0);
551        *length = rules.length();
552        return rules.getBuffer();
553    }
554    static const UChar _NUL = 0;
555    *length = 0;
556    return &_NUL;
557}
558
559U_CAPI int32_t U_EXPORT2
560ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
561    UnicodeString rules;
562    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
563    if(rbc != NULL || coll == NULL) {
564        rbc->getRules(delta, rules);
565    }
566    if(buffer != NULL && bufferLen > 0) {
567        UErrorCode errorCode = U_ZERO_ERROR;
568        return rules.extract(buffer, bufferLen, errorCode);
569    } else {
570        return rules.length();
571    }
572}
573
574U_CAPI const char * U_EXPORT2
575ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
576    return ucol_getLocaleByType(coll, type, status);
577}
578
579U_CAPI const char * U_EXPORT2
580ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
581    if(U_FAILURE(*status)) {
582        return NULL;
583    }
584    UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE);
585    UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll);
586
587    const char *result;
588    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
589    if(rbc == NULL && coll != NULL) {
590        *status = U_UNSUPPORTED_ERROR;
591        result = NULL;
592    } else {
593        result = rbc->internalGetLocaleID(type, *status);
594    }
595
596    UTRACE_DATA1(UTRACE_INFO, "result = %s", result);
597    UTRACE_EXIT_STATUS(*status);
598    return result;
599}
600
601U_CAPI USet * U_EXPORT2
602ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) {
603    if(U_FAILURE(*status)) {
604        return NULL;
605    }
606    UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status);
607    if(U_FAILURE(*status)) {
608        delete set;
609        return NULL;
610    }
611    return set->toUSet();
612}
613
614U_CAPI UBool U_EXPORT2
615ucol_equals(const UCollator *source, const UCollator *target) {
616    return source == target ||
617        (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target));
618}
619
620#endif /* #if !UCONFIG_NO_COLLATION */
621