1/*
2 ******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 */
7
8/**
9 * File tblcoll.cpp
10 *
11 * Created by: Helena Shih
12 *
13 * Modification History:
14 *
15 *  Date        Name        Description
16 *  2/5/97      aliu        Added streamIn and streamOut methods.  Added
17 *                          constructor which reads RuleBasedCollator object from
18 *                          a binary file.  Added writeToFile method which streams
19 *                          RuleBasedCollator out to a binary file.  The streamIn
20 *                          and streamOut methods use istream and ostream objects
21 *                          in binary mode.
22 *  2/11/97     aliu        Moved declarations out of for loop initializer.
23 *                          Added Mac compatibility #ifdef for ios::nocreate.
24 *  2/12/97     aliu        Modified to use TableCollationData sub-object to
25 *                          hold invariant data.
26 *  2/13/97     aliu        Moved several methods into this class from Collation.
27 *                          Added a private RuleBasedCollator(Locale&) constructor,
28 *                          to be used by Collator::getInstance().  General
29 *                          clean up.  Made use of UErrorCode variables consistent.
30 *  2/20/97     helena      Added clone, operator==, operator!=, operator=, and copy
31 *                          constructor and getDynamicClassID.
32 *  3/5/97      aliu        Changed compaction cycle to improve performance.  We
33 *                          use the maximum allowable value which is kBlockCount.
34 *                          Modified getRules() to load rules dynamically.  Changed
35 *                          constructFromFile() call to accomodate this (added
36 *                          parameter to specify whether binary loading is to
37 *                          take place).
38 * 05/06/97     helena      Added memory allocation error check.
39 *  6/20/97     helena      Java class name change.
40 *  6/23/97     helena      Adding comments to make code more readable.
41 * 09/03/97     helena      Added createCollationKeyValues().
42 * 06/26/98     erm         Changes for CollationKeys using byte arrays.
43 * 08/10/98     erm         Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99     stephen     Removed EDecompositionMode, merged with
45 *                          Normalizer::EMode
46 * 06/14/99     stephen     Removed kResourceBundleSuffix
47 * 06/22/99     stephen     Fixed logic in constructFromFile() since .ctx
48 *                          files are no longer used.
49 * 11/02/99     helena      Collator performance enhancements.  Special case
50 *                          for NO_OP situations.
51 * 11/17/99     srl         More performance enhancements. Inlined some internal functions.
52 * 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
53 *                          to implementation file.
54 * 01/29/01     synwee      Modified into a C++ wrapper calling C APIs (ucol.h)
55 */
56
57#include "unicode/utypeinfo.h"  // for 'typeid' to work
58
59#include "unicode/utypes.h"
60
61#if !UCONFIG_NO_COLLATION
62
63#include "unicode/tblcoll.h"
64#include "unicode/coleitr.h"
65#include "unicode/ures.h"
66#include "unicode/uset.h"
67#include "ucol_imp.h"
68#include "uresimp.h"
69#include "uhash.h"
70#include "cmemory.h"
71#include "cstring.h"
72#include "putilimp.h"
73
74/* public RuleBasedCollator constructor ---------------------------------- */
75
76U_NAMESPACE_BEGIN
77
78/**
79* Copy constructor, aliasing, not write-through
80*/
81RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
82: Collator(that)
83, dataIsOwned(FALSE)
84, isWriteThroughAlias(FALSE)
85, ucollator(NULL)
86{
87    RuleBasedCollator::operator=(that);
88}
89
90RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
91                                     UErrorCode& status) :
92dataIsOwned(FALSE)
93{
94    construct(rules,
95        UCOL_DEFAULT_STRENGTH,
96        UCOL_DEFAULT,
97        status);
98}
99
100RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
101                                     ECollationStrength collationStrength,
102                                     UErrorCode& status) : dataIsOwned(FALSE)
103{
104    construct(rules,
105        getUCollationStrength(collationStrength),
106        UCOL_DEFAULT,
107        status);
108}
109
110RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
111                                     UColAttributeValue decompositionMode,
112                                     UErrorCode& status) :
113dataIsOwned(FALSE)
114{
115    construct(rules,
116        UCOL_DEFAULT_STRENGTH,
117        decompositionMode,
118        status);
119}
120
121RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
122                                     ECollationStrength collationStrength,
123                                     UColAttributeValue decompositionMode,
124                                     UErrorCode& status) : dataIsOwned(FALSE)
125{
126    construct(rules,
127        getUCollationStrength(collationStrength),
128        decompositionMode,
129        status);
130}
131RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
132                    const RuleBasedCollator *base,
133                    UErrorCode &status) :
134dataIsOwned(TRUE),
135isWriteThroughAlias(FALSE)
136{
137  ucollator = ucol_openBinary(bin, length, base->ucollator, &status);
138}
139
140void
141RuleBasedCollator::setRuleStringFromCollator()
142{
143    int32_t length;
144    const UChar *r = ucol_getRules(ucollator, &length);
145
146    if (r && length > 0) {
147        // alias the rules string
148        urulestring.setTo(TRUE, r, length);
149    }
150    else {
151        urulestring.truncate(0); // Clear string.
152    }
153}
154
155// not aliasing, not write-through
156void
157RuleBasedCollator::construct(const UnicodeString& rules,
158                             UColAttributeValue collationStrength,
159                             UColAttributeValue decompositionMode,
160                             UErrorCode& status)
161{
162    ucollator = ucol_openRules(rules.getBuffer(), rules.length(),
163        decompositionMode, collationStrength,
164        NULL, &status);
165
166    dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it
167    isWriteThroughAlias = FALSE;
168
169    if(ucollator == NULL) {
170        if(U_SUCCESS(status)) {
171            status = U_MEMORY_ALLOCATION_ERROR;
172        }
173        return; // Failure
174    }
175
176    setRuleStringFromCollator();
177}
178
179/* RuleBasedCollator public destructor ----------------------------------- */
180
181RuleBasedCollator::~RuleBasedCollator()
182{
183    if (dataIsOwned)
184    {
185        ucol_close(ucollator);
186    }
187    ucollator = 0;
188}
189
190/* RuleBaseCollator public methods --------------------------------------- */
191
192UBool RuleBasedCollator::operator==(const Collator& that) const
193{
194  /* only checks for address equals here */
195  if (Collator::operator==(that))
196    return TRUE;
197
198  if (typeid(*this) != typeid(that))
199    return FALSE;  /* not the same class */
200
201  RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
202
203  // weiv: use C function, commented code below is wrong
204  return ucol_equals(this->ucollator, thatAlias.ucollator);
205  /*
206  synwee : orginal code does not check for data compatibility
207  */
208  /*
209  if (ucollator != thatAlias.ucollator)
210    return FALSE;
211
212  return TRUE;
213  */
214}
215
216UBool RuleBasedCollator::operator!=(const Collator& other) const
217{
218    return !(*this == other);
219}
220
221// aliasing, not write-through
222RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that)
223{
224    if (this != &that)
225    {
226        if (dataIsOwned)
227        {
228            ucol_close(ucollator);
229        }
230
231        urulestring.truncate(0); // empty the rule string
232        dataIsOwned = TRUE;
233        isWriteThroughAlias = FALSE;
234
235        UErrorCode intStatus = U_ZERO_ERROR;
236        int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
237        ucollator = ucol_safeClone(that.ucollator, NULL, &buffersize,
238                                        &intStatus);
239        if (U_SUCCESS(intStatus)) {
240            setRuleStringFromCollator();
241        }
242    }
243    return *this;
244}
245
246// aliasing, not write-through
247Collator* RuleBasedCollator::clone() const
248{
249    return new RuleBasedCollator(*this);
250}
251
252CollationElementIterator* RuleBasedCollator::createCollationElementIterator
253                                           (const UnicodeString& source) const
254{
255    UErrorCode status = U_ZERO_ERROR;
256    CollationElementIterator *result = new CollationElementIterator(source, this,
257                                                                    status);
258    if (U_FAILURE(status)) {
259        delete result;
260        return NULL;
261    }
262
263    return result;
264}
265
266/**
267* Create a CollationElementIterator object that will iterate over the
268* elements in a string, using the collation rules defined in this
269* RuleBasedCollator
270*/
271CollationElementIterator* RuleBasedCollator::createCollationElementIterator
272                                       (const CharacterIterator& source) const
273{
274    UErrorCode status = U_ZERO_ERROR;
275    CollationElementIterator *result = new CollationElementIterator(source, this,
276                                                                    status);
277
278    if (U_FAILURE(status)) {
279        delete result;
280        return NULL;
281    }
282
283    return result;
284}
285
286/**
287* Return a string representation of this collator's rules. The string can
288* later be passed to the constructor that takes a UnicodeString argument,
289* which will construct a collator that's functionally identical to this one.
290* You can also allow users to edit the string in order to change the collation
291* data, or you can print it out for inspection, or whatever.
292*/
293const UnicodeString& RuleBasedCollator::getRules() const
294{
295    return urulestring;
296}
297
298void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
299{
300    int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1);
301
302    if (rulesize > 0) {
303        UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) );
304        if(rules != NULL) {
305            ucol_getRulesEx(ucollator, delta, rules, rulesize);
306            buffer.setTo(rules, rulesize);
307            uprv_free(rules);
308        } else { // couldn't allocate
309            buffer.remove();
310        }
311    }
312    else {
313        buffer.remove();
314    }
315}
316
317UnicodeSet *
318RuleBasedCollator::getTailoredSet(UErrorCode &status) const
319{
320    if(U_FAILURE(status)) {
321        return NULL;
322    }
323    return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
324}
325
326
327void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const
328{
329    if (versionInfo!=NULL){
330        ucol_getVersion(ucollator, versionInfo);
331    }
332}
333
334Collator::EComparisonResult RuleBasedCollator::compare(
335                                               const UnicodeString& source,
336                                               const UnicodeString& target,
337                                               int32_t length) const
338{
339    UErrorCode status = U_ZERO_ERROR;
340    return getEComparisonResult(compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status));
341}
342
343UCollationResult RuleBasedCollator::compare(
344                                               const UnicodeString& source,
345                                               const UnicodeString& target,
346                                               int32_t length,
347                                               UErrorCode &status) const
348{
349    return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status);
350}
351
352Collator::EComparisonResult RuleBasedCollator::compare(const UChar* source,
353                                                       int32_t sourceLength,
354                                                       const UChar* target,
355                                                       int32_t targetLength)
356                                                       const
357{
358    return  getEComparisonResult(ucol_strcoll(ucollator, source, sourceLength,
359                                                         target, targetLength));
360}
361
362UCollationResult RuleBasedCollator::compare(const UChar* source,
363                                                       int32_t sourceLength,
364                                                       const UChar* target,
365                                                       int32_t targetLength,
366                                                       UErrorCode &status) const
367{
368    if(U_SUCCESS(status)) {
369        return  ucol_strcoll(ucollator, source, sourceLength, target, targetLength);
370    } else {
371        return UCOL_EQUAL;
372    }
373}
374
375/**
376* Compare two strings using this collator
377*/
378Collator::EComparisonResult RuleBasedCollator::compare(
379                                             const UnicodeString& source,
380                                             const UnicodeString& target) const
381{
382    return getEComparisonResult(ucol_strcoll(ucollator, source.getBuffer(), source.length(),
383                                                        target.getBuffer(), target.length()));
384}
385
386UCollationResult RuleBasedCollator::compare(
387                                             const UnicodeString& source,
388                                             const UnicodeString& target,
389                                             UErrorCode &status) const
390{
391    if(U_SUCCESS(status)) {
392        return ucol_strcoll(ucollator, source.getBuffer(), source.length(),
393                                       target.getBuffer(), target.length());
394    } else {
395        return UCOL_EQUAL;
396    }
397}
398
399UCollationResult RuleBasedCollator::compare(UCharIterator &sIter,
400                                            UCharIterator &tIter,
401                                            UErrorCode &status) const {
402    if(U_SUCCESS(status)) {
403        return ucol_strcollIter(ucollator, &sIter, &tIter, &status);
404    } else {
405        return UCOL_EQUAL;
406    }
407}
408
409/**
410* Retrieve a collation key for the specified string. The key can be compared
411* with other collation keys using a bitwise comparison (e.g. memcmp) to find
412* the ordering of their respective source strings. This is handy when doing a
413* sort, where each sort key must be compared many times.
414*
415* The basic algorithm here is to find all of the collation elements for each
416* character in the source string, convert them to an ASCII representation, and
417* put them into the collation key.  But it's trickier than that. Each
418* collation element in a string has three components: primary ('A' vs 'B'),
419* secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
420* at the end of a string takes precedence over a secondary or tertiary
421* difference earlier in the string.
422*
423* To account for this, we put all of the primary orders at the beginning of
424* the string, followed by the secondary and tertiary orders. Each set of
425* orders is terminated by nulls so that a key for a string which is a initial
426* substring of another key will compare less without any special case.
427*
428* Here's a hypothetical example, with the collation element represented as a
429* three-digit number, one digit for primary, one for secondary, etc.
430*
431* String:              A     a     B    \u00C9
432* Collation Elements: 101   100   201  511
433* Collation Key:      1125<null>0001<null>1011<null>
434*
435* To make things even trickier, secondary differences (accent marks) are
436* compared starting at the *end* of the string in languages with French
437* secondary ordering. But when comparing the accent marks on a single base
438* character, they are compared from the beginning. To handle this, we reverse
439* all of the accents that belong to each base character, then we reverse the
440* entire string of secondary orderings at the end.
441*/
442CollationKey& RuleBasedCollator::getCollationKey(
443                                                  const UnicodeString& source,
444                                                  CollationKey& sortkey,
445                                                  UErrorCode& status) const
446{
447    return getCollationKey(source.getBuffer(), source.length(), sortkey, status);
448}
449
450CollationKey& RuleBasedCollator::getCollationKey(const UChar* source,
451                                                    int32_t sourceLen,
452                                                    CollationKey& sortkey,
453                                                    UErrorCode& status) const
454{
455    if (U_FAILURE(status))
456    {
457        return sortkey.setToBogus();
458    }
459
460    if ((!source) || (sourceLen == 0)) {
461        return sortkey.reset();
462    }
463
464    uint8_t *result;
465    int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator,
466                                                      source, sourceLen,
467                                                      &result,
468                                                      &status);
469    sortkey.adopt(result, resultLen);
470    return sortkey;
471}
472
473/**
474 * Return the maximum length of any expansion sequences that end with the
475 * specified comparison order.
476 * @param order a collation order returned by previous or next.
477 * @return the maximum length of any expansion seuences ending with the
478 *         specified order or 1 if collation order does not occur at the end of any
479 *         expansion sequence.
480 * @see CollationElementIterator#getMaxExpansion
481 */
482int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
483{
484    uint8_t result;
485    UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result);
486    return result;
487}
488
489uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length,
490                                              UErrorCode &status)
491{
492    return ucol_cloneRuleData(ucollator, &length, &status);
493}
494
495
496int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status)
497{
498  return ucol_cloneBinary(ucollator, buffer, capacity, &status);
499}
500
501void RuleBasedCollator::setAttribute(UColAttribute attr,
502                                     UColAttributeValue value,
503                                     UErrorCode &status)
504{
505    if (U_FAILURE(status))
506        return;
507    checkOwned();
508    ucol_setAttribute(ucollator, attr, value, &status);
509}
510
511UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr,
512                                                      UErrorCode &status)
513{
514    if (U_FAILURE(status))
515        return UCOL_DEFAULT;
516    return ucol_getAttribute(ucollator, attr, &status);
517}
518
519uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) {
520    checkOwned();
521    return ucol_setVariableTop(ucollator, varTop, len, &status);
522}
523
524uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop, UErrorCode &status) {
525    checkOwned();
526    return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status);
527}
528
529void RuleBasedCollator::setVariableTop(const uint32_t varTop, UErrorCode &status) {
530    checkOwned();
531    ucol_restoreVariableTop(ucollator, varTop, &status);
532}
533
534uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const {
535  return ucol_getVariableTop(ucollator, &status);
536}
537
538Collator* RuleBasedCollator::safeClone(void)
539{
540    UErrorCode intStatus = U_ZERO_ERROR;
541    int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
542    UCollator *ucol = ucol_safeClone(ucollator, NULL, &buffersize,
543                                    &intStatus);
544    if (U_FAILURE(intStatus)) {
545        return NULL;
546    }
547
548    RuleBasedCollator *result = new RuleBasedCollator();
549    // Null pointer check
550    if (result != NULL) {
551	    result->ucollator = ucol;
552	    result->dataIsOwned = TRUE;
553	    result->isWriteThroughAlias = FALSE;
554	    setRuleStringFromCollator();
555    }
556
557    return result;
558}
559
560
561int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
562                                         uint8_t *result, int32_t resultLength)
563                                         const
564{
565    return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength);
566}
567
568int32_t RuleBasedCollator::getSortKey(const UChar *source,
569                                         int32_t sourceLength, uint8_t *result,
570                                         int32_t resultLength) const
571{
572    return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength);
573}
574
575Collator::ECollationStrength RuleBasedCollator::getStrength(void) const
576{
577    UErrorCode intStatus = U_ZERO_ERROR;
578    return getECollationStrength(ucol_getAttribute(ucollator, UCOL_STRENGTH,
579                                &intStatus));
580}
581
582void RuleBasedCollator::setStrength(ECollationStrength newStrength)
583{
584    checkOwned();
585    UErrorCode intStatus = U_ZERO_ERROR;
586    UCollationStrength strength = getUCollationStrength(newStrength);
587    ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus);
588}
589
590int32_t RuleBasedCollator::getReorderCodes(int32_t *dest,
591                                          int32_t destCapacity,
592                                          UErrorCode& status) const
593{
594    return ucol_getReorderCodes(ucollator, dest, destCapacity, &status);
595}
596
597void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes,
598                                       int32_t reorderCodesLength,
599                                       UErrorCode& status)
600{
601    ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status);
602}
603
604
605/**
606* Create a hash code for this collation. Just hash the main rule table -- that
607* should be good enough for almost any use.
608*/
609int32_t RuleBasedCollator::hashCode() const
610{
611    int32_t length;
612    const UChar *rules = ucol_getRules(ucollator, &length);
613    return uhash_hashUCharsN(rules, length);
614}
615
616/**
617* return the locale of this collator
618*/
619const Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const {
620    const char *result = ucol_getLocaleByType(ucollator, type, &status);
621    if(result == NULL) {
622        Locale res("");
623        res.setToBogus();
624        return res;
625    } else {
626        return Locale(result);
627    }
628}
629
630void
631RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) {
632    checkOwned();
633    char* rloc  = uprv_strdup(requestedLocale.getName());
634    if (rloc) {
635        char* vloc = uprv_strdup(validLocale.getName());
636        if (vloc) {
637            char* aloc = uprv_strdup(actualLocale.getName());
638            if (aloc) {
639                ucol_setReqValidLocales(ucollator, rloc, vloc, aloc);
640                return;
641            }
642            uprv_free(vloc);
643        }
644        uprv_free(rloc);
645    }
646}
647
648// RuleBaseCollatorNew private constructor ----------------------------------
649
650RuleBasedCollator::RuleBasedCollator()
651  : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
652{
653}
654
655RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale,
656                                           UErrorCode& status)
657 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
658{
659    if (U_FAILURE(status))
660        return;
661
662    /*
663    Try to load, in order:
664     1. The desired locale's collation.
665     2. A fallback of the desired locale.
666     3. The default locale's collation.
667     4. A fallback of the default locale.
668     5. The default collation rules, which contains en_US collation rules.
669
670     To reiterate, we try:
671     Specific:
672      language+country+variant
673      language+country
674      language
675     Default:
676      language+country+variant
677      language+country
678      language
679     Root: (aka DEFAULTRULES)
680     steps 1-5 are handled by resource bundle fallback mechanism.
681     however, in a very unprobable situation that no resource bundle
682     data exists, step 5 is repeated with hardcoded default rules.
683    */
684
685    setUCollator(desiredLocale, status);
686
687    if (U_FAILURE(status))
688    {
689        status = U_ZERO_ERROR;
690
691        setUCollator(kRootLocaleName, status);
692        if (status == U_ZERO_ERROR) {
693            status = U_USING_DEFAULT_WARNING;
694        }
695    }
696
697    if (U_SUCCESS(status))
698    {
699        setRuleStringFromCollator();
700    }
701}
702
703void
704RuleBasedCollator::setUCollator(const char *locale,
705                                UErrorCode &status)
706{
707    if (U_FAILURE(status))
708        return;
709    if (ucollator && dataIsOwned)
710        ucol_close(ucollator);
711    ucollator = ucol_open_internal(locale, &status);
712    dataIsOwned = TRUE;
713    isWriteThroughAlias = FALSE;
714}
715
716
717void
718RuleBasedCollator::checkOwned() {
719    if (!(dataIsOwned || isWriteThroughAlias)) {
720        UErrorCode status = U_ZERO_ERROR;
721        ucollator = ucol_safeClone(ucollator, NULL, NULL, &status);
722        setRuleStringFromCollator();
723        dataIsOwned = TRUE;
724        isWriteThroughAlias = FALSE;
725    }
726}
727
728UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
729
730U_NAMESPACE_END
731
732#endif /* #if !UCONFIG_NO_COLLATION */
733