1/*
2 ******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 */
7
8/**
9 * File tblcoll.cpp
10 *
11 * Created by: Helena Shih
12 *
13 * Modification History:
14 *
15 *  Date        Name        Description
16 *  2/5/97      aliu        Added streamIn and streamOut methods.  Added
17 *                          constructor which reads RuleBasedCollator object from
18 *                          a binary file.  Added writeToFile method which streams
19 *                          RuleBasedCollator out to a binary file.  The streamIn
20 *                          and streamOut methods use istream and ostream objects
21 *                          in binary mode.
22 *  2/11/97     aliu        Moved declarations out of for loop initializer.
23 *                          Added Mac compatibility #ifdef for ios::nocreate.
24 *  2/12/97     aliu        Modified to use TableCollationData sub-object to
25 *                          hold invariant data.
26 *  2/13/97     aliu        Moved several methods into this class from Collation.
27 *                          Added a private RuleBasedCollator(Locale&) constructor,
28 *                          to be used by Collator::getInstance().  General
29 *                          clean up.  Made use of UErrorCode variables consistent.
30 *  2/20/97     helena      Added clone, operator==, operator!=, operator=, and copy
31 *                          constructor and getDynamicClassID.
32 *  3/5/97      aliu        Changed compaction cycle to improve performance.  We
33 *                          use the maximum allowable value which is kBlockCount.
34 *                          Modified getRules() to load rules dynamically.  Changed
35 *                          constructFromFile() call to accomodate this (added
36 *                          parameter to specify whether binary loading is to
37 *                          take place).
38 * 05/06/97     helena      Added memory allocation error check.
39 *  6/20/97     helena      Java class name change.
40 *  6/23/97     helena      Adding comments to make code more readable.
41 * 09/03/97     helena      Added createCollationKeyValues().
42 * 06/26/98     erm         Changes for CollationKeys using byte arrays.
43 * 08/10/98     erm         Synched with 1.2 version of RuleBasedCollator.java
44 * 04/23/99     stephen     Removed EDecompositionMode, merged with
45 *                          Normalizer::EMode
46 * 06/14/99     stephen     Removed kResourceBundleSuffix
47 * 06/22/99     stephen     Fixed logic in constructFromFile() since .ctx
48 *                          files are no longer used.
49 * 11/02/99     helena      Collator performance enhancements.  Special case
50 *                          for NO_OP situations.
51 * 11/17/99     srl         More performance enhancements. Inlined some internal functions.
52 * 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
53 *                          to implementation file.
54 * 01/29/01     synwee      Modified into a C++ wrapper calling C APIs (ucol.h)
55 */
56
57#include "unicode/utypes.h"
58
59#if !UCONFIG_NO_COLLATION
60
61#include "unicode/tblcoll.h"
62#include "unicode/coleitr.h"
63#include "unicode/ures.h"
64#include "unicode/uset.h"
65#include "ucol_imp.h"
66#include "uresimp.h"
67#include "uhash.h"
68#include "cmemory.h"
69#include "cstring.h"
70#include "putilimp.h"
71#include "ustr_imp.h"
72
73/* public RuleBasedCollator constructor ---------------------------------- */
74
75U_NAMESPACE_BEGIN
76
77/**
78* Copy constructor, aliasing, not write-through
79*/
80RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
81: Collator(that)
82, dataIsOwned(FALSE)
83, isWriteThroughAlias(FALSE)
84, ucollator(NULL)
85{
86    RuleBasedCollator::operator=(that);
87}
88
89RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
90                                     UErrorCode& status) :
91dataIsOwned(FALSE)
92{
93    construct(rules,
94        UCOL_DEFAULT_STRENGTH,
95        UCOL_DEFAULT,
96        status);
97}
98
99RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
100                                     ECollationStrength collationStrength,
101                                     UErrorCode& status) : dataIsOwned(FALSE)
102{
103    construct(rules,
104        (UColAttributeValue)collationStrength,
105        UCOL_DEFAULT,
106        status);
107}
108
109RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
110                                     UColAttributeValue decompositionMode,
111                                     UErrorCode& status) :
112dataIsOwned(FALSE)
113{
114    construct(rules,
115        UCOL_DEFAULT_STRENGTH,
116        decompositionMode,
117        status);
118}
119
120RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
121                                     ECollationStrength collationStrength,
122                                     UColAttributeValue decompositionMode,
123                                     UErrorCode& status) : dataIsOwned(FALSE)
124{
125    construct(rules,
126        (UColAttributeValue)collationStrength,
127        decompositionMode,
128        status);
129}
130RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
131                    const RuleBasedCollator *base,
132                    UErrorCode &status) :
133dataIsOwned(TRUE),
134isWriteThroughAlias(FALSE)
135{
136  ucollator = ucol_openBinary(bin, length, base->ucollator, &status);
137}
138
139void
140RuleBasedCollator::setRuleStringFromCollator()
141{
142    int32_t length;
143    const UChar *r = ucol_getRules(ucollator, &length);
144
145    if (r && length > 0) {
146        // alias the rules string
147        urulestring.setTo(TRUE, r, length);
148    }
149    else {
150        urulestring.truncate(0); // Clear string.
151    }
152}
153
154// not aliasing, not write-through
155void
156RuleBasedCollator::construct(const UnicodeString& rules,
157                             UColAttributeValue collationStrength,
158                             UColAttributeValue decompositionMode,
159                             UErrorCode& status)
160{
161    ucollator = ucol_openRules(rules.getBuffer(), rules.length(),
162        decompositionMode, collationStrength,
163        NULL, &status);
164
165    dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it
166    isWriteThroughAlias = FALSE;
167
168    if(ucollator == NULL) {
169        if(U_SUCCESS(status)) {
170            status = U_MEMORY_ALLOCATION_ERROR;
171        }
172        return; // Failure
173    }
174
175    setRuleStringFromCollator();
176}
177
178/* RuleBasedCollator public destructor ----------------------------------- */
179
180RuleBasedCollator::~RuleBasedCollator()
181{
182    if (dataIsOwned)
183    {
184        ucol_close(ucollator);
185    }
186    ucollator = 0;
187}
188
189/* RuleBaseCollator public methods --------------------------------------- */
190
191UBool RuleBasedCollator::operator==(const Collator& that) const
192{
193  /* only checks for address equals here */
194  if (this == &that) {
195    return TRUE;
196  }
197  if (!Collator::operator==(that)) {
198    return FALSE;  /* not the same class */
199  }
200
201  RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
202
203  return ucol_equals(this->ucollator, thatAlias.ucollator);
204}
205
206// aliasing, not write-through
207RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that)
208{
209    if (this == &that) { return *this; }
210
211    UErrorCode intStatus = U_ZERO_ERROR;
212    int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
213    UCollator *ucol = ucol_safeClone(that.ucollator, NULL, &buffersize, &intStatus);
214    if (U_FAILURE(intStatus)) { return *this; }
215
216    if (dataIsOwned) {
217        ucol_close(ucollator);
218    }
219    ucollator = ucol;
220    dataIsOwned = TRUE;
221    isWriteThroughAlias = FALSE;
222    setRuleStringFromCollator();
223    return *this;
224}
225
226// aliasing, not write-through
227Collator* RuleBasedCollator::clone() const
228{
229    RuleBasedCollator* coll = new RuleBasedCollator(*this);
230    // There is a small chance that the internal ucol_safeClone() call fails.
231    if (coll != NULL && coll->ucollator == NULL) {
232        delete coll;
233        return NULL;
234    }
235    return coll;
236}
237
238
239CollationElementIterator* RuleBasedCollator::createCollationElementIterator
240                                           (const UnicodeString& source) const
241{
242    UErrorCode status = U_ZERO_ERROR;
243    CollationElementIterator *result = new CollationElementIterator(source, this,
244                                                                    status);
245    if (U_FAILURE(status)) {
246        delete result;
247        return NULL;
248    }
249
250    return result;
251}
252
253/**
254* Create a CollationElementIterator object that will iterate over the
255* elements in a string, using the collation rules defined in this
256* RuleBasedCollator
257*/
258CollationElementIterator* RuleBasedCollator::createCollationElementIterator
259                                       (const CharacterIterator& source) const
260{
261    UErrorCode status = U_ZERO_ERROR;
262    CollationElementIterator *result = new CollationElementIterator(source, this,
263                                                                    status);
264
265    if (U_FAILURE(status)) {
266        delete result;
267        return NULL;
268    }
269
270    return result;
271}
272
273/**
274* Return a string representation of this collator's rules. The string can
275* later be passed to the constructor that takes a UnicodeString argument,
276* which will construct a collator that's functionally identical to this one.
277* You can also allow users to edit the string in order to change the collation
278* data, or you can print it out for inspection, or whatever.
279*/
280const UnicodeString& RuleBasedCollator::getRules() const
281{
282    return urulestring;
283}
284
285void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
286{
287    int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1);
288
289    if (rulesize > 0) {
290        UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) );
291        if(rules != NULL) {
292            ucol_getRulesEx(ucollator, delta, rules, rulesize);
293            buffer.setTo(rules, rulesize);
294            uprv_free(rules);
295        } else { // couldn't allocate
296            buffer.remove();
297        }
298    }
299    else {
300        buffer.remove();
301    }
302}
303
304UnicodeSet *
305RuleBasedCollator::getTailoredSet(UErrorCode &status) const
306{
307    if(U_FAILURE(status)) {
308        return NULL;
309    }
310    return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
311}
312
313
314void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const
315{
316    if (versionInfo!=NULL){
317        ucol_getVersion(ucollator, versionInfo);
318    }
319}
320
321/**
322* Compare two strings using this collator
323*/
324UCollationResult RuleBasedCollator::compare(
325                                               const UnicodeString& source,
326                                               const UnicodeString& target,
327                                               int32_t length,
328                                               UErrorCode &status) const
329{
330    return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status);
331}
332
333UCollationResult RuleBasedCollator::compare(const UChar* source,
334                                                       int32_t sourceLength,
335                                                       const UChar* target,
336                                                       int32_t targetLength,
337                                                       UErrorCode &status) const
338{
339    if(U_SUCCESS(status)) {
340        return  ucol_strcoll(ucollator, source, sourceLength, target, targetLength);
341    } else {
342        return UCOL_EQUAL;
343    }
344}
345
346UCollationResult RuleBasedCollator::compare(
347                                             const UnicodeString& source,
348                                             const UnicodeString& target,
349                                             UErrorCode &status) const
350{
351    if(U_SUCCESS(status)) {
352        return ucol_strcoll(ucollator, source.getBuffer(), source.length(),
353                                       target.getBuffer(), target.length());
354    } else {
355        return UCOL_EQUAL;
356    }
357}
358
359UCollationResult RuleBasedCollator::compare(UCharIterator &sIter,
360                                            UCharIterator &tIter,
361                                            UErrorCode &status) const {
362    if(U_SUCCESS(status)) {
363        return ucol_strcollIter(ucollator, &sIter, &tIter, &status);
364    } else {
365        return UCOL_EQUAL;
366    }
367}
368
369/**
370* Retrieve a collation key for the specified string. The key can be compared
371* with other collation keys using a bitwise comparison (e.g. memcmp) to find
372* the ordering of their respective source strings. This is handy when doing a
373* sort, where each sort key must be compared many times.
374*
375* The basic algorithm here is to find all of the collation elements for each
376* character in the source string, convert them to an ASCII representation, and
377* put them into the collation key.  But it's trickier than that. Each
378* collation element in a string has three components: primary ('A' vs 'B'),
379* secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
380* at the end of a string takes precedence over a secondary or tertiary
381* difference earlier in the string.
382*
383* To account for this, we put all of the primary orders at the beginning of
384* the string, followed by the secondary and tertiary orders. Each set of
385* orders is terminated by nulls so that a key for a string which is a initial
386* substring of another key will compare less without any special case.
387*
388* Here's a hypothetical example, with the collation element represented as a
389* three-digit number, one digit for primary, one for secondary, etc.
390*
391* String:              A     a     B    \u00C9
392* Collation Elements: 101   100   201  511
393* Collation Key:      1125<null>0001<null>1011<null>
394*
395* To make things even trickier, secondary differences (accent marks) are
396* compared starting at the *end* of the string in languages with French
397* secondary ordering. But when comparing the accent marks on a single base
398* character, they are compared from the beginning. To handle this, we reverse
399* all of the accents that belong to each base character, then we reverse the
400* entire string of secondary orderings at the end.
401*/
402CollationKey& RuleBasedCollator::getCollationKey(
403                                                  const UnicodeString& source,
404                                                  CollationKey& sortkey,
405                                                  UErrorCode& status) const
406{
407    return getCollationKey(source.getBuffer(), source.length(), sortkey, status);
408}
409
410CollationKey& RuleBasedCollator::getCollationKey(const UChar* source,
411                                                    int32_t sourceLen,
412                                                    CollationKey& sortkey,
413                                                    UErrorCode& status) const
414{
415    if (U_FAILURE(status)) {
416        return sortkey.setToBogus();
417    }
418    if (sourceLen < -1 || (source == NULL && sourceLen != 0)) {
419        status = U_ILLEGAL_ARGUMENT_ERROR;
420        return sortkey.setToBogus();
421    }
422
423    if (sourceLen < 0) {
424        sourceLen = u_strlen(source);
425    }
426    if (sourceLen == 0) {
427        return sortkey.reset();
428    }
429
430    int32_t resultLen = ucol_getCollationKey(ucollator, source, sourceLen, sortkey, status);
431
432    if (U_SUCCESS(status)) {
433        sortkey.setLength(resultLen);
434    } else {
435        sortkey.setToBogus();
436    }
437    return sortkey;
438}
439
440/**
441 * Return the maximum length of any expansion sequences that end with the
442 * specified comparison order.
443 * @param order a collation order returned by previous or next.
444 * @return the maximum length of any expansion seuences ending with the
445 *         specified order or 1 if collation order does not occur at the end of any
446 *         expansion sequence.
447 * @see CollationElementIterator#getMaxExpansion
448 */
449int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
450{
451    uint8_t result;
452    UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result);
453    return result;
454}
455
456uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length,
457                                              UErrorCode &status)
458{
459    return ucol_cloneRuleData(ucollator, &length, &status);
460}
461
462
463int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status)
464{
465  return ucol_cloneBinary(ucollator, buffer, capacity, &status);
466}
467
468void RuleBasedCollator::setAttribute(UColAttribute attr,
469                                     UColAttributeValue value,
470                                     UErrorCode &status)
471{
472    if (U_FAILURE(status))
473        return;
474    checkOwned();
475    ucol_setAttribute(ucollator, attr, value, &status);
476}
477
478UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr,
479                                                      UErrorCode &status) const
480{
481    if (U_FAILURE(status))
482        return UCOL_DEFAULT;
483    return ucol_getAttribute(ucollator, attr, &status);
484}
485
486uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) {
487    checkOwned();
488    return ucol_setVariableTop(ucollator, varTop, len, &status);
489}
490
491uint32_t RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &status) {
492    checkOwned();
493    return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status);
494}
495
496void RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &status) {
497    checkOwned();
498    ucol_restoreVariableTop(ucollator, varTop, &status);
499}
500
501uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const {
502  return ucol_getVariableTop(ucollator, &status);
503}
504
505int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
506                                         uint8_t *result, int32_t resultLength)
507                                         const
508{
509    return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength);
510}
511
512int32_t RuleBasedCollator::getSortKey(const UChar *source,
513                                         int32_t sourceLength, uint8_t *result,
514                                         int32_t resultLength) const
515{
516    return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength);
517}
518
519int32_t RuleBasedCollator::getReorderCodes(int32_t *dest,
520                                          int32_t destCapacity,
521                                          UErrorCode& status) const
522{
523    return ucol_getReorderCodes(ucollator, dest, destCapacity, &status);
524}
525
526void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes,
527                                       int32_t reorderCodesLength,
528                                       UErrorCode& status)
529{
530    checkOwned();
531    ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status);
532}
533
534int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode,
535                                int32_t* dest,
536                                int32_t destCapacity,
537                                UErrorCode& status)
538{
539    return ucol_getEquivalentReorderCodes(reorderCode, dest, destCapacity, &status);
540}
541
542/**
543* Create a hash code for this collation. Just hash the main rule table -- that
544* should be good enough for almost any use.
545*/
546int32_t RuleBasedCollator::hashCode() const
547{
548    int32_t length;
549    const UChar *rules = ucol_getRules(ucollator, &length);
550    return ustr_hashUCharsN(rules, length);
551}
552
553/**
554* return the locale of this collator
555*/
556Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const {
557    const char *result = ucol_getLocaleByType(ucollator, type, &status);
558    if(result == NULL) {
559        Locale res("");
560        res.setToBogus();
561        return res;
562    } else {
563        return Locale(result);
564    }
565}
566
567void
568RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) {
569    checkOwned();
570    char* rloc  = uprv_strdup(requestedLocale.getName());
571    if (rloc) {
572        char* vloc = uprv_strdup(validLocale.getName());
573        if (vloc) {
574            char* aloc = uprv_strdup(actualLocale.getName());
575            if (aloc) {
576                ucol_setReqValidLocales(ucollator, rloc, vloc, aloc);
577                return;
578            }
579            uprv_free(vloc);
580        }
581        uprv_free(rloc);
582    }
583}
584
585// RuleBaseCollatorNew private constructor ----------------------------------
586
587RuleBasedCollator::RuleBasedCollator()
588  : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
589{
590}
591
592RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale,
593                                           UErrorCode& status)
594 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
595{
596    if (U_FAILURE(status))
597        return;
598
599    /*
600    Try to load, in order:
601     1. The desired locale's collation.
602     2. A fallback of the desired locale.
603     3. The default locale's collation.
604     4. A fallback of the default locale.
605     5. The default collation rules, which contains en_US collation rules.
606
607     To reiterate, we try:
608     Specific:
609      language+country+variant
610      language+country
611      language
612     Default:
613      language+country+variant
614      language+country
615      language
616     Root: (aka DEFAULTRULES)
617     steps 1-5 are handled by resource bundle fallback mechanism.
618     however, in a very unprobable situation that no resource bundle
619     data exists, step 5 is repeated with hardcoded default rules.
620    */
621
622    setUCollator(desiredLocale, status);
623
624    if (U_FAILURE(status))
625    {
626        status = U_ZERO_ERROR;
627
628        setUCollator(kRootLocaleName, status);
629        if (status == U_ZERO_ERROR) {
630            status = U_USING_DEFAULT_WARNING;
631        }
632    }
633
634    if (U_SUCCESS(status))
635    {
636        setRuleStringFromCollator();
637    }
638}
639
640void
641RuleBasedCollator::setUCollator(const char *locale,
642                                UErrorCode &status)
643{
644    if (U_FAILURE(status)) {
645        return;
646    }
647    if (ucollator && dataIsOwned)
648        ucol_close(ucollator);
649    ucollator = ucol_open_internal(locale, &status);
650    dataIsOwned = TRUE;
651    isWriteThroughAlias = FALSE;
652}
653
654
655void
656RuleBasedCollator::checkOwned() {
657    if (!(dataIsOwned || isWriteThroughAlias)) {
658        UErrorCode status = U_ZERO_ERROR;
659        ucollator = ucol_safeClone(ucollator, NULL, NULL, &status);
660        setRuleStringFromCollator();
661        dataIsOwned = TRUE;
662        isWriteThroughAlias = FALSE;
663    }
664}
665
666
667int32_t RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
668                                                                      char *buffer,
669                                                                      int32_t capacity,
670                                                                      UErrorCode &status) const {
671  /* simply delegate */
672  return ucol_getShortDefinitionString(ucollator, locale, buffer, capacity, &status);
673}
674
675
676UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
677
678U_NAMESPACE_END
679
680#endif /* #if !UCONFIG_NO_COLLATION */
681